1. Add 登陆功能

2. 调整字体大小 3. 新增部分功能
2026-06-05 18:00:31 +08:00
parent 06e0967128
commit 9fea9c6a53
58 changed files with 5028 additions and 322 deletions
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -277,7 +277,6 @@ class DocumentCommandService:
            message="Document record created",
        )

-        temp_path = ""
        try:
            self.binary_store.save(
                object_name=object_name,
@@ -297,117 +296,20 @@ class DocumentCommandService:
                stage="store",
                message="Source file stored",
            )
-
-            suffix = os.path.splitext(file_name)[1]
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
-                temp_file.write(content)
-                temp_path = temp_file.name
-
-            parsed_document = self.parser.parse(
-                file_path=temp_path,
+            # Delegate parse → embed → index to the shared processing method.
+            # This same method is invoked by the Celery worker for async processing.
+            return self._process_document(
                doc_id=doc_id,
-                doc_name=final_doc_name,
-            )
-            self._safe_mark_run_parsed(doc_id=doc_id, run_id=run_id, parsed_document=parsed_document)
-
-            artifact_keys: dict[str, str] = {}
-            try:
-                artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
-            except Exception:
-                logger.warning("Parse artifact binary persistence failed for doc_id={}", doc_id)
-            self.document_repository.update_status(
-                doc_id,
-                DocumentStatus.PARSED,
-                parser_name=parsed_document.parser_name,
-                metadata={
-                    "parser_backend": parsed_document.parser_name,
-                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
-                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
-                    "structure_node_count": len(parsed_document.structure_nodes),
-                    "semantic_block_count": len(parsed_document.semantic_blocks),
-                    "vector_chunk_count": len(parsed_document.vector_chunks),
-                    "artifact_keys": artifact_keys,
-                    "processing_stage": "parsed",
-                },
-            )
-            current_status = DocumentStatus.PARSED
-            current_stage = "embed"
-            self._safe_replace_processing_artifacts(doc_id=doc_id, run_id=run_id, artifact_keys=artifact_keys)
-            self._safe_append_status_event(
-                doc_id=doc_id,
-                run_id=run_id,
-                from_status=DocumentStatus.STORED.value,
-                to_status=DocumentStatus.PARSED.value,
-                stage="parse",
-                message="Document parsed",
-                metadata={"artifact_count": len(artifact_keys)},
-            )
-            if self.parse_artifact_store:
-                try:
-                    self.parse_artifact_store.save(
-                        doc_id,
-                        parsed_document.structure_nodes,
-                        parsed_document.semantic_blocks,
-                    )
-                except Exception:
-                    logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)
-
-            chunks = self.chunk_builder.build(
-                parsed_document=parsed_document,
+                file_name=file_name,
+                final_doc_name=final_doc_name,
+                content=content,
                regulation_type=regulation_type,
                version=version,
-            )
-            if not chunks:
-                raise ValueError("解析完成但没有生成可入库的 chunks")
-
-            vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
-            current_stage = "index"
-            inserted = self.vector_index.upsert(chunks, vectors)
-            if inserted != len(chunks):
-                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
-
-            health = self.vector_index.health()
-            self.document_repository.update_status(
-                doc_id,
-                DocumentStatus.INDEXED,
-                chunk_count=len(chunks),
-                summary="",
-                summary_latency_ms=0,
-                index_name=health.get("collection_name", ""),
-                metadata={
-                    "index_collection": health.get("collection_name", ""),
-                    "processing_stage": "indexed",
-                },
-            )
-            current_status = DocumentStatus.INDEXED
-            index_name = health.get("collection_name", "")
-            self._safe_mark_run_indexed(
-                doc_id=doc_id,
+                generate_summary=generate_summary,
                run_id=run_id,
-                chunk_count=len(chunks),
-                index_name=index_name,
-            )
-            self._safe_append_status_event(
-                doc_id=doc_id,
-                run_id=run_id,
-                from_status=DocumentStatus.PARSED.value,
-                to_status=DocumentStatus.INDEXED.value,
-                stage="index",
-                message="Document indexed",
-                metadata={"chunk_count": len(chunks), "index_name": index_name},
-            )
-            stored = self.document_repository.get(doc_id)
-            return DocumentProcessResult(
-                doc_id=doc_id,
-                doc_name=final_doc_name,
-                status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
-                message="处理成功",
-                num_chunks=len(chunks),
-                summary=stored.summary if stored else "",
-                summary_latency_ms=stored.summary_latency_ms if stored else 0,
            )
        except Exception as exc:
-            logger.exception("文档处理失败: doc_id={}", doc_id)
+            logger.exception("文档存储失败: doc_id={}", doc_id)
            failure_stage = current_stage
            self.document_repository.update_status(
                doc_id,
@@ -439,6 +341,183 @@ class DocumentCommandService:
                status=DocumentStatus.FAILED.value,
                message=f"文档处理失败: {exc}",
            )
+
+    def store_document(
+        self,
+        *,
+        doc_id: str | None = None,
+        file_name: str,
+        content: bytes,
+        content_type: str,
+        doc_name: str | None,
+        regulation_type: str,
+        version: str,
+        generate_summary: bool,
+    ) -> tuple[str, str | None]:
+        """Store the binary file and create the Document record.
+
+        Returns (doc_id, run_id). Does NOT parse, embed, or index.
+        This is the fast synchronous first step; processing is enqueued separately.
+        The caller is responsible for enqueuing the follow-up process_document_task.
+        """
+        doc_id = doc_id or str(uuid.uuid4())[:8]
+        final_doc_name = doc_name or file_name
+        object_name = f"{doc_id}/{file_name}"
+
+        document = Document(
+            doc_id=doc_id,
+            doc_name=final_doc_name,
+            file_name=file_name,
+            object_name=object_name,
+            content_type=content_type,
+            size_bytes=len(content),
+            regulation_type=regulation_type,
+            version=version,
+            metadata={"generate_summary": generate_summary},
+        )
+        self.document_repository.create(document)
+        run_id = self._safe_create_processing_run(
+            doc_id=doc_id, trigger_type="upload", generate_summary=generate_summary
+        )
+        self.binary_store.save(
+            object_name=object_name, data=content,
+            content_type=content_type, metadata={"doc_id": doc_id},
+        )
+        self.document_repository.update_status(doc_id, DocumentStatus.STORED)
+        self._safe_mark_run_stored(doc_id=doc_id, run_id=run_id)
+        self._safe_append_status_event(
+            doc_id=doc_id, run_id=run_id,
+            from_status=DocumentStatus.PENDING.value, to_status=DocumentStatus.STORED.value,
+            stage="store", message="Source file stored",
+        )
+        return doc_id, run_id
+
+    def _process_document(
+        self,
+        *,
+        doc_id: str,
+        file_name: str,
+        final_doc_name: str,
+        content: bytes,
+        regulation_type: str,
+        version: str,
+        generate_summary: bool,
+        run_id: str | None = None,
+    ) -> DocumentProcessResult:
+        """Run parse → chunk → embed → index for a document that is already stored.
+
+        Called both synchronously (from upload_and_process) and asynchronously
+        (from the Celery process_document_task worker). All side-effects write
+        through DocumentProcessingStore so callers can poll progress.
+        """
+        current_status = DocumentStatus.STORED
+        current_stage = "parse"
+        temp_path = ""
+        try:
+            suffix = os.path.splitext(file_name)[1]
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
+                temp_file.write(content)
+                temp_path = temp_file.name
+
+            parsed_document = self.parser.parse(
+                file_path=temp_path,
+                doc_id=doc_id,
+                doc_name=final_doc_name,
+            )
+            self._safe_mark_run_parsed(doc_id=doc_id, run_id=run_id, parsed_document=parsed_document)
+
+            artifact_keys: dict[str, str] = {}
+            try:
+                artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
+            except Exception:
+                logger.warning("Parse artifact binary persistence failed for doc_id={}", doc_id)
+
+            self.document_repository.update_status(
+                doc_id,
+                DocumentStatus.PARSED,
+                parser_name=parsed_document.parser_name,
+                metadata={
+                    "parser_backend": parsed_document.parser_name,
+                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
+                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
+                    "structure_node_count": len(parsed_document.structure_nodes),
+                    "semantic_block_count": len(parsed_document.semantic_blocks),
+                    "vector_chunk_count": len(parsed_document.vector_chunks),
+                    "artifact_keys": artifact_keys,
+                    "processing_stage": "parsed",
+                },
+            )
+            current_status = DocumentStatus.PARSED
+            current_stage = "embed"
+            self._safe_replace_processing_artifacts(doc_id=doc_id, run_id=run_id, artifact_keys=artifact_keys)
+            self._safe_append_status_event(
+                doc_id=doc_id, run_id=run_id,
+                from_status=DocumentStatus.STORED.value, to_status=DocumentStatus.PARSED.value,
+                stage="parse", message="Document parsed", metadata={"artifact_count": len(artifact_keys)},
+            )
+            if self.parse_artifact_store:
+                try:
+                    self.parse_artifact_store.save(
+                        doc_id, parsed_document.structure_nodes, parsed_document.semantic_blocks,
+                    )
+                except Exception:
+                    logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)
+
+            chunks = self.chunk_builder.build(
+                parsed_document=parsed_document,
+                regulation_type=regulation_type,
+                version=version,
+            )
+            if not chunks:
+                raise ValueError("解析完成但没有生成可入库的 chunks")
+
+            vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
+            current_stage = "index"
+            inserted = self.vector_index.upsert(chunks, vectors)
+            if inserted != len(chunks):
+                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
+
+            health = self.vector_index.health()
+            index_name = health.get("collection_name", "")
+            self.document_repository.update_status(
+                doc_id, DocumentStatus.INDEXED,
+                chunk_count=len(chunks), summary="", summary_latency_ms=0,
+                index_name=index_name,
+                metadata={"index_collection": index_name, "processing_stage": "indexed"},
+            )
+            self._safe_mark_run_indexed(doc_id=doc_id, run_id=run_id, chunk_count=len(chunks), index_name=index_name)
+            self._safe_append_status_event(
+                doc_id=doc_id, run_id=run_id,
+                from_status=DocumentStatus.PARSED.value, to_status=DocumentStatus.INDEXED.value,
+                stage="index", message="Document indexed",
+                metadata={"chunk_count": len(chunks), "index_name": index_name},
+            )
+            stored = self.document_repository.get(doc_id)
+            return DocumentProcessResult(
+                doc_id=doc_id, doc_name=final_doc_name,
+                status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
+                message="处理成功", num_chunks=len(chunks),
+                summary=stored.summary if stored else "",
+                summary_latency_ms=stored.summary_latency_ms if stored else 0,
+            )
+        except Exception as exc:
+            logger.exception("文档处理失败: doc_id={}", doc_id)
+            self.document_repository.update_status(
+                doc_id, DocumentStatus.FAILED, error_message=str(exc),
+                metadata={"failure_reason": str(exc), "processing_stage": "failed", "failure_stage": current_stage},
+            )
+            self._safe_mark_run_failed(
+                doc_id=doc_id, run_id=run_id, failure_stage=current_stage, error_message=str(exc)
+            )
+            self._safe_append_status_event(
+                doc_id=doc_id, run_id=run_id,
+                from_status=current_status.value, to_status=DocumentStatus.FAILED.value,
+                stage=current_stage, message=str(exc),
+            )
+            return DocumentProcessResult(
+                doc_id=doc_id, doc_name=final_doc_name,
+                status=DocumentStatus.FAILED.value, message=f"文档处理失败: {exc}",
+            )
        finally:
            if temp_path and os.path.exists(temp_path):
                try:
@@ -446,7 +525,6 @@ class DocumentCommandService:
                except OSError:
                    logger.warning("临时文件清理失败: {}", temp_path)

-
    def delete(self, doc_id: str) -> bool:
        """Delete document record, binary file, and vector chunks."""
        document = self.document_repository.get(doc_id)