Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/infrastructure/parser/init.py
+++ b/backend/app/infrastructure/parser/init.py
@@ -0,0 +1,5 @@
+"""Initialize the app.infrastructure.parser package."""
+# Keep package boundaries explicit so backend imports stay predictable.
+
+
+__all__ = []
--- a/backend/app/infrastructure/parser/aliyun_document_parser.py
+++ b/backend/app/infrastructure/parser/aliyun_document_parser.py
@@ -0,0 +1,55 @@
+"""Implement infrastructure support for aliyun document parser."""
+
+from __future__ import annotations
+
+from app.aliyun_parser.parse_pdf import (
+    MAX_CHARS,
+    OVERLAP_CHARS,
+    build_semantic_blocks,
+    build_structure_nodes,
+    build_vector_chunks,
+    collect_all_results,
+    init_client,
+    submit_job,
+    wait_for_completion,
+)
+from app.domain.documents import DocumentParser, ParsedDocument
+# Keep adapter behavior explicit so integration details remain easy to audit.
+
+
+
+class AliyunDocumentParser(DocumentParser):
+    """Provide the Aliyun Document Parser parser."""
+    parser_name = "aliyun_docmind"
+
+    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
+        """Handle parse for the Aliyun Document Parser instance."""
+        client = init_client()
+        task_id = submit_job(client, file_path)
+        if not wait_for_completion(client, task_id):
+            raise RuntimeError("阿里云文档解析任务失败")
+        layouts = collect_all_results(client, task_id)
+        structure_nodes = build_structure_nodes(layouts)
+        semantic_blocks = build_semantic_blocks(layouts)
+        vector_chunks = build_vector_chunks(
+            semantic_blocks,
+            doc_id=doc_id,
+            doc_title=doc_name,
+            max_chars=MAX_CHARS,
+            overlap_chars=OVERLAP_CHARS,
+        )
+        raw_text = "\n\n".join(
+            block.get("text", "")
+            for block in semantic_blocks
+            if block.get("text")
+        )
+        return ParsedDocument(
+            doc_id=doc_id,
+            doc_name=doc_name,
+            structure_nodes=structure_nodes,
+            semantic_blocks=semantic_blocks,
+            vector_chunks=vector_chunks,
+            parser_name=self.parser_name,
+            raw_text=raw_text,
+            metadata={"task_id": task_id, "layout_count": len(layouts)},
+        )
--- a/backend/app/infrastructure/parser/local_chunk_builder.py
+++ b/backend/app/infrastructure/parser/local_chunk_builder.py
@@ -0,0 +1,66 @@
+"""Local chunk builder adapter for the migrated backend architecture."""
+
+from __future__ import annotations
+
+from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
+from app.services.embedding.text_chunker import RegulationChunker
+
+
+class LocalRegulationChunkBuilder(ChunkBuilder):
+    """Adapt the existing markdown chunker to the new chunk builder port."""
+
+    def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
+        self.chunker = RegulationChunker(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+        )
+
+    def build(
+        self,
+        *,
+        parsed_document: ParsedDocument,
+        regulation_type: str,
+        version: str,
+    ) -> list[Chunk]:
+        markdown_text = parsed_document.raw_text.strip()
+        if not markdown_text:
+            return []
+
+        legacy_chunks = self.chunker.chunk_document(
+            markdown_text,
+            doc_id=parsed_document.doc_id,
+            doc_name=parsed_document.doc_name,
+            regulation_type=regulation_type,
+            version=version,
+        )
+
+        chunks: list[Chunk] = []
+        for item in legacy_chunks:
+            metadata = {
+                "section_number": item.metadata.section_number,
+                "section_title": item.metadata.section_title,
+                "clause_number": item.metadata.clause_number,
+                "start_position": item.metadata.start_position,
+                "end_position": item.metadata.end_position,
+                "token_count": item.token_count,
+                "source": "local_chunker",
+            }
+            section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
+            chunks.append(
+                Chunk(
+                    chunk_id=item.metadata.chunk_id,
+                    doc_id=parsed_document.doc_id,
+                    doc_name=parsed_document.doc_name,
+                    content=item.content,
+                    embedding_text=item.content,
+                    section_title=item.metadata.section_title or item.metadata.section_number,
+                    section_path=section_path,
+                    page_number=item.metadata.page_number,
+                    regulation_type=regulation_type,
+                    version=version,
+                    semantic_id=item.metadata.clause_number,
+                    block_type="local_markdown_chunk",
+                    metadata=metadata,
+                )
+            )
+        return chunks
--- a/backend/app/infrastructure/parser/local_document_parser.py
+++ b/backend/app/infrastructure/parser/local_document_parser.py
@@ -0,0 +1,38 @@
+"""Local parser adapter for the migrated backend architecture."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from app.domain.documents import DocumentParser, ParsedDocument
+from app.services.parser.docx_parser import parse_docx_to_markdown
+from app.services.parser.pdf_parser import parse_pdf_to_markdown
+
+
+class LocalDocumentParser(DocumentParser):
+    """Adapt the existing local PDF/DOCX parsers to the new parser port."""
+
+    parser_name = "local_markdown_parser"
+
+    def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
+        suffix = Path(file_path).suffix.lower()
+        if suffix == ".pdf":
+            markdown_text = parse_pdf_to_markdown(file_path)
+        elif suffix in {".docx", ".doc"}:
+            markdown_text = parse_docx_to_markdown(file_path)
+        else:
+            raise ValueError(f"不支持的文件类型: {suffix}")
+
+        if not markdown_text.strip():
+            raise ValueError("本地解析完成但未提取到有效文本")
+
+        return ParsedDocument(
+            doc_id=doc_id,
+            doc_name=doc_name,
+            structure_nodes=[],
+            semantic_blocks=[],
+            vector_chunks=[],
+            parser_name=self.parser_name,
+            raw_text=markdown_text,
+            metadata={"source": "local_parser", "file_suffix": suffix},
+        )
--- a/backend/app/infrastructure/parser/vector_chunk_builder.py
+++ b/backend/app/infrastructure/parser/vector_chunk_builder.py
@@ -0,0 +1,48 @@
+"""Implement infrastructure support for vector chunk builder."""
+
+from __future__ import annotations
+
+from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
+# Keep adapter behavior explicit so integration details remain easy to audit.
+
+
+
+class AliyunVectorChunkBuilder(ChunkBuilder):
+    """Provide the Aliyun Vector Chunk Builder builder."""
+    def build(
+        self,
+        *,
+        parsed_document: ParsedDocument,
+        regulation_type: str,
+        version: str,
+    ) -> list[Chunk]:
+        """Handle build for the Aliyun Vector Chunk Builder instance."""
+        chunks: list[Chunk] = []
+        for index, item in enumerate(parsed_document.vector_chunks):
+            content = item.get("content") or item.get("text") or ""
+            embedding_text = item.get("embedding_text") or content
+            if not embedding_text.strip():
+                continue
+            section_path = item.get("section_path") or []
+            section_title = item.get("section_title") or (section_path[-1] if section_path else "")
+            page_number = item.get("page_start") or item.get("page") or 0
+            chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
+            metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
+            chunks.append(
+                Chunk(
+                    chunk_id=str(chunk_id),
+                    doc_id=parsed_document.doc_id,
+                    doc_name=parsed_document.doc_name,
+                    content=content,
+                    embedding_text=embedding_text,
+                    section_title=section_title,
+                    section_path=section_path,
+                    page_number=int(page_number or 0),
+                    regulation_type=regulation_type,
+                    version=version,
+                    semantic_id=item.get("semantic_id", ""),
+                    block_type=item.get("block_type", ""),
+                    metadata=metadata,
+                )
+            )
+        return chunks