Fix SSE route dependency and align architecture docs
This commit is contained in:
5
backend/app/infrastructure/parser/__init__.py
Normal file
5
backend/app/infrastructure/parser/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.parser package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Implement infrastructure support for aliyun document parser."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.aliyun_parser.parse_pdf import (
|
||||
MAX_CHARS,
|
||||
OVERLAP_CHARS,
|
||||
build_semantic_blocks,
|
||||
build_structure_nodes,
|
||||
build_vector_chunks,
|
||||
collect_all_results,
|
||||
init_client,
|
||||
submit_job,
|
||||
wait_for_completion,
|
||||
)
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class AliyunDocumentParser(DocumentParser):
|
||||
"""Provide the Aliyun Document Parser parser."""
|
||||
parser_name = "aliyun_docmind"
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
"""Handle parse for the Aliyun Document Parser instance."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, file_path)
|
||||
if not wait_for_completion(client, task_id):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_name,
|
||||
max_chars=MAX_CHARS,
|
||||
overlap_chars=OVERLAP_CHARS,
|
||||
)
|
||||
raw_text = "\n\n".join(
|
||||
block.get("text", "")
|
||||
for block in semantic_blocks
|
||||
if block.get("text")
|
||||
)
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
structure_nodes=structure_nodes,
|
||||
semantic_blocks=semantic_blocks,
|
||||
vector_chunks=vector_chunks,
|
||||
parser_name=self.parser_name,
|
||||
raw_text=raw_text,
|
||||
metadata={"task_id": task_id, "layout_count": len(layouts)},
|
||||
)
|
||||
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Local chunk builder adapter for the migrated backend architecture."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
|
||||
|
||||
class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
"""Adapt the existing markdown chunker to the new chunk builder port."""
|
||||
|
||||
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
|
||||
self.chunker = RegulationChunker(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
parsed_document: ParsedDocument,
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
markdown_text = parsed_document.raw_text.strip()
|
||||
if not markdown_text:
|
||||
return []
|
||||
|
||||
legacy_chunks = self.chunker.chunk_document(
|
||||
markdown_text,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
for item in legacy_chunks:
|
||||
metadata = {
|
||||
"section_number": item.metadata.section_number,
|
||||
"section_title": item.metadata.section_title,
|
||||
"clause_number": item.metadata.clause_number,
|
||||
"start_position": item.metadata.start_position,
|
||||
"end_position": item.metadata.end_position,
|
||||
"token_count": item.token_count,
|
||||
"source": "local_chunker",
|
||||
}
|
||||
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=item.metadata.chunk_id,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=item.content,
|
||||
embedding_text=item.content,
|
||||
section_title=item.metadata.section_title or item.metadata.section_number,
|
||||
section_path=section_path,
|
||||
page_number=item.metadata.page_number,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.metadata.clause_number,
|
||||
block_type="local_markdown_chunk",
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
38
backend/app/infrastructure/parser/local_document_parser.py
Normal file
38
backend/app/infrastructure/parser/local_document_parser.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Local parser adapter for the migrated backend architecture."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.services.parser.docx_parser import parse_docx_to_markdown
|
||||
from app.services.parser.pdf_parser import parse_pdf_to_markdown
|
||||
|
||||
|
||||
class LocalDocumentParser(DocumentParser):
|
||||
"""Adapt the existing local PDF/DOCX parsers to the new parser port."""
|
||||
|
||||
parser_name = "local_markdown_parser"
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
markdown_text = parse_pdf_to_markdown(file_path)
|
||||
elif suffix in {".docx", ".doc"}:
|
||||
markdown_text = parse_docx_to_markdown(file_path)
|
||||
else:
|
||||
raise ValueError(f"不支持的文件类型: {suffix}")
|
||||
|
||||
if not markdown_text.strip():
|
||||
raise ValueError("本地解析完成但未提取到有效文本")
|
||||
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
structure_nodes=[],
|
||||
semantic_blocks=[],
|
||||
vector_chunks=[],
|
||||
parser_name=self.parser_name,
|
||||
raw_text=markdown_text,
|
||||
metadata={"source": "local_parser", "file_suffix": suffix},
|
||||
)
|
||||
48
backend/app/infrastructure/parser/vector_chunk_builder.py
Normal file
48
backend/app/infrastructure/parser/vector_chunk_builder.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Implement infrastructure support for vector chunk builder."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class AliyunVectorChunkBuilder(ChunkBuilder):
|
||||
"""Provide the Aliyun Vector Chunk Builder builder."""
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
parsed_document: ParsedDocument,
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
"""Handle build for the Aliyun Vector Chunk Builder instance."""
|
||||
chunks: list[Chunk] = []
|
||||
for index, item in enumerate(parsed_document.vector_chunks):
|
||||
content = item.get("content") or item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or content
|
||||
if not embedding_text.strip():
|
||||
continue
|
||||
section_path = item.get("section_path") or []
|
||||
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
|
||||
page_number = item.get("page_start") or item.get("page") or 0
|
||||
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
|
||||
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=str(chunk_id),
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=content,
|
||||
embedding_text=embedding_text,
|
||||
section_title=section_title,
|
||||
section_path=section_path,
|
||||
page_number=int(page_number or 0),
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.get("semantic_id", ""),
|
||||
block_type=item.get("block_type", ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
Reference in New Issue
Block a user