AIRegulation-DocAnalysis/backend/app/application/documents/services.py

"""Implement application-layer logic for services."""

from __future__ import annotations

import os
import tempfile
import uuid
import json
from dataclasses import dataclass

from loguru import logger

from app.domain.documents import (
    ChunkBuilder,
    Document,
    DocumentBinaryStore,
    DocumentParser,
    DocumentRepository,
    DocumentStatus,
    ParsedDocument,
)
from app.domain.retrieval import EmbeddingProvider, VectorIndex
# Keep orchestration logic centralized so use-case flow stays easy to trace.


@dataclass
class DocumentProcessResult:
    """Represent document process result data."""
    doc_id: str
    doc_name: str
    status: str
    message: str
    num_chunks: int = 0
    summary: str = ""
    summary_latency_ms: int = 0


class DocumentCommandService:
    """Provide the Document Command Service service."""
    def __init__(
        self,
        *,
        document_repository: DocumentRepository,
        binary_store: DocumentBinaryStore,
        parser: DocumentParser,
        chunk_builder: ChunkBuilder,
        embedding_provider: EmbeddingProvider,
        vector_index: VectorIndex,
    ) -> None:
        """Initialize the Document Command Service instance."""
        self.document_repository = document_repository
        self.binary_store = binary_store
        self.parser = parser
        self.chunk_builder = chunk_builder
        self.embedding_provider = embedding_provider
        self.vector_index = vector_index

    def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
        """Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
        prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
        artifact_payloads = {
            "layouts": parsed_document.raw_layouts,
            "structure_nodes": parsed_document.structure_nodes,
            "semantic_blocks": parsed_document.semantic_blocks,
            "vector_chunks": parsed_document.vector_chunks,
        }
        artifact_keys: dict[str, str] = {}
        for name, payload in artifact_payloads.items():
            object_name = f"{prefix}/{name}.json"
            self.binary_store.save(
                object_name=object_name,
                data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
                content_type="application/json",
                metadata={"doc_id": doc_id, "artifact_type": name},
            )
            artifact_keys[name] = object_name
        return artifact_keys

    def upload_and_process(
        self,
        *,
        doc_id: str | None = None,
        file_name: str,
        content: bytes,
        content_type: str,
        doc_name: str | None,
        regulation_type: str,
        version: str,
        generate_summary: bool,
    ) -> DocumentProcessResult:
        """Handle upload and process for the Document Command Service instance."""
        doc_id = doc_id or str(uuid.uuid4())[:8]
        final_doc_name = doc_name or file_name
        object_name = f"{doc_id}/{file_name}"

        document = Document(
            doc_id=doc_id,
            doc_name=final_doc_name,
            file_name=file_name,
            object_name=object_name,
            content_type=content_type,
            size_bytes=len(content),
            regulation_type=regulation_type,
            version=version,
            metadata={"generate_summary": generate_summary},
        )
        self.document_repository.create(document)

        temp_path = ""
        try:
            self.binary_store.save(
                object_name=object_name,
                data=content,
                content_type=content_type,
                metadata={"doc_id": doc_id},
            )
            self.document_repository.update_status(doc_id, DocumentStatus.STORED)

            suffix = os.path.splitext(file_name)[1]
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
                temp_file.write(content)
                temp_path = temp_file.name

            parsed_document = self.parser.parse(
                file_path=temp_path,
                doc_id=doc_id,
                doc_name=final_doc_name,
            )
            artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.PARSED,
                parser_name=parsed_document.parser_name,
                metadata={
                    "parser_backend": parsed_document.parser_name,
                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
                    "structure_node_count": len(parsed_document.structure_nodes),
                    "semantic_block_count": len(parsed_document.semantic_blocks),
                    "vector_chunk_count": len(parsed_document.vector_chunks),
                    "artifact_keys": artifact_keys,
                    "processing_stage": "parsed",
                },
            )

            chunks = self.chunk_builder.build(
                parsed_document=parsed_document,
                regulation_type=regulation_type,
                version=version,
            )
            if not chunks:
                raise ValueError("解析完成但没有生成可入库的 chunks")

            vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
            inserted = self.vector_index.upsert(chunks, vectors)
            if inserted != len(chunks):
                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))

            health = self.vector_index.health()
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.INDEXED,
                chunk_count=len(chunks),
                summary="",
                summary_latency_ms=0,
                index_name=health.get("collection_name", ""),
                metadata={
                    "index_collection": health.get("collection_name", ""),
                    "processing_stage": "indexed",
                },
            )
            stored = self.document_repository.get(doc_id)
            return DocumentProcessResult(
                doc_id=doc_id,
                doc_name=final_doc_name,
                status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
                message="处理成功",
                num_chunks=len(chunks),
                summary=stored.summary if stored else "",
                summary_latency_ms=stored.summary_latency_ms if stored else 0,
            )
        except Exception as exc:
            logger.exception("文档处理失败: doc_id={}", doc_id)
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.FAILED,
                error_message=str(exc),
                metadata={
                    "failure_reason": str(exc),
                    "processing_stage": "failed",
                },
            )
            return DocumentProcessResult(
                doc_id=doc_id,
                doc_name=final_doc_name,
                status=DocumentStatus.FAILED.value,
                message=f"文档处理失败: {exc}",
            )
        finally:
            if temp_path and os.path.exists(temp_path):
                try:
                    os.remove(temp_path)
                except OSError:
                    logger.warning("临时文件清理失败: {}", temp_path)


class DocumentQueryService:
    """Provide the Document Query Service service."""
    def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
        """Initialize the Document Query Service instance."""
        self.document_repository = document_repository
        self.binary_store = binary_store

    def get(self, doc_id: str) -> Document | None:
        """Handle get for the Document Query Service instance."""
        return self.document_repository.get(doc_id)

    def list_documents(self, limit: int | None = None) -> list[Document]:
        """List documents for the Document Query Service instance."""
        return self.document_repository.list(limit=limit)

    def download(self, doc_id: str) -> tuple[Document, bytes]:
        """Handle download for the Document Query Service instance."""
        document = self.document_repository.get(doc_id)
        if not document:
            raise FileNotFoundError(f"文档不存在: {doc_id}")
        return document, self.binary_store.read(document.object_name)
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`"""Implement application-layer logic for services."""`

			`from __future__ import annotations`

			`import os`
			`import tempfile`
			`import uuid`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`import json`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`from dataclasses import dataclass`

			`from loguru import logger`

			`from app.domain.documents import (`
			`ChunkBuilder,`
			`Document,`
			`DocumentBinaryStore,`
			`DocumentParser,`
			`DocumentRepository,`
			`DocumentStatus,`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`ParsedDocument,`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`)`
			`from app.domain.retrieval import EmbeddingProvider, VectorIndex`
			`# Keep orchestration logic centralized so use-case flow stays easy to trace.`



			`@dataclass`
			`class DocumentProcessResult:`
			`"""Represent document process result data."""`
			`doc_id: str`
			`doc_name: str`
			`status: str`
			`message: str`
			`num_chunks: int = 0`
			`summary: str = ""`
			`summary_latency_ms: int = 0`


			`class DocumentCommandService:`
			`"""Provide the Document Command Service service."""`
			`def __init__(`
			`self,`
			`*,`
			`document_repository: DocumentRepository,`
			`binary_store: DocumentBinaryStore,`
			`parser: DocumentParser,`
			`chunk_builder: ChunkBuilder,`
			`embedding_provider: EmbeddingProvider,`
			`vector_index: VectorIndex,`
			`) -> None:`
			`"""Initialize the Document Command Service instance."""`
			`self.document_repository = document_repository`
			`self.binary_store = binary_store`
			`self.parser = parser`
			`self.chunk_builder = chunk_builder`
			`self.embedding_provider = embedding_provider`
			`self.vector_index = vector_index`

feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:`
			`"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""`
			`prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"`
			`artifact_payloads = {`
			`"layouts": parsed_document.raw_layouts,`
			`"structure_nodes": parsed_document.structure_nodes,`
			`"semantic_blocks": parsed_document.semantic_blocks,`
			`"vector_chunks": parsed_document.vector_chunks,`
			`}`
			`artifact_keys: dict[str, str] = {}`
			`for name, payload in artifact_payloads.items():`
			`object_name = f"{prefix}/{name}.json"`
			`self.binary_store.save(`
			`object_name=object_name,`
			`data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),`
			`content_type="application/json",`
			`metadata={"doc_id": doc_id, "artifact_type": name},`
			`)`
			`artifact_keys[name] = object_name`
			`return artifact_keys`

Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`def upload_and_process(`
			`self,`
			`*,`
			`doc_id: str \| None = None,`
			`file_name: str,`
			`content: bytes,`
			`content_type: str,`
			`doc_name: str \| None,`
			`regulation_type: str,`
			`version: str,`
			`generate_summary: bool,`
			`) -> DocumentProcessResult:`
			`"""Handle upload and process for the Document Command Service instance."""`
			`doc_id = doc_id or str(uuid.uuid4())[:8]`
			`final_doc_name = doc_name or file_name`
			`object_name = f"{doc_id}/{file_name}"`

			`document = Document(`
			`doc_id=doc_id,`
			`doc_name=final_doc_name,`
			`file_name=file_name,`
			`object_name=object_name,`
			`content_type=content_type,`
			`size_bytes=len(content),`
			`regulation_type=regulation_type,`
			`version=version,`
			`metadata={"generate_summary": generate_summary},`
			`)`
			`self.document_repository.create(document)`

			`temp_path = ""`
			`try:`
			`self.binary_store.save(`
			`object_name=object_name,`
			`data=content,`
			`content_type=content_type,`
			`metadata={"doc_id": doc_id},`
			`)`
			`self.document_repository.update_status(doc_id, DocumentStatus.STORED)`

			`suffix = os.path.splitext(file_name)[1]`
			`with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:`
			`temp_file.write(content)`
			`temp_path = temp_file.name`

			`parsed_document = self.parser.parse(`
			`file_path=temp_path,`
			`doc_id=doc_id,`
			`doc_name=final_doc_name,`
			`)`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`self.document_repository.update_status(`
			`doc_id,`
			`DocumentStatus.PARSED,`
			`parser_name=parsed_document.parser_name,`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`metadata={`
			`"parser_backend": parsed_document.parser_name,`
			`"parse_task_id": parsed_document.metadata.get("task_id", ""),`
			`"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),`
			`"structure_node_count": len(parsed_document.structure_nodes),`
			`"semantic_block_count": len(parsed_document.semantic_blocks),`
			`"vector_chunk_count": len(parsed_document.vector_chunks),`
			`"artifact_keys": artifact_keys,`
			`"processing_stage": "parsed",`
			`},`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`)`

			`chunks = self.chunk_builder.build(`
			`parsed_document=parsed_document,`
			`regulation_type=regulation_type,`
			`version=version,`
			`)`
			`if not chunks:`
			`raise ValueError("解析完成但没有生成可入库的 chunks")`

			`vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])`
			`inserted = self.vector_index.upsert(chunks, vectors)`
			`if inserted != len(chunks):`
			`logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))`

feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`health = self.vector_index.health()`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`self.document_repository.update_status(`
			`doc_id,`
			`DocumentStatus.INDEXED,`
			`chunk_count=len(chunks),`
			`summary="",`
			`summary_latency_ms=0,`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`index_name=health.get("collection_name", ""),`
			`metadata={`
			`"index_collection": health.get("collection_name", ""),`
			`"processing_stage": "indexed",`
			`},`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`)`
			`stored = self.document_repository.get(doc_id)`
			`return DocumentProcessResult(`
			`doc_id=doc_id,`
			`doc_name=final_doc_name,`
			`status=(stored.status.value if stored else DocumentStatus.INDEXED.value),`
			`message="处理成功",`
			`num_chunks=len(chunks),`
			`summary=stored.summary if stored else "",`
			`summary_latency_ms=stored.summary_latency_ms if stored else 0,`
			`)`
			`except Exception as exc:`
			`logger.exception("文档处理失败: doc_id={}", doc_id)`
			`self.document_repository.update_status(`
			`doc_id,`
			`DocumentStatus.FAILED,`
			`error_message=str(exc),`
feat: Migrate document parsing to Aliyun and update embedding configurations - Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend. 2026-05-18 22:30:28 +08:00			`metadata={`
			`"failure_reason": str(exc),`
			`"processing_stage": "failed",`
			`},`
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`)`
			`return DocumentProcessResult(`
			`doc_id=doc_id,`
			`doc_name=final_doc_name,`
			`status=DocumentStatus.FAILED.value,`
			`message=f"文档处理失败: {exc}",`
			`)`
			`finally:`
			`if temp_path and os.path.exists(temp_path):`
			`try:`
			`os.remove(temp_path)`
			`except OSError:`
			`logger.warning("临时文件清理失败: {}", temp_path)`


			`class DocumentQueryService:`
			`"""Provide the Document Query Service service."""`
			`def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:`
			`"""Initialize the Document Query Service instance."""`
			`self.document_repository = document_repository`
			`self.binary_store = binary_store`

			`def get(self, doc_id: str) -> Document \| None:`
			`"""Handle get for the Document Query Service instance."""`
			`return self.document_repository.get(doc_id)`

			`def list_documents(self, limit: int \| None = None) -> list[Document]:`
			`"""List documents for the Document Query Service instance."""`
			`return self.document_repository.list(limit=limit)`

			`def download(self, doc_id: str) -> tuple[Document, bytes]:`
			`"""Handle download for the Document Query Service instance."""`
			`document = self.document_repository.get(doc_id)`
			`if not document:`
			`raise FileNotFoundError(f"文档不存在: {doc_id}")`
			`return document, self.binary_store.read(document.object_name)`