2026-05-18 16:32:42 +08:00
|
|
|
"""Implement application-layer logic for services."""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import tempfile
|
|
|
|
|
import uuid
|
2026-05-18 22:30:28 +08:00
|
|
|
import json
|
2026-05-18 16:32:42 +08:00
|
|
|
from dataclasses import dataclass
|
2026-05-26 12:34:12 +08:00
|
|
|
from datetime import UTC, datetime
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
from loguru import logger
|
2026-05-26 12:34:12 +08:00
|
|
|
from app.config.settings import settings
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
from app.domain.documents import (
|
|
|
|
|
ChunkBuilder,
|
|
|
|
|
Document,
|
2026-05-26 12:34:12 +08:00
|
|
|
DocumentArtifact,
|
2026-05-18 16:32:42 +08:00
|
|
|
DocumentBinaryStore,
|
|
|
|
|
DocumentParser,
|
2026-05-26 12:34:12 +08:00
|
|
|
DocumentProcessingRun,
|
|
|
|
|
DocumentProcessingStore,
|
2026-05-18 16:32:42 +08:00
|
|
|
DocumentRepository,
|
|
|
|
|
DocumentStatus,
|
2026-05-26 12:34:12 +08:00
|
|
|
DocumentStatusEvent,
|
2026-05-20 23:34:08 +08:00
|
|
|
ParseArtifactStore,
|
2026-05-18 22:30:28 +08:00
|
|
|
ParsedDocument,
|
2026-05-18 16:32:42 +08:00
|
|
|
)
|
|
|
|
|
from app.domain.retrieval import EmbeddingProvider, VectorIndex
|
|
|
|
|
# Keep orchestration logic centralized so use-case flow stays easy to trace.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class DocumentProcessResult:
|
|
|
|
|
"""Represent document process result data."""
|
|
|
|
|
doc_id: str
|
|
|
|
|
doc_name: str
|
|
|
|
|
status: str
|
|
|
|
|
message: str
|
|
|
|
|
num_chunks: int = 0
|
|
|
|
|
summary: str = ""
|
|
|
|
|
summary_latency_ms: int = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentCommandService:
|
|
|
|
|
"""Provide the Document Command Service service."""
|
2026-05-26 12:34:12 +08:00
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
document_repository: DocumentRepository,
|
|
|
|
|
binary_store: DocumentBinaryStore,
|
|
|
|
|
parser: DocumentParser,
|
|
|
|
|
chunk_builder: ChunkBuilder,
|
|
|
|
|
embedding_provider: EmbeddingProvider,
|
|
|
|
|
vector_index: VectorIndex,
|
2026-05-20 23:34:08 +08:00
|
|
|
parse_artifact_store: ParseArtifactStore | None = None,
|
2026-05-26 12:34:12 +08:00
|
|
|
document_processing_store: DocumentProcessingStore | None = None,
|
2026-05-18 16:32:42 +08:00
|
|
|
) -> None:
|
|
|
|
|
"""Initialize the Document Command Service instance."""
|
|
|
|
|
self.document_repository = document_repository
|
|
|
|
|
self.binary_store = binary_store
|
|
|
|
|
self.parser = parser
|
|
|
|
|
self.chunk_builder = chunk_builder
|
|
|
|
|
self.embedding_provider = embedding_provider
|
|
|
|
|
self.vector_index = vector_index
|
2026-05-20 23:34:08 +08:00
|
|
|
self.parse_artifact_store = parse_artifact_store
|
2026-05-26 12:34:12 +08:00
|
|
|
self.document_processing_store = document_processing_store
|
|
|
|
|
|
|
|
|
|
def _utcnow(self) -> datetime:
|
|
|
|
|
"""Return the current UTC timestamp for persisted processing metadata."""
|
|
|
|
|
return datetime.now(UTC)
|
2026-05-18 16:32:42 +08:00
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
|
|
|
|
|
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
|
|
|
|
|
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
|
|
|
|
|
artifact_payloads = {
|
|
|
|
|
"layouts": parsed_document.raw_layouts,
|
|
|
|
|
"structure_nodes": parsed_document.structure_nodes,
|
|
|
|
|
"semantic_blocks": parsed_document.semantic_blocks,
|
|
|
|
|
"vector_chunks": parsed_document.vector_chunks,
|
|
|
|
|
}
|
|
|
|
|
artifact_keys: dict[str, str] = {}
|
|
|
|
|
for name, payload in artifact_payloads.items():
|
|
|
|
|
object_name = f"{prefix}/{name}.json"
|
|
|
|
|
self.binary_store.save(
|
|
|
|
|
object_name=object_name,
|
|
|
|
|
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
|
|
|
|
|
content_type="application/json",
|
|
|
|
|
metadata={"doc_id": doc_id, "artifact_type": name},
|
|
|
|
|
)
|
|
|
|
|
artifact_keys[name] = object_name
|
|
|
|
|
return artifact_keys
|
|
|
|
|
|
2026-05-26 12:34:12 +08:00
|
|
|
def _safe_create_processing_run(self, *, doc_id: str, trigger_type: str, generate_summary: bool) -> str | None:
|
|
|
|
|
"""Create a processing run record when the optional store is available."""
|
|
|
|
|
if not self.document_processing_store:
|
|
|
|
|
return None
|
|
|
|
|
run = DocumentProcessingRun(
|
|
|
|
|
run_id=str(uuid.uuid4()),
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
trigger_type=trigger_type,
|
|
|
|
|
run_status="running",
|
|
|
|
|
parser_backend=settings.parser_backend,
|
|
|
|
|
chunk_backend=settings.chunk_backend,
|
|
|
|
|
embedding_model=settings.embedding_model,
|
|
|
|
|
metadata={"generate_summary": generate_summary},
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
created = self.document_processing_store.create_run(run)
|
|
|
|
|
return created.run_id
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore.create_run failed for doc_id={}", doc_id)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def _safe_append_status_event(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
doc_id: str,
|
|
|
|
|
run_id: str | None,
|
|
|
|
|
from_status: str,
|
|
|
|
|
to_status: str,
|
|
|
|
|
stage: str,
|
|
|
|
|
message: str = "",
|
|
|
|
|
metadata: dict | None = None,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""Append a status event without allowing auxiliary persistence failures to abort processing."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
event = DocumentStatusEvent(
|
|
|
|
|
event_id=str(uuid.uuid4()),
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
from_status=from_status,
|
|
|
|
|
to_status=to_status,
|
|
|
|
|
stage=stage,
|
|
|
|
|
message=message,
|
|
|
|
|
metadata=metadata or {},
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.append_status_event(event)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"DocumentProcessingStore.append_status_event failed for doc_id={}, run_id={}",
|
|
|
|
|
doc_id,
|
|
|
|
|
run_id,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _safe_mark_run_stored(self, *, doc_id: str, run_id: str | None) -> None:
|
|
|
|
|
"""Mark the processing run as stored without affecting the main workflow."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.mark_run_stored(run_id, stored_at=self._utcnow())
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore.mark_run_stored failed for doc_id={}, run_id={}", doc_id, run_id)
|
|
|
|
|
|
|
|
|
|
def _safe_mark_run_parsed(self, *, doc_id: str, run_id: str | None, parsed_document: ParsedDocument) -> None:
|
|
|
|
|
"""Persist parse completion details without failing the document pipeline."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.mark_run_parsed(
|
|
|
|
|
run_id,
|
|
|
|
|
parser_backend=parsed_document.parser_name,
|
|
|
|
|
layout_count=int(parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)) or 0),
|
|
|
|
|
structure_node_count=len(parsed_document.structure_nodes),
|
|
|
|
|
semantic_block_count=len(parsed_document.semantic_blocks),
|
|
|
|
|
vector_chunk_count=len(parsed_document.vector_chunks),
|
|
|
|
|
parsed_at=self._utcnow(),
|
|
|
|
|
metadata={"parse_task_id": parsed_document.metadata.get("task_id", "")},
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore.mark_run_parsed failed for doc_id={}, run_id={}", doc_id, run_id)
|
|
|
|
|
|
|
|
|
|
def _safe_replace_processing_artifacts(self, *, doc_id: str, run_id: str | None, artifact_keys: dict[str, str]) -> None:
|
|
|
|
|
"""Store artifact references without turning persistence drift into a user-visible failure."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
artifacts = [
|
|
|
|
|
DocumentArtifact(
|
|
|
|
|
artifact_id=str(uuid.uuid4()),
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
artifact_type=artifact_type,
|
|
|
|
|
object_name=object_name,
|
|
|
|
|
content_type="application/json",
|
|
|
|
|
byte_size=0,
|
|
|
|
|
checksum="",
|
|
|
|
|
)
|
|
|
|
|
for artifact_type, object_name in artifact_keys.items()
|
|
|
|
|
]
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.replace_artifacts_for_run(run_id, artifacts)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"DocumentProcessingStore.replace_artifacts_for_run failed for doc_id={}, run_id={}",
|
|
|
|
|
doc_id,
|
|
|
|
|
run_id,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _safe_mark_run_indexed(self, *, doc_id: str, run_id: str | None, chunk_count: int, index_name: str) -> None:
|
|
|
|
|
"""Mark the processing run as indexed without affecting the success path."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
now = self._utcnow()
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.mark_run_indexed(
|
|
|
|
|
run_id,
|
|
|
|
|
chunk_count=chunk_count,
|
|
|
|
|
index_name=index_name,
|
|
|
|
|
indexed_at=now,
|
|
|
|
|
finished_at=now,
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore.mark_run_indexed failed for doc_id={}, run_id={}", doc_id, run_id)
|
|
|
|
|
|
|
|
|
|
def _safe_mark_run_failed(self, *, doc_id: str, run_id: str | None, failure_stage: str, error_message: str) -> None:
|
|
|
|
|
"""Mark the processing run as failed without masking the original error handling path."""
|
|
|
|
|
if not self.document_processing_store or not run_id:
|
|
|
|
|
return
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.mark_run_failed(
|
|
|
|
|
run_id,
|
|
|
|
|
failure_stage=failure_stage,
|
|
|
|
|
error_message=error_message,
|
|
|
|
|
finished_at=self._utcnow(),
|
|
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore.mark_run_failed failed for doc_id={}, run_id={}", doc_id, run_id)
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
def upload_and_process(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
doc_id: str | None = None,
|
|
|
|
|
file_name: str,
|
|
|
|
|
content: bytes,
|
|
|
|
|
content_type: str,
|
|
|
|
|
doc_name: str | None,
|
|
|
|
|
regulation_type: str,
|
|
|
|
|
version: str,
|
|
|
|
|
generate_summary: bool,
|
2026-05-26 12:34:12 +08:00
|
|
|
trigger_type: str = "upload",
|
2026-05-18 16:32:42 +08:00
|
|
|
) -> DocumentProcessResult:
|
|
|
|
|
"""Handle upload and process for the Document Command Service instance."""
|
|
|
|
|
doc_id = doc_id or str(uuid.uuid4())[:8]
|
|
|
|
|
final_doc_name = doc_name or file_name
|
|
|
|
|
object_name = f"{doc_id}/{file_name}"
|
2026-05-26 12:34:12 +08:00
|
|
|
run_id: str | None = None
|
|
|
|
|
current_status = DocumentStatus.PENDING
|
|
|
|
|
current_stage = "store"
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
document = Document(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=final_doc_name,
|
|
|
|
|
file_name=file_name,
|
|
|
|
|
object_name=object_name,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
size_bytes=len(content),
|
|
|
|
|
regulation_type=regulation_type,
|
|
|
|
|
version=version,
|
|
|
|
|
metadata={"generate_summary": generate_summary},
|
|
|
|
|
)
|
|
|
|
|
self.document_repository.create(document)
|
2026-05-26 12:34:12 +08:00
|
|
|
run_id = self._safe_create_processing_run(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
trigger_type=trigger_type,
|
|
|
|
|
generate_summary=generate_summary,
|
|
|
|
|
)
|
|
|
|
|
self._safe_append_status_event(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
from_status="",
|
|
|
|
|
to_status=DocumentStatus.PENDING.value,
|
|
|
|
|
stage="document_created",
|
|
|
|
|
message="Document record created",
|
|
|
|
|
)
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self.binary_store.save(
|
|
|
|
|
object_name=object_name,
|
|
|
|
|
data=content,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
metadata={"doc_id": doc_id},
|
|
|
|
|
)
|
|
|
|
|
self.document_repository.update_status(doc_id, DocumentStatus.STORED)
|
2026-05-26 12:34:12 +08:00
|
|
|
current_status = DocumentStatus.STORED
|
|
|
|
|
current_stage = "parse"
|
|
|
|
|
self._safe_mark_run_stored(doc_id=doc_id, run_id=run_id)
|
|
|
|
|
self._safe_append_status_event(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
from_status=DocumentStatus.PENDING.value,
|
|
|
|
|
to_status=DocumentStatus.STORED.value,
|
|
|
|
|
stage="store",
|
|
|
|
|
message="Source file stored",
|
|
|
|
|
)
|
2026-06-05 18:00:31 +08:00
|
|
|
# Delegate parse → embed → index to the shared processing method.
|
|
|
|
|
# This same method is invoked by the Celery worker for async processing.
|
|
|
|
|
return self._process_document(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
file_name=file_name,
|
|
|
|
|
final_doc_name=final_doc_name,
|
|
|
|
|
content=content,
|
|
|
|
|
regulation_type=regulation_type,
|
|
|
|
|
version=version,
|
|
|
|
|
generate_summary=generate_summary,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.exception("文档存储失败: doc_id={}", doc_id)
|
|
|
|
|
failure_stage = current_stage
|
|
|
|
|
self.document_repository.update_status(
|
|
|
|
|
doc_id,
|
|
|
|
|
DocumentStatus.FAILED,
|
|
|
|
|
error_message=str(exc),
|
|
|
|
|
metadata={
|
|
|
|
|
"failure_reason": str(exc),
|
|
|
|
|
"processing_stage": "failed",
|
|
|
|
|
"failure_stage": failure_stage,
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
self._safe_mark_run_failed(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
failure_stage=failure_stage,
|
|
|
|
|
error_message=str(exc),
|
|
|
|
|
)
|
|
|
|
|
self._safe_append_status_event(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
from_status=current_status.value,
|
|
|
|
|
to_status=DocumentStatus.FAILED.value,
|
|
|
|
|
stage=failure_stage,
|
|
|
|
|
message=str(exc),
|
|
|
|
|
)
|
|
|
|
|
return DocumentProcessResult(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=final_doc_name,
|
|
|
|
|
status=DocumentStatus.FAILED.value,
|
|
|
|
|
message=f"文档处理失败: {exc}",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def store_document(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
doc_id: str | None = None,
|
|
|
|
|
file_name: str,
|
|
|
|
|
content: bytes,
|
|
|
|
|
content_type: str,
|
|
|
|
|
doc_name: str | None,
|
|
|
|
|
regulation_type: str,
|
|
|
|
|
version: str,
|
|
|
|
|
generate_summary: bool,
|
|
|
|
|
) -> tuple[str, str | None]:
|
|
|
|
|
"""Store the binary file and create the Document record.
|
|
|
|
|
|
|
|
|
|
Returns (doc_id, run_id). Does NOT parse, embed, or index.
|
|
|
|
|
This is the fast synchronous first step; processing is enqueued separately.
|
|
|
|
|
The caller is responsible for enqueuing the follow-up process_document_task.
|
|
|
|
|
"""
|
|
|
|
|
doc_id = doc_id or str(uuid.uuid4())[:8]
|
|
|
|
|
final_doc_name = doc_name or file_name
|
|
|
|
|
object_name = f"{doc_id}/{file_name}"
|
|
|
|
|
|
|
|
|
|
document = Document(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=final_doc_name,
|
|
|
|
|
file_name=file_name,
|
|
|
|
|
object_name=object_name,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
size_bytes=len(content),
|
|
|
|
|
regulation_type=regulation_type,
|
|
|
|
|
version=version,
|
|
|
|
|
metadata={"generate_summary": generate_summary},
|
|
|
|
|
)
|
|
|
|
|
self.document_repository.create(document)
|
|
|
|
|
run_id = self._safe_create_processing_run(
|
|
|
|
|
doc_id=doc_id, trigger_type="upload", generate_summary=generate_summary
|
|
|
|
|
)
|
|
|
|
|
self.binary_store.save(
|
|
|
|
|
object_name=object_name, data=content,
|
|
|
|
|
content_type=content_type, metadata={"doc_id": doc_id},
|
|
|
|
|
)
|
|
|
|
|
self.document_repository.update_status(doc_id, DocumentStatus.STORED)
|
|
|
|
|
self._safe_mark_run_stored(doc_id=doc_id, run_id=run_id)
|
|
|
|
|
self._safe_append_status_event(
|
|
|
|
|
doc_id=doc_id, run_id=run_id,
|
|
|
|
|
from_status=DocumentStatus.PENDING.value, to_status=DocumentStatus.STORED.value,
|
|
|
|
|
stage="store", message="Source file stored",
|
|
|
|
|
)
|
|
|
|
|
return doc_id, run_id
|
2026-05-18 16:32:42 +08:00
|
|
|
|
2026-06-05 18:00:31 +08:00
|
|
|
def _process_document(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
doc_id: str,
|
|
|
|
|
file_name: str,
|
|
|
|
|
final_doc_name: str,
|
|
|
|
|
content: bytes,
|
|
|
|
|
regulation_type: str,
|
|
|
|
|
version: str,
|
|
|
|
|
generate_summary: bool,
|
|
|
|
|
run_id: str | None = None,
|
|
|
|
|
) -> DocumentProcessResult:
|
|
|
|
|
"""Run parse → chunk → embed → index for a document that is already stored.
|
|
|
|
|
|
|
|
|
|
Called both synchronously (from upload_and_process) and asynchronously
|
|
|
|
|
(from the Celery process_document_task worker). All side-effects write
|
|
|
|
|
through DocumentProcessingStore so callers can poll progress.
|
|
|
|
|
"""
|
|
|
|
|
current_status = DocumentStatus.STORED
|
|
|
|
|
current_stage = "parse"
|
|
|
|
|
temp_path = ""
|
|
|
|
|
try:
|
2026-05-18 16:32:42 +08:00
|
|
|
suffix = os.path.splitext(file_name)[1]
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
|
|
|
|
temp_file.write(content)
|
|
|
|
|
temp_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
parsed_document = self.parser.parse(
|
|
|
|
|
file_path=temp_path,
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=final_doc_name,
|
|
|
|
|
)
|
2026-05-26 12:34:12 +08:00
|
|
|
self._safe_mark_run_parsed(doc_id=doc_id, run_id=run_id, parsed_document=parsed_document)
|
|
|
|
|
|
|
|
|
|
artifact_keys: dict[str, str] = {}
|
|
|
|
|
try:
|
|
|
|
|
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("Parse artifact binary persistence failed for doc_id={}", doc_id)
|
2026-06-05 18:00:31 +08:00
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
self.document_repository.update_status(
|
|
|
|
|
doc_id,
|
|
|
|
|
DocumentStatus.PARSED,
|
|
|
|
|
parser_name=parsed_document.parser_name,
|
2026-05-18 22:30:28 +08:00
|
|
|
metadata={
|
|
|
|
|
"parser_backend": parsed_document.parser_name,
|
|
|
|
|
"parse_task_id": parsed_document.metadata.get("task_id", ""),
|
|
|
|
|
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
|
|
|
|
|
"structure_node_count": len(parsed_document.structure_nodes),
|
|
|
|
|
"semantic_block_count": len(parsed_document.semantic_blocks),
|
|
|
|
|
"vector_chunk_count": len(parsed_document.vector_chunks),
|
|
|
|
|
"artifact_keys": artifact_keys,
|
|
|
|
|
"processing_stage": "parsed",
|
|
|
|
|
},
|
2026-05-18 16:32:42 +08:00
|
|
|
)
|
2026-05-26 12:34:12 +08:00
|
|
|
current_status = DocumentStatus.PARSED
|
|
|
|
|
current_stage = "embed"
|
|
|
|
|
self._safe_replace_processing_artifacts(doc_id=doc_id, run_id=run_id, artifact_keys=artifact_keys)
|
|
|
|
|
self._safe_append_status_event(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, run_id=run_id,
|
|
|
|
|
from_status=DocumentStatus.STORED.value, to_status=DocumentStatus.PARSED.value,
|
|
|
|
|
stage="parse", message="Document parsed", metadata={"artifact_count": len(artifact_keys)},
|
2026-05-26 12:34:12 +08:00
|
|
|
)
|
2026-05-20 23:34:08 +08:00
|
|
|
if self.parse_artifact_store:
|
|
|
|
|
try:
|
|
|
|
|
self.parse_artifact_store.save(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id, parsed_document.structure_nodes, parsed_document.semantic_blocks,
|
2026-05-20 23:34:08 +08:00
|
|
|
)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
chunks = self.chunk_builder.build(
|
|
|
|
|
parsed_document=parsed_document,
|
|
|
|
|
regulation_type=regulation_type,
|
|
|
|
|
version=version,
|
|
|
|
|
)
|
|
|
|
|
if not chunks:
|
|
|
|
|
raise ValueError("解析完成但没有生成可入库的 chunks")
|
|
|
|
|
|
|
|
|
|
vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
|
2026-05-26 12:34:12 +08:00
|
|
|
current_stage = "index"
|
2026-05-18 16:32:42 +08:00
|
|
|
inserted = self.vector_index.upsert(chunks, vectors)
|
|
|
|
|
if inserted != len(chunks):
|
|
|
|
|
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
|
|
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
health = self.vector_index.health()
|
2026-05-26 12:34:12 +08:00
|
|
|
index_name = health.get("collection_name", "")
|
2026-06-05 18:00:31 +08:00
|
|
|
self.document_repository.update_status(
|
|
|
|
|
doc_id, DocumentStatus.INDEXED,
|
|
|
|
|
chunk_count=len(chunks), summary="", summary_latency_ms=0,
|
2026-05-26 12:34:12 +08:00
|
|
|
index_name=index_name,
|
2026-06-05 18:00:31 +08:00
|
|
|
metadata={"index_collection": index_name, "processing_stage": "indexed"},
|
2026-05-26 12:34:12 +08:00
|
|
|
)
|
2026-06-05 18:00:31 +08:00
|
|
|
self._safe_mark_run_indexed(doc_id=doc_id, run_id=run_id, chunk_count=len(chunks), index_name=index_name)
|
2026-05-26 12:34:12 +08:00
|
|
|
self._safe_append_status_event(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, run_id=run_id,
|
|
|
|
|
from_status=DocumentStatus.PARSED.value, to_status=DocumentStatus.INDEXED.value,
|
|
|
|
|
stage="index", message="Document indexed",
|
2026-05-26 12:34:12 +08:00
|
|
|
metadata={"chunk_count": len(chunks), "index_name": index_name},
|
|
|
|
|
)
|
2026-05-18 16:32:42 +08:00
|
|
|
stored = self.document_repository.get(doc_id)
|
|
|
|
|
return DocumentProcessResult(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, doc_name=final_doc_name,
|
2026-05-18 16:32:42 +08:00
|
|
|
status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
|
2026-06-05 18:00:31 +08:00
|
|
|
message="处理成功", num_chunks=len(chunks),
|
2026-05-18 16:32:42 +08:00
|
|
|
summary=stored.summary if stored else "",
|
|
|
|
|
summary_latency_ms=stored.summary_latency_ms if stored else 0,
|
|
|
|
|
)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.exception("文档处理失败: doc_id={}", doc_id)
|
|
|
|
|
self.document_repository.update_status(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id, DocumentStatus.FAILED, error_message=str(exc),
|
|
|
|
|
metadata={"failure_reason": str(exc), "processing_stage": "failed", "failure_stage": current_stage},
|
2026-05-18 16:32:42 +08:00
|
|
|
)
|
2026-05-26 12:34:12 +08:00
|
|
|
self._safe_mark_run_failed(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, run_id=run_id, failure_stage=current_stage, error_message=str(exc)
|
2026-05-26 12:34:12 +08:00
|
|
|
)
|
|
|
|
|
self._safe_append_status_event(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, run_id=run_id,
|
|
|
|
|
from_status=current_status.value, to_status=DocumentStatus.FAILED.value,
|
|
|
|
|
stage=current_stage, message=str(exc),
|
2026-05-26 12:34:12 +08:00
|
|
|
)
|
2026-05-18 16:32:42 +08:00
|
|
|
return DocumentProcessResult(
|
2026-06-05 18:00:31 +08:00
|
|
|
doc_id=doc_id, doc_name=final_doc_name,
|
|
|
|
|
status=DocumentStatus.FAILED.value, message=f"文档处理失败: {exc}",
|
2026-05-18 16:32:42 +08:00
|
|
|
)
|
|
|
|
|
finally:
|
|
|
|
|
if temp_path and os.path.exists(temp_path):
|
|
|
|
|
try:
|
|
|
|
|
os.remove(temp_path)
|
|
|
|
|
except OSError:
|
|
|
|
|
logger.warning("临时文件清理失败: {}", temp_path)
|
|
|
|
|
|
2026-05-20 23:34:08 +08:00
|
|
|
def delete(self, doc_id: str) -> bool:
|
|
|
|
|
"""Delete document record, binary file, and vector chunks."""
|
|
|
|
|
document = self.document_repository.get(doc_id)
|
|
|
|
|
if not document:
|
|
|
|
|
return False
|
|
|
|
|
try:
|
|
|
|
|
self.binary_store.delete(document.object_name)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("Binary delete failed for doc_id={}", doc_id)
|
|
|
|
|
try:
|
|
|
|
|
self.vector_index.delete_by_document(doc_id)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("Vector delete failed for doc_id={}", doc_id)
|
|
|
|
|
if self.parse_artifact_store:
|
|
|
|
|
try:
|
|
|
|
|
self.parse_artifact_store.delete(doc_id)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("ParseArtifactStore delete failed for doc_id={}", doc_id)
|
2026-05-26 12:34:12 +08:00
|
|
|
if self.document_processing_store:
|
|
|
|
|
try:
|
|
|
|
|
self.document_processing_store.delete_by_document(doc_id)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.warning("DocumentProcessingStore delete failed for doc_id={}", doc_id)
|
2026-05-20 23:34:08 +08:00
|
|
|
self.document_repository.delete(doc_id)
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def retry(self, doc_id: str) -> DocumentProcessResult:
|
|
|
|
|
"""Re-process a failed document from its stored binary."""
|
|
|
|
|
document = self.document_repository.get(doc_id)
|
|
|
|
|
if not document:
|
|
|
|
|
return DocumentProcessResult(doc_id=doc_id, doc_name="", status="failed", message="文档不存在")
|
|
|
|
|
content = self.binary_store.read(document.object_name)
|
|
|
|
|
return self.upload_and_process(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
file_name=document.file_name,
|
|
|
|
|
content=content,
|
|
|
|
|
content_type=document.content_type,
|
|
|
|
|
doc_name=document.doc_name,
|
|
|
|
|
regulation_type=document.regulation_type,
|
|
|
|
|
version=document.version,
|
|
|
|
|
generate_summary=bool(document.metadata.get("generate_summary", False)),
|
2026-05-26 12:34:12 +08:00
|
|
|
trigger_type="retry",
|
2026-05-20 23:34:08 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
class DocumentQueryService:
|
|
|
|
|
"""Provide the Document Query Service service."""
|
2026-05-20 23:34:08 +08:00
|
|
|
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore, vector_index: VectorIndex) -> None:
|
2026-05-18 16:32:42 +08:00
|
|
|
"""Initialize the Document Query Service instance."""
|
|
|
|
|
self.document_repository = document_repository
|
|
|
|
|
self.binary_store = binary_store
|
2026-05-20 23:34:08 +08:00
|
|
|
self.vector_index = vector_index
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
def get(self, doc_id: str) -> Document | None:
|
|
|
|
|
"""Handle get for the Document Query Service instance."""
|
|
|
|
|
return self.document_repository.get(doc_id)
|
|
|
|
|
|
|
|
|
|
def list_documents(self, limit: int | None = None) -> list[Document]:
|
2026-05-20 23:34:08 +08:00
|
|
|
"""Return documents with real-time state from Milvus as the authoritative source.
|
|
|
|
|
|
|
|
|
|
Algorithm:
|
2026-05-26 20:21:31 +08:00
|
|
|
1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
|
2026-05-20 23:34:08 +08:00
|
|
|
2. Load JSON/PG metadata records and index them by doc_id.
|
|
|
|
|
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
|
|
|
|
|
metadata-only docs with status=INDEXED are demoted to FAILED.
|
|
|
|
|
4. Milvus-only docs (no metadata record) are surfaced as synthetic INDEXED
|
|
|
|
|
entries so they are never invisible to the management list.
|
|
|
|
|
"""
|
|
|
|
|
# Fetch live Milvus state first.
|
|
|
|
|
try:
|
|
|
|
|
milvus_rows = self.vector_index.list_document_metadata()
|
|
|
|
|
except Exception:
|
|
|
|
|
milvus_rows = []
|
|
|
|
|
|
|
|
|
|
milvus_by_id: dict[str, dict] = {r["doc_id"]: r for r in milvus_rows}
|
|
|
|
|
|
|
|
|
|
# Load metadata store records.
|
|
|
|
|
meta_docs = self.document_repository.list(limit=limit)
|
|
|
|
|
meta_by_id: dict[str, Document] = {d.doc_id: d for d in meta_docs}
|
|
|
|
|
|
|
|
|
|
result: list[Document] = []
|
|
|
|
|
|
|
|
|
|
# Reconcile metadata records against Milvus.
|
|
|
|
|
for doc in meta_docs:
|
|
|
|
|
if doc.doc_id in milvus_by_id:
|
|
|
|
|
row = milvus_by_id[doc.doc_id]
|
|
|
|
|
doc.chunk_count = row["chunk_count"]
|
|
|
|
|
doc.status = DocumentStatus.INDEXED
|
|
|
|
|
# Backfill fields that may be missing from older JSON records.
|
2026-05-26 20:21:31 +08:00
|
|
|
if not doc.doc_name and row.get("doc_title"):
|
|
|
|
|
doc.doc_name = row["doc_title"]
|
2026-05-20 23:34:08 +08:00
|
|
|
if not doc.regulation_type and row.get("regulation_type"):
|
|
|
|
|
doc.regulation_type = row["regulation_type"]
|
|
|
|
|
if not doc.version and row.get("version"):
|
|
|
|
|
doc.version = row["version"]
|
|
|
|
|
elif doc.status == DocumentStatus.INDEXED:
|
|
|
|
|
# Metadata says indexed but Milvus has no chunks.
|
|
|
|
|
doc.status = DocumentStatus.FAILED
|
|
|
|
|
doc.error_message = "向量数据库中未找到对应数据"
|
|
|
|
|
result.append(doc)
|
|
|
|
|
|
|
|
|
|
# Surface Milvus-only docs that have no metadata record at all.
|
|
|
|
|
for doc_id, row in milvus_by_id.items():
|
|
|
|
|
if doc_id not in meta_by_id:
|
|
|
|
|
synthetic = Document(
|
|
|
|
|
doc_id=doc_id,
|
2026-05-26 20:21:31 +08:00
|
|
|
doc_name=row.get("doc_title", doc_id),
|
|
|
|
|
file_name=row.get("doc_title", doc_id),
|
2026-05-20 23:34:08 +08:00
|
|
|
object_name="",
|
|
|
|
|
content_type="",
|
|
|
|
|
size_bytes=0,
|
|
|
|
|
status=DocumentStatus.INDEXED,
|
|
|
|
|
regulation_type=row.get("regulation_type", ""),
|
|
|
|
|
version=row.get("version", ""),
|
|
|
|
|
chunk_count=row["chunk_count"],
|
|
|
|
|
)
|
|
|
|
|
result.append(synthetic)
|
|
|
|
|
|
|
|
|
|
result.sort(key=lambda d: d.updated_at, reverse=True)
|
|
|
|
|
return result[:limit] if limit is not None else result
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
def download(self, doc_id: str) -> tuple[Document, bytes]:
|
|
|
|
|
"""Handle download for the Document Query Service instance."""
|
|
|
|
|
document = self.document_repository.get(doc_id)
|
|
|
|
|
if not document:
|
|
|
|
|
raise FileNotFoundError(f"文档不存在: {doc_id}")
|
|
|
|
|
return document, self.binary_store.read(document.object_name)
|