Fix centered content layout widths
This commit is contained in:
@@ -7,16 +7,22 @@ import tempfile
|
||||
import uuid
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from loguru import logger
|
||||
from app.config.settings import settings
|
||||
|
||||
from app.domain.documents import (
|
||||
ChunkBuilder,
|
||||
Document,
|
||||
DocumentArtifact,
|
||||
DocumentBinaryStore,
|
||||
DocumentParser,
|
||||
DocumentProcessingRun,
|
||||
DocumentProcessingStore,
|
||||
DocumentRepository,
|
||||
DocumentStatus,
|
||||
DocumentStatusEvent,
|
||||
ParseArtifactStore,
|
||||
ParsedDocument,
|
||||
)
|
||||
@@ -39,6 +45,7 @@ class DocumentProcessResult:
|
||||
|
||||
class DocumentCommandService:
|
||||
"""Provide the Document Command Service service."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
@@ -49,6 +56,7 @@ class DocumentCommandService:
|
||||
embedding_provider: EmbeddingProvider,
|
||||
vector_index: VectorIndex,
|
||||
parse_artifact_store: ParseArtifactStore | None = None,
|
||||
document_processing_store: DocumentProcessingStore | None = None,
|
||||
) -> None:
|
||||
"""Initialize the Document Command Service instance."""
|
||||
self.document_repository = document_repository
|
||||
@@ -58,6 +66,11 @@ class DocumentCommandService:
|
||||
self.embedding_provider = embedding_provider
|
||||
self.vector_index = vector_index
|
||||
self.parse_artifact_store = parse_artifact_store
|
||||
self.document_processing_store = document_processing_store
|
||||
|
||||
def _utcnow(self) -> datetime:
|
||||
"""Return the current UTC timestamp for persisted processing metadata."""
|
||||
return datetime.now(UTC)
|
||||
|
||||
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
|
||||
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
|
||||
@@ -80,6 +93,143 @@ class DocumentCommandService:
|
||||
artifact_keys[name] = object_name
|
||||
return artifact_keys
|
||||
|
||||
def _safe_create_processing_run(self, *, doc_id: str, trigger_type: str, generate_summary: bool) -> str | None:
|
||||
"""Create a processing run record when the optional store is available."""
|
||||
if not self.document_processing_store:
|
||||
return None
|
||||
run = DocumentProcessingRun(
|
||||
run_id=str(uuid.uuid4()),
|
||||
doc_id=doc_id,
|
||||
trigger_type=trigger_type,
|
||||
run_status="running",
|
||||
parser_backend=settings.parser_backend,
|
||||
chunk_backend=settings.chunk_backend,
|
||||
embedding_model=settings.embedding_model,
|
||||
metadata={"generate_summary": generate_summary},
|
||||
)
|
||||
try:
|
||||
created = self.document_processing_store.create_run(run)
|
||||
return created.run_id
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore.create_run failed for doc_id={}", doc_id)
|
||||
return None
|
||||
|
||||
def _safe_append_status_event(
|
||||
self,
|
||||
*,
|
||||
doc_id: str,
|
||||
run_id: str | None,
|
||||
from_status: str,
|
||||
to_status: str,
|
||||
stage: str,
|
||||
message: str = "",
|
||||
metadata: dict | None = None,
|
||||
) -> None:
|
||||
"""Append a status event without allowing auxiliary persistence failures to abort processing."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
event = DocumentStatusEvent(
|
||||
event_id=str(uuid.uuid4()),
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status=from_status,
|
||||
to_status=to_status,
|
||||
stage=stage,
|
||||
message=message,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
try:
|
||||
self.document_processing_store.append_status_event(event)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"DocumentProcessingStore.append_status_event failed for doc_id={}, run_id={}",
|
||||
doc_id,
|
||||
run_id,
|
||||
)
|
||||
|
||||
def _safe_mark_run_stored(self, *, doc_id: str, run_id: str | None) -> None:
|
||||
"""Mark the processing run as stored without affecting the main workflow."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
try:
|
||||
self.document_processing_store.mark_run_stored(run_id, stored_at=self._utcnow())
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore.mark_run_stored failed for doc_id={}, run_id={}", doc_id, run_id)
|
||||
|
||||
def _safe_mark_run_parsed(self, *, doc_id: str, run_id: str | None, parsed_document: ParsedDocument) -> None:
|
||||
"""Persist parse completion details without failing the document pipeline."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
try:
|
||||
self.document_processing_store.mark_run_parsed(
|
||||
run_id,
|
||||
parser_backend=parsed_document.parser_name,
|
||||
layout_count=int(parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)) or 0),
|
||||
structure_node_count=len(parsed_document.structure_nodes),
|
||||
semantic_block_count=len(parsed_document.semantic_blocks),
|
||||
vector_chunk_count=len(parsed_document.vector_chunks),
|
||||
parsed_at=self._utcnow(),
|
||||
metadata={"parse_task_id": parsed_document.metadata.get("task_id", "")},
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore.mark_run_parsed failed for doc_id={}, run_id={}", doc_id, run_id)
|
||||
|
||||
def _safe_replace_processing_artifacts(self, *, doc_id: str, run_id: str | None, artifact_keys: dict[str, str]) -> None:
|
||||
"""Store artifact references without turning persistence drift into a user-visible failure."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
artifacts = [
|
||||
DocumentArtifact(
|
||||
artifact_id=str(uuid.uuid4()),
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
artifact_type=artifact_type,
|
||||
object_name=object_name,
|
||||
content_type="application/json",
|
||||
byte_size=0,
|
||||
checksum="",
|
||||
)
|
||||
for artifact_type, object_name in artifact_keys.items()
|
||||
]
|
||||
try:
|
||||
self.document_processing_store.replace_artifacts_for_run(run_id, artifacts)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"DocumentProcessingStore.replace_artifacts_for_run failed for doc_id={}, run_id={}",
|
||||
doc_id,
|
||||
run_id,
|
||||
)
|
||||
|
||||
def _safe_mark_run_indexed(self, *, doc_id: str, run_id: str | None, chunk_count: int, index_name: str) -> None:
|
||||
"""Mark the processing run as indexed without affecting the success path."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
now = self._utcnow()
|
||||
try:
|
||||
self.document_processing_store.mark_run_indexed(
|
||||
run_id,
|
||||
chunk_count=chunk_count,
|
||||
index_name=index_name,
|
||||
indexed_at=now,
|
||||
finished_at=now,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore.mark_run_indexed failed for doc_id={}, run_id={}", doc_id, run_id)
|
||||
|
||||
def _safe_mark_run_failed(self, *, doc_id: str, run_id: str | None, failure_stage: str, error_message: str) -> None:
|
||||
"""Mark the processing run as failed without masking the original error handling path."""
|
||||
if not self.document_processing_store or not run_id:
|
||||
return
|
||||
try:
|
||||
self.document_processing_store.mark_run_failed(
|
||||
run_id,
|
||||
failure_stage=failure_stage,
|
||||
error_message=error_message,
|
||||
finished_at=self._utcnow(),
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore.mark_run_failed failed for doc_id={}, run_id={}", doc_id, run_id)
|
||||
|
||||
def upload_and_process(
|
||||
self,
|
||||
*,
|
||||
@@ -91,11 +241,15 @@ class DocumentCommandService:
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
generate_summary: bool,
|
||||
trigger_type: str = "upload",
|
||||
) -> DocumentProcessResult:
|
||||
"""Handle upload and process for the Document Command Service instance."""
|
||||
doc_id = doc_id or str(uuid.uuid4())[:8]
|
||||
final_doc_name = doc_name or file_name
|
||||
object_name = f"{doc_id}/{file_name}"
|
||||
run_id: str | None = None
|
||||
current_status = DocumentStatus.PENDING
|
||||
current_stage = "store"
|
||||
|
||||
document = Document(
|
||||
doc_id=doc_id,
|
||||
@@ -109,6 +263,19 @@ class DocumentCommandService:
|
||||
metadata={"generate_summary": generate_summary},
|
||||
)
|
||||
self.document_repository.create(document)
|
||||
run_id = self._safe_create_processing_run(
|
||||
doc_id=doc_id,
|
||||
trigger_type=trigger_type,
|
||||
generate_summary=generate_summary,
|
||||
)
|
||||
self._safe_append_status_event(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status="",
|
||||
to_status=DocumentStatus.PENDING.value,
|
||||
stage="document_created",
|
||||
message="Document record created",
|
||||
)
|
||||
|
||||
temp_path = ""
|
||||
try:
|
||||
@@ -119,6 +286,17 @@ class DocumentCommandService:
|
||||
metadata={"doc_id": doc_id},
|
||||
)
|
||||
self.document_repository.update_status(doc_id, DocumentStatus.STORED)
|
||||
current_status = DocumentStatus.STORED
|
||||
current_stage = "parse"
|
||||
self._safe_mark_run_stored(doc_id=doc_id, run_id=run_id)
|
||||
self._safe_append_status_event(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status=DocumentStatus.PENDING.value,
|
||||
to_status=DocumentStatus.STORED.value,
|
||||
stage="store",
|
||||
message="Source file stored",
|
||||
)
|
||||
|
||||
suffix = os.path.splitext(file_name)[1]
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||||
@@ -130,7 +308,13 @@ class DocumentCommandService:
|
||||
doc_id=doc_id,
|
||||
doc_name=final_doc_name,
|
||||
)
|
||||
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
|
||||
self._safe_mark_run_parsed(doc_id=doc_id, run_id=run_id, parsed_document=parsed_document)
|
||||
|
||||
artifact_keys: dict[str, str] = {}
|
||||
try:
|
||||
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
|
||||
except Exception:
|
||||
logger.warning("Parse artifact binary persistence failed for doc_id={}", doc_id)
|
||||
self.document_repository.update_status(
|
||||
doc_id,
|
||||
DocumentStatus.PARSED,
|
||||
@@ -146,6 +330,18 @@ class DocumentCommandService:
|
||||
"processing_stage": "parsed",
|
||||
},
|
||||
)
|
||||
current_status = DocumentStatus.PARSED
|
||||
current_stage = "embed"
|
||||
self._safe_replace_processing_artifacts(doc_id=doc_id, run_id=run_id, artifact_keys=artifact_keys)
|
||||
self._safe_append_status_event(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status=DocumentStatus.STORED.value,
|
||||
to_status=DocumentStatus.PARSED.value,
|
||||
stage="parse",
|
||||
message="Document parsed",
|
||||
metadata={"artifact_count": len(artifact_keys)},
|
||||
)
|
||||
if self.parse_artifact_store:
|
||||
try:
|
||||
self.parse_artifact_store.save(
|
||||
@@ -165,6 +361,7 @@ class DocumentCommandService:
|
||||
raise ValueError("解析完成但没有生成可入库的 chunks")
|
||||
|
||||
vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
|
||||
current_stage = "index"
|
||||
inserted = self.vector_index.upsert(chunks, vectors)
|
||||
if inserted != len(chunks):
|
||||
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
|
||||
@@ -182,6 +379,23 @@ class DocumentCommandService:
|
||||
"processing_stage": "indexed",
|
||||
},
|
||||
)
|
||||
current_status = DocumentStatus.INDEXED
|
||||
index_name = health.get("collection_name", "")
|
||||
self._safe_mark_run_indexed(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
chunk_count=len(chunks),
|
||||
index_name=index_name,
|
||||
)
|
||||
self._safe_append_status_event(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status=DocumentStatus.PARSED.value,
|
||||
to_status=DocumentStatus.INDEXED.value,
|
||||
stage="index",
|
||||
message="Document indexed",
|
||||
metadata={"chunk_count": len(chunks), "index_name": index_name},
|
||||
)
|
||||
stored = self.document_repository.get(doc_id)
|
||||
return DocumentProcessResult(
|
||||
doc_id=doc_id,
|
||||
@@ -194,6 +408,7 @@ class DocumentCommandService:
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception("文档处理失败: doc_id={}", doc_id)
|
||||
failure_stage = current_stage
|
||||
self.document_repository.update_status(
|
||||
doc_id,
|
||||
DocumentStatus.FAILED,
|
||||
@@ -201,8 +416,23 @@ class DocumentCommandService:
|
||||
metadata={
|
||||
"failure_reason": str(exc),
|
||||
"processing_stage": "failed",
|
||||
"failure_stage": failure_stage,
|
||||
},
|
||||
)
|
||||
self._safe_mark_run_failed(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
failure_stage=failure_stage,
|
||||
error_message=str(exc),
|
||||
)
|
||||
self._safe_append_status_event(
|
||||
doc_id=doc_id,
|
||||
run_id=run_id,
|
||||
from_status=current_status.value,
|
||||
to_status=DocumentStatus.FAILED.value,
|
||||
stage=failure_stage,
|
||||
message=str(exc),
|
||||
)
|
||||
return DocumentProcessResult(
|
||||
doc_id=doc_id,
|
||||
doc_name=final_doc_name,
|
||||
@@ -235,6 +465,11 @@ class DocumentCommandService:
|
||||
self.parse_artifact_store.delete(doc_id)
|
||||
except Exception:
|
||||
logger.warning("ParseArtifactStore delete failed for doc_id={}", doc_id)
|
||||
if self.document_processing_store:
|
||||
try:
|
||||
self.document_processing_store.delete_by_document(doc_id)
|
||||
except Exception:
|
||||
logger.warning("DocumentProcessingStore delete failed for doc_id={}", doc_id)
|
||||
self.document_repository.delete(doc_id)
|
||||
return True
|
||||
|
||||
@@ -253,6 +488,7 @@ class DocumentCommandService:
|
||||
regulation_type=document.regulation_type,
|
||||
version=document.version,
|
||||
generate_summary=bool(document.metadata.get("generate_summary", False)),
|
||||
trigger_type="retry",
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user