Files
AIRegulation-DocAnalysis/backend/app/services/document_processor.py

87 lines
2.8 KiB
Python

"""Provide service-layer logic for document processor."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from app.shared.bootstrap import get_document_command_service, get_retrieval_service
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class ProcessingResult:
"""Represent the Processing Result type."""
doc_id: str
doc_name: str
success: bool
num_chunks: int = 0
message: str = ""
markdown_text: str = ""
summary: str = ""
summary_latency_ms: int = 0
class DocumentProcessor:
"""Represent the Document Processor type."""
def __init__(self, *args, generate_summary: bool = False, **kwargs):
"""Initialize the Document Processor instance."""
self.generate_summary = generate_summary
def process(
self,
file_path: str,
doc_id: Optional[str] = None,
doc_name: Optional[str] = None,
regulation_type: str = "",
version: str = "",
) -> ProcessingResult:
"""Handle process for the Document Processor instance."""
path = Path(file_path)
content = path.read_bytes()
result = get_document_command_service().upload_and_process(
doc_id=doc_id,
file_name=path.name,
content=content,
content_type="application/octet-stream",
doc_name=doc_name or path.name,
regulation_type=regulation_type,
version=version,
generate_summary=self.generate_summary,
)
return ProcessingResult(
doc_id=result.doc_id,
doc_name=result.doc_name,
success=result.status != "failed",
num_chunks=result.num_chunks,
message=result.message,
summary=result.summary,
summary_latency_ms=result.summary_latency_ms,
)
def search(self, query: str, top_k: int = 10, filters: str | None = None) -> list[dict]:
"""Handle search for the Document Processor instance."""
results = get_retrieval_service().retrieve(query=query, top_k=top_k, filters=filters)
return [
{
"id": item.chunk_id,
"content": item.content,
"score": item.score,
"metadata": {
"doc_id": item.doc_id,
"doc_name": item.doc_name,
"chunk_id": item.chunk_id,
"section_title": item.section_title,
"page_number": item.page_number,
**item.metadata,
},
}
for item in results
]
def close(self):
"""Release the resources held by this component."""
return None