init
This commit is contained in:
623
vw-document-ai-indexer/business_layer.py
Normal file
623
vw-document-ai-indexer/business_layer.py
Normal file
@@ -0,0 +1,623 @@
|
||||
""" business_layer.py
|
||||
This module contains the business logic for document processing."""
|
||||
|
||||
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
import traceback
|
||||
import datetime
|
||||
from collections import Counter
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
from azure.core.pipeline.policies import RetryPolicy
|
||||
from app_config import ApplicationConfig, ServiceFactory
|
||||
from chunk_service import chunk_di_doc
|
||||
from entity_models import Document, ChunkingResult,DiResult
|
||||
from database import DatabaseInterface, IndexObject, IndexObjectStatus,LegacyDatabaseAdapter
|
||||
|
||||
from di_extractor import di_extract
|
||||
from blob_service import blob_exists, blob_upload_content, blob_upload_object, downloadToLocalFolder, load_content
|
||||
from utils import replace_urls_in_content, write_content,write_document,asdict_with_dynamic
|
||||
|
||||
|
||||
from azure_index_service import upload_merge_index, delete_documents_by_field,query_by_field
|
||||
from vllm_extractor import process_document_figures
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class SingletonFormRecognizerClient:
|
||||
instance = None
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if not cls.instance:
|
||||
extract_method = os.environ.get("extract_method", "default")
|
||||
if extract_method == "vision-llm":
|
||||
cls.instance = object() # dummy object
|
||||
else:
|
||||
url = os.getenv("form_rec_resource")
|
||||
key = os.getenv("form_rec_key")
|
||||
if url and key:
|
||||
print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
|
||||
|
||||
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
|
||||
|
||||
cls.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
|
||||
else:
|
||||
print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
|
||||
cls.instance = object() # dummy object
|
||||
return cls.instance
|
||||
|
||||
def __getstate__(self)->tuple[Any,Any]:
|
||||
return self.url, self.key
|
||||
|
||||
def __setstate__(self, state):
|
||||
url, key = state
|
||||
|
||||
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
|
||||
self.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingContext:
|
||||
"""Processing Context"""
|
||||
object_key: str
|
||||
data_config: Dict[str, Any]
|
||||
metadata: Dict[str, Any]
|
||||
retry_count: int = 0
|
||||
error_message: Optional[str] = None
|
||||
current_tmp_directory: str = ""
|
||||
datasource_name: str = ""
|
||||
config: ApplicationConfig | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingResult:
|
||||
"""Processing Result"""
|
||||
status: IndexObjectStatus
|
||||
object_key: str
|
||||
message: str
|
||||
processing_time: float
|
||||
chunks_count: int = 0
|
||||
error: Optional[Exception] = None
|
||||
|
||||
|
||||
# Keep only the DocumentRepository interface, other services directly use the specific implementation
|
||||
class DocumentRepository(ABC):
|
||||
"""Document Repository Interface"""
|
||||
|
||||
@abstractmethod
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
pass
|
||||
|
||||
|
||||
# Application service layer
|
||||
class DocumentProcessingOrchestrator:
|
||||
"""Document Processing Orchestrator (Application Service Layer)"""
|
||||
|
||||
def __init__(self,
|
||||
extraction_service: 'DocumentIntelligenceExtractionService',
|
||||
chunking_service: 'DefaultDocumentChunkingService',
|
||||
indexing_service: 'AzureSearchIndexingService',
|
||||
metadata_service: 'BlobMetadataService',
|
||||
repository: DocumentRepository):
|
||||
self.extraction_service = extraction_service
|
||||
self.chunking_service = chunking_service
|
||||
self.indexing_service = indexing_service
|
||||
self.metadata_service = metadata_service
|
||||
self.repository = repository
|
||||
|
||||
def process_document(self, context: ProcessingContext) -> ProcessingResult:
|
||||
"""Main process for handling a single document"""
|
||||
start_time = datetime.datetime.now()
|
||||
|
||||
# 1. Get or create index object
|
||||
index_object = self._get_or_create_index_object(context)
|
||||
# if not index_object:
|
||||
# raise ValueError(f"Failed to create or retrieve index object for {context.object_key}")
|
||||
|
||||
try:
|
||||
|
||||
# 2. Check retry count
|
||||
# If the current processing object's time is updated, reset the retry count, and execute the subsequent logic. The comparison dimensions are the last failed document modification time and metadata modification time
|
||||
if index_object.last_fail_doc_modifed_time != context.metadata.get("doc_modified_time") or index_object.last_fail_metadata_modifed_time != context.metadata.get("metadata_modified_time"):
|
||||
index_object.try_count = 0
|
||||
|
||||
|
||||
if index_object.status in ["processing", "failed"]:
|
||||
# Check if the maximum retry count has been reached
|
||||
if index_object.try_count >= 3:
|
||||
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Object has been retried {index_object.try_count} times, skipping processing", processing_time=0)
|
||||
|
||||
# Increase the retry count and save immediately
|
||||
index_object.try_count += 1
|
||||
|
||||
# Immediately save the retry count update
|
||||
self.repository.save_index_object(index_object)
|
||||
|
||||
# 3. Update status to processing
|
||||
self.repository.update_processing_status(context.object_key,context.datasource_name, IndexObjectStatus.PROCESSING)
|
||||
|
||||
# 4. Check if processing is needed (metadata and document modification times)
|
||||
meta_update_flag = self._should_process_metadata(index_object, context)
|
||||
doc_update_flag = self._should_process_document(index_object, context)
|
||||
|
||||
chunks_count = 0
|
||||
|
||||
# 5. Process metadata index (if update is needed)
|
||||
if meta_update_flag:
|
||||
self._process_metadata_indexes(context)
|
||||
|
||||
# 6. Process document and chunk indexes (Important: Only process when meta_update_flag OR doc_update_flag=True)
|
||||
if meta_update_flag or doc_update_flag:
|
||||
chunks_count = self._process_document_and_chunks(context, doc_update_flag)
|
||||
|
||||
# 7. Update the modification time of the index object
|
||||
if meta_update_flag:
|
||||
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
if doc_update_flag:
|
||||
index_object.doc_modifed_time = context.metadata.get("doc_modified_time")
|
||||
|
||||
index_object.status = IndexObjectStatus.SUCCESS.value
|
||||
|
||||
|
||||
if index_object.metadata_modifed_time is None:
|
||||
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
|
||||
self.repository.save_index_object(index_object)
|
||||
|
||||
processing_time = (datetime.datetime.now() - start_time).total_seconds()
|
||||
return ProcessingResult(status=IndexObjectStatus.SUCCESS, object_key=context.object_key, message=f"Successfully processed {chunks_count} chunks", processing_time=processing_time, chunks_count=chunks_count)
|
||||
|
||||
except Exception as e:
|
||||
error_message:str = traceback.format_exc()
|
||||
index_object.status = IndexObjectStatus.FAILED.value
|
||||
index_object.last_fail_doc_modifed_time = context.metadata.get("doc_modified_time")
|
||||
index_object.last_fail_metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
self.repository.save_index_object(index_object)
|
||||
processing_time = (datetime.datetime.now() - start_time).total_seconds()
|
||||
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Processing failed: {error_message}", processing_time=processing_time, error=e )
|
||||
|
||||
|
||||
|
||||
def _get_or_create_index_object(self, context: ProcessingContext) -> IndexObject:
|
||||
"""Get or create index object"""
|
||||
index_object = self.repository.get_index_object(context.object_key,context.datasource_name)
|
||||
if not index_object:
|
||||
index_object = IndexObject(
|
||||
object_key=context.object_key,
|
||||
type="document",
|
||||
status=IndexObjectStatus.PROCESSING.value,
|
||||
datasource_name=context.datasource_name
|
||||
)
|
||||
self.repository.save_index_object(index_object)
|
||||
return index_object
|
||||
|
||||
def _should_process(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether processing is needed (keep the original logic for backward compatibility)"""
|
||||
return self._should_process_metadata(index_object, context) or self._should_process_document(index_object, context)
|
||||
|
||||
def _should_process_metadata(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether metadata processing is needed"""
|
||||
if 'metadata_modified_time' in context.metadata:
|
||||
metadata_modified_time = context.metadata['metadata_modified_time']
|
||||
if index_object.metadata_modifed_time is None:
|
||||
return True
|
||||
if metadata_modified_time is not None and metadata_modified_time > index_object.metadata_modifed_time:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _should_process_document(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether document processing is needed"""
|
||||
if 'doc_modified_time' in context.metadata:
|
||||
doc_modified_time = context.metadata['doc_modified_time']
|
||||
if index_object.doc_modifed_time is None:
|
||||
return True
|
||||
if doc_modified_time is not None and doc_modified_time > index_object.doc_modifed_time:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _process_metadata_indexes(self, context: ProcessingContext) -> None:
|
||||
"""Process metadata index"""
|
||||
|
||||
# Push metadata index - only process index with data_type of ["metadata"]
|
||||
meta_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata"])]
|
||||
if not any(meta_index_schemas):
|
||||
return
|
||||
|
||||
# Get metadata - from metadata service
|
||||
doc_meta = self.metadata_service.get_metadata(context.object_key)
|
||||
# Metadata must not be empty, use empty dictionary as default value
|
||||
if not doc_meta:
|
||||
raise ValueError(f"Metadata for object {context.object_key} not found")
|
||||
|
||||
for meta_index_schema in meta_index_schemas:
|
||||
self.indexing_service.index_metadata(doc_meta, meta_index_schema, context)
|
||||
|
||||
def _process_document_and_chunks(self, context: ProcessingContext, doc_update_flag: bool) -> int:
|
||||
"""Process document and chunk indexes, return the number of processed chunks"""
|
||||
|
||||
doc_dict = {}
|
||||
chunk_dict = []
|
||||
chunks_count = 0
|
||||
# Update document dictionary with metadata
|
||||
doc_meta = self.metadata_service.get_metadata(context.object_key)
|
||||
language_code = doc_meta.get("language_code", "zh-Hans") # Default to "zh-Hans" if not specified
|
||||
# Future error or skip operation if no doc_meta configuration file
|
||||
if not doc_meta:
|
||||
doc_meta={}
|
||||
|
||||
|
||||
# If the document needs to be updated, re-extract and chunk
|
||||
if doc_update_flag:
|
||||
# Extract document
|
||||
document = self.extraction_service.extract_document(context, language_code)
|
||||
document.title = os.path.splitext(context.object_key)[0]
|
||||
|
||||
# Chunk processing
|
||||
chunking_result = self.chunking_service.chunk_document(document, context)
|
||||
chunks_count = len(chunking_result.chunks)
|
||||
|
||||
# Convert to dictionary format
|
||||
doc_dict = self._convert_document_to_dict(document)
|
||||
chunk_dict = [self._convert_document_to_dict(chunk) for chunk in chunking_result.chunks]
|
||||
|
||||
# Process document index - data_type is ["metadata","document"]
|
||||
document_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document"]) or Counter(schema["data_type"]) == Counter(["document"])]
|
||||
|
||||
for document_index_schema in document_index_schemas:
|
||||
if not doc_update_flag:
|
||||
# Get existing document data from Azure Search Index
|
||||
existing_docs = self.indexing_service.get_existing_document_data(
|
||||
context.object_key, document_index_schema["index_name"],
|
||||
document_index_schema["update_by_field"]
|
||||
)
|
||||
if existing_docs:
|
||||
doc_dict = existing_docs
|
||||
|
||||
doc_dict.update({k: doc_meta[k] for k in document_index_schema["fields"] if k in doc_meta})
|
||||
|
||||
# Upload document index
|
||||
self.indexing_service.index_document_with_schema(doc_dict, document_index_schema, context)
|
||||
|
||||
# Process chunk index - data_type is ["metadata","document","chunk"]
|
||||
chunk_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document","chunk"]) or Counter(schema["data_type"]) == Counter(["chunk"])]
|
||||
|
||||
for index_schema in chunk_index_schemas:
|
||||
current_chunk_dict = chunk_dict # Use existing chunk_dict
|
||||
current_chunks_count = chunks_count # Use existing chunks_count
|
||||
if not doc_update_flag:
|
||||
# Get existing chunk data from Azure Search Index
|
||||
current_chunk_dict = self.indexing_service.get_existing_chunk_data(context.object_key, index_schema["index_name"], index_schema["update_by_field"])
|
||||
current_chunks_count = len(current_chunk_dict) if current_chunk_dict else 0
|
||||
|
||||
# Update the total chunks_count (for return value)
|
||||
chunks_count = current_chunks_count
|
||||
|
||||
for chunk in current_chunk_dict if current_chunk_dict else []:
|
||||
chunk.update({k: doc_meta[k] for k in index_schema["fields"] if k in doc_meta})
|
||||
|
||||
# Delete old chunk data
|
||||
self.indexing_service.delete_chunks_by_field(index_schema["index_name"], index_schema["update_by_field"], doc_dict.get(index_schema["update_by_field"], context.object_key))
|
||||
|
||||
# Upload new chunk data
|
||||
if current_chunk_dict:
|
||||
self.indexing_service.index_chunks_with_schema(current_chunk_dict, index_schema, context)
|
||||
|
||||
return chunks_count
|
||||
|
||||
def _convert_document_to_dict(self, document:Document) -> Dict[str, Any]:
|
||||
"""Convert Document object to dictionary"""
|
||||
|
||||
try:
|
||||
# Use the original asdict_with_dynamic function to maintain compatibility
|
||||
return asdict_with_dynamic(document)
|
||||
except Exception:
|
||||
# If asdict_with_dynamic fails, use the fallback method
|
||||
if hasattr(document, '__dict__'):
|
||||
return document.__dict__.copy()
|
||||
elif hasattr(document, 'to_dict'):
|
||||
return document.to_dict()
|
||||
else:
|
||||
# If all fails, return empty dictionary
|
||||
return {}
|
||||
|
||||
|
||||
# Infrastructure layer implementation
|
||||
class SqlAlchemyDocumentRepository(DocumentRepository):
|
||||
"""SQLAlchemy-based document repository implementation"""
|
||||
|
||||
def __init__(self, database_interface: DatabaseInterface):
|
||||
self.database_interface = database_interface
|
||||
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object"""
|
||||
return self.database_interface.get_index_object(object_key,datasource_name)
|
||||
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
self.database_interface.save_index_object(index_object)
|
||||
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
|
||||
message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
|
||||
# Convert business layer status to database status
|
||||
self.database_interface.update_processing_status(object_key,datasource_name, status, message)
|
||||
|
||||
|
||||
# Concrete implementation class
|
||||
class DocumentIntelligenceExtractionService:
|
||||
"""Document extraction service based on Document Intelligence"""
|
||||
|
||||
def __init__(self, form_recognizer_client: DocumentIntelligenceClient, vllm_endpoint, vllm_key, tmp_directory, data_directory=None,di_sas_url=None, figure_sas_url=None):
|
||||
self.form_recognizer_client: DocumentIntelligenceClient = form_recognizer_client
|
||||
self.vllm_endpoint: str = vllm_endpoint
|
||||
self.vllm_key: str = vllm_key
|
||||
self.tmp_directory: str = tmp_directory
|
||||
self.data_directory: str = data_directory or ""
|
||||
self.di_sas_url: str = di_sas_url
|
||||
self.figure_sas_url: str = figure_sas_url
|
||||
|
||||
def extract_document(self, context: ProcessingContext,language:str) -> Document:
|
||||
"""Extract document content using Document Intelligence"""
|
||||
|
||||
# Get data_dir config, use instance variable if not present
|
||||
data_dir = context.data_config.get("data_dir", self.data_directory)
|
||||
|
||||
# Download document file - use correct parameter order
|
||||
local_file_paths = downloadToLocalFolder(blob_url=context.data_config["data_path"], data_dir=data_dir, local_folder=self.tmp_directory, name_starts_with=context.object_key)
|
||||
|
||||
if not local_file_paths or len(local_file_paths) == 0:
|
||||
raise ValueError(f"File {context.object_key} not found in blob storage")
|
||||
|
||||
di_blob_file_name = context.object_key + str(context.metadata["doc_modified_time"]) + ".json"
|
||||
di_result:DiResult = None
|
||||
# Try to download the di result from the blob. If you can download it, you will no longer di_extract
|
||||
if self.di_sas_url and blob_exists(self.di_sas_url, di_blob_file_name):
|
||||
content:str = load_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name)
|
||||
if content:
|
||||
di_result = DiResult.from_json(content) # type: ignore
|
||||
if not di_result:
|
||||
di_result = di_extract(source_file_path=local_file_paths.pop(), di_client=self.form_recognizer_client, directory_path=self.tmp_directory, figure_sas_url=self.figure_sas_url, language=language)
|
||||
try:
|
||||
process_document_figures(di_result=di_result,config=context.config)
|
||||
except Exception as e:
|
||||
print(f"Error processing document figures: {e}")
|
||||
finally:
|
||||
# The result after understanding is written directly to the blob to prevent subsequent reprocessing
|
||||
blob_upload_object(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name, obj=di_result)
|
||||
|
||||
under_image_content = replace_urls_in_content(content=di_result.di_content, replacements=di_result.figures)
|
||||
# Save extracted content to local file (same as original logic)
|
||||
write_content(content=under_image_content, directory_path=self.tmp_directory, file_name=context.object_key)
|
||||
|
||||
blob_upload_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name+".md", content=under_image_content)
|
||||
|
||||
return Document(content=under_image_content, filepath=context.object_key)
|
||||
|
||||
|
||||
class DefaultDocumentChunkingService:
|
||||
"""Default document chunking service"""
|
||||
|
||||
def __init__(self, tmp_directory: str = None):
|
||||
self.tmp_directory = tmp_directory
|
||||
|
||||
def chunk_document(self, document: Document, context: ProcessingContext) -> ChunkingResult:
|
||||
"""Chunk document"""
|
||||
|
||||
# Call the original chunking method
|
||||
chunking_result = chunk_di_doc(document, data_config=context.data_config, tmp_path=context.current_tmp_directory)
|
||||
|
||||
# If tmp_directory is configured, save chunk result to local file
|
||||
if self.tmp_directory:
|
||||
write_document( chunking_result.chunks, file_path=context.object_key, directory_path=self.tmp_directory, rel_file_path=context.object_key )
|
||||
|
||||
return chunking_result
|
||||
|
||||
|
||||
class AzureSearchIndexingService:
|
||||
"""Azure Search-based indexing service"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def index_document(self, document: Document, context: ProcessingContext) -> bool:
|
||||
"""Index document"""
|
||||
|
||||
# Get document index schema
|
||||
document_schemas = [schema for schema in context.data_config["index_schemas"]
|
||||
if set(schema["data_type"]) == {"metadata", "document"}]
|
||||
|
||||
doc_dict = asdict_with_dynamic(document)
|
||||
doc_dict.update(context.metadata)
|
||||
|
||||
for schema in document_schemas:
|
||||
if not upload_merge_index(index_config=schema, docs=[doc_dict], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def index_chunks(self, chunks: List[Document], context: ProcessingContext) -> bool:
|
||||
"""Index document chunks"""
|
||||
|
||||
# Get chunk index schema
|
||||
chunk_schemas = [schema for schema in context.data_config["index_schemas"]
|
||||
if set(schema["data_type"]) == {"metadata", "document", "chunk"}]
|
||||
|
||||
chunk_dict = [asdict_with_dynamic(chunk) for chunk in chunks]
|
||||
|
||||
for schema in chunk_schemas:
|
||||
# First delete old chunk data
|
||||
delete_documents_by_field(schema["index_name"], schema["update_by_field"], context.object_key)
|
||||
|
||||
# Add metadata to each chunk
|
||||
for chunk in chunk_dict:
|
||||
chunk.update(context.metadata)
|
||||
|
||||
# Upload new chunk data
|
||||
if not upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=chunk_dict,
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_existing_document_data(self, object_key: str, index_name: str, field_name: str) -> Optional[dict[str,Any]]:
|
||||
"""Get existing document data from Azure Search Index"""
|
||||
|
||||
results = query_by_field(
|
||||
index_name=index_name,
|
||||
field_name=field_name,
|
||||
value=object_key
|
||||
)
|
||||
|
||||
return results[0] if results else None
|
||||
|
||||
def get_existing_chunk_data(self, object_key: str, index_name: str, field_name: str) -> List[dict[str,Any]]:
|
||||
"""Get existing chunk data from Azure Search Index"""
|
||||
|
||||
results = query_by_field( index_name=index_name, field_name=field_name, value=object_key )
|
||||
|
||||
return results if results else []
|
||||
|
||||
def index_metadata(self, metadata: dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index metadata"""
|
||||
|
||||
return upload_merge_index(index_config=schema, docs=[metadata], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory )
|
||||
|
||||
def index_document_with_schema(self, doc_dict: Dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index document using specified schema"""
|
||||
return upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=[doc_dict],
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
)
|
||||
|
||||
def index_chunks_with_schema(self, chunk_dict: List[Dict[str,Any]], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index chunks using specified schema"""
|
||||
|
||||
return upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=chunk_dict,
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
)
|
||||
|
||||
def delete_chunks_by_field(self, index_name: str, field_name: str, field_value: str) -> bool:
|
||||
"""Delete chunks by field"""
|
||||
|
||||
try:
|
||||
delete_documents_by_field(index_name, field_name, field_value)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class BlobMetadataService:
|
||||
"""Metadata service based on Blob storage"""
|
||||
|
||||
def __init__(self, datasource: Dict[str, Any]):
|
||||
self.datasource = datasource
|
||||
|
||||
def get_metadata(self, object_key: str) -> Dict[str, Any]:
|
||||
"""Get metadata"""
|
||||
if "metadata" not in self.datasource:
|
||||
return {}
|
||||
|
||||
return self.datasource["metadata"].get(object_key, {})
|
||||
|
||||
|
||||
# Update the factory class with specific implementations
|
||||
class DocumentProcessingFactory:
|
||||
"""Document processing factory class"""
|
||||
|
||||
def __init__(self, service_factory: ServiceFactory, tmp_directory:str, datasource: Optional[Dict[str, Any]] = None, config:ApplicationConfig = None):
|
||||
"""
|
||||
Initialize factory
|
||||
Args:
|
||||
service_factory: Service factory (used to get database engine)
|
||||
datasource: Data source configuration
|
||||
"""
|
||||
self.service_factory: ServiceFactory = service_factory
|
||||
self.datasource = datasource or {}
|
||||
self.shared_tmp_directory = tmp_directory
|
||||
self.config:ApplicationConfig = config
|
||||
|
||||
def create_orchestrator(self) -> DocumentProcessingOrchestrator:
|
||||
"""Create document processing orchestrator"""
|
||||
extraction_service = self._create_extraction_service()
|
||||
chunking_service = self._create_chunking_service()
|
||||
indexing_service = self._create_indexing_service()
|
||||
metadata_service = self._create_metadata_service()
|
||||
repository = self._create_repository()
|
||||
|
||||
return DocumentProcessingOrchestrator(
|
||||
extraction_service=extraction_service,
|
||||
chunking_service=chunking_service,
|
||||
indexing_service=indexing_service,
|
||||
metadata_service=metadata_service,
|
||||
repository=repository
|
||||
)
|
||||
|
||||
def _create_extraction_service(self) -> 'DocumentIntelligenceExtractionService':
|
||||
"""Create document extraction service"""
|
||||
|
||||
|
||||
# Use the factory shared temporary directory (same as original app.py logic)
|
||||
tmp_directory = self.shared_tmp_directory
|
||||
|
||||
# Get configuration from environment variables (same as original worker.py logic)
|
||||
vllm_endpoint = os.environ.get("captioning_model_endpoint", "")
|
||||
vllm_key = os.environ.get("captioning_model_key", "")
|
||||
|
||||
form_recognizer_client = SingletonFormRecognizerClient()
|
||||
return DocumentIntelligenceExtractionService(
|
||||
form_recognizer_client=form_recognizer_client,
|
||||
vllm_endpoint=vllm_endpoint,
|
||||
vllm_key=vllm_key,
|
||||
tmp_directory=tmp_directory,
|
||||
data_directory="", # Will be dynamically fetched from data_config
|
||||
di_sas_url=self.config.azure_services.di_blob_account_url,
|
||||
figure_sas_url=self.config.azure_services.figure_blob_account_url
|
||||
)
|
||||
|
||||
def _create_chunking_service(self) -> 'DefaultDocumentChunkingService':
|
||||
"""Create document chunking service"""
|
||||
|
||||
# Use the factory shared temporary directory
|
||||
tmp_directory = self.shared_tmp_directory
|
||||
|
||||
return DefaultDocumentChunkingService(tmp_directory=tmp_directory)
|
||||
|
||||
def _create_indexing_service(self) -> 'AzureSearchIndexingService':
|
||||
"""Create indexing service"""
|
||||
return AzureSearchIndexingService()
|
||||
|
||||
def _create_metadata_service(self) -> 'BlobMetadataService':
|
||||
"""Create metadata service"""
|
||||
return BlobMetadataService(self.datasource)
|
||||
|
||||
def _create_repository(self) -> DocumentRepository:
|
||||
"""Create document repository"""
|
||||
database_interface = LegacyDatabaseAdapter(self.service_factory.get_database_engine())
|
||||
return SqlAlchemyDocumentRepository(database_interface)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user