catonline_ai/vw-document-ai-indexer/business_layer.py

""" business_layer.py
This module contains the business logic for document processing."""


import os
from abc import ABC, abstractmethod
from typing import List, Optional, Dict, Any
from dataclasses import dataclass
import traceback
import datetime
from collections import Counter
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.core.pipeline.policies import RetryPolicy 
from app_config import ApplicationConfig, ServiceFactory
from chunk_service import chunk_di_doc
from entity_models import Document, ChunkingResult,DiResult
from database import DatabaseInterface, IndexObject,  IndexObjectStatus,LegacyDatabaseAdapter

from di_extractor import di_extract
from blob_service import blob_exists, blob_upload_content, blob_upload_object, downloadToLocalFolder, load_content
from utils import replace_urls_in_content, write_content,write_document,asdict_with_dynamic


from azure_index_service import upload_merge_index, delete_documents_by_field,query_by_field
from vllm_extractor import process_document_figures


class SingletonFormRecognizerClient:
    instance = None
    def __new__(cls, *args, **kwargs):
        if not cls.instance:
            extract_method = os.environ.get("extract_method", "default")
            if extract_method == "vision-llm":
                cls.instance = object() # dummy object
            else:
                url = os.getenv("form_rec_resource")
                key = os.getenv("form_rec_key")
                if url and key:
                    print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
                
                    retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60) 
                    
                    cls.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
                else:
                    print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
                    cls.instance = object() # dummy object
        return cls.instance

    def __getstate__(self)->tuple[Any,Any]:
        return self.url, self.key

    def __setstate__(self, state):
        url, key = state
        
        retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60) 
        self.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)


@dataclass
class ProcessingContext:
    """Processing Context"""
    object_key: str
    data_config: Dict[str, Any]
    metadata: Dict[str, Any]
    retry_count: int = 0
    error_message: Optional[str] = None
    current_tmp_directory: str = ""
    datasource_name: str = ""
    config: ApplicationConfig | None = None


@dataclass
class ProcessingResult:
    """Processing Result"""
    status: IndexObjectStatus
    object_key: str
    message: str
    processing_time: float
    chunks_count: int = 0
    error: Optional[Exception] = None


# Keep only the DocumentRepository interface, other services directly use the specific implementation
class DocumentRepository(ABC):
    """Document Repository Interface"""
    
    @abstractmethod
    def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
        """Get index object"""
        pass
    
    @abstractmethod
    def save_index_object(self, index_object: IndexObject) -> None:
        """Save index object"""
        pass
    
    @abstractmethod
    def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
        """Update processing status"""
        pass


# Application service layer
class DocumentProcessingOrchestrator:
    """Document Processing Orchestrator (Application Service Layer)"""
    
    def __init__(self,
                 extraction_service: 'DocumentIntelligenceExtractionService',
                 chunking_service: 'DefaultDocumentChunkingService',
                 indexing_service: 'AzureSearchIndexingService',
                 metadata_service: 'BlobMetadataService',
                 repository: DocumentRepository):
        self.extraction_service = extraction_service
        self.chunking_service = chunking_service
        self.indexing_service = indexing_service
        self.metadata_service = metadata_service
        self.repository = repository
    
    def process_document(self, context: ProcessingContext) -> ProcessingResult:
        """Main process for handling a single document"""
        start_time = datetime.datetime.now()
        
        # 1. Get or create index object
        index_object = self._get_or_create_index_object(context)
        # if not index_object:
        #     raise ValueError(f"Failed to create or retrieve index object for {context.object_key}")
        
        try:
            
            # 2. Check retry count
            # If the current processing object's time is updated, reset the retry count, and execute the subsequent logic. The comparison dimensions are the last failed document modification time and metadata modification time
            if index_object.last_fail_doc_modifed_time != context.metadata.get("doc_modified_time") or index_object.last_fail_metadata_modifed_time != context.metadata.get("metadata_modified_time"):
                index_object.try_count = 0
                

            if index_object.status in ["processing", "failed"]:
                # Check if the maximum retry count has been reached
                if index_object.try_count >= 3:
                    return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Object has been retried {index_object.try_count} times, skipping processing", processing_time=0)

                # Increase the retry count and save immediately
                index_object.try_count += 1

                # Immediately save the retry count update
                self.repository.save_index_object(index_object)
            
            # 3. Update status to processing
            self.repository.update_processing_status(context.object_key,context.datasource_name, IndexObjectStatus.PROCESSING)
            
            # 4. Check if processing is needed (metadata and document modification times)
            meta_update_flag = self._should_process_metadata(index_object, context)
            doc_update_flag = self._should_process_document(index_object, context)
            
            chunks_count = 0
            
            # 5. Process metadata index (if update is needed)
            if meta_update_flag:
                self._process_metadata_indexes(context)
            
            # 6. Process document and chunk indexes (Important: Only process when meta_update_flag OR doc_update_flag=True)
            if meta_update_flag or doc_update_flag:
                chunks_count = self._process_document_and_chunks(context, doc_update_flag)
                
            # 7. Update the modification time of the index object
            if meta_update_flag:
                index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
            if doc_update_flag:
                index_object.doc_modifed_time = context.metadata.get("doc_modified_time")
            
            index_object.status = IndexObjectStatus.SUCCESS.value
            
            
            if index_object.metadata_modifed_time is None:
                index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
            
            self.repository.save_index_object(index_object)
            
            processing_time = (datetime.datetime.now() - start_time).total_seconds()
            return ProcessingResult(status=IndexObjectStatus.SUCCESS, object_key=context.object_key, message=f"Successfully processed {chunks_count} chunks", processing_time=processing_time, chunks_count=chunks_count)
            
        except Exception as e:
            error_message:str = traceback.format_exc()
            index_object.status = IndexObjectStatus.FAILED.value
            index_object.last_fail_doc_modifed_time = context.metadata.get("doc_modified_time")
            index_object.last_fail_metadata_modifed_time = context.metadata.get("metadata_modified_time")
            self.repository.save_index_object(index_object)
            processing_time = (datetime.datetime.now() - start_time).total_seconds()
            return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Processing failed: {error_message}", processing_time=processing_time, error=e )


    def _get_or_create_index_object(self, context: ProcessingContext) -> IndexObject:
        """Get or create index object"""
        index_object = self.repository.get_index_object(context.object_key,context.datasource_name)
        if not index_object:
            index_object = IndexObject( 
                object_key=context.object_key, 
                type="document", 
                status=IndexObjectStatus.PROCESSING.value,
                datasource_name=context.datasource_name
            )
            self.repository.save_index_object(index_object)
        return index_object
    
    def _should_process(self, index_object: IndexObject, context: ProcessingContext) -> bool:
        """Determine whether processing is needed (keep the original logic for backward compatibility)"""
        return self._should_process_metadata(index_object, context) or self._should_process_document(index_object, context)
    
    def _should_process_metadata(self, index_object: IndexObject, context: ProcessingContext) -> bool:
        """Determine whether metadata processing is needed"""
        if 'metadata_modified_time' in context.metadata:
            metadata_modified_time = context.metadata['metadata_modified_time']
            if index_object.metadata_modifed_time is None:
                return True
            if metadata_modified_time is not None and metadata_modified_time > index_object.metadata_modifed_time:
                return True
        return False
    
    def _should_process_document(self, index_object: IndexObject, context: ProcessingContext) -> bool:
        """Determine whether document processing is needed"""
        if 'doc_modified_time' in context.metadata:
            doc_modified_time = context.metadata['doc_modified_time']
            if index_object.doc_modifed_time is None:
                return True
            if doc_modified_time is not None and doc_modified_time > index_object.doc_modifed_time:
                return True
        return False
    
    def _process_metadata_indexes(self, context: ProcessingContext) -> None:
        """Process metadata index"""
        
        # Push metadata index - only process index with data_type of ["metadata"]
        meta_index_schemas = [schema for schema in context.data_config["index_schemas"]  if Counter(schema["data_type"]) == Counter(["metadata"])]
        if not any(meta_index_schemas):
            return
        
        # Get metadata - from metadata service
        doc_meta = self.metadata_service.get_metadata(context.object_key)
        # Metadata must not be empty, use empty dictionary as default value
        if not doc_meta:
            raise ValueError(f"Metadata for object {context.object_key} not found")
        
        for meta_index_schema in meta_index_schemas:
            self.indexing_service.index_metadata(doc_meta, meta_index_schema, context)

    def _process_document_and_chunks(self, context: ProcessingContext, doc_update_flag: bool) -> int:
        """Process document and chunk indexes, return the number of processed chunks"""
        
        doc_dict = {}
        chunk_dict = []
        chunks_count = 0
        # Update document dictionary with metadata
        doc_meta = self.metadata_service.get_metadata(context.object_key)
        language_code = doc_meta.get("language_code", "zh-Hans")  # Default to "zh-Hans" if not specified
        # Future error or skip operation if no doc_meta configuration file
        if not doc_meta:
            doc_meta={}
            

        # If the document needs to be updated, re-extract and chunk
        if doc_update_flag:
            # Extract document
            document = self.extraction_service.extract_document(context, language_code)
            document.title = os.path.splitext(context.object_key)[0]
            
            # Chunk processing
            chunking_result = self.chunking_service.chunk_document(document, context)
            chunks_count = len(chunking_result.chunks)
            
            # Convert to dictionary format
            doc_dict = self._convert_document_to_dict(document)
            chunk_dict = [self._convert_document_to_dict(chunk) for chunk in chunking_result.chunks]
        
        # Process document index - data_type is ["metadata","document"]
        document_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document"]) or Counter(schema["data_type"]) == Counter(["document"])]
        
        for document_index_schema in document_index_schemas:
            if not doc_update_flag:
                # Get existing document data from Azure Search Index
                existing_docs = self.indexing_service.get_existing_document_data(
                    context.object_key, document_index_schema["index_name"], 
                    document_index_schema["update_by_field"]
                )
                if existing_docs:
                    doc_dict = existing_docs
            
            doc_dict.update({k: doc_meta[k] for k in document_index_schema["fields"] if k in doc_meta})
            
            # Upload document index
            self.indexing_service.index_document_with_schema(doc_dict, document_index_schema, context)
        
        # Process chunk index - data_type is ["metadata","document","chunk"]
        chunk_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document","chunk"]) or Counter(schema["data_type"]) == Counter(["chunk"])]
        
        for index_schema in chunk_index_schemas:            
            current_chunk_dict = chunk_dict  # Use existing chunk_dict
            current_chunks_count = chunks_count  # Use existing chunks_count
            if not doc_update_flag:
                # Get existing chunk data from Azure Search Index
                current_chunk_dict = self.indexing_service.get_existing_chunk_data(context.object_key, index_schema["index_name"],  index_schema["update_by_field"])
                current_chunks_count = len(current_chunk_dict) if current_chunk_dict else 0
            
            # Update the total chunks_count (for return value)
            chunks_count = current_chunks_count
            
            for chunk in current_chunk_dict if current_chunk_dict else []:
                chunk.update({k: doc_meta[k] for k in index_schema["fields"] if k in doc_meta})
            
            # Delete old chunk data
            self.indexing_service.delete_chunks_by_field(index_schema["index_name"], index_schema["update_by_field"], doc_dict.get(index_schema["update_by_field"], context.object_key))
            
            # Upload new chunk data
            if current_chunk_dict:
                self.indexing_service.index_chunks_with_schema(current_chunk_dict, index_schema, context)
        
        return chunks_count
    
    def _convert_document_to_dict(self, document:Document) -> Dict[str, Any]:
        """Convert Document object to dictionary"""
                
        try:
            # Use the original asdict_with_dynamic function to maintain compatibility
            return asdict_with_dynamic(document)
        except Exception:
            # If asdict_with_dynamic fails, use the fallback method
            if hasattr(document, '__dict__'):
                return document.__dict__.copy()
            elif hasattr(document, 'to_dict'):
                return document.to_dict()
            else:
                # If all fails, return empty dictionary
                return {}


# Infrastructure layer implementation
class SqlAlchemyDocumentRepository(DocumentRepository):
    """SQLAlchemy-based document repository implementation"""
    
    def __init__(self, database_interface: DatabaseInterface):
        self.database_interface = database_interface
    
    def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
        """Get index object"""
        return self.database_interface.get_index_object(object_key,datasource_name)
    
    def save_index_object(self, index_object: IndexObject) -> None:
        """Save index object"""
        self.database_interface.save_index_object(index_object)
    
    def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
                               message: str = None) -> None:
        """Update processing status"""

        # Convert business layer status to database status
        self.database_interface.update_processing_status(object_key,datasource_name, status, message)


# Concrete implementation class
class DocumentIntelligenceExtractionService:
    """Document extraction service based on Document Intelligence"""
    
    def __init__(self, form_recognizer_client: DocumentIntelligenceClient, vllm_endpoint, vllm_key, tmp_directory, data_directory=None,di_sas_url=None, figure_sas_url=None):
        self.form_recognizer_client: DocumentIntelligenceClient  = form_recognizer_client
        self.vllm_endpoint: str = vllm_endpoint
        self.vllm_key: str = vllm_key
        self.tmp_directory: str = tmp_directory
        self.data_directory: str = data_directory or ""
        self.di_sas_url: str = di_sas_url
        self.figure_sas_url: str = figure_sas_url 
    
    def extract_document(self, context: ProcessingContext,language:str) -> Document:
        """Extract document content using Document Intelligence"""

        # Get data_dir config, use instance variable if not present
        data_dir = context.data_config.get("data_dir", self.data_directory)
        
        # Download document file - use correct parameter order
        local_file_paths = downloadToLocalFolder(blob_url=context.data_config["data_path"], data_dir=data_dir, local_folder=self.tmp_directory, name_starts_with=context.object_key)
        
        if not local_file_paths or len(local_file_paths) == 0:
            raise ValueError(f"File {context.object_key} not found in blob storage")
        
        di_blob_file_name = context.object_key + str(context.metadata["doc_modified_time"]) + ".json"
        di_result:DiResult = None
        # Try to download the di result from the blob. If you can download it, you will no longer di_extract
        if self.di_sas_url and blob_exists(self.di_sas_url, di_blob_file_name):
            content:str = load_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name)
            if content:
                di_result = DiResult.from_json(content) # type: ignore
        if not di_result:
            di_result = di_extract(source_file_path=local_file_paths.pop(), di_client=self.form_recognizer_client, directory_path=self.tmp_directory, figure_sas_url=self.figure_sas_url, language=language)
        try: 
            process_document_figures(di_result=di_result,config=context.config)
        except Exception as e:
            print(f"Error processing document figures: {e}")
        finally:
            # The result after understanding is written directly to the blob to prevent subsequent reprocessing
            blob_upload_object(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name, obj=di_result)
        
        under_image_content = replace_urls_in_content(content=di_result.di_content, replacements=di_result.figures)
        # Save extracted content to local file (same as original logic)
        write_content(content=under_image_content, directory_path=self.tmp_directory, file_name=context.object_key)
        
        blob_upload_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name+".md", content=under_image_content)
        
        return Document(content=under_image_content, filepath=context.object_key)


class DefaultDocumentChunkingService:
    """Default document chunking service"""
    
    def __init__(self, tmp_directory: str = None):
        self.tmp_directory = tmp_directory
    
    def chunk_document(self, document: Document, context: ProcessingContext) -> ChunkingResult:
        """Chunk document"""
        
        # Call the original chunking method
        chunking_result = chunk_di_doc(document, data_config=context.data_config, tmp_path=context.current_tmp_directory)
        
        # If tmp_directory is configured, save chunk result to local file
        if self.tmp_directory:
            write_document( chunking_result.chunks,  file_path=context.object_key,  directory_path=self.tmp_directory,  rel_file_path=context.object_key  )
        
        return chunking_result


class AzureSearchIndexingService:
    """Azure Search-based indexing service"""
    
    def __init__(self):
        pass
    
    def index_document(self, document: Document, context: ProcessingContext) -> bool:
        """Index document"""

        # Get document index schema
        document_schemas = [schema for schema in context.data_config["index_schemas"] 
                          if set(schema["data_type"]) == {"metadata", "document"}]
        
        doc_dict = asdict_with_dynamic(document)
        doc_dict.update(context.metadata)
        
        for schema in document_schemas:
            if not upload_merge_index(index_config=schema, docs=[doc_dict], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory):
                return False
        
        return True
    
    def index_chunks(self, chunks: List[Document], context: ProcessingContext) -> bool:
        """Index document chunks"""

        # Get chunk index schema
        chunk_schemas = [schema for schema in context.data_config["index_schemas"] 
                        if set(schema["data_type"]) == {"metadata", "document", "chunk"}]
        
        chunk_dict = [asdict_with_dynamic(chunk) for chunk in chunks]
        
        for schema in chunk_schemas:
            # First delete old chunk data
            delete_documents_by_field(schema["index_name"], schema["update_by_field"], context.object_key)
            
            # Add metadata to each chunk
            for chunk in chunk_dict:
                chunk.update(context.metadata)
            
            # Upload new chunk data
            if not upload_merge_index(
                index_config=schema, 
                docs=chunk_dict, 
                merge_fields=context.data_config["merge_fields"],
                current_tmp_directory=context.current_tmp_directory
            ):
                return False
        
        return True
    
    def get_existing_document_data(self, object_key: str, index_name: str, field_name: str) -> Optional[dict[str,Any]]:
        """Get existing document data from Azure Search Index"""
        
        results = query_by_field(
            index_name=index_name, 
            field_name=field_name, 
            value=object_key
        )
        
        return results[0] if results else None
    
    def get_existing_chunk_data(self, object_key: str, index_name: str, field_name: str) -> List[dict[str,Any]]:
        """Get existing chunk data from Azure Search Index"""
        
        results = query_by_field( index_name=index_name,  field_name=field_name,  value=object_key )
        
        return results if results else []
    
    def index_metadata(self, metadata: dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
        """Index metadata"""

        return upload_merge_index(index_config=schema,  docs=[metadata],  merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory )
    
    def index_document_with_schema(self, doc_dict: Dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
        """Index document using specified schema"""
        return upload_merge_index(
            index_config=schema, 
            docs=[doc_dict], 
            merge_fields=context.data_config["merge_fields"],
            current_tmp_directory=context.current_tmp_directory
        )
    
    def index_chunks_with_schema(self, chunk_dict: List[Dict[str,Any]], schema: Any, context: ProcessingContext) -> bool:
        """Index chunks using specified schema"""
 
        return upload_merge_index(
            index_config=schema, 
            docs=chunk_dict, 
            merge_fields=context.data_config["merge_fields"],
            current_tmp_directory=context.current_tmp_directory
        )
    
    def delete_chunks_by_field(self, index_name: str, field_name: str, field_value: str) -> bool:
        """Delete chunks by field"""
        
        try:
            delete_documents_by_field(index_name, field_name, field_value)
            return True
        except Exception:
            return False


class BlobMetadataService:
    """Metadata service based on Blob storage"""
    
    def __init__(self, datasource: Dict[str, Any]):
        self.datasource = datasource
    
    def get_metadata(self, object_key: str) -> Dict[str, Any]:
        """Get metadata"""
        if "metadata" not in self.datasource:
            return {}
        
        return self.datasource["metadata"].get(object_key, {})
    

# Update the factory class with specific implementations
class DocumentProcessingFactory:
    """Document processing factory class"""
    
    def __init__(self, service_factory: ServiceFactory, tmp_directory:str, datasource: Optional[Dict[str, Any]] = None, config:ApplicationConfig = None):
        """
        Initialize factory 
        Args:
            service_factory: Service factory (used to get database engine)
            datasource: Data source configuration
        """
        self.service_factory: ServiceFactory = service_factory
        self.datasource = datasource or {}
        self.shared_tmp_directory = tmp_directory
        self.config:ApplicationConfig = config

    def create_orchestrator(self) -> DocumentProcessingOrchestrator:
        """Create document processing orchestrator"""
        extraction_service = self._create_extraction_service()
        chunking_service = self._create_chunking_service()
        indexing_service = self._create_indexing_service()
        metadata_service = self._create_metadata_service()
        repository = self._create_repository()
        
        return DocumentProcessingOrchestrator(
            extraction_service=extraction_service,
            chunking_service=chunking_service,
            indexing_service=indexing_service,
            metadata_service=metadata_service,
            repository=repository
        )
    
    def _create_extraction_service(self) -> 'DocumentIntelligenceExtractionService':
        """Create document extraction service"""

        
        # Use the factory shared temporary directory (same as original app.py logic)
        tmp_directory = self.shared_tmp_directory
        
        # Get configuration from environment variables (same as original worker.py logic)
        vllm_endpoint = os.environ.get("captioning_model_endpoint", "")
        vllm_key = os.environ.get("captioning_model_key", "")
        
        form_recognizer_client = SingletonFormRecognizerClient()
        return DocumentIntelligenceExtractionService(
            form_recognizer_client=form_recognizer_client,
            vllm_endpoint=vllm_endpoint,
            vllm_key=vllm_key,
            tmp_directory=tmp_directory,
            data_directory="",  # Will be dynamically fetched from data_config
            di_sas_url=self.config.azure_services.di_blob_account_url,
            figure_sas_url=self.config.azure_services.figure_blob_account_url
        )
    
    def _create_chunking_service(self) -> 'DefaultDocumentChunkingService':
        """Create document chunking service"""
        
        # Use the factory shared temporary directory
        tmp_directory = self.shared_tmp_directory
        
        return DefaultDocumentChunkingService(tmp_directory=tmp_directory)
    
    def _create_indexing_service(self) -> 'AzureSearchIndexingService':
        """Create indexing service"""
        return AzureSearchIndexingService()
    
    def _create_metadata_service(self) -> 'BlobMetadataService':
        """Create metadata service"""
        return BlobMetadataService(self.datasource)
    
    def _create_repository(self) -> DocumentRepository:
        """Create document repository"""
        database_interface = LegacyDatabaseAdapter(self.service_factory.get_database_engine())
        return SqlAlchemyDocumentRepository(database_interface)