This commit is contained in:
2025-09-26 17:15:54 +08:00
commit db0e5965ec
211 changed files with 40437 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
[flake8]
ignore = W293
exclude =
.git,
__pycache__,
.venv,
venv,
tests,
docs,
build,
dist,
*.egg-info,
.tox,
.mypy_cache,
.pytest_cache

209
vw-document-ai-indexer/.gitignore vendored Normal file
View File

@@ -0,0 +1,209 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.env.production
.env.development
config.json
config.prd.json
config.dev.json
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.conda/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/'
.DS_Store
web/.vscode/settings.json
# Intellij IDEA Files
.idea/*
!.idea/vcs.xml
!.idea/icon.png
.ideaDataSources/
*.iml
api/.idea
api/.env
api/storage/*
docker-legacy/volumes/app/storage/*
docker-legacy/volumes/db/data/*
docker-legacy/volumes/redis/data/*
docker-legacy/volumes/weaviate/*
docker-legacy/volumes/qdrant/*
docker-legacy/volumes/etcd/*
docker-legacy/volumes/minio/*
docker-legacy/volumes/milvus/*
docker-legacy/volumes/chroma/*
docker-legacy/volumes/opensearch/data/*
docker-legacy/volumes/pgvectors/data/*
docker-legacy/volumes/pgvector/data/*
docker/volumes/app/storage/*
docker/volumes/certbot/*
docker/volumes/db/data/*
docker/volumes/redis/data/*
docker/volumes/weaviate/*
docker/volumes/qdrant/*
docker/volumes/etcd/*
docker/volumes/minio/*
docker/volumes/milvus/*
docker/volumes/chroma/*
docker/volumes/opensearch/data/*
docker/volumes/myscale/data/*
docker/volumes/myscale/log/*
docker/volumes/unstructured/*
docker/volumes/pgvector/data/*
docker/volumes/pgvecto_rs/data/*
docker/volumes/couchbase/*
docker/volumes/oceanbase/*
!docker/volumes/oceanbase/init.d
docker/nginx/conf.d/default.conf
docker/nginx/ssl/*
!docker/nginx/ssl/.gitkeep
docker/middleware.env
sdks/python-client/build
sdks/python-client/dist
sdks/python-client/dify_client.egg-info
pyrightconfig.json
api/.vscode
.idea/
#.tmp
.tmp/
.vscode/
tests/
.playground/
.vscode/
.vs/
/version1/
/doc/
/.vibe

View File

@@ -0,0 +1,10 @@
[MASTER]
ignore=tests,venv
disable=
C0114, # missing-module-docstring
C0115, # missing-class-docstring
C0116, # missing-function-docstring
C0303,
W1203, # missing-parameter-docstring
W0718,
W0719

View File

@@ -0,0 +1,391 @@
# Document Extractor - Deployment Guide
This document provides a complete deployment guide for Document Extractor, including on-premises development, Docker containerized deployment, and Kubernetes production environment deployment.
## 📋 Pre-deployment preparation
### System Requirements
- Python 3.12+
- Docker (optional, for containerized deployment)
- Kubernetes (production environment deployment)
- Azure subscription and related services
### Azure Service Preparation
Ensure that you have configured the following Azure services:
- Azure Document Intelligence
- Azure AI Search
- Azure Blob Storage
- Azure OpenAI (for vector embeddings)
## 🔧 Configuration File Preparation
### 1. Environment Configuration (env.yaml)
```yaml
# Configuration file reference
config: config.yaml
# Processing settings
njobs: 8 # Number of parallel processing jobs
# Azure AI Search configuration
search_service_name: "https://your-search-service.search.windows.net"
search_admin_key: "your-search-admin-key"
# Azure OpenAI Embedding service
embedding_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview"
embedding_model_key: "your-openai-key"
VECTOR_DIMENSION: 1536
FLAG_AOAI: "V3" # Azure OpenAI version
FLAG_EMBEDDING_MODEL: "AOAI" # Embedding model type: "AOAI" or "qwen3-embedding-8b"
# Document Intelligence configuration
extract_method: "di+vision-llm" # Extraction method: "di+vision-llm", "vision-llm", "di"
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
form_rec_key: "your-di-key"
# Document Intelligence features
di-hiRes: true # High resolution OCR
di-Formulas: true # Mathematical expression detection
di_allow_features_ext: "pdf;jpeg;jpg;png;bmp;tiff;heif" # Supported file extensions
# Vision and captioning models
captioning_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
captioning_model_key: "your-openai-key"
vision_max_images: 200 # Maximum images to process per document (0 = no limit)
vision_image_method: "openai" # Image processing method: "openai" or "newapi"
FIGURE_CONTENT_CLEAR: true # Clear DI recognized image content
# Blob storage for figures and DI results
FIGURE_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
DI_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
# Database configuration
DB_URI: "postgresql://user:password@host:port/database_name"
# Processing flags
header_fix: false # Enable/disable header fixing
```
### 2. Business Configuration (config.yaml)
```yaml
# Main data configuration (array format)
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
datasource_name: "CATOnline-cn" # data source name
data_dir: "" # Optional local data directory
base_path: "/app/run_tmp" # Temporary processing directory
# File processing limits
process_file_num: 0 # 0 = process all files
process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date
# Chunking configuration
chunk_size: 2048 # Maximum tokens per chunk
token_overlap: 128 # Overlap between chunks
# Index schemas configuration
index_schemas:
# Chunk-level index for search
- index_name: "your-knowledge-chunk-index"
data_type: ["metadata", "document", "chunk"]
field_type: "append" # How to handle existing data
upload_batch_size: 50 # Documents per batch upload
# Metadata fields to include
fields: [
"filepath", "timestamp", "title", "publisher", "publish_date",
"document_category", "document_code", "language_code",
"x_Standard_Regulation_Id", "x_Attachment_Type",
"x_Standard_Title_CN", "x_Standard_Title_EN",
"x_Standard_Published_State", "x_Standard_Drafting_Status",
"x_Standard_Range", "x_Standard_Kind", "x_Standard_No",
"x_Standard_Code", "x_Standard_Technical_Committee",
"x_Standard_Vehicle_Type", "x_Standard_Power_Type",
"x_Standard_CCS", "x_Standard_ICS",
"x_Standard_Published_Date", "x_Standard_Effective_Date",
"x_Regulation_Status", "x_Regulation_Title_CN",
"x_Regulation_Title_EN", "x_Regulation_Document_No",
"x_Regulation_Issued_Date", "x_Classification",
"x_Work_Group", "x_Reference_Standard",
"x_Replaced_by", "x_Refer_To", "func_uuid",
"update_time", "status"
]
# Vector configuration
vector_fields:
- field: "contentVector"
append_fields: ["content"] # Fields to vectorize for content
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization
# Azure AI Search configuration
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath" # Field to use for updates
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
# Document-level index
- index_name: "your-knowledge-document-index"
data_type: ["document", "metadata"]
field_type: "full" # Replace entire documents
key_fields: ["filepath"] # Primary key fields
upload_batch_size: 1
fields: [
# Same field list as chunk index
"filepath", "timestamp", "title", "publisher"
# ... (same as above)
]
merge_content_fields: ["content"] # Fields to merge from chunks
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
# Regulation-specific index
- index_name: "your-regulation-index"
data_type: ["metadata"]
field_type: "full"
key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key
upload_batch_size: 50
fields: [
# Regulation-specific fields
"x_Standard_Regulation_Id", "x_Standard_Title_CN",
"x_Standard_Title_EN", "x_Regulation_Status"
# ... (regulation metadata fields)
]
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
update_by_field: "x_Standard_Regulation_Id"
# Field merging configuration
merge_fields:
- key: "doc_metadata" # Combined metadata field
fields: [
"title", "publisher", "document_category", "document_code",
"x_Standard_Title_CN", "x_Standard_Title_EN",
"x_Standard_Published_State", "x_Standard_Drafting_Status"
# ... (all metadata fields to combine)
]
# Vector field configuration
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
```
## 🚀 Deployment method
### Method 1: Local Development Deployment
#### 1. Environment Preparation
```bash
# Clone the repository
git clone <repository-url>
cd document-extractor
# Create a virtual environment
python -m venv .venv
# Activate the virtual environment
# Linux/Mac:
source .venv/bin/activate
# Windows:
.venv\Scripts\activate
# Install dependencies
pip install -r requirements.txt
```
#### 2. Configuration File Setup
```bash
# Copy configuration templates
cp config.yaml.example config.yaml
cp env.yaml.example env.yaml
# Edit config.yaml and env.yaml to actual configuration
```
#### 3. Run the application
```bash
# Directly run
python main.py --config config.yaml --env env.yaml
```
### Method 2: Kubernetes Production Deployment
#### 1. Build the image
```bash
docker build . -t document-ai-indexer:latest
docker tag document-ai-indexer:latest acrsales2caiprd.azurecr.cn/document-ai-indexer:latest
docker login acrsales2caiprd.azurecr.cn -u username -p password
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:latest
```
#### 2. Prepare Configuration Files
```bash
# Create namespace (if not exists)
kubectl create namespace knowledge-agent
# Create ConfigMap
kubectl create configmap document-ai-indexer-config \
--from-file=config.yaml \
--from-file=env.yaml \
-n knowledge-agent
```
#### 3. One-time Task Deployment
```bash
# Deploy Pod
kubectl apply -f deploy/document-ai-indexer_k8s.yml -n knowledge-agent
# Check status
kubectl get pods -n knowledge-agent
kubectl logs -f document-ai-indexer -n knowledge-agent
```
#### 4. CronJob Deployment
```bash
# Deploy CronJob
kubectl apply -f deploy/document-ai-indexer-cronjob.yml -n knowledge-agent
# Check CronJob status
kubectl get cronjobs -n knowledge-agent
# Check job history
kubectl get jobs -n knowledge-agent
# Trigger execution manually
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
```
## 📊 Deployment architecture diagram
```mermaid
graph TB
subgraph "Azure Cloud Services"
ABS[Azure Blob Storage]
ADI[Azure Document Intelligence]
AAS[Azure AI Search]
AOI[Azure OpenAI]
end
subgraph "Kubernetes Cluster"
subgraph "Namespace: knowledge-agent"
CM[ConfigMap<br/>Configuration File]
CJ[CronJob<br/>Timing tasks]
POD[Pod<br/>Processing container]
end
end
subgraph "Container Registry"
ACR[Azure Container Registry<br/>acrsales2caiprd.azurecr.cn]
end
CM --> POD
CJ --> POD
ACR --> POD
POD --> ABS
POD --> ADI
POD --> AAS
POD --> AOI
style POD fill:#e1f5fe
style CM fill:#e8f5e8
style CJ fill:#fff3e0
```
## 📈 Monitoring and logging
### View log
```bash
# Kubernetes environment
kubectl logs -f document-ai-indexer -n knowledge-agent
# Filter error logs
kubectl logs document-ai-indexer -n knowledge-agent | grep ERROR
# Check the processing progress
kubectl logs document-ai-indexer -n knowledge-agent | grep "Processing"
```
#### 4. Kubernetes Deployment Issues
**Symptoms**: Pod fails to start or keeps restarting
**Solutions**:
```bash
# Check Pod Status
kubectl describe pod document-ai-indexer -n knowledge-agent
# Check Events
kubectl get events -n knowledge-agent
# Check ConfigMap
kubectl get configmap document-ai-indexer-config -n knowledge-agent -o yaml
```
### Debugging Commands
```bash
# Check Configuration
kubectl exec -it document-ai-indexer -n knowledge-agent -- cat /app/config.yaml
# Enter Container for Debugging
kubectl exec -it document-ai-indexer -n knowledge-agent -- /bin/bash
# Manually run processing
kubectl exec -it document-ai-indexer -n knowledge-agent -- python main.py --config config.yaml --env env.yaml
```
## 🔄 Update deployment
### Application update
```bash
# Build new image
docker build -t document-ai-indexer:v0.21.0 .
# Push to repository
docker tag document-ai-indexer:v0.21.0 acrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0
docker push aacrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0
# Update Kubernetes deployment
kubectl set image cronjob/document-ai-indexer-cronjob \
document-ai-indexer=acrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0 \
-n knowledge-agent
```
### Configuration update
```bash
# Update ConfigMap
kubectl create configmap document-ai-indexer-config \
--from-file=config.yaml \
--from-file=env.yaml \
-n knowledge-agent \
--dry-run=client -o yaml | kubectl apply -f -
# Restart the application (if needed)
kubectl rollout restart cronjob/document-ai-indexer-cronjob -n knowledge-agent
```
---
*Last updated: August 2025*

View File

@@ -0,0 +1,19 @@
FROM acraiflowlab.azurecr.io/python:3.12-bullseye
RUN echo “Asia/Shanghai” > /etc/timezone
WORKDIR /app
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
COPY ./*.py /app
# RUN rm -f /app/env.yaml
# RUN rm -f /app/config.yaml
ENTRYPOINT ["python", "main.py"]

View File

@@ -0,0 +1,260 @@
# Document AI Indexer
An intelligent document processing and indexing system based on Azure AI services, supporting content extraction, processing, and vectorized indexing for multiple document formats.
## Features
### 🚀 Core Features
- **Multi-format Document Support**: PDF, DOCX, image formats, etc.
- **Intelligent Content Extraction**: OCR and structured extraction using Azure Document Intelligence
- **Document Chunking**: Smart document chunking and vectorization
- **Azure AI Search Integration**: Automatically create search indexes and upload documents
- **Metadata Management**: Complete document metadata extraction and management
- **Hierarchy Structure Repair**: Automatically fix title hierarchy structure in Markdown documents
### 🔧 Technical Features
- **Asynchronous Processing**: High-performance async processing based on asyncio
- **Containerized Deployment**: Complete Docker and Kubernetes support
- **Configuration Management**: Flexible YAML configuration file management
- **Database Support**: SQLAlchemy ORM supporting multiple databases
- **Resilient Processing**: Built-in retry mechanisms and error handling
- **Monitoring & Logging**: Complete logging and progress monitoring
## System Architecture
```mermaid
graph LR
subgraph "Data Sources"
DS[Document Sources<br/>Blob Storage/Local]
MD[Metadata<br/>Extraction]
end
subgraph "Azure AI Services"
ADI[Azure Document<br/>Intelligence]
AAS[Azure AI Search<br/>Index]
EMB[Vector<br/>Embedding]
end
subgraph "Processing Pipeline"
HF[Hierarchy<br/>Fix]
CH[Content<br/>Chunking]
end
DS --> ADI
MD --> HF
ADI --> HF
HF --> CH
CH --> EMB
EMB --> AAS
style DS fill:#e1f5fe
style ADI fill:#e8f5e8
style AAS fill:#fff3e0
style EMB fill:#f3e5f5
style HF fill:#ffebee
style CH fill:#f1f8e9
```
### Document Processing Flow
```mermaid
flowchart TD
START([Document Input]) --> DOWNLOAD[Download Document]
DOWNLOAD --> EXTRACT[AI Content Extraction]
EXTRACT --> FIX[Hierarchy Structure Fix]
FIX --> CHUNK[Content Chunking]
CHUNK --> EMBED[Vector Embedding]
EMBED --> INDEX[Search Index Upload]
INDEX --> END([Processing Complete])
style START fill:#c8e6c9
style END fill:#c8e6c9
style EXTRACT fill:#e1f5fe
style FIX fill:#fff3e0
style CHUNK fill:#f3e5f5
```
## Quick Start
### Requirements
- Python 3.12+
- Azure subscription and related services
For detailed deployment guides, please refer to: [Deployment.md](Deployment.md)
### Install Dependencies
```bash
pip install -r requirements.txt
```
### Configuration Files
The system uses two main configuration files:
- `config.yaml` - Business configuration (data source, index configuration, etc.)
- `env.yaml` - Environment variable configuration (Azure service keys, etc.)
**Quick Start Configuration:**
```yaml
# env.yaml - Essential Azure services
search_service_name: "https://your-search-service.search.windows.net"
search_admin_key: "your-search-admin-key"
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
form_rec_key: "your-di-key"
embedding_model_endpoint: "https://your-openai.openai.azure.com/..."
embedding_model_key: "your-openai-key"
# config.yaml - Basic data source
data_configs:
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
index_schemas:
- index_name: "your-knowledge-index"
data_type: ["metadata", "document", "chunk"]
```
📖 **Detailed configuration instructions**: See the complete configuration parameters and examples [Deployment.md - Configuration file preparation](Deployment.md#Configuration-file-preparation)
### Run Application
```bash
# Direct execution
python main.py
# Or use predefined tasks
# (In VS Code, use Ctrl+Shift+P -> Run Task)
```
## 📚 Document Navigation
- **[Deployment Guide (Deployment.md)](Deployment.md)** - Complete deployment guide, including Docker and Kubernetes deployments
- **[Configuration instructions](Deployment.md#Configuration-file-preparation)** - Detailed configuration file description
## Project Structure
```
document-extractor/
├── main.py # Application entry point
├── app_config.py # Configuration management
├── business_layer.py # Business logic layer
├── document_task_processor.py # Document task processor
├── di_extractor.py # Document Intelligence extractor
├── azure_index_service.py # Azure Search service
├── blob_service.py # Blob storage service
├── chunk_service.py # Document chunking service
├── hierarchy_fix.py # Hierarchy structure repair
├── database.py # Database models
├── entity_models.py # Entity models
├── utils.py # Utility functions
├── config.yaml # Business configuration
├── env.yaml # Environment configuration
├── requirements.txt # Dependencies
├── Dockerfile # Docker build file
├── pyproject.toml # Project configuration
├── build-script/ # Build scripts
│ └── document-ai-indexer.sh
├── deploy/ # Deployment files
│ ├── document-ai-indexer.sh
│ ├── document-ai-indexer_k8s.yml
│ ├── document-ai-indexer_cronjob.yml
│ └── embedding-api-proxy_k8s.yml
└── doc/ # Documentation
```
## Core Components
### 1. Document Processing Pipeline
- **Document Loading**: Support loading from Azure Blob Storage or local file system
- **Content Extraction**: OCR and structured extraction using Azure Document Intelligence
- **Content Chunking**: Smart chunking algorithms maintaining semantic integrity
- **Vectorization**: Generate vector representations of document content
### 2. Index Management
- **Dynamic Index Creation**: Automatically create Azure AI Search indexes based on configuration
- **Batch Upload**: Efficient batch document upload
- **Metadata Management**: Complete document metadata indexing
- **Incremental Updates**: Support incremental document updates
### 3. Data Processing
- **Hierarchy Structure Repair**: Automatically fix title hierarchy in Markdown documents
- **Metadata Extraction**: Extract structured metadata from documents and filenames
- **Format Conversion**: Unified processing support for multiple document formats
## API and Integration
### Azure Service Integration
- **Azure Document Intelligence**: Document analysis and OCR
- **Azure AI Search**: Search indexing and querying
- **Azure Blob Storage**: Document storage
- **Azure OpenAI**: Vector embedding generation
### Database Support
- PostgreSQL (recommended)
- SQLite (development and testing)
- Other SQLAlchemy-supported databases
## Monitoring and Logging
The system provides comprehensive logging capabilities:
- Processing progress monitoring
- Error logging
- Performance statistics
- Task status tracking
View logs:
```bash
# Kubernetes environment
kubectl logs -f document-ai-indexer -n knowledge-agent
# Docker environment
docker logs -f <container-id>
```
## Development
### Development Mode
```bash
# Activate virtual environment
source .venv/bin/activate # Linux/Mac
# or
.venv\Scripts\activate # Windows
# Install development dependencies
pip install -e .[dev,test]
# Run code checks
mypy .
```
### Log Analysis
```bash
# View error logs
kubectl logs document-ai-indexer -n knowledge-agent | grep ERROR
# View processing progress
kubectl logs document-ai-indexer -n knowledge-agent | grep "Processing"
```
## Version Information
- **Current Version**: 0.20.4
- **Python Version**: 3.12+
- **Main Dependencies**:
- azure-ai-documentintelligence
- azure-search-documents
- SQLAlchemy 2.0.41
- openai 1.55.3
---
*Last updated: August 2025*

View File

@@ -0,0 +1,197 @@
"""
Refactored configuration management system
Uses dependency injection and config classes instead of global variables
"""
from dataclasses import dataclass, field
from typing import Optional, Dict, Any
import os
import yaml
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from sqlalchemy import create_engine
@dataclass
class DatabaseConfig:
"""Database configuration"""
uri: str
pool_size: int = 5
max_overflow: int = 10
pool_timeout: int = 30
@dataclass
class AzureServiceConfig:
"""Azure service configuration"""
form_recognizer_endpoint: str
form_recognizer_key: str
search_service_name: str
search_admin_key: str
embedding_model_endpoint: Optional[str] = None
embedding_model_key: Optional[str] = None
captioning_model_endpoint: Optional[str] = None
captioning_model_key: Optional[str] = None
di_blob_account_url: Optional[str] = None
figure_blob_account_url: Optional[str] = None
@dataclass
class CaptionServiceConfig:
"""Caption service configuration"""
include_di_content:bool = True
description_gen_max_images:int = 0
model_endpoint: Optional[str] = None
model_key: Optional[str] = None
model:Optional[str] = None
azure_deployment:Optional[str] = None
api_version:Optional[str] = None
prompts:Optional[dict[str,Any]] = None
@dataclass
class ProcessingConfig:
"""Processing configuration"""
max_workers: int = 8
chunk_size: int = 2048
token_overlap: int = 128
min_chunk_size: int = 10
retry_count: int = 3
retry_delay: int = 15
tmp_directory: str = '/tmp'
@dataclass
class LoggingConfig:
"""Logging configuration"""
level: str = "INFO"
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file_path: Optional[str] = None
console_output: bool = True
console_level: str = "WARNING" # Console only shows WARNING and above
console_format: str = "%(message)s" # Simplified format for console
console_progress_only: bool = True # Only show progress and key info in console
@dataclass
class ApplicationConfig:
"""Main application configuration"""
database: DatabaseConfig
azure_services: AzureServiceConfig
processing: ProcessingConfig
data_configs: list[Dict[str, Any]] = field(default_factory= list[Dict[str, Any]])
current_tmp_directory: str = ''
caption: CaptionServiceConfig = None
env_data: Dict[str, Any] = field(default_factory=dict)
@classmethod
def from_env_and_config_files(cls, config_yaml_path: str, env_yaml_path: str = "env.yaml",prompt_path:str="prompt.yaml") -> 'ApplicationConfig':
"""Load configuration from environment variable file and config file."""
# 1. Load environment variable config file first
cls._load_env_yaml(cls,env_yaml_path)
# 2. Load business config file
with open(config_yaml_path, 'r', encoding='utf-8') as f:
config_data = yaml.safe_load(f)
# 3. Load prompt config file
if os.path.exists(prompt_path):
with open(prompt_path, 'r', encoding='utf-8') as f:
prompt_data = yaml.safe_load(f)
# 4. Build config object
return cls(
database=DatabaseConfig(
uri=os.getenv('DB_URI', 'sqlite:///app.db'),
pool_size=int(os.getenv('DB_POOL_SIZE', '5')),
max_overflow=int(os.getenv('DB_MAX_OVERFLOW', '10')),
pool_timeout=int(os.getenv('DB_POOL_TIMEOUT', '30'))
),
azure_services=AzureServiceConfig(
form_recognizer_endpoint=os.getenv('form_rec_resource', ''),
form_recognizer_key=os.getenv('form_rec_key', ''),
search_service_name=os.getenv('search_service_name', ''),
search_admin_key=os.getenv('search_admin_key', ''),
embedding_model_endpoint=os.getenv('embedding_model_endpoint'),
embedding_model_key=os.getenv('embedding_model_key'),
captioning_model_endpoint=os.getenv('captioning_model_endpoint'),
captioning_model_key=os.getenv('captioning_model_key'),
di_blob_account_url=os.getenv('DI_BLOB_ACCOUNT_URL',None),
figure_blob_account_url=os.getenv('FIGURE_BLOB_ACCOUNT_URL', '')
),
processing=ProcessingConfig(
max_workers=int(os.getenv('njobs', '8')),
retry_count=int(os.getenv('RETRY_COUNT', '3')),
retry_delay=int(os.getenv('RETRY_DELAY', '15')),
tmp_directory=os.getenv('TMP_DIRECTORY', '/tmp')
),
caption=CaptionServiceConfig(
description_gen_max_images= int(cls.env_data["figure_caption"]["description_gen_max_images"]),
include_di_content = cls.env_data["figure_caption"]["include_di_content"],
model_endpoint= cls.env_data["figure_caption"]["model_endpoint"],
model_key= cls.env_data["figure_caption"]["model_key"],
model= cls.env_data["figure_caption"]["model"],
azure_deployment= cls.env_data["figure_caption"]["azure_deployment"],
api_version=cls.env_data["figure_caption"]["api_version"],
prompts=prompt_data["caption"] if prompt_data and "caption" in prompt_data else None
),
data_configs=config_data if isinstance(config_data, list) else [config_data]
)
@staticmethod
def _load_env_yaml(self,env_yaml_path: str):
"""Load environment variable YAML file."""
if not os.path.exists(env_yaml_path):
return
with open(env_yaml_path, 'r', encoding='utf-8') as f:
self.env_data = yaml.safe_load(f)
# Set environment variables to system environment
if self.env_data:
for key, value in self.env_data.items():
if isinstance(value, bool):
value = str(value).lower()
os.environ[str(key)] = str(value)
def validate(self) -> None:
"""Validate configuration."""
if not self.database.uri:
raise ValueError("Database URI cannot be empty")
if not self.azure_services.form_recognizer_endpoint:
raise ValueError("Form Recognizer endpoint cannot be empty")
if not self.azure_services.form_recognizer_key:
raise ValueError("Form Recognizer key cannot be empty")
if self.processing.max_workers < 1:
raise ValueError("Number of worker threads must be greater than 0")
class ServiceFactory:
"""Service factory class, responsible for creating and managing various service instances."""
def __init__(self, config: ApplicationConfig):
self.config = config
self._form_recognizer_client = None
def get_form_recognizer_client(self) -> DocumentAnalysisClient:
"""Get Form Recognizer client (singleton)."""
if self._form_recognizer_client is None:
self._form_recognizer_client = DocumentAnalysisClient(
endpoint=self.config.azure_services.form_recognizer_endpoint,
credential=AzureKeyCredential(self.config.azure_services.form_recognizer_key)
)
return self._form_recognizer_client
def get_database_engine(self):
"""Get database engine."""
return create_engine(
self.config.database.uri,
pool_size=self.config.database.pool_size,
max_overflow=self.config.database.max_overflow,
pool_timeout=self.config.database.pool_timeout
)

View File

@@ -0,0 +1,751 @@
"""
Azure AI index search service
Provides operations for Azure AI Search Index, including creating indexes, uploading documents, checking if an index exists, etc.
"""
import base64
import json
import logging
import os
import time
import uuid
from dataclasses import fields
from typing import List, Dict, Any, Optional
from tqdm import tqdm
import uuid6
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from azure.search.documents import SearchClient, IndexDocumentsBatch
from azure.search.documents._generated.models import IndexingResult
from azure.search.documents.indexes.models import SearchIndex, SimpleField # type: ignore
from azure.search.documents.indexes import SearchIndexClient
from resilient_http_pool import get_cloud_api_client
from entity_models import Document
from utils import asdict_with_dynamic, write_log, write_grouped_index_files
from di_extractor import retry_get_embedding
SUPPORTED_LANGUAGE_CODES = {
"ar": "Arabic",
"hy": "Armenian",
"eu": "Basque",
"bg": "Bulgarian",
"ca": "Catalan",
"zh-Hans": "Chinese Simplified",
"zh-Hant": "Chinese Traditional",
"cs": "Czech",
"da": "Danish",
"nl": "Dutch",
"en": "English",
"fi": "Finnish",
"fr": "French",
"gl": "Galician",
"de": "German",
"el": "Greek",
"hi": "Hindi",
"hu": "Hungarian",
"id": "Indonesian (Bahasa)",
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
"ko": "Korean",
"lv": "Latvian",
"no": "Norwegian",
"fa": "Persian",
"pl": "Polish",
"pt-Br": "Portuguese (Brazil)",
"pt-Pt": "Portuguese (Portugal)",
"ro": "Romanian",
"ru": "Russian",
"es": "Spanish",
"sv": "Swedish",
"th": "Thai",
"tr": "Turkish"
}
def index_init(data_config: dict[str, Any] , search_admin_key:str, search_service_name:str) -> None:
index_schemas: dict[str, Any] = data_config.get("index_schemas") if data_config else None # type: ignore
admin_key = search_admin_key if search_admin_key else None
service_name = search_service_name
for schema_name in index_schemas:
language = data_config.get("language", None)
if language and language not in SUPPORTED_LANGUAGE_CODES:
raise Exception(f"ERROR: Ingestion does not support {language} documents. "
f"Please use one of {SUPPORTED_LANGUAGE_CODES}."
f"Language is set as two letter code for e.g. 'en' for English."
f"If you donot want to set a language just remove this prompt config or set as None")
# Basic index structure initialization
create_or_update_search_index(service_name=service_name, index_name=schema_name["index_name"],
semantic_config_name=schema_name["semantic_config_name"],
vector_config_name=schema_name["vector_config_name"],
language=language,admin_key=admin_key,
meta_fields = schema_name["fields"])
def create_or_update_search_index(service_name: str|None, index_name: str|None, semantic_config_name: str = "default", vector_config_name: str = "", language:str="", admin_key: str = "", meta_fields: list[str]|None = None):
url = f"{service_name}/indexes/{index_name}?api-version=2024-11-01-Preview"
headers: dict[str, str] = {"Content-Type": "application/json", "api-key": admin_key}
body: dict[str, Any] = {
"fields": [
{"name":"session_id","type":"Edm.String", "searchable": True, "sortable": False, "facetable": False, "filterable": True},
{"name": "id","type": "Edm.String","searchable": True,"key": True,},
{"name": "content","type": "Edm.String","searchable": True,"sortable": False,"facetable": False,"filterable": False,"analyzer": f"{language}.lucene" if language else None,},
{"name": "title","type": "Edm.String","searchable": True,"sortable": True,"facetable": False,"filterable": False,"analyzer": f"{language}.lucene" if language else None,},
{"name": "filepath","type": "Edm.String", "searchable": True,"sortable": True,"facetable": False,"filterable": True},
{"name": "url","type": "Edm.String","searchable": True,"sortable": True,"filterable": True},
{ "name": "metadata", "type": "Edm.String", "searchable": True, "filterable": True },
{ "name": "image_mapping", "type": "Edm.String", "searchable": False, "sortable": False, "facetable": False, "filterable": True },
{ "name": "doc_metadata", "type": "Edm.String", "searchable": True, "sortable": False, "facetable": False, "filterable": False },
{ "name": "document_schema", "type": "Edm.String", "searchable": True, "sortable": True, "facetable": False, "filterable": True },
{ "name": "main_title", "type": "Edm.String", "searchable": True, "sortable": True, "facetable": False, "filterable": True },
{
"name": "sub_title",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "publisher",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "document_code",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "document_category",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "main_title_sec_language",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "sub_title_sec_language",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "primary_language",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "secondary_language",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "full_headers",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h1",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h2",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h3",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h4",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h5",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "h6",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "timestamp",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": True,
"filterable": True
},
{
"name": "publish_date",
"type": "Edm.String",
"searchable": True,
"sortable": True,
"facetable": False,
"filterable": True
},
{
"name": "description",
"type": "Edm.String",
"searchable": True,
"sortable": False,
"facetable": False,
"filterable": True
}
],
"suggesters": [],
"scoringProfiles": [],
"semantic": {
"configurations": [
{
"name": semantic_config_name,
"prioritizedFields": {
"titleField": {"fieldName": "title"},
"prioritizedContentFields": [{"fieldName": "content"}],
"prioritizedKeywordsFields": [{"fieldName": "full_headers"}, {"fieldName": "doc_metadata"}],
},
}
]
},
}
if vector_config_name:
body["fields"].append({
"name": "contentVector",
"type": "Collection(Edm.Single)",
"searchable": True,
"retrievable": True,
"stored": True,
"dimensions": int(os.getenv("VECTOR_DIMENSION", "1536")),
"vectorSearchProfile": vector_config_name
})
body["fields"].append({
"name": "full_metadata_vector",
"type": "Collection(Edm.Single)",
"searchable": True,
"retrievable": True,
"stored": True,
"dimensions": int(os.getenv("VECTOR_DIMENSION", "1536")),
"vectorSearchProfile": vector_config_name
})
body["vectorSearch"] = {
"algorithms": [
{
"name": "my-hnsw-config-1",
"kind": "hnsw",
"hnswParameters": {
"m": 4,
"efConstruction": 400,
"efSearch": 500,
"metric": "cosine"
}
}
],
"profiles": [
{
"name": "vectorSearchProfile",
"algorithm": "my-hnsw-config-1",
# "vectorizer": "azure_vectorizer"
}
],
}
if os.getenv("AOAI_EMBEDDING_ENDPOINT"):
body["vectorSearch"]["profiles"][0]["vectorizer"] = "azure_vectorizer"
body["vectorSearch"]["vectorizers"] = [
{
"name": "azure_vectorizer",
"kind": "azureOpenAI",
"azureOpenAIParameters": {
"resourceUri": os.getenv("AOAI_EMBEDDING_ENDPOINT"),
"deploymentId": os.getenv("AOAI_EMBEDDING_DEPLOYMENT"),
"apiKey": os.getenv("AOAI_EMBEDDING_KEY"),
"modelName": os.getenv("AOAI_EMBEDDING_MODEL")
}
}
]
for field in meta_fields if meta_fields is not None else []:
if not any(str(item["name"]) == field for item in body['fields']):
sortable:bool = True
facetable:bool = True
filterable:bool = True
if field in ["x_Standard_Range"]:
sortable = False
facetable = False
filterable = False
body["fields"].append({
"name": field,
"type": "Edm.String",
"searchable": True,
"sortable": sortable,
"facetable": facetable,
"filterable": filterable
})
client = get_cloud_api_client()
response = client.put(url, json=body, headers=headers)
if response.status_code == 201:
print(f"Created search index {index_name}")
elif response.status_code == 204:
print(f"Updated existing search index {index_name}")
else:
raise Exception(f"Failed to create search index. Status Code:{response.status_code}, Error: {response.text}")
return True
def upload_documents_to_index(service_name:str, index_name:str, docs, upload_batch_size:int=50, admin_key:str|None=None):
if admin_key is None:
raise ValueError("credential and admin_key cannot be None")
to_upload_dicts = []
for d in docs:
# Get dynamically added attributes
if type(d) is not dict:
d = asdict_with_dynamic(d)
# add id to documents
d.update({"@search.action": "upload", "id": d["id"]})
if "contentVector" in d and d["contentVector"] is None:
del d["contentVector"]
if "full_metadata_vector" in d and d["full_metadata_vector"] is None:
del d["full_metadata_vector"]
to_upload_dicts.append(d)
# endpoint = "https://{}.search.windows.net/".format(service_name)
endpoint: str = service_name
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_key))
# Upload the documents in batches of upload_batch_size
for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc="Indexing Chunks..."):
batch = to_upload_dicts[i: i + upload_batch_size]
results = search_client.upload_documents(documents=batch)
num_failures = 0
errors = set()
for result in results:
if not result.succeeded:
print(f"Indexing Failed for {result.key} with ERROR: {result.error_message}")
num_failures += 1
errors.add(result.error_message)
if num_failures > 0:
raise Exception(f"INDEXING FAILED for {num_failures} documents. Please recreate the index."
f"To Debug: PLEASE CHECK chunk_size and upload_batch_size. \n Error Messages: {list(errors)}")
def upload_merge_index(index_config: Any, docs:list[dict[str,Any]],merge_fields:list[dict[str,Any]]|None=None,current_tmp_directory:str='') -> bool:
"""
Merge chunk information and upload to AI search index
"""
index_name: str = index_config["index_name"]
embedding_endpoint: str = os.environ.get("embedding_model_endpoint", '')
embedding_model_key: str = os.environ.get("embedding_model_key", '') #config.embedding_model_key
fields_meta: Any = index_config["fields"] or []
merge_content_fields: Any = index_config[ "merge_content_fields"] if "merge_content_fields" in index_config.keys() else []
key_fields: Any = index_config["key_fields"] if "key_fields" in index_config.keys() else []
all_fields = list(dict.fromkeys(["id"] + fields_meta + merge_content_fields + key_fields + [f.name for f in fields(Document)] ))
upload_batch_size = index_config["upload_batch_size"] if "upload_batch_size" in index_config.keys() else 1
original_to_upload_dicts: list[Any] = []
for d in docs:
# Get dynamically added attributes
if type(d) is not dict:
d = asdict_with_dynamic(d)
for key in list(d.keys()):
if key not in all_fields:
del d[key]
if ("contentVector" in d) and (d["contentVector"] is None or "contentVector" not in all_fields):
del d["contentVector"]
if ("full_metadata_vector" in d) and (
d["full_metadata_vector"] is None or "full_metadata_vector" not in all_fields):
del d["full_metadata_vector"]
# Default id primary key assignment, key_fields content merge and base64
id_value = d["id"] if "id" in d else ""
if "key_fields" in index_config.keys():
id_value = '_'.join(str(d[k]) for k in key_fields if k in d)
if id_value is None or id_value == "":
continue
# Select certain fields, concatenate to another field
for merge_field in merge_fields:
d[merge_field["key"]] = json.dumps( {field: d[field] for field in merge_field["fields"] if field in d and (value := d[field]) is not None and value != ""}, ensure_ascii=False)
d["id"] = base64.urlsafe_b64encode(id_value.encode('utf-8')).decode('utf-8') \
# add id to documents
d.update({"@search.action": "upload", "id": d["id"]})
d.update({"session_id":str(uuid6.uuid7())})
original_to_upload_dicts.append(d)
to_upload_dicts = original_to_upload_dicts
current_object_key = to_upload_dicts[0]["filepath"] if len(to_upload_dicts) > 0 and "filepath" in to_upload_dicts[0] else ''
# Calculate vector data based on configuration fields
for vector_config in index_config["vector_fields"] if "vector_fields" in index_config.keys() else []:
for i in tqdm(range(0, len(to_upload_dicts), 1), desc=f"{current_object_key} vector {vector_config["field"]} embedding..."):
d = to_upload_dicts[i: i + 1][0]
vector_dict = {}
for field in vector_config["append_fields"]:
if isinstance(d[field], dict):
vector_dict |= d[field]
elif isinstance(d[field], str):
vector_dict[field] = d[field]
vector_str = str(vector_dict) if vector_dict else ""
embedding = retry_get_embedding(text=vector_str, embedding_model_key=embedding_model_key, embedding_endpoint=embedding_endpoint)
if embedding:
d[vector_config["field"]] = retry_get_embedding(text=vector_str, embedding_model_key=embedding_model_key, embedding_endpoint=embedding_endpoint)
# 根据to_upload_dicts种的filepath字段分组写入到.index目录下对应的json文件
write_grouped_index_files(to_upload_dicts, index_name=index_name, base_directory=current_tmp_directory)
results: list[bool] = []
# Upload the documents in batches of upload_batch_size
for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc=f"Indexing {index_name} Chunks..."):
batch = to_upload_dicts[i: i + upload_batch_size]
results.append(upload_and_ensure(index_name=index_name, docs=batch, key_field="session_id"))
return all(results)
def merge_dicts(data_list, key_fields, merge_fields, separator='\n'):
"""
Merge dictionary list based on specified fields
Arguments:
data_list -- Original dictionary list
key_fields -- Fields used for deduplication (e.g., ['title', 'filepath'])
merge_fields -- Fields to be merged (e.g., ['content'])
separator -- Separator used for merging fields (default is newline)
Returns:
New dictionary list after merging
"""
merged_dict = {}
for item in data_list:
# Create a unique key - a tuple of all key fields
key = tuple(item.get(field) for field in key_fields)
if key in merged_dict:
# Merge fields
existing = merged_dict[key]
for field in merge_fields:
# Merge new value with old value
existing[field] = separator.join([
existing.get(field, ''),
item.get(field, '')
]).strip(separator)
else:
# Create new record
merged_dict[key] = {
**item, # Copy original fields
# Pre-initialize merged fields
**{field: item.get(field, '') for field in merge_fields}
}
return list(merged_dict.values())
def validate_index(service_name:str, index_name:str, admin_key:str=None):
api_version = "2024-11-01-Preview"
headers = {"Content-Type": "application/json", "api-key": admin_key}
params = {"api-version": api_version}
url = f"{service_name}/indexes/{index_name}/stats"
client = get_cloud_api_client()
for retry_count in range(5):
response = client.get(url, headers=headers, params=params)
if response.status_code == 200:
response_data = response.json()
num_chunks = response_data['documentCount']
if num_chunks == 0 and retry_count < 10:
print("Index is empty. Waiting 20 seconds to check again...")
time.sleep(20)
elif num_chunks == 0 and retry_count == 10:
print("Index is empty. Please investigate and re-index.")
else:
print(f"The index contains {num_chunks} chunks.")
average_chunk_size = response_data['storageSize'] / num_chunks
print(f"The average chunk size of the index is {average_chunk_size} bytes.")
break
else:
if response.status_code == 404:
print("The index does not seem to exist. Please make sure the index was created correctly, and that you are using the correct service and index names")
elif response.status_code == 403:
print("Authentication Failure: Make sure you are using the correct key")
else:
print(f"Request failed. Please investigate. Status code: {response.status_code}")
break
def index_exists(index_name: str) -> bool:
try:
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
endpoint = search_service_name
credential = AzureKeyCredential(search_admin_key)
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
index_client.get_index(index_name)
return True
except Exception as e:
write_log(f"Index '{index_name}' does not exist: {e}")
return False
def create_index(index_name:str, index_fields: list[dict[str, Any]], suggesters: Optional[list[dict[str, Any]]] = None) -> None:
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
endpoint = search_service_name
credential = AzureKeyCredential(search_admin_key)
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
if index_exists(index_name=index_name):
write_log(f"Index '{index_name}' already exists.")
return
search_fields = [SimpleField(**field) for field in index_fields]
index = SearchIndex(name=index_name, fields=search_fields, suggesters=suggesters or [])
index_client.create_index(index)
write_log(f"Index '{index_name}' created.")
def upload_documents(index_name:str, documents: List[Dict[str, Any]]) -> None:
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
endpoint = search_service_name
credential = AzureKeyCredential(search_admin_key)
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
batch = IndexDocumentsBatch()
batch.add_merge_or_upload_actions(documents) #type: ignore
results = search_client.index_documents(batch)
write_log(f"Uploaded {len(documents)} documents to index '{index_name}'. Result: {results}")
def delete_index(index_name:str) -> None:
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
endpoint = search_service_name
credential = AzureKeyCredential(search_admin_key)
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
if index_exists(index_name=index_name):
index_client.delete_index(index_name)
write_log(f"Index '{index_name}' deleted.")
else:
write_log(f"Index '{index_name}' does not exist.")
def search(index_name, search_text: str, **kwargs) -> Any:
endpoint = os.getenv("search_service_name","")
credential = AzureKeyCredential(os.getenv("search_admin_key",""))
index_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
return index_client.search(search_text, **kwargs)
def documents_with_field_value_exist(index_name:str, field_name: str, value: Any) -> bool:
"""
Check if there are documents in the index where a specific field equals the given value.
"""
endpoint = os.getenv("search_service_name", "")
credential = AzureKeyCredential(os.getenv("search_admin_key", ""))
index_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
filter_query = f"{field_name} eq '{value}'" if isinstance(value, str) else f"{field_name} eq {value}"
results: Any = index_client.search("*", filter=filter_query, top=1)
for _ in results:
return True
return False
def delete_documents_by_field(index_name:str,field_name: str, value: Any) -> bool:
"""
Delete all documents where the specified field equals the given value.
"""
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
search_client = SearchClient(endpoint=search_service_name, index_name=index_name, credential=AzureKeyCredential(search_admin_key))
# Step 1: Retrieve documents that meet the criteria (here looking for documents with status field as "inactive")
query = f"{field_name} eq '{value}'"
results: Any = search_client.search(select=["id"], filter=query)
if not results:
return True
# Step 2: Extract the primary keys (id) of the documents to be deleted
keys_to_delete = [doc['id'] for doc in results]
# Step 3: Delete the documents that meet the criteria
if keys_to_delete:
# Use batch delete API to remove documents
delete_results:list[IndexingResult] = search_client.delete_documents(documents=[{'id': key} for key in keys_to_delete])#type: ignore
logging.getLogger().info(f"Deleted documents with keys: {keys_to_delete}")
return all(result.succeeded for result in delete_results)
else:
return False
def query_by_field( index_name: str, field_name: str, value: Any, top: int = 99999) -> list[dict[Any,Any]]:
"""
Query documents in the index where a specific field equals the given value.
:param field_name: The field to filter on.
:param value: The value to match.
:param top: Maximum number of results to return.
:return: List of matching documents.
"""
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
search_client = SearchClient(endpoint = search_service_name, index_name=index_name,credential=AzureKeyCredential(search_admin_key))
filter_query = f"{field_name} eq '{value}'" if isinstance(value, str) else f"{field_name} eq {value}"
results:Any = search_client.search("*", filter=filter_query, top=top)
return [doc for doc in results]
def upload_and_ensure(index_name:str, docs: list[dict[Any, Any]], key_field="session_id", delay_seconds:int=5, max_retries:int=5) -> bool:
search_service_name = os.getenv("search_service_name", "")
search_admin_key = os.getenv("search_admin_key", "")
endpoint = search_service_name
api_key = search_admin_key
client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
# Step 1: Batch submit MergeOrUpload
batch = IndexDocumentsBatch()
batch.add_merge_or_upload_actions(docs) # type: ignore
results = client.index_documents(batch)
# Step 2: Check status of each document
failed = [r.key for r in results if not r.succeeded]
if failed:
raise Exception(f"Initial submission failed for documents: {failed}")
return True
# # Step 3: Delay waiting for background index
# time.sleep(delay_seconds)
# # Step 4: Verify and retry
# keys: list[str] = [doc[key_field] for doc in docs]
# return verify_and_retry(client, keys, docs, key_field, delay_seconds, max_retries)
def verify_and_retry(client: SearchClient, keys: list[str], docs, key_field, delay_seconds, max_retries) -> bool:
attempt = 0
session_id = str(uuid.uuid4())
while attempt <= max_retries:
missing = find_missing(client, keys, session_id)
if not missing:
return True
attempt += 1
print(f"Retry {attempt}, missing: {missing}")
to_retry = [doc for doc in docs if doc[key_field] in missing]
batch = IndexDocumentsBatch()
actions = [batch.add_merge_or_upload_actions([doc]) for doc in to_retry]
client.index_documents(batch)
time.sleep(delay_seconds)
# Final check
missing = find_missing(client, keys, session_id)
if missing:
raise Exception(f"Index verification failed, the following documents were not indexed: {missing}")
return True
def find_missing(client: SearchClient, keys: list[str], session_id: str) -> list[str]:
missing: list[str] = []
for key in keys:
try:
results = client.search(filter=f"session_id eq '{key}'", top=1)
if not any(results):
missing.append(key)
except HttpResponseError:
missing.append(key)
return missing

View File

@@ -0,0 +1,150 @@
import json
import os
import time
from datetime import datetime
from typing import Any
from azure.storage.blob import ContainerClient, BlobProperties
from utils import custom_serializer, keep_latest
def check_files(blob_url:str, doc_time:datetime|None) -> list[dict[str, Any]]:
# If blob, get blob properties; if local file, get system modification time
container_client = ContainerClient.from_container_url(blob_url)
updated_files: list[dict[str, Any]] = []
blobs: list[BlobProperties] = list(container_client.list_blobs())
# Sort by modification time ascending
blobs_by_last_modified = sorted(blobs, key=lambda b: b.last_modified) #datetime.fromisoformat()
for blob in blobs_by_last_modified:
if blob.name.endswith('.doc_metadata.json'):
continue
else:
last_modified: datetime = blob.last_modified.replace(tzinfo=None) #datetime.fromisoformat(blob.last_modified)
name = blob.name
if doc_time is None or last_modified > doc_time:
updated_files.append({"name": name, "doc_upper_time": last_modified})
return updated_files
def load_metadata(blob_url:str, directory_path: str, data_directory: str) -> list[Any]:
"""Download .doc_metadata.json file from blob_url and return the parsed metadata list."""
downloadToLocalFolder(blob_url, data_directory, directory_path, ".doc_metadata.json")
if not os.path.exists(f"{directory_path}/.doc_metadata.json"):
return []
#raise FileNotFoundError(f"Metadata file not found in {directory_path}")
with open(f"{directory_path}/.doc_metadata.json", "rb") as doc_metadata_file:
doc_metadata = json.load(doc_metadata_file)
sorted_list = sorted(doc_metadata["doc_metadata"], key=lambda x: x["timestamp"], reverse=True)
# For testing: replace '-' with '_' in keys
[dic.update({k.replace("-", "_"): dic.pop(k)}) for dic in sorted_list for k in list(dic.keys()) if "-" in k]
return sorted_list
def check_meta(blob_url:str, meta_upper_time:Any, current_tmp_directory: str, data_dir: str) -> list[dict[Any,Any]]:
"""Check .doc_metadata.json records under blob_url and compare with processed meta_upper_time, return updated metadata list."""
sorted_list = load_metadata(blob_url, current_tmp_directory, data_directory=data_dir)
filter_list = filter(lambda x: meta_upper_time is None or datetime.fromisoformat(x["timestamp"]).replace(tzinfo=None) > meta_upper_time, sorted_list)
updated_metas: list[dict[str,Any]] = []
for item in filter_list:
# Parse string to datetime object
dt = datetime.fromisoformat(item["timestamp"]).replace(tzinfo=None)
# Keep the latest meta_upper_time data
updated_metas.append({"name": item["filepath"], "meta_upper_time": dt})
return keep_latest(updated_metas, "name", "meta_upper_time")
def downloadToLocalFolder(blob_url:str, data_dir:str, local_folder: str, name_starts_with:str) -> list[str]:
"""Check if .doc_metadata.json exists in the directory, download if not."""
# If local_folder is empty, use temp directory
if os.path.exists(f"{local_folder}/{name_starts_with}"):
return []
path = data_dir
if path and not path.endswith('/'):
path = path + '/'
container_client = ContainerClient.from_container_url(blob_url)
last_destination_folder = None
destination_paths: list[str] = []
for blob in container_client.list_blobs(name_starts_with=name_starts_with):
relative_path = blob.name[len(path):]
destination_path = os.path.join(local_folder, relative_path)
destination_folder = os.path.dirname(destination_path)
if destination_folder != last_destination_folder:
os.makedirs(destination_folder, exist_ok=True)
last_destination_folder = destination_folder
blob_client = container_client.get_blob_client(blob.name)
with open(file=destination_path, mode='wb') as local_file:
stream = blob_client.download_blob()
local_file.write(stream.readall())
destination_paths.append(destination_path)
return destination_paths
def blob_upload_content(blob_sas_url: str, file_name: str, content: str, retry_count: int = 3) -> str:
for i in range(retry_count):
try:
# Upload file to Azure blob
container_client: ContainerClient = ContainerClient.from_container_url(blob_sas_url)
container_client.upload_blob(name=file_name, data=content, overwrite=True) # type: ignore
return f"{blob_sas_url}/{file_name}"
except Exception as e:
print(f"Error uploading content for {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
time.sleep(5)
raise Exception(f"Error uploading content for: {file_name}")
def blob_upload_object(blob_sas_url: str, file_name: str, obj: Any, retry_count: int = 3) -> str:
if not blob_sas_url:
return ''
content = json.dumps(obj, default=custom_serializer,ensure_ascii=False, indent=4)
for i in range(retry_count):
try:
# Upload file to Azure blob
container_client: ContainerClient = ContainerClient.from_container_url(blob_sas_url)
container_client.upload_blob(name=file_name, data=content, overwrite=True) # type: ignore
return f"{blob_sas_url}/{file_name}"
except Exception as e:
print(f"Error uploading content for {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
time.sleep(5)
raise Exception(f"Error uploading content for: {file_name}")
def blob_exists(blob_sas_url: str, file_name: str) -> bool:
"""Check if a blob exists in the container."""
try:
container_client = ContainerClient.from_container_url(blob_sas_url)
blob_client = container_client.get_blob_client(file_name)
return blob_client.exists()
except Exception as e:
print(f"Error checking existence of blob {file_name}: {e}")
return False
def load_content(blob_sas_url: str, file_name: str, retry_count: int = 3) -> str:
"""Download the file from blob storage."""
for i in range(retry_count):
try:
container_client = ContainerClient.from_container_url(blob_sas_url)
blob_client = container_client.get_blob_client(file_name)
# Download blob content as bytes and decode to string
blob_data = blob_client.download_blob().readall() # type: ignore
# Try to decode as UTF-8 first, fallback to other encodings if needed
try:
return blob_data.decode('utf-8')
except UnicodeDecodeError:
# Try other common encodings
for encoding in ['gbk', 'latin-1', 'cp1252']:
try:
return blob_data.decode(encoding)
except UnicodeDecodeError:
continue
# If all encodings fail, return with error replacement
return blob_data.decode('utf-8', errors='replace')
except Exception as e:
print(f"Error loading content from {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
if i < retry_count - 1:
time.sleep(5)
# If all retries fail, raise exception
raise Exception(f"Error loading content from blob: {file_name} after {retry_count} retries")

View File

@@ -0,0 +1,623 @@
""" business_layer.py
This module contains the business logic for document processing."""
import os
from abc import ABC, abstractmethod
from typing import List, Optional, Dict, Any
from dataclasses import dataclass
import traceback
import datetime
from collections import Counter
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.core.pipeline.policies import RetryPolicy
from app_config import ApplicationConfig, ServiceFactory
from chunk_service import chunk_di_doc
from entity_models import Document, ChunkingResult,DiResult
from database import DatabaseInterface, IndexObject, IndexObjectStatus,LegacyDatabaseAdapter
from di_extractor import di_extract
from blob_service import blob_exists, blob_upload_content, blob_upload_object, downloadToLocalFolder, load_content
from utils import replace_urls_in_content, write_content,write_document,asdict_with_dynamic
from azure_index_service import upload_merge_index, delete_documents_by_field,query_by_field
from vllm_extractor import process_document_figures
class SingletonFormRecognizerClient:
instance = None
def __new__(cls, *args, **kwargs):
if not cls.instance:
extract_method = os.environ.get("extract_method", "default")
if extract_method == "vision-llm":
cls.instance = object() # dummy object
else:
url = os.getenv("form_rec_resource")
key = os.getenv("form_rec_key")
if url and key:
print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
cls.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
else:
print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
cls.instance = object() # dummy object
return cls.instance
def __getstate__(self)->tuple[Any,Any]:
return self.url, self.key
def __setstate__(self, state):
url, key = state
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
self.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
@dataclass
class ProcessingContext:
"""Processing Context"""
object_key: str
data_config: Dict[str, Any]
metadata: Dict[str, Any]
retry_count: int = 0
error_message: Optional[str] = None
current_tmp_directory: str = ""
datasource_name: str = ""
config: ApplicationConfig | None = None
@dataclass
class ProcessingResult:
"""Processing Result"""
status: IndexObjectStatus
object_key: str
message: str
processing_time: float
chunks_count: int = 0
error: Optional[Exception] = None
# Keep only the DocumentRepository interface, other services directly use the specific implementation
class DocumentRepository(ABC):
"""Document Repository Interface"""
@abstractmethod
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
"""Get index object"""
pass
@abstractmethod
def save_index_object(self, index_object: IndexObject) -> None:
"""Save index object"""
pass
@abstractmethod
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
"""Update processing status"""
pass
# Application service layer
class DocumentProcessingOrchestrator:
"""Document Processing Orchestrator (Application Service Layer)"""
def __init__(self,
extraction_service: 'DocumentIntelligenceExtractionService',
chunking_service: 'DefaultDocumentChunkingService',
indexing_service: 'AzureSearchIndexingService',
metadata_service: 'BlobMetadataService',
repository: DocumentRepository):
self.extraction_service = extraction_service
self.chunking_service = chunking_service
self.indexing_service = indexing_service
self.metadata_service = metadata_service
self.repository = repository
def process_document(self, context: ProcessingContext) -> ProcessingResult:
"""Main process for handling a single document"""
start_time = datetime.datetime.now()
# 1. Get or create index object
index_object = self._get_or_create_index_object(context)
# if not index_object:
# raise ValueError(f"Failed to create or retrieve index object for {context.object_key}")
try:
# 2. Check retry count
# If the current processing object's time is updated, reset the retry count, and execute the subsequent logic. The comparison dimensions are the last failed document modification time and metadata modification time
if index_object.last_fail_doc_modifed_time != context.metadata.get("doc_modified_time") or index_object.last_fail_metadata_modifed_time != context.metadata.get("metadata_modified_time"):
index_object.try_count = 0
if index_object.status in ["processing", "failed"]:
# Check if the maximum retry count has been reached
if index_object.try_count >= 3:
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Object has been retried {index_object.try_count} times, skipping processing", processing_time=0)
# Increase the retry count and save immediately
index_object.try_count += 1
# Immediately save the retry count update
self.repository.save_index_object(index_object)
# 3. Update status to processing
self.repository.update_processing_status(context.object_key,context.datasource_name, IndexObjectStatus.PROCESSING)
# 4. Check if processing is needed (metadata and document modification times)
meta_update_flag = self._should_process_metadata(index_object, context)
doc_update_flag = self._should_process_document(index_object, context)
chunks_count = 0
# 5. Process metadata index (if update is needed)
if meta_update_flag:
self._process_metadata_indexes(context)
# 6. Process document and chunk indexes (Important: Only process when meta_update_flag OR doc_update_flag=True)
if meta_update_flag or doc_update_flag:
chunks_count = self._process_document_and_chunks(context, doc_update_flag)
# 7. Update the modification time of the index object
if meta_update_flag:
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
if doc_update_flag:
index_object.doc_modifed_time = context.metadata.get("doc_modified_time")
index_object.status = IndexObjectStatus.SUCCESS.value
if index_object.metadata_modifed_time is None:
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
self.repository.save_index_object(index_object)
processing_time = (datetime.datetime.now() - start_time).total_seconds()
return ProcessingResult(status=IndexObjectStatus.SUCCESS, object_key=context.object_key, message=f"Successfully processed {chunks_count} chunks", processing_time=processing_time, chunks_count=chunks_count)
except Exception as e:
error_message:str = traceback.format_exc()
index_object.status = IndexObjectStatus.FAILED.value
index_object.last_fail_doc_modifed_time = context.metadata.get("doc_modified_time")
index_object.last_fail_metadata_modifed_time = context.metadata.get("metadata_modified_time")
self.repository.save_index_object(index_object)
processing_time = (datetime.datetime.now() - start_time).total_seconds()
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Processing failed: {error_message}", processing_time=processing_time, error=e )
def _get_or_create_index_object(self, context: ProcessingContext) -> IndexObject:
"""Get or create index object"""
index_object = self.repository.get_index_object(context.object_key,context.datasource_name)
if not index_object:
index_object = IndexObject(
object_key=context.object_key,
type="document",
status=IndexObjectStatus.PROCESSING.value,
datasource_name=context.datasource_name
)
self.repository.save_index_object(index_object)
return index_object
def _should_process(self, index_object: IndexObject, context: ProcessingContext) -> bool:
"""Determine whether processing is needed (keep the original logic for backward compatibility)"""
return self._should_process_metadata(index_object, context) or self._should_process_document(index_object, context)
def _should_process_metadata(self, index_object: IndexObject, context: ProcessingContext) -> bool:
"""Determine whether metadata processing is needed"""
if 'metadata_modified_time' in context.metadata:
metadata_modified_time = context.metadata['metadata_modified_time']
if index_object.metadata_modifed_time is None:
return True
if metadata_modified_time is not None and metadata_modified_time > index_object.metadata_modifed_time:
return True
return False
def _should_process_document(self, index_object: IndexObject, context: ProcessingContext) -> bool:
"""Determine whether document processing is needed"""
if 'doc_modified_time' in context.metadata:
doc_modified_time = context.metadata['doc_modified_time']
if index_object.doc_modifed_time is None:
return True
if doc_modified_time is not None and doc_modified_time > index_object.doc_modifed_time:
return True
return False
def _process_metadata_indexes(self, context: ProcessingContext) -> None:
"""Process metadata index"""
# Push metadata index - only process index with data_type of ["metadata"]
meta_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata"])]
if not any(meta_index_schemas):
return
# Get metadata - from metadata service
doc_meta = self.metadata_service.get_metadata(context.object_key)
# Metadata must not be empty, use empty dictionary as default value
if not doc_meta:
raise ValueError(f"Metadata for object {context.object_key} not found")
for meta_index_schema in meta_index_schemas:
self.indexing_service.index_metadata(doc_meta, meta_index_schema, context)
def _process_document_and_chunks(self, context: ProcessingContext, doc_update_flag: bool) -> int:
"""Process document and chunk indexes, return the number of processed chunks"""
doc_dict = {}
chunk_dict = []
chunks_count = 0
# Update document dictionary with metadata
doc_meta = self.metadata_service.get_metadata(context.object_key)
language_code = doc_meta.get("language_code", "zh-Hans") # Default to "zh-Hans" if not specified
# Future error or skip operation if no doc_meta configuration file
if not doc_meta:
doc_meta={}
# If the document needs to be updated, re-extract and chunk
if doc_update_flag:
# Extract document
document = self.extraction_service.extract_document(context, language_code)
document.title = os.path.splitext(context.object_key)[0]
# Chunk processing
chunking_result = self.chunking_service.chunk_document(document, context)
chunks_count = len(chunking_result.chunks)
# Convert to dictionary format
doc_dict = self._convert_document_to_dict(document)
chunk_dict = [self._convert_document_to_dict(chunk) for chunk in chunking_result.chunks]
# Process document index - data_type is ["metadata","document"]
document_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document"]) or Counter(schema["data_type"]) == Counter(["document"])]
for document_index_schema in document_index_schemas:
if not doc_update_flag:
# Get existing document data from Azure Search Index
existing_docs = self.indexing_service.get_existing_document_data(
context.object_key, document_index_schema["index_name"],
document_index_schema["update_by_field"]
)
if existing_docs:
doc_dict = existing_docs
doc_dict.update({k: doc_meta[k] for k in document_index_schema["fields"] if k in doc_meta})
# Upload document index
self.indexing_service.index_document_with_schema(doc_dict, document_index_schema, context)
# Process chunk index - data_type is ["metadata","document","chunk"]
chunk_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document","chunk"]) or Counter(schema["data_type"]) == Counter(["chunk"])]
for index_schema in chunk_index_schemas:
current_chunk_dict = chunk_dict # Use existing chunk_dict
current_chunks_count = chunks_count # Use existing chunks_count
if not doc_update_flag:
# Get existing chunk data from Azure Search Index
current_chunk_dict = self.indexing_service.get_existing_chunk_data(context.object_key, index_schema["index_name"], index_schema["update_by_field"])
current_chunks_count = len(current_chunk_dict) if current_chunk_dict else 0
# Update the total chunks_count (for return value)
chunks_count = current_chunks_count
for chunk in current_chunk_dict if current_chunk_dict else []:
chunk.update({k: doc_meta[k] for k in index_schema["fields"] if k in doc_meta})
# Delete old chunk data
self.indexing_service.delete_chunks_by_field(index_schema["index_name"], index_schema["update_by_field"], doc_dict.get(index_schema["update_by_field"], context.object_key))
# Upload new chunk data
if current_chunk_dict:
self.indexing_service.index_chunks_with_schema(current_chunk_dict, index_schema, context)
return chunks_count
def _convert_document_to_dict(self, document:Document) -> Dict[str, Any]:
"""Convert Document object to dictionary"""
try:
# Use the original asdict_with_dynamic function to maintain compatibility
return asdict_with_dynamic(document)
except Exception:
# If asdict_with_dynamic fails, use the fallback method
if hasattr(document, '__dict__'):
return document.__dict__.copy()
elif hasattr(document, 'to_dict'):
return document.to_dict()
else:
# If all fails, return empty dictionary
return {}
# Infrastructure layer implementation
class SqlAlchemyDocumentRepository(DocumentRepository):
"""SQLAlchemy-based document repository implementation"""
def __init__(self, database_interface: DatabaseInterface):
self.database_interface = database_interface
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
"""Get index object"""
return self.database_interface.get_index_object(object_key,datasource_name)
def save_index_object(self, index_object: IndexObject) -> None:
"""Save index object"""
self.database_interface.save_index_object(index_object)
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
message: str = None) -> None:
"""Update processing status"""
# Convert business layer status to database status
self.database_interface.update_processing_status(object_key,datasource_name, status, message)
# Concrete implementation class
class DocumentIntelligenceExtractionService:
"""Document extraction service based on Document Intelligence"""
def __init__(self, form_recognizer_client: DocumentIntelligenceClient, vllm_endpoint, vllm_key, tmp_directory, data_directory=None,di_sas_url=None, figure_sas_url=None):
self.form_recognizer_client: DocumentIntelligenceClient = form_recognizer_client
self.vllm_endpoint: str = vllm_endpoint
self.vllm_key: str = vllm_key
self.tmp_directory: str = tmp_directory
self.data_directory: str = data_directory or ""
self.di_sas_url: str = di_sas_url
self.figure_sas_url: str = figure_sas_url
def extract_document(self, context: ProcessingContext,language:str) -> Document:
"""Extract document content using Document Intelligence"""
# Get data_dir config, use instance variable if not present
data_dir = context.data_config.get("data_dir", self.data_directory)
# Download document file - use correct parameter order
local_file_paths = downloadToLocalFolder(blob_url=context.data_config["data_path"], data_dir=data_dir, local_folder=self.tmp_directory, name_starts_with=context.object_key)
if not local_file_paths or len(local_file_paths) == 0:
raise ValueError(f"File {context.object_key} not found in blob storage")
di_blob_file_name = context.object_key + str(context.metadata["doc_modified_time"]) + ".json"
di_result:DiResult = None
# Try to download the di result from the blob. If you can download it, you will no longer di_extract
if self.di_sas_url and blob_exists(self.di_sas_url, di_blob_file_name):
content:str = load_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name)
if content:
di_result = DiResult.from_json(content) # type: ignore
if not di_result:
di_result = di_extract(source_file_path=local_file_paths.pop(), di_client=self.form_recognizer_client, directory_path=self.tmp_directory, figure_sas_url=self.figure_sas_url, language=language)
try:
process_document_figures(di_result=di_result,config=context.config)
except Exception as e:
print(f"Error processing document figures: {e}")
finally:
# The result after understanding is written directly to the blob to prevent subsequent reprocessing
blob_upload_object(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name, obj=di_result)
under_image_content = replace_urls_in_content(content=di_result.di_content, replacements=di_result.figures)
# Save extracted content to local file (same as original logic)
write_content(content=under_image_content, directory_path=self.tmp_directory, file_name=context.object_key)
blob_upload_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name+".md", content=under_image_content)
return Document(content=under_image_content, filepath=context.object_key)
class DefaultDocumentChunkingService:
"""Default document chunking service"""
def __init__(self, tmp_directory: str = None):
self.tmp_directory = tmp_directory
def chunk_document(self, document: Document, context: ProcessingContext) -> ChunkingResult:
"""Chunk document"""
# Call the original chunking method
chunking_result = chunk_di_doc(document, data_config=context.data_config, tmp_path=context.current_tmp_directory)
# If tmp_directory is configured, save chunk result to local file
if self.tmp_directory:
write_document( chunking_result.chunks, file_path=context.object_key, directory_path=self.tmp_directory, rel_file_path=context.object_key )
return chunking_result
class AzureSearchIndexingService:
"""Azure Search-based indexing service"""
def __init__(self):
pass
def index_document(self, document: Document, context: ProcessingContext) -> bool:
"""Index document"""
# Get document index schema
document_schemas = [schema for schema in context.data_config["index_schemas"]
if set(schema["data_type"]) == {"metadata", "document"}]
doc_dict = asdict_with_dynamic(document)
doc_dict.update(context.metadata)
for schema in document_schemas:
if not upload_merge_index(index_config=schema, docs=[doc_dict], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory):
return False
return True
def index_chunks(self, chunks: List[Document], context: ProcessingContext) -> bool:
"""Index document chunks"""
# Get chunk index schema
chunk_schemas = [schema for schema in context.data_config["index_schemas"]
if set(schema["data_type"]) == {"metadata", "document", "chunk"}]
chunk_dict = [asdict_with_dynamic(chunk) for chunk in chunks]
for schema in chunk_schemas:
# First delete old chunk data
delete_documents_by_field(schema["index_name"], schema["update_by_field"], context.object_key)
# Add metadata to each chunk
for chunk in chunk_dict:
chunk.update(context.metadata)
# Upload new chunk data
if not upload_merge_index(
index_config=schema,
docs=chunk_dict,
merge_fields=context.data_config["merge_fields"],
current_tmp_directory=context.current_tmp_directory
):
return False
return True
def get_existing_document_data(self, object_key: str, index_name: str, field_name: str) -> Optional[dict[str,Any]]:
"""Get existing document data from Azure Search Index"""
results = query_by_field(
index_name=index_name,
field_name=field_name,
value=object_key
)
return results[0] if results else None
def get_existing_chunk_data(self, object_key: str, index_name: str, field_name: str) -> List[dict[str,Any]]:
"""Get existing chunk data from Azure Search Index"""
results = query_by_field( index_name=index_name, field_name=field_name, value=object_key )
return results if results else []
def index_metadata(self, metadata: dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
"""Index metadata"""
return upload_merge_index(index_config=schema, docs=[metadata], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory )
def index_document_with_schema(self, doc_dict: Dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
"""Index document using specified schema"""
return upload_merge_index(
index_config=schema,
docs=[doc_dict],
merge_fields=context.data_config["merge_fields"],
current_tmp_directory=context.current_tmp_directory
)
def index_chunks_with_schema(self, chunk_dict: List[Dict[str,Any]], schema: Any, context: ProcessingContext) -> bool:
"""Index chunks using specified schema"""
return upload_merge_index(
index_config=schema,
docs=chunk_dict,
merge_fields=context.data_config["merge_fields"],
current_tmp_directory=context.current_tmp_directory
)
def delete_chunks_by_field(self, index_name: str, field_name: str, field_value: str) -> bool:
"""Delete chunks by field"""
try:
delete_documents_by_field(index_name, field_name, field_value)
return True
except Exception:
return False
class BlobMetadataService:
"""Metadata service based on Blob storage"""
def __init__(self, datasource: Dict[str, Any]):
self.datasource = datasource
def get_metadata(self, object_key: str) -> Dict[str, Any]:
"""Get metadata"""
if "metadata" not in self.datasource:
return {}
return self.datasource["metadata"].get(object_key, {})
# Update the factory class with specific implementations
class DocumentProcessingFactory:
"""Document processing factory class"""
def __init__(self, service_factory: ServiceFactory, tmp_directory:str, datasource: Optional[Dict[str, Any]] = None, config:ApplicationConfig = None):
"""
Initialize factory
Args:
service_factory: Service factory (used to get database engine)
datasource: Data source configuration
"""
self.service_factory: ServiceFactory = service_factory
self.datasource = datasource or {}
self.shared_tmp_directory = tmp_directory
self.config:ApplicationConfig = config
def create_orchestrator(self) -> DocumentProcessingOrchestrator:
"""Create document processing orchestrator"""
extraction_service = self._create_extraction_service()
chunking_service = self._create_chunking_service()
indexing_service = self._create_indexing_service()
metadata_service = self._create_metadata_service()
repository = self._create_repository()
return DocumentProcessingOrchestrator(
extraction_service=extraction_service,
chunking_service=chunking_service,
indexing_service=indexing_service,
metadata_service=metadata_service,
repository=repository
)
def _create_extraction_service(self) -> 'DocumentIntelligenceExtractionService':
"""Create document extraction service"""
# Use the factory shared temporary directory (same as original app.py logic)
tmp_directory = self.shared_tmp_directory
# Get configuration from environment variables (same as original worker.py logic)
vllm_endpoint = os.environ.get("captioning_model_endpoint", "")
vllm_key = os.environ.get("captioning_model_key", "")
form_recognizer_client = SingletonFormRecognizerClient()
return DocumentIntelligenceExtractionService(
form_recognizer_client=form_recognizer_client,
vllm_endpoint=vllm_endpoint,
vllm_key=vllm_key,
tmp_directory=tmp_directory,
data_directory="", # Will be dynamically fetched from data_config
di_sas_url=self.config.azure_services.di_blob_account_url,
figure_sas_url=self.config.azure_services.figure_blob_account_url
)
def _create_chunking_service(self) -> 'DefaultDocumentChunkingService':
"""Create document chunking service"""
# Use the factory shared temporary directory
tmp_directory = self.shared_tmp_directory
return DefaultDocumentChunkingService(tmp_directory=tmp_directory)
def _create_indexing_service(self) -> 'AzureSearchIndexingService':
"""Create indexing service"""
return AzureSearchIndexingService()
def _create_metadata_service(self) -> 'BlobMetadataService':
"""Create metadata service"""
return BlobMetadataService(self.datasource)
def _create_repository(self) -> DocumentRepository:
"""Create document repository"""
database_interface = LegacyDatabaseAdapter(self.service_factory.get_database_engine())
return SqlAlchemyDocumentRepository(database_interface)

View File

@@ -0,0 +1,177 @@
import json
import os
from os import makedirs
import re
import time
from typing import Any, List
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter
from entity_models import Document, ChunkingResult
from hierarchy_fix import HierarchyFixer
from third_level_service import get_recommended_hash_count_simple
from utils import TOKEN_ESTIMATOR, custom_serializer
# Compile once for efficiency
_specific_comments = re.compile(
r"""<!--\s* # opening
(?:PageFooter="[^"]*" # PageFooter=""
|PageNumber="[^"]*" # PageNumber=""
|PageBreak # PageBreak
|PageHeader="[^"]*") # PageHeader=""
\s*--> # closing
""",
flags=re.VERBOSE
)
def remove_specific_comments(text: str) -> str:
return _specific_comments.sub('', text)
def infer_level_from_number():
pass
def chunk_docs_by_section(extracted_doc: Document, num_tokens:int, token_overlap:int,tmp_path:str) -> List[Document]:
headers_to_split_on = [
("#", "h1"),
("##", "h2"),
("###", "h3"),
("####", "h4"),
("#####", "h5"),
("######", "h6")
]
filepath:str = extracted_doc.filepath if extracted_doc.filepath else ""
extracted_content:str = extracted_doc.content or ""
merged_content:str = extracted_content
if os.getenv("header_fix","false").lower() == "true":
#merge content of all extracted_docs into one string
fixer = HierarchyFixer()
fix_result:dict[str,Any] = fixer.fix_hierarchy(content=extracted_content)
# If a fix exists, the fix report is saved by file
merged_content = fix_result["fixed_content"]
makedirs(tmp_path + f"/.extracted/{filepath}", exist_ok=True)
if tmp_path and fix_result["fixes_applied"] > 0:
with open(tmp_path + f"/.extracted/{filepath}/hierarchy_fix_log.json", "a", encoding="utf-8") as log_file:
json.dump(fix_result, log_file, default=custom_serializer, ensure_ascii=False)
# Dynamically get the number of # for level 3 headers
third_level_counts:int = get_recommended_hash_count_simple(merged_content)['recommendation']
headers_to_split_on = [( "#" * i, f"h{i}") for i in range(1, third_level_counts + 1)]
with open(tmp_path + f"/.extracted/{filepath}/get_recommended_hash_count.txt", "a", encoding="utf-8") as md_file:
md_file.write(str(headers_to_split_on))
with open(tmp_path + f"/.extracted/{filepath}/new_merged_hierarchy.md", "a", encoding="utf-8") as md_file:
md_file.write(merged_content)
# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False )
md_header_splits = markdown_splitter.split_text(merged_content)
chunk_size = num_tokens
chunk_overlap = token_overlap
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(md_header_splits)
pre_document = extracted_doc
chunked_docs: List[Document] = []
for i, split in enumerate(splits):
if TOKEN_ESTIMATOR.estimate_tokens(split.page_content) < num_tokens * 1.5:
chunked_doc = Document(
document_schema=pre_document.document_schema,
main_title=pre_document.main_title,
sub_title=pre_document.sub_title,
publisher=pre_document.publisher,
document_code=pre_document.document_code,
document_category=pre_document.document_category,
main_title_sec_language=pre_document.main_title_sec_language,
sub_title_sec_language=pre_document.sub_title_sec_language,
primary_language=pre_document.primary_language,
secondary_language=pre_document.secondary_language,
title=pre_document.title,
doc_metadata=pre_document.doc_metadata,
filepath=pre_document.filepath,
)
chunked_doc.copy_dynamic_attrs(pre_document)
chunked_doc.content = split.page_content
chunked_doc.h1 = split.metadata.get("h1", "")
chunked_doc.h2 = split.metadata.get("h2", "")
chunked_doc.h3 = split.metadata.get("h3", "")
chunked_doc.h4 = split.metadata.get("h4", "")
chunked_doc.h5 = split.metadata.get("h5", "")
chunked_doc.h6 = split.metadata.get("h6", "")
chunked_doc.h7 = split.metadata.get("h7", "")
# chunked_doc.h4 =split.metadata.get("h4", "")
chunked_doc.full_headers = "||".join(h for h in [chunked_doc.h6, chunked_doc.h5, chunked_doc.h4, chunked_doc.h3, chunked_doc.h2, chunked_doc.h1] if h)
chunked_doc.id = chunked_doc.filepath + f"_{i}"
chunked_docs.append(chunked_doc)
else:
splitter = MarkdownTextSplitter.from_tiktoken_encoder(
chunk_size=num_tokens, chunk_overlap=token_overlap)
chunked_content_list = splitter.split_text(
split.page_content)
# chunk the original content
for j, chunked_content in enumerate(chunked_content_list):
chunked_doc = Document(
document_schema=pre_document.document_schema,
main_title=pre_document.main_title,
sub_title=pre_document.sub_title,
publisher=pre_document.publisher,
document_code=pre_document.document_code,
document_category=pre_document.document_category,
main_title_sec_language=pre_document.main_title_sec_language,
sub_title_sec_language=pre_document.sub_title_sec_language,
primary_language=pre_document.primary_language,
secondary_language=pre_document.secondary_language,
title=pre_document.title,
doc_metadata=pre_document.doc_metadata,
filepath=pre_document.filepath
)
chunked_doc.copy_dynamic_attrs(pre_document)
chunked_doc.content = chunked_content
chunked_doc.h1 = split.metadata.get("h1", "")
chunked_doc.h2 = split.metadata.get("h2", "")
chunked_doc.h3 = split.metadata.get("h3", "")
chunked_doc.h4 = split.metadata.get("h4", "")
chunked_doc.h5 = split.metadata.get("h5", "")
chunked_doc.h6 = split.metadata.get("h6", "")
chunked_doc.h7 = split.metadata.get("h7", "")
chunked_doc.full_headers = "||".join(h for h in [chunked_doc.h6, chunked_doc.h5, chunked_doc.h4, chunked_doc.h3, chunked_doc.h2, chunked_doc.h1] if h)
chunked_doc.id = chunked_doc.filepath + f"_{i}_{j}"
chunked_docs.append(chunked_doc)
return chunked_docs
def chunk_di_doc(extracted_doc: Document, data_config: dict[str, Any], tmp_path: str) -> ChunkingResult:
"""
Chunk the document.
Args:
extracted_doc: The document object to be processed.
data_config: Processing configuration.
Returns:
ChunkingResult: The result containing the list of chunks and total files.
"""
num_tokens:int = data_config["chunk_size"] if "chunk_size" in data_config else 1024
token_overlap:int = data_config["token_overlap"] if "token_overlap" in data_config else 128
print({"index_name":extracted_doc.filepath , "num_tokens": num_tokens, "token_overlap": token_overlap})
extracted_doc.content = remove_specific_comments(text=extracted_doc.content or "")
chunked_docs: List[Document] = chunk_docs_by_section(extracted_doc= extracted_doc,num_tokens=num_tokens, token_overlap=token_overlap,tmp_path=tmp_path)
time.sleep(0.1)
return ChunkingResult(chunks=chunked_docs, total_files=1)

View File

@@ -0,0 +1,27 @@
- data_path: "blob sas url"
datasource_name: "demo-vw-03"
data_dir: ""
base_path: "D:\\tmp\\"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 128
index_schemas:
- index_name: "index-dev-figure-01-chunk"
data_type: ["metadata", "document", "chunk"]
field_type: "append"
upload_batch_size: 50
fields: ["filepath", "title"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["title"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]

View File

@@ -0,0 +1,109 @@
# Main data configuration (array format)
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
datasource_name: "CATOnline-cn" # data source name
data_dir: "" # Optional local data directory
base_path: "/app/run_tmp" # Temporary processing directory
# File processing limits
process_file_num: 0 # 0 = process all files
process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date
# Chunking configuration
chunk_size: 2048 # Maximum tokens per chunk
token_overlap: 128 # Overlap between chunks
# Index schemas configuration
index_schemas:
# Chunk-level index for search
- index_name: "your-knowledge-chunk-index"
data_type: ["metadata", "document", "chunk"]
field_type: "append" # How to handle existing data
upload_batch_size: 50 # Documents per batch upload
# Metadata fields to include
fields: [
"filepath", "timestamp", "title", "publisher", "publish_date",
"document_category", "document_code", "language_code",
"x_Standard_Regulation_Id", "x_Attachment_Type",
"x_Standard_Title_CN", "x_Standard_Title_EN",
"x_Standard_Published_State", "x_Standard_Drafting_Status",
"x_Standard_Range", "x_Standard_Kind", "x_Standard_No",
"x_Standard_Code", "x_Standard_Technical_Committee",
"x_Standard_Vehicle_Type", "x_Standard_Power_Type",
"x_Standard_CCS", "x_Standard_ICS",
"x_Standard_Published_Date", "x_Standard_Effective_Date",
"x_Regulation_Status", "x_Regulation_Title_CN",
"x_Regulation_Title_EN", "x_Regulation_Document_No",
"x_Regulation_Issued_Date", "x_Classification",
"x_Work_Group", "x_Reference_Standard",
"x_Replaced_by", "x_Refer_To", "func_uuid",
"update_time", "status"
]
# Vector configuration
vector_fields:
- field: "contentVector"
append_fields: ["content"] # Fields to vectorize for content
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization
# Azure AI Search configuration
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath" # Field to use for updates
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
# Document-level index
- index_name: "your-knowledge-document-index"
data_type: ["document", "metadata"]
field_type: "full" # Replace entire documents
key_fields: ["filepath"] # Primary key fields
upload_batch_size: 1
fields: [
# Same field list as chunk index
"filepath", "timestamp", "title", "publisher"
# ... (same as above)
]
merge_content_fields: ["content"] # Fields to merge from chunks
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
# Regulation-specific index
- index_name: "your-regulation-index"
data_type: ["metadata"]
field_type: "full"
key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key
upload_batch_size: 50
fields: [
# Regulation-specific fields
"x_Standard_Regulation_Id", "x_Standard_Title_CN",
"x_Standard_Title_EN", "x_Regulation_Status"
# ... (regulation metadata fields)
]
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
update_by_field: "x_Standard_Regulation_Id"
# Field merging configuration
merge_fields:
- key: "doc_metadata" # Combined metadata field
fields: [
"title", "publisher", "document_category", "document_code",
"x_Standard_Title_CN", "x_Standard_Title_EN",
"x_Standard_Published_State", "x_Standard_Drafting_Status"
# ... (all metadata fields to combine)
]
# Vector field configuration
full_metadata_vector_fields: ["full_headers", "doc_metadata"]

View File

@@ -0,0 +1,189 @@
from enum import Enum
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any
import datetime
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.orm import Mapped, declarative_base, mapped_column
Base = declarative_base()
class IndexJobStatus(Enum):
"""Enumeration for index job status"""
PENDING = 'pending' # todo
PROCESSING = 'processing'
SUCCESS = 'success'
PARTIAL_SUCCESS = 'partial_success'
FAILED = 'failed'
class IndexObjectStatus(Enum):
"""Enumeration for index object status"""
SUCCESS = 'success'
PROCESSING = 'processing'
FAILED = 'failed'
class IndexJob(Base): # type: ignore
"""Index job model, represents a single index run"""
__tablename__ = 'index_run'
id = Column(Integer, primary_key=True, autoincrement=True)
start_time = Column(DateTime, nullable=True)
finished_time = Column(DateTime)
status: Mapped[str] = mapped_column(String(20), default=IndexJobStatus.PENDING.value)
detailed_message = Column(Text,nullable=True)
doc_lower_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
doc_upper_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
metadata_lower_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
metadata_upper_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
total_process_count = Column(Integer)
success_object_count = Column(Integer, default=0)
failed_object_count = Column(Integer, default=0)
datasource_name: Mapped[str] = mapped_column(String(255), nullable=False)
class IndexObject(Base):
"""Index object model, represents a document or metadata file to be processed"""
__tablename__ = 'index_object'
object_key: Mapped[str] = mapped_column(String(255), primary_key=True)
type = Column(String(20), nullable=False)
doc_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
metadata_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
status: Mapped[str] = mapped_column(String(20), default=IndexObjectStatus.PROCESSING.value)
try_count: Mapped[int] = mapped_column(Integer, default=0)
last_run_id = Column(Integer)
last_start_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
last_finished_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
detailed_message: Mapped[str] = mapped_column(Text,nullable=True)
last_fail_doc_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
last_fail_metadata_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
datasource_name: Mapped[str] = mapped_column(String(255), primary_key=True)
def init_database(database_uri: str = '') -> Any:
engine = create_engine(database_uri)
Base.metadata.create_all(engine)
return engine
class DatabaseInterface(ABC):
"""Database interface for the refactored system"""
@abstractmethod
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
"""Get index object by key"""
pass
@abstractmethod
def save_index_object(self, index_object: IndexObject) -> None:
"""Save index object"""
pass
@abstractmethod
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
message: str = None) -> None:
"""Update processing status"""
pass
class InMemoryDatabase(DatabaseInterface):
"""In-memory database implementation for testing"""
def __init__(self):
self._objects: Dict[str, IndexObject] = {}
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
"""Get index object by key"""
return self._objects.get(object_key)
def save_index_object(self, index_object: IndexObject) -> None:
"""Save index object"""
index_object.updated_at = datetime.datetime.now()
if index_object.created_at is None:
index_object.created_at = datetime.datetime.now()
self._objects[index_object.object_key] = index_object
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
message: str = None) -> None:
"""Update processing status"""
if object_key in self._objects:
self._objects[object_key].status = status
self._objects[object_key].error_message = message
self._objects[object_key].updated_at = datetime.datetime.now()
else:
# Create new object if it doesn't exist
obj = IndexObject(
object_key=object_key,
status=status,
error_message=message,
created_at=datetime.datetime.now(),
updated_at=datetime.datetime.now()
)
self._objects[object_key] = obj
class LegacyDatabaseAdapter(DatabaseInterface):
"""Adapter to bridge the old database module with the new interface"""
def __init__(self, database_engine):
self.database_engine = database_engine
self._session_factory = None
def _get_session_factory(self):
"""Get session factory (lazy initialization)"""
if self._session_factory is None:
from sqlalchemy.orm import sessionmaker
self._session_factory = sessionmaker(bind=self.database_engine)
return self._session_factory
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
"""Get index object by key"""
session_factory = self._get_session_factory()
with session_factory() as session:
return session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
def save_index_object(self, index_object: IndexObject) -> None:
"""Save index object"""
object_key = index_object.object_key
datasource_name = index_object.datasource_name
session_factory = self._get_session_factory()
with session_factory() as session:
old_obj = session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
if old_obj:
# Update existing
old_obj.doc_modifed_time = index_object.doc_modifed_time
old_obj.metadata_modifed_time = index_object.metadata_modifed_time
old_obj.try_count = index_object.try_count
old_obj.status = index_object.status
old_obj.last_fail_doc_modifed_time = index_object.last_fail_doc_modifed_time
old_obj.last_fail_metadata_modifed_time = index_object.last_fail_metadata_modifed_time
old_obj.datasource_name = index_object.datasource_name
# Note: legacy IndexObject might not have all fields
else:
# Create new
old_obj = IndexObject(
object_key=index_object.object_key,
type=index_object.type,
doc_modifed_time=index_object.doc_modifed_time,
metadata_modifed_time=index_object.metadata_modifed_time,
try_count=index_object.try_count,
status=index_object.status,
last_fail_doc_modifed_time=index_object.last_fail_doc_modifed_time,
last_fail_metadata_modifed_time=index_object.last_fail_metadata_modifed_time,
datasource_name=index_object.datasource_name
)
session.add(old_obj)
session.commit()
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
"""Update processing status"""
session_factory = self._get_session_factory()
with session_factory() as session:
old_obj = session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
if old_obj:
old_obj.status = status.value
old_obj.detailed_message = message
session.commit()

View File

@@ -0,0 +1,50 @@
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-abroad-prd?sp=rl&st=2025-08-02T08:25:56Z&se=2125-08-02T16:40:56Z&spr=https&sv=2024-11-04&sr=c&sig=lJui2%2BOs8V%2BdzCkjchQCR7ITWT28tJ0HAq8bIhkkM%2Bk%3D"
datasource_name: "cat-standard-regulation-oversea"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 256
index_schemas:
- index_name: "index-catonline-chunk-oversea"
data_type: ["metadata", "document", "chunk"]
upload_batch_size: 50
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
- index_name: "index-catonline-document-oversea"
data_type: ["document", "metadata"]
key_fields: ["filepath"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
merge_content_fields: ["content"]
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
- index_name: "index-catonline-standard-regulation-oversea"
data_type: ["metadata"]
key_fields: ["standard_Id"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
vector_config_name: "vectorSearchProfile"
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
update_by_field: "standard_Id"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["file_Name","entity_Attribute","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","version_Name","version_Parent_Name","technical_Series_No","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str " ]

View File

@@ -0,0 +1,50 @@
# docker build
docker login acrsales2caiprd.azurecr.cn -u username -p password
docker build . -t document-ai-indexer:2.0.1
docker tag document-ai-indexer:2.0.1 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
# login AKS
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
# az login # Log in to Azure China account (browser or device code flow)
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# Create Azure Files Volume
# kubectl create secret generic azure-files-cred \
# --from-literal=azurestorageaccountname=saaisearchlab \
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
# -n knowledge-agent
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
# Deploy ConfigMap
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=.\deploy\prd\env.yaml --from-file=.\deploy\prd\config.yaml
# Deploy Pod
# kubectl create namespace knowledge-agent
# kubectl delete pod document-ai-indexer -n knowledge-agent
kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
# Monitor Pod
kubectl logs -f document-ai-indexer -n knowledge-agent
# Deploy CronJob
kubectl apply -f ./deploy/prd/document-ai-indexer-cronjob.yml --namespace knowledge-agent
# Check CronJob Status
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
# Check Job Execution History
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
###########
# Manually trigger a job (for testing)
kubectl delete job manual-test -n knowledge-agent
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
# Check Job Logs
kubectl logs -f job/manual-test -n knowledge-agent

View File

@@ -0,0 +1,64 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: document-ai-indexer-cronjob
spec:
# Scheduling configuration - execute every 10 minutes
schedule: "*/10 * * * *"
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
concurrencyPolicy: Forbid
# Successful jobs history limit: Keep the last 3 successful job records.
successfulJobsHistoryLimit: 10
# Failed jobs history limit: Keep the last failed job record.
failedJobsHistoryLimit: 10
# Job template
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: document-ai-indexer
job-type: cronjob
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false # Write permission
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
# Azure File Shared mount
- name: data-volume
mountPath: /app/run_tmp # Program write/read directory

View File

@@ -0,0 +1,42 @@
apiVersion: v1
kind: Pod
metadata:
name: document-ai-indexer
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
# Azure File Share Mount
- name: data-volume
mountPath: /app/run_tmp # Directory for program read/write

View File

@@ -0,0 +1,10 @@
# login AKS
# az cloud set -n AzureChinaCloud
# az login
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# kubectl create namespace knowledge-agent
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent

View File

@@ -0,0 +1,39 @@
# Service 资源:将外部域名映射为集群内 Service
apiVersion: v1
kind: Service
metadata:
name: itpai-backend
spec:
type: ExternalName
externalName: itpai.infer.api.vgcserv.com.cn
ports:
- port: 443
protocol: TCP
targetPort: 443
---
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: itpai-proxy
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
spec:
rules:
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
http:
paths:
- path: /v1-openai
pathType: Prefix
backend:
service:
name: itpai-backend
port:
number: 443

View File

@@ -0,0 +1,40 @@
config: config.yaml
njobs: 12
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
VECTOR_DIMENSION: 4096
FLAG_AOAI: "V3"
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
extract_method: di+vision-llm
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
di-Formulas: true
di-hiRes: true
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
# 图片理解
figure_caption:
include_di_content: false
description_gen_max_images: 0
model_endpoint: null
model_key: null
model: null # azure 留空
azure_deployment: gpt-4o # azure 部署名称,其他平台模型留空
api_version: 2024-08-01-preview # azure api版本,其他平台留空
header_fix: true

View File

@@ -0,0 +1,46 @@
# login AKS
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
# az login # Log in to Azure China account (browser or device code flow)
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
# az aks get-credentials -g rg-aiflow-lab -n aks-aiflow-lab --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-aiflow-lab
kubectl config current-context
# kubectl create secret generic azure-files-cred \
# --from-literal=azurestorageaccountname=saaisearchlab \
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
# -n knowledge-agent
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
docker build . -t document-ai-indexer:2.0.2
docker tag document-ai-indexer:2.0.2 acraiflowlab.azurecr.io/document-ai-indexer:2.0.2
docker push acraiflowlab.azurecr.io/document-ai-indexer:2.0.2
# dev
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=env.yaml --from-file=config.yaml
# kubectl create namespace knowledge-agent
# # kubectl delete pod document-ai-indexer -n knowledge-agent
# kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
# kubectl logs -f document-ai-indexer -n knowledge-agent
# Deploy CronJob
kubectl apply -f deploy/dev/document-ai-indexer-cronjob.yml --namespace knowledge-agent
# Check CronJob Status
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
# Check Job Execution History
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
###########
# Manually trigger a job (for testing)
kubectl delete job manual-test -n knowledge-agent
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
# Check Job Logs
kubectl logs -f job/manual-test -n knowledge-agent

View File

@@ -0,0 +1,64 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: document-ai-indexer-cronjob
spec:
# Scheduling configuration - execute every 10 minutes
schedule: "*/10 * * * *"
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
concurrencyPolicy: Forbid
# Successful jobs history limit: Keep the last 3 successful job records.
successfulJobsHistoryLimit: 10
# Failed jobs history limit: Keep the last failed job record.
failedJobsHistoryLimit: 10
# Job template
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: document-ai-indexer
job-type: cronjob
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quote Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false # Write permission
containers:
- name: document-ai-indexer
image: acraiflowlab.azurecr.io/document-ai-indexer:2.0.1
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
# Azure File Shared mount
- name: data-volume
mountPath: /app/run_tmp # Program write/read directory

View File

@@ -0,0 +1,42 @@
apiVersion: v1
kind: Pod
metadata:
name: document-ai-indexer
spec:
restartPolicy: Never
volumes:
# 1. 原有的 ConfigMap 卷
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
# 2. Azure File Share 卷
- name: data-volume
azureFile:
secretName: azure-files-cred # 引用你创建的 Secret
shareName: fs-document-ai-indexer # 你的文件共享名称
readOnly: false # 写权限
containers:
- name: document-ai-indexer
image: acraiflowlab.azurecr.io/document-ai-indexer:2.0.1
imagePullPolicy: Always
# 挂载卷到容器内
volumeMounts:
# ConfigMap 挂载
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
# Azure File 共享挂载
- name: data-volume
mountPath: /app/run_tmp # 程序写入/读取目录

View File

@@ -0,0 +1,27 @@
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-usermanual-prd?sp=racwdl&st=2025-08-27T06:26:11Z&se=2035-08-27T14:41:11Z&spr=https&sv=2024-11-04&sr=c&sig=7GVqfbWPM5VDRW8crTeR06KsSPX%2BuuDLjN7ceqBuLCE%3D"
datasource_name: "cat-usermanual-prd"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 128
index_schemas:
- index_name: "index-cat-usermanual-chunk-prd"
data_type: ["chunk"]
field_type: "append"
upload_batch_size: 50
fields: ["filepath", "title"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["title"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]

View File

@@ -0,0 +1,50 @@
# docker build
docker login acrsales2caiprd.azurecr.cn -u username -p password
docker build . -t document-ai-indexer:2.0.4
docker tag document-ai-indexer:2.0.4 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
# login AKS
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
# az login # Log in to Azure China account (browser or device code flow)
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# Create Azure Files Volume
# kubectl create secret generic azure-files-cred \
# --from-literal=azurestorageaccountname=saaisearchlab \
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
# -n knowledge-agent
# kubectl delete configmap document-ai-indexer-usermanual-config -n knowledge-agent
# Deploy ConfigMap
kubectl delete configmap document-ai-indexer-usermanual-config -n knowledge-agent
kubectl create configmap document-ai-indexer-usermanual-config -n knowledge-agent --from-file=.\deploy\prd-usermanual\env.yaml --from-file=.\deploy\prd-usermanual\config.yaml --from-file=prompt.yaml
# Deploy Pod
# kubectl create namespace knowledge-agent
# kubectl delete pod document-ai-indexer-usermanual -n knowledge-agent
kubectl apply -f .\deploy\prd-usermanual\document-ai-indexer-usermanual.yml -n knowledge-agent
# Monitor Pod
kubectl logs -f document-ai-indexer-usermanual -n knowledge-agent
# Deploy CronJob
kubectl apply -f deploy/prd-usermanual/document-ai-indexer-cronjob.yml --namespace knowledge-agent
# Check CronJob Status
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
# Check Job Execution History
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
###########
# Manually trigger a job (for testing)
kubectl delete job manual-test -n knowledge-agent
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
# Check Job Logs
kubectl logs -f job/manual-test -n knowledge-agent

View File

@@ -0,0 +1,64 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: document-ai-indexer-cronjob
spec:
# Scheduling configuration - execute every 10 minutes
schedule: "*/10 * * * *"
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
concurrencyPolicy: Forbid
# Successful jobs history limit: Keep the last 3 successful job records.
successfulJobsHistoryLimit: 10
# Failed jobs history limit: Keep the last failed job record.
failedJobsHistoryLimit: 10
# Job template
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: document-ai-indexer
job-type: cronjob
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false # Write permission
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
# Azure File Shared mount
- name: data-volume
mountPath: /app/run_tmp # Program write/read directory

View File

@@ -0,0 +1,47 @@
apiVersion: v1
kind: Pod
metadata:
name: document-ai-indexer-usermanual
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-usermanual-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
- key: prompt.yaml
path: prompt.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false
containers:
- name: document-ai-indexer-usermanual
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: config-volume
mountPath: /app/prompt.yaml
subPath: prompt.yaml
# Azure File Share Mount
- name: data-volume
mountPath: /app/run_tmp # Directory for program read/write

View File

@@ -0,0 +1,10 @@
# login AKS
# az cloud set -n AzureChinaCloud
# az login
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# kubectl create namespace knowledge-agent
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent

View File

@@ -0,0 +1,39 @@
# Service 资源:将外部域名映射为集群内 Service
apiVersion: v1
kind: Service
metadata:
name: itpai-backend
spec:
type: ExternalName
externalName: itpai.infer.api.vgcserv.com.cn
ports:
- port: 443
protocol: TCP
targetPort: 443
---
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: itpai-proxy
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
spec:
rules:
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
http:
paths:
- path: /v1-openai
pathType: Prefix
backend:
service:
name: itpai-backend
port:
number: 443

View File

@@ -0,0 +1,42 @@
config: config.yaml
njobs: 12
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
VECTOR_DIMENSION: 4096
FLAG_AOAI: "V3"
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
extract_method: di+vision-llm
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
di-Formulas: false
di-hiRes: true
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
# Image understanding
figure_caption:
include_di_content: false # Figure content that quotes the result of di
description_gen_max_images: 0 # The maximum number of images to be described. 0 means no description
model_endpoint: null
model_key: null
model: null # azure openai set null
azure_deployment: gpt-4o # azure openai deployment name,Other platforms are set to empty
api_version: 2024-08-01-preview # azure openai deployment name,Other platforms are set to empty
header_fix: true

View File

@@ -0,0 +1,103 @@
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-prd?sp=rl&st=2025-08-02T08:25:56Z&se=2125-08-02T16:40:56Z&spr=https&sv=2024-11-04&sr=c&sig=lJui2%2BOs8V%2BdzCkjchQCR7ITWT28tJ0HAq8bIhkkM%2Bk%3D"
datasource_name: "cat-standard-regulation-prd"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 256
index_schemas:
- index_name: "index-catonline-chunk-v2-prd"
data_type: ["metadata", "document", "chunk"]
# field_type: "append"
upload_batch_size: 50
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"] #todo check
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
- index_name: "index-catonline-document-v2-prd"
data_type: ["document", "metadata"]
# field_type: "full"
key_fields: ["filepath"]
upload_batch_size: 1
fields: ["doc_metadata", "full_metadata_vector", "url", "metadata", "image_mapping", "document_schema", "main_title", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
merge_content_fields: ["content"]
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
- index_name: "index-catonline-standard-regulation-v2-prd"
data_type: ["metadata"]
# field_type: "full"
key_fields: ["x_Standard_Regulation_Id"]
upload_batch_size: 1
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
vector_config_name: "vectorSearchProfile"
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
update_by_field: "x_Standard_Regulation_Id"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["title", "publisher", "document_category", "document_code", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Kind", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Classification", "x_Work_Group", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-abroad-prd?sp=rl&st=2025-09-08T05:32:13Z&se=2099-09-08T13:47:13Z&sv=2024-11-04&sr=c&sig=ebYoiKrSwCk12cRnQqov197LvuBv7m%2FxNoQv4VDMY5o%3D"
datasource_name: "cat-standard-regulation-oversea"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 256
index_schemas:
- index_name: "index-catonline-chunk-oversea"
data_type: ["metadata", "document", "chunk"]
upload_batch_size: 50
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
- index_name: "index-catonline-document-oversea"
data_type: ["document", "metadata"]
key_fields: ["filepath"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
merge_content_fields: ["content"]
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
- index_name: "index-catonline-standard-regulation-oversea"
data_type: ["metadata"]
key_fields: ["standard_Id"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
vector_config_name: "vectorSearchProfile"
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
update_by_field: "standard_Id"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["file_Name","entity_Attribute","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","version_Name","version_Parent_Name","technical_Series_No","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str " ]

View File

@@ -0,0 +1,50 @@
# docker build
docker login acrsales2caiprd.azurecr.cn -u username -p password
docker build . -t document-ai-indexer:2.0.1
docker tag document-ai-indexer:2.0.1 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
# login AKS
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
# az login # Log in to Azure China account (browser or device code flow)
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# Create Azure Files Volume
# kubectl create secret generic azure-files-cred \
# --from-literal=azurestorageaccountname=saaisearchlab \
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
# -n knowledge-agent
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
# Deploy ConfigMap
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=.\deploy\prd\env.yaml --from-file=.\deploy\prd\config.yaml --from-file=prompt.yaml
# Deploy Pod
# kubectl create namespace knowledge-agent
# kubectl delete pod document-ai-indexer -n knowledge-agent
kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
# Monitor Pod
kubectl logs -f document-ai-indexer -n knowledge-agent
# Deploy CronJob
kubectl apply -f ./deploy/prd/document-ai-indexer-cronjob.yml --namespace knowledge-agent
# Check CronJob Status
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
# Check Job Execution History
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
###########
# Manually trigger a job (for testing)
kubectl delete job manual-test -n knowledge-agent
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
# Check Job Logs
kubectl logs -f job/manual-test -n knowledge-agent

View File

@@ -0,0 +1,69 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: document-ai-indexer-cronjob
spec:
# Scheduling configuration - execute every 10 minutes
schedule: "*/10 * * * *"
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
concurrencyPolicy: Forbid
# Successful jobs history limit: Keep the last 3 successful job records.
successfulJobsHistoryLimit: 10
# Failed jobs history limit: Keep the last failed job record.
failedJobsHistoryLimit: 10
# Job template
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: document-ai-indexer
job-type: cronjob
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
- key: prompt.yaml
path: prompt.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false # Write permission
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: config-volume
mountPath: /app/prompt.yaml
subPath: prompt.yaml
# Azure File Shared mount
- name: data-volume
mountPath: /app/run_tmp # Program write/read directory

View File

@@ -0,0 +1,47 @@
apiVersion: v1
kind: Pod
metadata:
name: document-ai-indexer
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
- key: prompt.yaml
path: prompt.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: config-volume
mountPath: /app/prompt.yaml
subPath: prompt.yaml
# Azure File Share Mount
- name: data-volume
mountPath: /app/run_tmp # Directory for program read/write

View File

@@ -0,0 +1,10 @@
# login AKS
# az cloud set -n AzureChinaCloud
# az login
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# kubectl create namespace knowledge-agent
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent

View File

@@ -0,0 +1,39 @@
# Service 资源:将外部域名映射为集群内 Service
apiVersion: v1
kind: Service
metadata:
name: itpai-backend
spec:
type: ExternalName
externalName: itpai.infer.api.vgcserv.com.cn
ports:
- port: 443
protocol: TCP
targetPort: 443
---
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: itpai-proxy
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
spec:
rules:
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
http:
paths:
- path: /v1-openai
pathType: Prefix
backend:
service:
name: itpai-backend
port:
number: 443

View File

@@ -0,0 +1,36 @@
config: config.yaml
njobs: 12
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
VECTOR_DIMENSION: 4096
FLAG_AOAI: "V3"
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
extract_method: di+vision-llm
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
di-Formulas: true
di-hiRes: true
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
# Image understanding
figure_caption:
include_di_content: false # Figure content that quotes the result of di
description_gen_max_images: 0 # The maximum number of images to be described. 0 means no description
model_endpoint: null
model_key: null
model: null # azure openai set null
azure_deployment: null # azure openai deployment name,Other platforms are set to empty
api_version: null # azure openai deployment name,Other platforms are set to empty
header_fix: true

View File

@@ -0,0 +1,215 @@
import json
import os
import re
import time
from pathlib import Path
from urllib.parse import urlparse, urlunparse
import base64
import uuid
from openai import AzureOpenAI
from azure.storage.blob import ContainerClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat, AnalyzeResult, \
DocumentAnalysisFeature, AnalyzeOutputOption, DocumentSpan
from entity_models import DiResult, Document, FigureFlat
from utils import TOKEN_ESTIMATOR, custom_serializer, resize_image, file_rename
from resilient_http_pool import get_ai_inference_client
def di_extract(source_file_path:str, di_client: DocumentIntelligenceClient, directory_path:str, figure_sas_url:str, language:str="zh-Hans") -> DiResult:
di_features:list[str|DocumentAnalysisFeature] = []
allow_features_exts: list[str] = os.getenv("di_allow_features_ext", "").lower().split(';')
# get file name from source_file_path without extension
file_name = os.path.basename(source_file_path)
di_source_file_path = source_file_path
# PDF
# JPEG / JPG、PNG、BMP、TIFF、HEIF
file_ext: str = (source_file_path.split('.')[-1] if '.' in source_file_path.split('/')[-1] else '' ).lower()
if file_ext in ['jpg', 'jpeg', 'jpe', 'jfif', 'pjpeg', 'pjp', 'png', 'gif', 'webp', 'tif', 'tiff', 'bmp', 'dib', 'heif', 'heic', 'avif', 'apng', 'svg']:
di_source_file_path = resize_image(source_file_path)
# doc to docx
di_source_file_path = file_rename(di_source_file_path)
if os.getenv("di-hiRes",'').lower() == "true" and file_ext in allow_features_exts:
di_features.append(DocumentAnalysisFeature.OCR_HIGH_RESOLUTION)
if os.getenv("di-Formulas",'').lower() == "true" and file_ext in allow_features_exts:
di_features.append(DocumentAnalysisFeature.FORMULAS)
print(f"di_features: {di_features},file_path:{file_name}")
with open(di_source_file_path, "rb") as file:
poller = di_client.begin_analyze_document(model_id="prebuilt-layout", body=file,
features=di_features, output_content_format=DocumentContentFormat.MARKDOWN, output=[AnalyzeOutputOption.FIGURES]) # type: ignore
result: AnalyzeResult = poller.result()
extracted_doc = Document()
source_rel_file_path = os.path.relpath(source_file_path, directory_path)
extracted_doc.filepath = source_rel_file_path
result_content: str = result.content
# The operation id is required to later query individual figures
operation_id: str = str(poller.details.get("operation_id"))
output_folder = directory_path + "/.extracted/" + file_name
os.makedirs(f"{output_folder}", exist_ok=True)
extracted_doc.content = result_content
with open(f"{output_folder}/_merged_origin.md", "w", encoding="utf-8") as doc_meta_file:
doc_meta_file.write(result_content)
# Download and process images
figures = extract_figures(di_client, result, operation_id, directory_path, file_name, figure_sas_url)
di_result:DiResult = DiResult(
figures = figures,
di_content = result_content,
filepath= source_rel_file_path,
language=language
)
return di_result
def extract_figures(di_client: DocumentIntelligenceClient, result:AnalyzeResult, result_id:str, directory_path:str, file_name:str, figure_sas_url:str)->list[FigureFlat]:
"""Extracts figures and their metadata from the analyzed result."""
figures:list[FigureFlat] = []
base_path: Path = Path(os.path.join(directory_path, ".extracted", file_name, ".images"))
base_path.mkdir(parents=True, exist_ok=True)
with open(f"{base_path}/result.json", "w", encoding="utf-8") as figures_file:
json.dump(result, figures_file, default=custom_serializer, ensure_ascii=False, indent=4)
for figure in result.figures if result.figures is not None else []:
if not any(figure.spans):
continue
span:DocumentSpan = figure.spans[0]
# Image extraction
stream = di_client.get_analyze_result_figure(model_id=result.model_id, result_id=result_id, figure_id=figure.id)
image_bytes = b"".join(list(stream))
path_image: Path = Path(os.path.join(base_path, f"figure_{figure.id}.png"))
path_image.write_bytes(image_bytes)
blob_url = upload_figure(figure_sas_url,f"figure_{figure.id}.png", image_bytes)
image_str:str = base64.b64encode(image_bytes).decode('utf-8')
figures.append(FigureFlat(offset=span.offset, length=span.length, url=blob_url, content="",image=image_str,understand_flag=False,caption = figure.caption.content if figure.caption else ""))
return figures
# Compile once for efficiency
_specific_comments = re.compile(
r"""<!--\s* # opening
(?:PageFooter="[^"]*" # PageFooter=""
|PageNumber="[^"]*" # PageNumber=""
|PageBreak # PageBreak
|PageHeader="[^"]*") # PageHeader=""
\s*--> # closing
""",
flags=re.VERBOSE
)
def remove_specific_comments(text: str) -> str:
return _specific_comments.sub('', text)
def retry_get_embedding(text: str, embedding_model_key:str, embedding_endpoint:str,min_chunk_size:int=10,retry_num:int = 3):
""" Retries getting embedding for the provided text until it succeeds or reaches the retry limit."""
full_metadata_size = TOKEN_ESTIMATOR.estimate_tokens(text)
if full_metadata_size >= min_chunk_size:
for i in range(retry_num):
try:
return get_embedding(text, embedding_model_key=embedding_model_key,embedding_model_endpoint=embedding_endpoint)
except Exception as e:
print(f"Error getting embedding for full_metadata_vector with error={e}, retrying, currently at {i + 1} retry, {retry_num - (i + 1)} retries left")
time.sleep(10)
raise Exception(f"Error getting embedding for full_metadata_vector={text}")
return None
def get_embedding(text:str, embedding_model_endpoint:str="", embedding_model_key:str="", azure_credential=None):
endpoint = embedding_model_endpoint if embedding_model_endpoint else os.environ.get("EMBEDDING_MODEL_ENDPOINT")
FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")
if azure_credential is None and (endpoint is None or embedding_model_key is None):
raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
try:
if FLAG_EMBEDDING_MODEL == "AOAI":
endpoint_parts = endpoint.split("/openai/deployments/")
base_url = endpoint_parts[0]
deployment_id = endpoint_parts[1].split("/embeddings")[0]
api_version = endpoint_parts[1].split("api-version=")[1].split("&")[0]
if azure_credential is not None:
api_key = azure_credential.get_token("https://cognitiveservices.azure.com/.default").token
else:
api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")
client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
if FLAG_AOAI == "V2":
embeddings = client.embeddings.create(model=deployment_id, input=text, timeout=120)
elif FLAG_AOAI == "V3":
embeddings = client.embeddings.create(model=deployment_id,
input=text,
dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)), timeout=120)
return embeddings.model_dump()['data'][0]['embedding']
if FLAG_EMBEDDING_MODEL == "COHERE":
raise Exception("COHERE is not supported for now")
# if FLAG_COHERE == "MULTILINGUAL":
# key = embedding_model_key if embedding_model_key else os.getenv("COHERE_MULTILINGUAL_API_KEY")
# elif FLAG_COHERE == "ENGLISH":
# key = embedding_model_key if embedding_model_key else os.getenv("COHERE_ENGLISH_API_KEY")
# data, headers = get_payload_and_headers_cohere(text, key)
# with httpx.Client() as client:
# response = client.post(endpoint, json=data, headers=headers)
# result_content = response.json()
# return result_content["embeddings"][0]
if FLAG_EMBEDDING_MODEL:
headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {embedding_model_key}' }
data = { "model": FLAG_EMBEDDING_MODEL, "input": text }
client = get_ai_inference_client()
response = client.post(endpoint, json=data, headers=headers)
result_content = response.json()
return result_content["data"][0]["embedding"]
except Exception as e:
print(f"Error getting embeddings with endpoint={endpoint} with error={e}")
raise Exception(f"Error getting embeddings with endpoint={endpoint} with error={e}")
def upload_figure(blob_sas_url: str, orgin_file_name: str, data: bytes) -> str:
for i in range(3):
try:
# Upload image to Azure Blob
fileName = generate_filename()
container_client = ContainerClient.from_container_url(blob_sas_url)
blob = container_client.upload_blob(name=f"{fileName}.png", data=data)
return urlunparse(urlparse(blob.url)._replace(query='', fragment=''))
except Exception as e:
print(
f"Error uploading figure with error={e}, retrying, currently at {i + 1} retry, {3 - (i + 1)} retries left")
time.sleep(3)
raise Exception(f"Error uploading figure for: {orgin_file_name}")
def generate_filename(length:int=8):
"""Generate a unique 10-character ID using UUID"""
t = int(time.time() * 1000) % 1000000
base = uuid.uuid4().hex[:length]
return f"{t:06x}{base}"

View File

@@ -0,0 +1,841 @@
# Document AI Indexer - Design Document
## Overview
The Document AI Indexer is an intelligent document processing and indexing system built on Azure AI services. It provides comprehensive document extraction, processing, and vectorized indexing capabilities for multiple document formats, enabling advanced search and retrieval functionality.
### Design Philosophy
The system is designed with several key principles in mind:
**Modularity and Separation of Concerns**: The architecture follows a layered approach with clear separation between application logic, business logic, service layer, and data access. This ensures maintainability and allows for easy testing and modification of individual components.
**Scalability and Performance**: Built with asynchronous processing capabilities and horizontal scaling in mind. The system can handle large volumes of documents through configurable parallel processing and efficient resource utilization.
**Resilience and Fault Tolerance**: Implements comprehensive error handling, retry mechanisms, and graceful degradation to ensure reliable operation even when external services experience issues.
**Configuration-Driven Architecture**: Utilizes YAML-based configuration management that allows for flexible deployment across different environments without code changes.
**Cloud-Native Design**: Leverages Azure services for AI processing, storage, and search capabilities while maintaining vendor independence through abstraction layers.
## Features
### 🚀 Core Features
- **Multi-format Document Support**: Handles PDF, DOCX, images (JPEG, PNG, TIFF, etc.), and other document formats
- **Intelligent Content Extraction**: Leverages Azure Document Intelligence for OCR and structured data extraction
- **Smart Document Chunking**: Implements hierarchy-aware chunking with configurable token limits and overlap
- **Vector Search Integration**: Automatic Azure AI Search index creation and document vectorization
- **Metadata Management**: Complete extraction and management of document metadata and custom fields
- **Hierarchy Structure Repair**: Automatic correction of title hierarchy structure in Markdown documents
- **Figure and Formula Extraction**: Advanced extraction of visual elements and mathematical formulas
### 🔧 Technical Features
- **Asynchronous Processing**: High-performance async processing using asyncio and task queues
- **Containerized Deployment**: Complete Docker and Kubernetes support with configurable environments
- **Configuration Management**: Flexible YAML-based configuration for different deployment scenarios
- **Database Support**: SQLAlchemy ORM with support for multiple database backends
- **Resilient Processing**: Built-in retry mechanisms, error handling, and fault tolerance
- **Monitoring & Logging**: Comprehensive logging, progress monitoring, and processing statistics
- **Scalable Architecture**: Horizontal scaling support through containerization and task distribution
## System Architecture
The Document AI Indexer follows a multi-layered architecture designed for scalability, maintainability, and robust error handling. The system processes documents through a well-defined pipeline that transforms raw documents into searchable, vectorized content.
### Architectural Patterns
**Service Factory Pattern**: The system uses a centralized ServiceFactory to manage dependencies and service creation. This pattern ensures consistent configuration across all services and enables easy testing through dependency injection.
**Repository Pattern**: Data access is abstracted through repository interfaces, allowing for different storage backends and simplified testing with mock implementations.
**Command Pattern**: Document processing tasks are encapsulated as commands that can be queued, retried, and executed asynchronously.
**Pipeline Pattern**: The document processing workflow follows a clear pipeline with distinct stages: extraction, hierarchy fixing, chunking, vectorization, and indexing.
### High-Level Architecture
The high-level architecture represents a distributed, service-oriented system designed for scalable document processing and intelligent content extraction. The architecture emphasizes separation of concerns, fault tolerance, and cloud-native principles to handle enterprise-scale document processing workloads.
#### Architectural Overview
**Multi-Layered Design**: The system is organized into distinct functional layers that separate data ingestion, processing logic, AI services, and storage concerns. This layered approach enables independent scaling, testing, and maintenance of different system components.
**Service-Oriented Architecture**: Each major functional area is implemented as a distinct service or component group, enabling independent deployment, scaling, and maintenance. Services communicate through well-defined interfaces and can be replaced or upgraded independently.
**Cloud-Native Integration**: The architecture leverages Azure cloud services for AI processing, storage, and search capabilities while maintaining abstraction layers that enable portability and testing flexibility.
**Event-Driven Processing**: The system follows an event-driven model where document processing is triggered by events (new documents, configuration changes, etc.) and progresses through a series of processing stages with clear state transitions.
#### System Components and Responsibilities
**Data Sources Layer**: Manages document ingestion from various sources including Azure Blob Storage and local file systems. This layer handles authentication, access control, and metadata extraction from source systems. It provides a unified interface for document discovery regardless of the underlying storage mechanism.
**Processing Engine Layer**: Orchestrates the entire document processing workflow through a hierarchical task management system. The Main Application serves as the central coordinator, while the Task Processor manages work distribution and the Document Task Processor handles individual document processing operations with full state tracking and error recovery.
**AI Services Layer**: Provides intelligent document processing capabilities through integration with Azure AI services and optional Vision LLM systems. These services handle complex operations like OCR, layout analysis, content extraction, and embedding generation. The modular design allows for easy integration of additional AI services or replacement of existing ones.
**Processing Pipeline Layer**: Implements the core document transformation logic through a series of processing stages. Each stage has specific responsibilities: content extraction converts raw documents to structured text, hierarchy fixing normalizes document structure, chunking creates manageable content segments, and vector generation produces searchable embeddings.
**Storage & Search Layer**: Manages persistent data storage and search capabilities through a combination of relational database storage for metadata and state management, Azure AI Search for vector-based content search, and blob storage for processed content and temporary files.
#### Data Flow and Integration Patterns
**Asynchronous Processing Flow**: Documents flow through the system asynchronously, enabling high throughput and efficient resource utilization. Each processing stage can operate independently, with clear handoff points and state persistence between stages.
**Fault-Tolerant Design**: The architecture includes comprehensive error handling and recovery mechanisms at every level. Failed operations are tracked, logged, and can be retried with exponential backoff. The system maintains processing state to enable recovery from failures without losing work.
**Scalability Patterns**: The architecture supports both vertical and horizontal scaling through stateless processing components, connection pooling, and queue-based work distribution. Different components can be scaled independently based on their specific resource requirements and bottlenecks.
**Configuration-Driven Behavior**: The system behavior is largely controlled through configuration rather than code changes, enabling flexible deployment across different environments and use cases without requiring code modifications or redeployment.
```mermaid
graph TB
subgraph "Data Sources"
DS[Document Sources<br/>Azure Blob Storage/Local Files]
META[Metadata<br/>Configuration]
end
subgraph "Processing Engine"
MAIN[Main Application<br/>Orchestrator]
TP[Task Processor<br/>Queue Management]
DTP[Document Task<br/>Processor]
end
subgraph "AI Services"
ADI[Azure Document<br/>Intelligence]
EMBED[Embedding<br/>Service]
VLLM[Vision LLM<br/>Optional]
end
subgraph "Processing Pipeline"
EXTRACT[Content<br/>Extraction]
HIERARCHY[Hierarchy<br/>Fix]
CHUNK[Document<br/>Chunking]
VECTOR[Vector<br/>Generation]
end
subgraph "Storage & Search"
DB[(Database<br/>SQLAlchemy)]
AAS[Azure AI Search<br/>Index]
BLOB[Azure Blob<br/>Storage]
end
DS --> MAIN
META --> MAIN
MAIN --> TP
TP --> DTP
DTP --> EXTRACT
EXTRACT --> ADI
EXTRACT --> VLLM
ADI --> HIERARCHY
HIERARCHY --> CHUNK
CHUNK --> VECTOR
VECTOR --> EMBED
DTP --> DB
VECTOR --> AAS
EXTRACT --> BLOB
style DS fill:#e1f5fe
style AI fill:#f3e5f5
style STORAGE fill:#e8f5e8
```
### Component Architecture
The component architecture illustrates the internal structure and dependencies between different layers of the system. Each layer has specific responsibilities and communicates through well-defined interfaces.
**Application Layer**: Handles application initialization, configuration loading, and high-level orchestration. The ApplicationContext manages the overall application state and provides access to configuration and services.
**Business Layer**: Contains the core business logic for document processing. The DocumentProcessingOrchestrator coordinates the entire processing workflow, while the DocumentProcessor handles individual document processing tasks.
**Service Layer**: Provides abstracted access to external services and resources. The ServiceFactory manages service creation and configuration, ensuring consistent behavior across the application.
**Data Layer**: Manages data persistence and retrieval through repository patterns and entity models. This layer abstracts database operations and provides a clean interface for data access.
```mermaid
graph LR
subgraph "Application Layer"
APP[DocumentProcessingApplication]
CTX[ApplicationContext]
CONFIG[ApplicationConfig]
end
subgraph "Business Layer"
BL[Business Layer]
ORCH[DocumentProcessingOrchestrator]
PROC[DocumentProcessor]
FACTORY[DocumentProcessingFactory]
end
subgraph "Service Layer"
SF[ServiceFactory]
DI[DocumentIntelligenceService]
CHUNK[ChunkService]
INDEX[AzureIndexService]
BLOB[BlobService]
end
subgraph "Data Layer"
DB[DatabaseInterface]
REPO[DocumentRepository]
MODELS[Entity Models]
end
APP --> BL
CTX --> CONFIG
APP --> CTX
BL --> SF
ORCH --> PROC
FACTORY --> ORCH
SF --> DI
SF --> CHUNK
SF --> INDEX
SF --> BLOB
PROC --> DB
DB --> REPO
REPO --> MODELS
style APP fill:#bbdefb
style BL fill:#c8e6c9
style SF fill:#ffecb3
style DB fill:#f8bbd9
```
## Workflow
The document processing workflow is designed to handle large-scale document processing with fault tolerance and efficient resource utilization. The system processes documents asynchronously through a task-based architecture.
### Processing Strategy
**Asynchronous Task Processing**: Documents are processed as individual tasks that can be executed in parallel. This approach maximizes throughput and allows for efficient resource utilization across multiple processing nodes.
**Stateful Processing**: Each document's processing state is tracked in the database, enabling recovery from failures and preventing duplicate processing. The system maintains detailed status information and processing history.
**Batch Operations**: Where possible, operations are batched to improve efficiency. This is particularly important for operations like embedding generation and search index uploads.
**Retry Logic**: Failed operations are automatically retried with exponential backoff. The system distinguishes between transient failures (which should be retried) and permanent failures (which should be logged and skipped).
### Document Processing Workflow
```mermaid
sequenceDiagram
participant USER as User/Scheduler
participant MAIN as Main App
participant TP as Task Processor
participant DTP as Document Task Processor
participant ORCH as Orchestrator
participant ADI as Azure DI
participant CHUNK as Chunk Service
participant INDEX as Index Service
participant DB as Database
USER->>MAIN: Start Processing
MAIN->>MAIN: Initialize Configuration
MAIN->>DB: Initialize Database
MAIN->>TP: Create Task Processor
loop For Each Document
MAIN->>TP: Submit Document Task
TP->>DTP: Process Task
DTP->>DB: Create/Update IndexObject
DTP->>ORCH: Execute Processing
ORCH->>ADI: Extract Document Content
ADI-->>ORCH: Return Extracted Content
ORCH->>ORCH: Fix Hierarchy
ORCH->>CHUNK: Chunk Document
CHUNK-->>ORCH: Return Chunks
ORCH->>INDEX: Generate Embeddings
INDEX-->>ORCH: Return Vectors
ORCH->>INDEX: Upload to Search Index
INDEX-->>ORCH: Confirm Upload
ORCH-->>DTP: Return Processing Result
DTP->>DB: Update IndexObject Status
DTP-->>TP: Return Result
end
TP-->>MAIN: Processing Complete
MAIN-->>USER: Return Statistics
```
### Data Flow Architecture
The data flow architecture represents the end-to-end processing pipeline from document ingestion to search index publication. This design emphasizes fault tolerance, scalability, and efficient resource utilization throughout the processing lifecycle.
#### Design Principles for Data Flow
**Pipeline-Based Processing**: The data flow follows a clear pipeline pattern where each stage has specific responsibilities and well-defined inputs and outputs. This design enables parallel processing, easier debugging, and modular testing of individual stages.
**Decision Points and Routing**: The architecture includes intelligent decision points that route documents through appropriate processing paths based on their characteristics. This ensures optimal processing strategies for different document types while maintaining a unified interface.
**State Management**: Processing state is carefully managed throughout the pipeline, with persistent state stored in the database and transient state maintained in memory. This approach enables recovery from failures at any point in the pipeline.
**Resource Optimization**: The flow is designed to minimize resource usage through efficient batching, connection reuse, and memory management. Processing stages are optimized to balance throughput with resource consumption.
#### Processing Flow Stages
**Initialization Phase**: The system performs comprehensive initialization including configuration validation, database connectivity checks, and service authentication. This phase ensures that all dependencies are available before processing begins.
**Discovery and Task Creation**: Document sources are scanned to identify new or modified documents that require processing. Tasks are created based on configured criteria such as file modification dates and processing history.
**Format Detection and Routing**: Documents are analyzed to determine their format and complexity, enabling the system to select the most appropriate extraction method. This intelligent routing ensures optimal processing quality and efficiency.
**Content Extraction**: Multiple extraction paths are available depending on document characteristics. The system can leverage Azure Document Intelligence for complex documents, Vision LLM for advanced image analysis, or direct processing for simple text documents.
**Content Enhancement**: Extracted content undergoes enhancement through hierarchy fixing and structure normalization. This stage ensures that the processed content maintains logical structure and is suitable for effective chunking.
**Vectorization and Indexing**: The final stages convert processed content into searchable vectors and upload them to the search index. These operations are batched for efficiency and include comprehensive error handling and retry logic.
```mermaid
flowchart TD
START([Start Processing]) --> INIT[Initialize Application]
INIT --> LOAD_CONFIG[Load Configuration]
LOAD_CONFIG --> INIT_DB[Initialize Database]
INIT_DB --> SCAN_DOCS[Scan Document Sources]
SCAN_DOCS --> CREATE_TASKS[Create Processing Tasks]
CREATE_TASKS --> PROCESS_TASK{Process Each Task}
PROCESS_TASK --> EXTRACT[Extract Content]
EXTRACT --> CHECK_FORMAT{Check Document Format}
CHECK_FORMAT -->|PDF/Images| USE_DI[Use Azure Document Intelligence]
CHECK_FORMAT -->|Vision Mode| USE_VLLM[Use Vision LLM]
CHECK_FORMAT -->|Text| DIRECT_PROCESS[Direct Processing]
USE_DI --> EXTRACT_RESULT[Content + Metadata]
USE_VLLM --> EXTRACT_RESULT
DIRECT_PROCESS --> EXTRACT_RESULT
EXTRACT_RESULT --> FIX_HIERARCHY[Fix Document Hierarchy]
FIX_HIERARCHY --> CHUNK_DOC[Chunk Document]
CHUNK_DOC --> GENERATE_VECTORS[Generate Embeddings]
GENERATE_VECTORS --> UPLOAD_INDEX[Upload to Search Index]
UPLOAD_INDEX --> UPDATE_DB[Update Database Status]
UPDATE_DB --> MORE_TASKS{More Tasks?}
MORE_TASKS -->|Yes| PROCESS_TASK
MORE_TASKS -->|No| COMPLETE[Processing Complete]
COMPLETE --> STATS[Generate Statistics]
STATS --> END([End])
style START fill:#c8e6c9
style END fill:#ffcdd2
style EXTRACT fill:#fff3e0
style GENERATE_VECTORS fill:#e1f5fe
style UPLOAD_INDEX fill:#f3e5f5
```
## Functional Logic
The functional logic of the Document AI Indexer encompasses three main processing areas: document extraction, content chunking, and search indexing. Each area implements sophisticated algorithms to ensure high-quality output.
### Design Principles for Document Processing
**Format-Agnostic Processing**: The system handles multiple document formats through a unified interface. Different extractors are used based on document type, but all produce a standardized Document object.
**Intelligent Content Analysis**: Before processing, the system analyzes document structure to determine the optimal processing strategy. This includes detecting header hierarchies, identifying figures and tables, and understanding document layout.
**Quality Assurance**: Each processing stage includes validation and quality checks. For example, the hierarchy fixer validates that document structure is logical and coherent before proceeding to chunking.
**Metadata Preservation**: Throughout the processing pipeline, important metadata is preserved and enriched. This includes document properties, processing timestamps, and structural information.
### Document Extraction Logic
The document extraction logic is the foundation of the processing pipeline. It handles the complex task of converting various document formats into structured, searchable content while preserving important layout and formatting information.
**Multi-Modal Processing**: The system supports both traditional OCR-based extraction and advanced vision-language model processing. The choice of extraction method depends on document complexity and available resources.
**Feature Detection**: Azure Document Intelligence features are selectively enabled based on document characteristics and configuration. This includes high-resolution OCR for detailed documents, formula extraction for technical content, and figure extraction for visual elements.
**Content Structure Preservation**: The extraction process maintains document structure through markdown formatting, preserving headers, lists, tables, and other formatting elements that provide context for the content.
**Error Handling and Fallbacks**: If advanced extraction features fail, the system falls back to basic extraction methods to ensure that content is not lost due to processing errors.
```mermaid
flowchart TD
DOC[Document Input] --> DETECT[Detect Format]
DETECT --> PDF{PDF?}
DETECT --> IMG{Image?}
DETECT --> OFFICE{Office Doc?}
DETECT --> TEXT{Text File?}
PDF -->|Yes| DI_PDF[Azure DI Layout Model]
IMG -->|Yes| RESIZE[Resize if Needed]
OFFICE -->|Yes| CONVERT[Convert to Supported Format]
TEXT -->|Yes| DIRECT[Direct Content Read]
RESIZE --> DI_IMG[Azure DI OCR + Layout]
CONVERT --> DI_OFFICE[Azure DI Document Analysis]
DI_PDF --> FEATURES[Apply DI Features]
DI_IMG --> FEATURES
DI_OFFICE --> FEATURES
FEATURES --> HIGH_RES{High Resolution OCR?}
FEATURES --> FORMULAS{Extract Formulas?}
FEATURES --> FIGURES{Extract Figures?}
HIGH_RES -->|Yes| ENABLE_HIRES[Enable High-Res OCR]
FORMULAS -->|Yes| ENABLE_FORMULAS[Enable Formula Extraction]
FIGURES -->|Yes| ENABLE_FIGURES[Enable Figure Extraction]
ENABLE_HIRES --> PROCESS_DI[Process with Azure DI]
ENABLE_FORMULAS --> PROCESS_DI
ENABLE_FIGURES --> PROCESS_DI
HIGH_RES -->|No| PROCESS_DI
FORMULAS -->|No| PROCESS_DI
FIGURES -->|No| PROCESS_DI
DIRECT --> EXTRACT_META[Extract Metadata]
PROCESS_DI --> EXTRACT_CONTENT[Extract Content + Structure]
EXTRACT_CONTENT --> EXTRACT_META
EXTRACT_META --> RESULT[Document Object]
style DOC fill:#e3f2fd
style RESULT fill:#c8e6c9
style PROCESS_DI fill:#fff3e0
```
### Chunking Strategy
The chunking strategy is critical for creating meaningful, searchable segments from large documents. The system implements intelligent chunking that respects document structure while maintaining optimal chunk sizes for search and retrieval.
**Hierarchy-Aware Chunking**: The system analyzes document structure and uses markdown headers to create logical chunks. This ensures that related content stays together and that chunks maintain contextual coherence.
**Adaptive Chunking**: Chunk boundaries are determined by both content structure and token limits. The system balances the need for complete thoughts with search engine constraints.
**Overlap Strategy**: Configurable token overlap between chunks ensures that important information at chunk boundaries is not lost during retrieval operations.
**Token Management**: Precise token counting using tiktoken ensures that chunks stay within specified limits while maximizing content density.
```mermaid
flowchart TD
CONTENT[Extracted Content] --> HIERARCHY_FIX{Apply Hierarchy Fix?}
HIERARCHY_FIX -->|Yes| FIX[Fix Header Hierarchy]
HIERARCHY_FIX -->|No| CHUNK_STRATEGY[Determine Chunking Strategy]
FIX --> ANALYZE[Analyze Document Structure]
ANALYZE --> CHUNK_STRATEGY
CHUNK_STRATEGY --> MARKDOWN{Markdown Headers?}
CHUNK_STRATEGY --> RECURSIVE{Use Recursive Split?}
MARKDOWN -->|Yes| HEADER_SPLIT[Markdown Header Splitter]
MARKDOWN -->|No| RECURSIVE
RECURSIVE -->|Yes| CHAR_SPLIT[Recursive Character Splitter]
HEADER_SPLIT --> CONFIG[Apply Chunk Configuration]
CHAR_SPLIT --> CONFIG
CONFIG --> SIZE[Chunk Size: 2048 tokens]
CONFIG --> OVERLAP[Token Overlap: 128]
SIZE --> SPLIT[Split Document]
OVERLAP --> SPLIT
SPLIT --> VALIDATE[Validate Chunk Sizes]
VALIDATE --> METADATA[Add Chunk Metadata]
METADATA --> RESULT[Chunked Documents]
style CONTENT fill:#e3f2fd
style RESULT fill:#c8e6c9
style FIX fill:#fff3e0
style SPLIT fill:#f3e5f5
```
### Indexing and Search Integration
The indexing and search integration component handles the final stage of the processing pipeline, converting processed documents into searchable vector representations and uploading them to Azure AI Search.
**Vector Generation**: The system generates high-quality embeddings using Azure OpenAI services. Multiple vector fields can be configured to support different search scenarios (content-based, metadata-based, etc.).
**Batch Processing**: Documents are processed in configurable batches to optimize upload performance and manage API rate limits effectively.
**Schema Management**: The system automatically creates and manages search index schemas based on configuration files, ensuring that all required fields and vector configurations are properly set up.
**Error Recovery**: Failed uploads are tracked and retried, with detailed logging to help diagnose and resolve issues. The system can recover from partial batch failures without losing processed content.
```mermaid
flowchart TD
CHUNKS[Document Chunks] --> EMBED[Generate Embeddings]
EMBED --> OPENAI[Azure OpenAI API]
OPENAI --> VECTORS[Vector Embeddings]
VECTORS --> PREPARE[Prepare Index Documents]
PREPARE --> METADATA[Add Metadata Fields]
METADATA --> CUSTOM[Add Custom Fields]
CUSTOM --> BATCH[Create Upload Batches]
BATCH --> SIZE[Batch Size: 50 docs]
SIZE --> UPLOAD[Upload to Azure AI Search]
UPLOAD --> SUCCESS{Upload Successful?}
SUCCESS -->|Yes| UPDATE_STATUS[Update Success Status]
SUCCESS -->|No| RETRY[Retry Upload]
RETRY --> MAX_RETRIES{Max Retries Reached?}
MAX_RETRIES -->|No| UPLOAD
MAX_RETRIES -->|Yes| ERROR[Mark as Failed]
UPDATE_STATUS --> NEXT_BATCH{More Batches?}
NEXT_BATCH -->|Yes| BATCH
NEXT_BATCH -->|No| COMPLETE[Index Complete]
ERROR --> LOG[Log Error Details]
LOG --> COMPLETE
style CHUNKS fill:#e3f2fd
style COMPLETE fill:#c8e6c9
style EMBED fill:#fff3e0
style UPLOAD fill:#f3e5f5
style ERROR fill:#ffcdd2
```
## Database Schema
The database schema is designed to support scalable document processing operations while maintaining data integrity and enabling efficient querying. The schema tracks processing state, manages job coordination, and provides audit trails.
### Design Rationale
**Composite Primary Keys**: The IndexObject table uses composite primary keys (object_key, datasource_name) to support multi-tenant scenarios where the same document might exist in different data sources.
**State Tracking**: Detailed status tracking allows the system to resume processing after failures and provides visibility into processing progress and issues.
**Audit Trail**: Comprehensive timestamp tracking and detailed message logging provide full audit trails for compliance and debugging purposes.
**Job Coordination**: The IndexJob table enables coordination of processing jobs across multiple instances and provides reporting on job completion and success rates.
### Core Entities
```mermaid
erDiagram
IndexObject {
string object_key PK
string datasource_name PK
string type
string status
datetime created_time
datetime updated_time
datetime last_start_time
datetime last_finished_time
int try_count
int last_run_id
text detailed_message
text error_message
text last_message
}
IndexJob {
int id PK
string datasource_name
string status
datetime start_time
datetime end_time
int total_files
int processed_files
int failed_files
int skipped_files
text config_snapshot
text error_message
}
IndexObject ||--o{ IndexJob : belongs_to
```
## Configuration Management
The configuration management system is designed to support flexible deployment across different environments while maintaining security and ease of management. The system separates business configuration from sensitive credentials and provides environment-specific overrides.
### Configuration Strategy
**Separation of Concerns**: Business logic configuration (data sources, processing parameters) is separated from sensitive credentials (API keys, connection strings) to enable secure deployment practices.
**Environment-Specific Configuration**: The system supports multiple configuration files that can be combined to create environment-specific deployments without duplicating common settings.
**Validation and Defaults**: Configuration values are validated at startup, and sensible defaults are provided to minimize required configuration while ensuring the system operates correctly.
**Dynamic Reconfiguration**: Many configuration parameters can be modified without requiring application restarts, enabling operational flexibility and optimization.
### Configuration Structure
```mermaid
mindmap
root((Configuration))
Data Sources
Blob Storage
SAS Tokens
Container Paths
Local Files
Directory Paths
File Filters
Processing
Chunk Size
Token Overlap
Batch Sizes
Retry Limits
AI Services
Azure Document Intelligence
Endpoint
API Key
Features
Azure OpenAI
Endpoint
API Key
Model Settings
Database
Connection String
Connection Pool
Index Schemas
Field Mappings
Vector Configurations
Search Index Settings
```
## Deployment Architecture
The deployment architecture is designed for cloud-native operations with support for both batch processing and continuous operation modes. The system leverages Kubernetes for orchestration and scaling while maintaining compatibility with various deployment scenarios.
### Cloud-Native Design Principles
**Containerization**: The application is fully containerized, enabling consistent deployment across different environments and easy scaling based on demand.
**Stateless Processing**: Processing pods are designed to be stateless, with all persistent state managed through external databases and storage services. This enables horizontal scaling and fault tolerance.
**Configuration Externalization**: All configuration is externalized through ConfigMaps and Secrets, allowing for environment-specific configuration without rebuilding container images.
**Resource Management**: The deployment configuration includes resource limits and requests to ensure proper resource allocation and prevent resource contention in multi-tenant environments.
### Scaling Strategy
**Horizontal Pod Autoscaling**: The system can automatically scale the number of processing pods based on CPU utilization, memory usage, or custom metrics like queue depth.
**Job-Based Processing**: For batch operations, the system uses Kubernetes Jobs and CronJobs to ensure processing completion and automatic cleanup of completed jobs.
**Load Distribution**: Multiple pods process documents in parallel, with work distribution managed through the database-backed task queue system.
### Kubernetes Deployment
```mermaid
graph TB
subgraph "Kubernetes Cluster"
subgraph "Namespace: document-ai"
POD1[Document Processor Pod 1]
POD2[Document Processor Pod 2]
POD3[Document Processor Pod N]
CM[ConfigMap<br/>config.yaml]
SECRET[Secret<br/>env.yaml]
PVC[PersistentVolumeClaim<br/>Temp Storage]
end
subgraph "Services"
SVC[LoadBalancer Service]
CRON[CronJob Controller]
end
end
subgraph "External Services"
AZURE_DI[Azure Document Intelligence]
AZURE_OPENAI[Azure OpenAI]
AZURE_SEARCH[Azure AI Search]
AZURE_STORAGE[Azure Blob Storage]
DATABASE[(Database)]
end
CM --> POD1
CM --> POD2
CM --> POD3
SECRET --> POD1
SECRET --> POD2
SECRET --> POD3
PVC --> POD1
PVC --> POD2
PVC --> POD3
SVC --> POD1
SVC --> POD2
SVC --> POD3
CRON --> POD1
POD1 --> AZURE_DI
POD1 --> AZURE_OPENAI
POD1 --> AZURE_SEARCH
POD1 --> AZURE_STORAGE
POD1 --> DATABASE
POD2 --> AZURE_DI
POD2 --> AZURE_OPENAI
POD2 --> AZURE_SEARCH
POD2 --> AZURE_STORAGE
POD2 --> DATABASE
POD3 --> AZURE_DI
POD3 --> AZURE_OPENAI
POD3 --> AZURE_SEARCH
POD3 --> AZURE_STORAGE
POD3 --> DATABASE
style POD1 fill:#e1f5fe
style POD2 fill:#e1f5fe
style POD3 fill:#e1f5fe
style CM fill:#fff3e0
style SECRET fill:#ffebee
```
## Performance and Scalability
The system is designed to handle large-scale document processing operations efficiently while maintaining high quality output. Performance optimization occurs at multiple levels: application design, resource utilization, and operational practices.
### Performance Optimization Strategies
**Asynchronous Processing**: All I/O-bound operations are implemented asynchronously to maximize throughput and resource utilization. This is particularly important for operations involving external API calls and database operations.
**Connection Pooling**: Database and HTTP connections are pooled and reused to minimize connection overhead and improve response times.
**Caching Strategies**: Frequently accessed configuration data and metadata are cached in memory to reduce database load and improve response times.
**Batch Operations**: Operations that can be batched (such as database writes and API calls) are grouped together to reduce overhead and improve efficiency.
### Scalability Considerations
**Horizontal Scaling**: The stateless design of processing components enables horizontal scaling by adding more processing instances without architectural changes.
**Database Optimization**: Database operations are optimized through proper indexing, connection pooling, and efficient query patterns to support high-concurrency operations.
**Rate Limiting and Throttling**: The system implements rate limiting and throttling mechanisms to respect external service limits while maintaining optimal throughput.
**Resource Monitoring**: Comprehensive monitoring of resource utilization enables proactive scaling decisions and performance optimization.
### Processing Pipeline Performance
```mermaid
graph LR
subgraph "Performance Metrics"
TPS[Throughput<br/>Documents/Second]
LAT[Latency<br/>Processing Time]
ERR[Error Rate<br/>Failed Documents]
RES[Resource Usage<br/>CPU/Memory]
end
subgraph "Optimization Strategies"
ASYNC[Async Processing]
BATCH[Batch Operations]
CACHE[Caching Layer]
RETRY[Retry Logic]
end
subgraph "Scaling Options"
HSCALE[Horizontal Scaling<br/>More Pods]
VSCALE[Vertical Scaling<br/>Larger Pods]
QUEUE[Queue Management<br/>Task Distribution]
end
TPS --> ASYNC
LAT --> BATCH
ERR --> RETRY
RES --> CACHE
ASYNC --> HSCALE
BATCH --> QUEUE
CACHE --> VSCALE
style TPS fill:#c8e6c9
style LAT fill:#fff3e0
style ERR fill:#ffcdd2
style RES fill:#e1f5fe
```
## Error Handling and Monitoring
The error handling and monitoring system is designed to provide comprehensive visibility into system operations while implementing robust recovery mechanisms. The system distinguishes between different types of errors and responds appropriately to each.
### Error Classification and Response
**Transient Errors**: Network timeouts, temporary service unavailability, and rate limiting are handled through exponential backoff retry mechanisms. These errors are expected in distributed systems and are handled automatically.
**Configuration Errors**: Invalid configuration values, missing credentials, and similar issues are detected at startup and cause immediate failure with clear error messages to facilitate quick resolution.
**Resource Errors**: Insufficient disk space, memory exhaustion, and similar resource constraints are detected and handled gracefully, often by pausing processing until resources become available.
**Service Errors**: Failures in external services (Azure Document Intelligence, Azure OpenAI, etc.) are handled through fallback mechanisms where possible, or graceful degradation when fallbacks are not available.
### Monitoring and Observability
**Structured Logging**: All log messages follow a structured format that enables efficient searching and analysis. Log levels are used appropriately to balance information content with log volume.
**Processing Metrics**: Key performance indicators such as processing rates, error rates, and resource utilization are tracked and can be exported to monitoring systems.
**Health Checks**: The system implements health check endpoints that can be used by orchestration systems to determine system health and restart unhealthy instances.
**Audit Trails**: Complete audit trails of document processing operations are maintained for compliance and debugging purposes.
### Error Handling Strategy
```mermaid
flowchart TD
ERROR[Error Detected] --> CLASSIFY[Classify Error Type]
CLASSIFY --> TRANSIENT{Transient Error?}
CLASSIFY --> CONFIG{Configuration Error?}
CLASSIFY --> RESOURCE{Resource Error?}
CLASSIFY --> SERVICE{Service Error?}
TRANSIENT -->|Yes| RETRY[Retry with Backoff]
CONFIG -->|Yes| LOG_FATAL[Log Fatal Error]
RESOURCE -->|Yes| WAIT[Wait for Resources]
SERVICE -->|Yes| CHECK_SERVICE[Check Service Status]
RETRY --> MAX_RETRY{Max Retries?}
MAX_RETRY -->|No| ATTEMPT[Retry Attempt]
MAX_RETRY -->|Yes| MARK_FAILED[Mark as Failed]
ATTEMPT --> SUCCESS{Success?}
SUCCESS -->|Yes| UPDATE_SUCCESS[Update Success]
SUCCESS -->|No| RETRY
WAIT --> RESOURCE_CHECK{Resources Available?}
RESOURCE_CHECK -->|Yes| RETRY
RESOURCE_CHECK -->|No| WAIT
CHECK_SERVICE --> SERVICE_OK{Service OK?}
SERVICE_OK -->|Yes| RETRY
SERVICE_OK -->|No| ESCALATE[Escalate Error]
LOG_FATAL --> STOP[Stop Processing]
MARK_FAILED --> LOG_ERROR[Log Detailed Error]
ESCALATE --> LOG_ERROR
UPDATE_SUCCESS --> CONTINUE[Continue Processing]
LOG_ERROR --> CONTINUE
style ERROR fill:#ffcdd2
style UPDATE_SUCCESS fill:#c8e6c9
style CONTINUE fill:#e8f5e8
```
## Conclusion
The Document AI Indexer provides a comprehensive, scalable solution for intelligent document processing and indexing. Its modular architecture, robust error handling, and integration with Azure AI services make it suitable for enterprise-scale document processing workflows. The system's flexibility allows for easy customization and extension to meet specific business requirements while maintaining high performance and reliability.

View File

@@ -0,0 +1,103 @@
"""
Document Task Processor
Integrates business logic and database operations
"""
import datetime
import json
import logging
from typing import Any, Optional
from sqlalchemy import and_
from sqlalchemy.orm import sessionmaker
from app_config import ServiceFactory
from task_processor import Task, TaskProcessorInterface
from business_layer import ApplicationConfig, DocumentProcessingFactory, ProcessingContext
from database import IndexObject, IndexObjectStatus, IndexJob
from utils import custom_serializer
class DocumentTaskProcessor(TaskProcessorInterface):
"""Document task processor"""
def __init__(self, config:ApplicationConfig, service_factory:ServiceFactory, tmp_directory:str, database_engine:Any, logger: Optional[logging.Logger] , datasource: dict[str,Any] ,data_config:dict[str,Any]):
self.config = config
self.service_factory = service_factory
self.database_engine = database_engine
self.logger = logger or logging.getLogger(__name__)
self.datasource = datasource or {}
self.processing_factory = DocumentProcessingFactory(service_factory=service_factory, tmp_directory=tmp_directory, datasource=datasource, config=config)
self.data_config: dict[str, Any] = data_config
self.datasource_name: str = data_config.get("datasource_name", "default")
def process(self, task: Task) -> Any:
"""Process document task"""
if not isinstance(task.payload, ProcessingContext):
raise ValueError(f"Expected ProcessingContext, got {type(task.payload)}")
context = task.payload
detailed_message:dict[str,Any] = {}
detailed_message["start_time"] = datetime.datetime.now(datetime.timezone.utc)
Session = sessionmaker(bind=self.database_engine)
session = Session()
try:
# 1. Query or create IndexObject record
index_object_db = session.query(IndexObject).get({"object_key":context.object_key,"datasource_name":context.datasource_name})
if not index_object_db:
self.logger.info(f"Creating new IndexObject entry for {context.object_key}")
index_object_db = IndexObject(
object_key=context.object_key,
type="document",
status=IndexObjectStatus.PROCESSING.value,
try_count=0,
datasource_name=context.datasource_name
)
session.add(index_object_db)
session.commit()
# 2. Only update task-related fields, no longer update business fields
index_object_db.last_start_time = datetime.datetime.now(datetime.timezone.utc)
current_job = session.query(IndexJob).filter(and_(IndexJob.status == "processing",IndexJob.datasource_name== context.datasource_name)).order_by(IndexJob.id.desc()).first()
if current_job:
index_object_db.last_run_id = current_job.id
session.commit()
# 3. Execute business processing
self.logger.info(f"Processing document: {context.object_key}")
orchestrator = self.processing_factory.create_orchestrator()
result = orchestrator.process_document(context)
# 4. Only update task-related fields, no longer update business fields
detailed_message["success"] = result.status == IndexObjectStatus.SUCCESS
detailed_message["chunks_count"] = result.chunks_count
detailed_message["processing_time"] = result.processing_time
detailed_message["message"] = result.message
if result.status != IndexObjectStatus.SUCCESS:
self.logger.error(f"Failed to process {context.object_key}: {result.message}")
detailed_message["error"] = result.message
if result.error:
detailed_message["error_details"] = str(result.error)
else:
self.logger.info(f"Successfully processed {context.object_key}")
index_object_db.last_finished_time = datetime.datetime.now(datetime.timezone.utc)
detailed_message["end_time"] = datetime.datetime.now(datetime.timezone.utc)
index_object_db.detailed_message = json.dumps(detailed_message, default=custom_serializer, ensure_ascii=False)
session.commit()
# If processing failed, raise exception to trigger retry mechanism
if result.status == IndexObjectStatus.FAILED:
raise Exception(result.message)
return result
except Exception as e:
# Handle exceptions - only update database in case of unexpected exceptions
# Business logic failures are already handled above
self.logger.error(f"Error processing {context.object_key}: {e}")
raise
finally:
session.close()

View File

@@ -0,0 +1,132 @@
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, fields
from dataclasses_json import dataclass_json
@dataclass_json
@dataclass
class DiResult:
"""Data class for storing"""
figures: List['FigureFlat']
di_content: str
filepath:str
language:str
@dataclass_json
@dataclass
class FigureFlat:
offset: int
length: int
url: str
content: str
image: str
understand_flag:bool
caption:str
def dict_to_str(v):
return v if isinstance(v, str) else str(v)
@dataclass
class Document(object):
"""A data class for storing documents
Attributes:
content (str): The content of the document.
id (Optional[str]): The id of the document.
title (Optional[str]): The title of the document.
filepath (Optional[str]): The filepath of the document.
url (Optional[str]): The url of the document.
metadata (Optional[Dict]): The metadata of the document.
"""
content: Optional[str] = None
id: Optional[str] = None
title: Optional[str] = None
filepath: Optional[str] = None
url: Optional[str] = None
metadata: Optional[Dict] = None
image_mapping: Optional[Dict] = None
doc_metadata: Optional[str] = None
document_schema: Optional[str] = None
main_title: Optional[str] = None
sub_title: Optional[str] = None
publisher: Optional[str] = None
document_code: Optional[str] = None
document_category: Optional[str] = None
main_title_sec_language: Optional[str] = None
sub_title_sec_language: Optional[str] = None
primary_language: Optional[str] = None
secondary_language: Optional[str] = None
full_headers: Optional[str] = None
h1: Optional[str] = None
h2: Optional[str] = None
h3: Optional[str] = None
h4: Optional[str] = None
h5: Optional[str] = None
h6: Optional[str] = None
contentVector: Optional[List[float]] = None
full_metadata_vector: Optional[List[float]] = None
def __setattr__(self, key, value) -> None:
# If the attribute is a list or dictionary, convert it to a string for storage
if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
value = dict_to_str(value)
# Avoid infinite recursion of __setattr__ calls
object.__setattr__(self, key, value)
def __setitem__(self, key, value) -> None:
# Store the attribute directly in the instance's __dict__
self.__dict__[key] = value
def __getitem__(self, key) -> Any:
# Retrieve the attribute from the instance's __dict__
return self.__dict__[key]
def copy_dynamic_attrs(self, source) -> None:
"""Copy dynamic attributes from the source object to the current object"""
predefined = {f.name for f in fields(source)}
for attr in dir(source):
# Filter dynamic attributes
if (attr not in predefined and
not attr.startswith('__') and
not callable(getattr(source, attr))):
value = getattr(source, attr)
setattr(self, attr, value)
@dataclass
class ChunkingResult:
"""Data model for chunking result
Attributes:
chunks (List[Document]): List of chunks.
total_files (int): Total number of files.
num_unsupported_format_files (int): Number of files with unsupported format.
num_files_with_errors (int): Number of files with errors.
skipped_chunks (int): Number of chunks skipped due to too few tokens.
"""
chunks: List[Document]
total_files: int
num_unsupported_format_files: int = 0
num_files_with_errors: int = 0
# some chunks might be skipped due to too few tokens
skipped_chunks: int = 0
failed_files = None
class UnsupportedFormatError(Exception):
"""Exception raised when a format is not supported by a parser."""
pass

View File

@@ -0,0 +1,51 @@
config: config.yaml
njobs: 1
search_service_name: https://<resource name>.search.windows.net
search_admin_key:
embedding_model_endpoint: https://<resource name>.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview
embedding_model_key:
VECTOR_DIMENSION: 1536
extract_method: di+vision-llm
# extract_method=vision-llm
form_rec_resource: https://<resource name>.cognitiveservices.azure.cn/
form_rec_key:
# Perform OCR at a higher resolution to handle documents with fine print
di-hiRes: true
# Enable the detection of mathematical expressions in the document.
di-Formulas: true
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
# 图片理解
figure_caption:
include_di_content: false
description_gen_max_images: 0
model_endpoint: null
model_key: null
model: null # azure 留空
azure_deployment: gpt-4o # azure 部署名称,其他平台模型留空
api_version: 2024-08-01-preview # azure api版本,其他平台留空
FLAG_AOAI: "V3"
#FLAG_EMBEDDING_MODEL: "qwen3-embedding-8b"
FLAG_EMBEDDING_MODEL: "AOAI"
FIGURE_BLOB_ACCOUNT_URL: https://blob sas url
DI_BLOB_ACCOUNT_URL: https://blob sas url
DB_URI: postgresql+psycopg2://user:passwords@localhost:5433/document_indexer
header_fix: true

View File

@@ -0,0 +1,43 @@
# Configuration file reference
config: config.yaml
# Processing settings
njobs: 8 # Number of parallel processing jobs
# Azure AI Search configuration
search_service_name: "https://your-search-service.search.windows.net"
search_admin_key: "your-search-admin-key"
# Azure OpenAI Embedding service
embedding_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview"
embedding_model_key: "your-openai-key"
VECTOR_DIMENSION: 1536
FLAG_AOAI: "V3" # Azure OpenAI version
FLAG_EMBEDDING_MODEL: "AOAI" # Embedding model type: "AOAI" or "qwen3-embedding-8b"
# Document Intelligence configuration
extract_method: "di+vision-llm" # Extraction method: "di+vision-llm", "vision-llm", "di"
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
form_rec_key: "your-di-key"
# Document Intelligence features
di-hiRes: true # High resolution OCR
di-Formulas: true # Mathematical expression detection
di_allow_features_ext: "pdf;jpeg;jpg;png;bmp;tiff;heif" # Supported file extensions
# Vision and captioning models
captioning_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
captioning_model_key: "your-openai-key"
vision_max_images: 200 # Maximum images to process per document (0 = no limit)
vision_image_method: "openai" # Image processing method: "openai"
# Blob storage for figures and DI results
FIGURE_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
DI_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
# Database configuration
DB_URI: "postgresql://user:password@host:port/database_name"
# Processing flags
header_fix: false # Enable/disable header fixing

View File

@@ -0,0 +1,473 @@
"""
Fixed the problem of mismatch between the upper and lower titles in MD documents. Solve the problem that the # number of the lower title is raised to the same as the upper title, or is higher than the upper title.
"""
import re
from typing import Any, List, Dict, Optional
class HeaderInfo:
"""Title information"""
def __init__(self, line_number: int, original_line: str, hash_count: int,
level: int, number_pattern: str, title_text: str):
self.line_number = line_number
self.original_line = original_line
self.hash_count = hash_count
self.level = level
self.number_pattern = number_pattern
self.title_text = title_text
self.correct_hash_count = hash_count # Will be updated by Fixer
class HierarchyFixer:
"""Special fixer for title hierarchy # number mismatch issues"""
def __init__(self):
# Number pattern matching - supports both formats with and without trailing dots
self.number_patterns = [
r'^(\d+)\.?$', # 1 or 1.
r'^(\d+)\.(\d+)\.?$', # 1.1 or 1.1.
r'^(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1 or 1.1.1.
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1 or 1.1.1.1.
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1 or 1.1.1.1.1.
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1.1 or 1.1.1.1.1.1.
]
# Letter+number pattern matching - supports both "A.x.x.x" and "C. x.x.x" formats
self.letter_number_patterns = [
# Single letter: A, B, C (followed by space or end)
(r'^([A-Z])(?:\s|$)', 1),
# Letter + space + numbers: "C. 1", "A. 2"
(r'^([A-Z])\.\s+(\d+)(?:\s|$)', 2),
(r'^([A-Z])\.\s+(\d+)\.(\d+)(?:\s|$)', 3), # C. 1.1, A. 2.3
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)(?:\s|$)', 4), # C. 1.1.1, A. 2.3.4
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 5), # C. 1.1.1.1, A. 2.3.4.5
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 6), # C. 1.1.1.1.1, A. 2.3.4.5.6
# Compact format (no space): A.1, A.1.2, A.1.2.3 etc.
(r'^([A-Z])\.(\d+)(?:\s|$|[^\d\.])', 2), # A.1, A.2
(r'^([A-Z])\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 3), # A.1.2, A.1.3
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 4), # A.1.2.3
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 5), # A.1.2.3.4
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 6), # A.1.2.3.4.5
]
def detect_headers(self, content: str) -> List[HeaderInfo]:
"""Detect all headers and determine their logical levels"""
lines = content.split('\n')
headers: List[HeaderInfo] = []
for line_num, line in enumerate(lines):
if line.strip().startswith('#'):
header_info = self._parse_header_line(line_num, line)
if header_info:
headers.append(header_info)
return headers
def _parse_header_line(self, line_num: int, line: str) -> Optional[HeaderInfo]:
"""Analyze the title line"""
line = line.strip()
# Count the number of # characters
hash_count = 0
for char in line:
if char == '#':
hash_count += 1
else:
break
if hash_count == 0:
return None
# Extract title content
title_content = line[hash_count:].strip()
# Try to match number pattern
level = 1
number_pattern = ""
# Check for letter+number patterns first (A.1.2.3 format)
for pattern, expected_level in self.letter_number_patterns:
match = re.match(pattern, title_content)
if match:
level = expected_level
# Extract the complete matched numbering pattern
matched_text = match.group(0)
# For space-separated patterns like "C. 1.1", we need to extract the full pattern
if '. ' in matched_text:
# This is a space-separated pattern like "C. 1.1"
# The match already contains the complete pattern we want
number_pattern = matched_text.rstrip() # Remove trailing space if any
else:
# This is a compact pattern like "A.1.2.3"
number_pattern = matched_text
return HeaderInfo(
line_number=line_num,
original_line=line,
hash_count=hash_count,
level=level,
number_pattern=number_pattern,
title_text=title_content
)
# If no letter+number pattern, try traditional number patterns
if title_content:
# First, try to identify and extract the complete numbering part
# Look for patterns like "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
words = title_content.split()
numbering_words = []
# Collect words that could be part of the numbering (digits, dots, spaces)
for word in words:
if re.match(r'^[\d\.]+$', word) or word == '.':
numbering_words.append(word)
else:
break # Stop at first non-numbering word
if numbering_words:
# Join and normalize the numbering part
numbering_text = ' '.join(numbering_words)
# Normalize: "1 . 2 . 3" -> "1.2.3", "1. 2. 3" -> "1.2.3"
normalized = re.sub(r'\s*\.\s*', '.', numbering_text)
normalized = re.sub(r'\.+$', '', normalized) # Remove trailing dots
normalized = normalized.strip()
# Try to match the normalized pattern
for i, pattern in enumerate(self.number_patterns, 1):
match = re.match(pattern, normalized)
if match:
level = i
number_pattern = normalized
break
else:
# If no numbering pattern found in separate words, try the first word directly
first_word = words[0] if words else ""
for i, pattern in enumerate(self.number_patterns, 1):
match = re.match(pattern, first_word)
if match:
level = i
number_pattern = match.group(0).rstrip('.')
break
# If no number pattern is found, infer level from # count
if not number_pattern:
level = hash_count
return HeaderInfo(
line_number=line_num,
original_line=line,
hash_count=hash_count,
level=level,
number_pattern=number_pattern,
title_text=title_content
)
def find_hierarchy_problems(self, headers: List[HeaderInfo]) -> List[Dict]:
"""Find problems with mismatched # counts using adaptive analysis"""
problems = []
# 首先分析文档的自适应层级映射
level_hash_mapping = self._analyze_document_hash_pattern(headers)
# 1. Check for level-hash mismatch based on adaptive mapping
for header in headers:
if header.number_pattern: # Only check numbered headers
expected_hash_count = level_hash_mapping.get(header.level, header.level)
if header.hash_count != expected_hash_count:
problems.append({
'type': 'level_hash_mismatch',
'line': header.line_number + 1,
'level': header.level,
'current_hash': header.hash_count,
'expected_hash': expected_hash_count,
'title': header.title_text[:50],
'pattern': header.number_pattern,
'problem': f"Level {header.level} header '{header.number_pattern}' uses {header.hash_count} #, but document pattern suggests {expected_hash_count} #"
})
# 2. Check for parent-child hierarchy issues
for i in range(len(headers) - 1):
current = headers[i]
next_header = headers[i + 1]
# Only consider headers with a clear number pattern
if current.number_pattern and next_header.number_pattern:
# Check if the child header's # count is less than or equal to the parent header's
if next_header.level > current.level: # Child header
expected_parent_hash = level_hash_mapping.get(current.level, current.level)
expected_child_hash = level_hash_mapping.get(next_header.level, next_header.level)
if next_header.hash_count <= current.hash_count:
problems.append({
'type': 'hierarchy_violation',
'parent_line': current.line_number + 1,
'parent_level': current.level,
'parent_hash': current.hash_count,
'parent_title': current.title_text[:50],
'child_line': next_header.line_number + 1,
'child_level': next_header.level,
'child_hash': next_header.hash_count,
'child_title': next_header.title_text[:50],
'problem': f"Child header ({next_header.level} level) # count ({next_header.hash_count}) should be greater than parent header ({current.level} level, {current.hash_count} #). Expected pattern: parent {expected_parent_hash}#, child {expected_child_hash}#"
})
# 3. Check for significant inconsistency within same level (now less strict)
same_level_problems = self._find_same_level_inconsistency(headers)
problems.extend(same_level_problems)
return problems
def _find_same_level_inconsistency(self, headers: List[HeaderInfo]) -> List[Dict]:
"""Check the problem of inconsistent number of titles # numbers at the same level"""
problems = []
# Group by level, only numbered titles
level_groups = {}
for header in headers:
if header.number_pattern: # Only numbered titles
if header.level not in level_groups:
level_groups[header.level] = []
level_groups[header.level].append(header)
# Check the consistency of # numbers within each level
for level, group_headers in level_groups.items():
if len(group_headers) < 2:
continue # Only one header, no need to check
# Count the usage of different # numbers within the same level
hash_count_stats = {}
for header in group_headers:
hash_count = header.hash_count
if hash_count not in hash_count_stats:
hash_count_stats[hash_count] = []
hash_count_stats[hash_count].append(header)
# If there are different # numbers in the same level
if len(hash_count_stats) > 1:
# Find the most common # number as the standard
most_common_hash_count = max(hash_count_stats.keys(),
key=lambda x: len(hash_count_stats[x]))
# Report titles that do not meet the standard
for hash_count, headers_with_this_count in hash_count_stats.items():
if hash_count != most_common_hash_count:
for header in headers_with_this_count:
problems.append({
'type': 'same_level_inconsistency',
'line': header.line_number + 1,
'level': header.level,
'current_hash': header.hash_count,
'expected_hash': most_common_hash_count,
'title': header.title_text[:50],
'pattern': header.number_pattern,
'problem': f"{header.level} level header uses {header.hash_count} #, but the majority of siblings use {most_common_hash_count} #"
})
return problems
def fix_hierarchy(self, content: str) -> Dict[str,Any]:
"""Fix hierarchy issues"""
headers = self.detect_headers(content)
if not headers:
return {
'fixed_content': content,
'problems_found': [],
'fixes_applied': 0,
'message': 'No headers detected'
}
# Check for problems
problems = self.find_hierarchy_problems(headers)
if not problems:
return {
'fixed_content': content,
'problems_found': [],
'fixes_applied': 0,
'message': 'No hierarchy issues found'
}
# Apply fixes
lines = content.split('\n')
fixes_applied = 0
# To ensure child headers have more # than parent headers, we need to recalculate the # count for each header
fixed_headers = self._calculate_correct_hash_counts(headers)
# Apply fixes
for header in fixed_headers:
if header.hash_count != header.correct_hash_count:
old_line = lines[header.line_number]
new_hash = '#' * header.correct_hash_count
# Replace # part
new_line = re.sub(r'^#+', new_hash, old_line)
lines[header.line_number] = new_line
fixes_applied += 1
fixed_content = '\n'.join(lines)
return {
'fixed_content': fixed_content,
'original_content': content,
'problems_found': problems,
'fixes_applied': fixes_applied,
'fixed_headers': [(h.line_number + 1, h.hash_count, h.correct_hash_count, h.title_text[:30])
for h in fixed_headers if h.hash_count != h.correct_hash_count]
}
def _calculate_correct_hash_counts(self, headers: List[HeaderInfo]) -> List[HeaderInfo]:
"""Calculate the correct number of #'s based on adaptive analysis of the document"""
if not headers:
return []
# 1. 分析文档中各层级的#号使用模式 (自适应分析)
level_hash_mapping = self._analyze_document_hash_pattern(headers)
# Create copies with the correct number of #'s
fixed_headers: list[HeaderInfo] = []
for header in headers:
# Copy original information
fixed_header = HeaderInfo(
line_number=header.line_number,
original_line=header.original_line,
hash_count=header.hash_count,
level=header.level,
number_pattern=header.number_pattern,
title_text=header.title_text
)
if fixed_header.number_pattern:
# For numbered headers, use the adaptive mapping
if fixed_header.level in level_hash_mapping:
fixed_header.correct_hash_count = level_hash_mapping[fixed_header.level]
else:
# Fallback: extrapolate from existing pattern
fixed_header.correct_hash_count = self._extrapolate_hash_count(
fixed_header.level, level_hash_mapping)
else:
# For non-numbered headers, keep the original # count
fixed_header.correct_hash_count = fixed_header.hash_count
fixed_headers.append(fixed_header)
return fixed_headers
def _analyze_document_hash_pattern(self, headers: List[HeaderInfo]) -> Dict[int, int]:
"""Analyze the document's # pattern to determine the adaptive mapping"""
# Count the number of #'s used at each level
level_hash_stats = {}
for header in headers:
if header.number_pattern: # Only numbered titles are considered
level = header.level
hash_count = header.hash_count
if level not in level_hash_stats:
level_hash_stats[level] = {}
if hash_count not in level_hash_stats[level]:
level_hash_stats[level][hash_count] = 0
level_hash_stats[level][hash_count] += 1
# Find out the most commonly used number of # numbers for each level
level_hash_mapping = {}
for level, hash_stats in level_hash_stats.items():
most_common_hash = max(hash_stats.keys(), key=lambda x: hash_stats[x])
level_hash_mapping[level] = most_common_hash
# Verify and adjust the mapping to ensure that the incremental # number of the hierarchy is also incremented
level_hash_mapping = self._ensure_monotonic_mapping(level_hash_mapping)
return level_hash_mapping
def _ensure_monotonic_mapping(self, level_hash_mapping: Dict[int, int]) -> Dict[int, int]:
"""Ensure that the level mapping is monotonically increasing (higher level = more #'s)"""
if not level_hash_mapping:
return level_hash_mapping
# Sort by level
sorted_levels = sorted(level_hash_mapping.keys())
adjusted_mapping = {}
# Ensure that the # count for each level is at least 1 more than the previous level
for i, level in enumerate(sorted_levels):
current_hash = level_hash_mapping[level]
if i == 0:
# First level, use as is
adjusted_mapping[level] = current_hash
else:
# Ensure at least 1 more # than the previous level
prev_level = sorted_levels[i-1]
min_required_hash = adjusted_mapping[prev_level] + 1
adjusted_mapping[level] = max(current_hash, min_required_hash)
return adjusted_mapping
def _extrapolate_hash_count(self, level: int, level_hash_mapping: Dict[int, int]) -> int:
"""Infer the number of # numbers for the hierarchy that have not appeared"""
if not level_hash_mapping:
return level # Fallback to simple 1:1 mapping
sorted_levels = sorted(level_hash_mapping.keys())
if level < sorted_levels[0]:
# Smaller than the minimum level, infer forward
diff = sorted_levels[0] - level
return max(1, level_hash_mapping[sorted_levels[0]] - diff)
elif level > sorted_levels[-1]:
# Larger than the maximum level, infer backward
diff = level - sorted_levels[-1]
return level_hash_mapping[sorted_levels[-1]] + diff
else:
# Between known levels, interpolation inference
for i in range(len(sorted_levels) - 1):
if sorted_levels[i] < level < sorted_levels[i + 1]:
# Simple linear interpolation
lower_level = sorted_levels[i]
upper_level = sorted_levels[i + 1]
lower_hash = level_hash_mapping[lower_level]
upper_hash = level_hash_mapping[upper_level]
# Linear interpolation
ratio = (level - lower_level) / (upper_level - lower_level)
return int(lower_hash + ratio * (upper_hash - lower_hash))
return level # Fallback
def _fix_same_level_inconsistency(self, headers: List[HeaderInfo]) -> None:
"""Fix inconsistency of # count at the same level"""
# Group by level, only process headers with a numbering pattern
level_groups = {}
for header in headers:
if header.number_pattern: # Only process headers with a numbering pattern
if header.level not in level_groups:
level_groups[header.level] = []
level_groups[header.level].append(header)
# Fix inconsistency of # count within each level
for level, group_headers in level_groups.items():
if len(group_headers) < 2:
continue # Only one header, no need to fix
# Count the usage of different # counts within the same level
hash_count_stats = {}
for header in group_headers:
hash_count = header.correct_hash_count
if hash_count not in hash_count_stats:
hash_count_stats[hash_count] = []
hash_count_stats[hash_count].append(header)
# If different # counts exist at the same level
if len(hash_count_stats) > 1:
# Find the most common # count as the standard
most_common_hash_count = max(hash_count_stats.keys(),
key=lambda x: len(hash_count_stats[x]))
# Unify all titles of the same level into the most commonly used number of # numbers
for header in group_headers:
header.correct_hash_count = most_common_hash_count

View File

@@ -0,0 +1,370 @@
"""Main application entry point for document processing."""
import asyncio
import json
import logging
import sys
import os
import traceback
from typing import Optional, List, Dict, Any
from contextlib import asynccontextmanager
from dataclasses import dataclass
import argparse
import datetime
from sqlalchemy import and_
from sqlalchemy.orm import sessionmaker
from app_config import ApplicationConfig, ServiceFactory
from business_layer import ProcessingContext
from document_task_processor import DocumentTaskProcessor
from task_processor import ProcessingStats, Task, TaskProcessor
from database import init_database,IndexObject,IndexJob
from utils import custom_serializer, init_current_data_directory,max_datetime_safe, min_datetime_safe
from blob_service import check_files, check_meta,load_metadata
from azure_index_service import index_init
@dataclass
class ApplicationContext:
"""Application context."""
config: ApplicationConfig
service_factory: ServiceFactory
database_engine: Any
logger: logging.Logger
class DocumentProcessingApplication:
"""Main class for document processing application."""
def __init__(self, config_path: str, env_path: str = "env.yaml"):
self.config_path = config_path
self.env_path = env_path
self.context: ApplicationContext = None # type: ignore
self.logger = logging.getLogger(__name__)
self.console_logger = logging.getLogger("data_preparation")
async def initialize(self):
"""Initialize the application."""
try:
# Load config - load environment and business config separately
config = ApplicationConfig.from_env_and_config_files(config_yaml_path=self.config_path, env_yaml_path=self.env_path)
config.validate()
# Set up logging
self._setup_app_logging()
# Create service factory
service_factory = ServiceFactory(config)
# Initialize database (create tables)
database_engine = init_database(config.database.uri)
self.logger.info("Database initialized successfully")
# Validate database engine
service_engine = service_factory.get_database_engine()
if database_engine.url != service_engine.url:
self.logger.warning("Database engines have different URLs, using init_database result")
database_engine = service_engine
# Create application context
self.context = ApplicationContext(config=config, service_factory=service_factory, database_engine=database_engine, logger=self.logger)
# Initialize task processor
self._initialize_task_processor()
self.console_logger.info("Application initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize application: {e}")
raise
def _setup_app_logging(self):
self.console_logger.handlers = []
self.console_logger.setLevel(logging.DEBUG)
self.console_logger.propagate = False
# Console output - only show progress and key info
console_handler = logging.StreamHandler(sys.stdout)
console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(console_formatter)
console_handler.setLevel(logging.DEBUG)
self.console_logger.addHandler(console_handler)
def _setup_logging(self, log_file: str = '~'):
"""Set up logging configuration."""
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
# Remove existing handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
file_path = f"{log_file}/.chunked/.run.log"
# File output - log all details
os.makedirs(os.path.dirname(file_path), exist_ok=True)
file_handler = logging.FileHandler(file_path, encoding='utf-8')
file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
file_handler.setLevel(logging.INFO)
root_logger.addHandler(file_handler)
self.console_logger.addHandler(file_handler)
async def _initialize_datasource(self, data_config: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize datasource."""
try:
self.console_logger.info("Loading metadata from blob storage...")
sorted_list = await asyncio.to_thread(load_metadata, data_config["data_path"], self.context.config.current_tmp_directory, data_config["data_dir"])
doc_metadata_map: dict[str, dict[str, Any]] = {}
for item in sorted_list:
key = item["filepath"]
# Assume there is a timestamp field, keep the latest
if key not in doc_metadata_map or item.get("timestamp", 0) > doc_metadata_map[key].get("timestamp", 0):
doc_metadata_map[key] = item
datasource = {"metadata": doc_metadata_map}
self.console_logger.info(f"Loaded {len(doc_metadata_map)} metadata entries")
return datasource
except Exception as e:
self.logger.error(f"Error initializing datasource: {e}")
raise
def _initialize_task_processor(self):
"""Initialize task processor (basic init only)."""
if not self.context:
raise RuntimeError("Application context not initialized")
# Basic task processor config, actual processor will be created per data config
self.logger.info("Task processor configuration initialized")
async def run(self):
"""Run the application."""
if not self.context:
raise RuntimeError("Application not initialized")
try:
self.console_logger.info("Starting document processing application")
for i, data_config in enumerate(self.context.config.data_configs, 1):
self.console_logger.info(f"Processing data source {i}/{len(self.context.config.data_configs)}")
await self._process_data_config(data_config)
self.console_logger.info("Document processing application completed")
except Exception as e:
self.logger.error(f"Application error: {e}")
raise
async def _process_data_config(self, data_config: Dict[str, Any]):
"""Process a single data config."""
data_path = data_config.get('data_path', '/')
self.console_logger.info(f"Processing data source: {data_path}")
if not self.context:
raise RuntimeError("Application context not initialized")
try:
base_path: str = data_config.get('base_path', '')
self.context.config.current_tmp_directory = init_current_data_directory(base_path)
self._setup_logging(self.context.config.current_tmp_directory)
# 1. Initialize datasource (load metadata)
datasource = await self._initialize_datasource(data_config)
# 2. Get objects to process
objects_to_process = await self._get_objects_to_process(data_config)
if not objects_to_process:
self.console_logger.info("No new documents to process")
return
self.console_logger.info(f"Found {len(objects_to_process)} documents to process")
# 3. Initialize search index schema (ensure search index is created and configured)
await self._initialize_search_index(data_config, self.context.config)
# 4. Create task processor with datasource
task_processor_impl = DocumentTaskProcessor(config=self.context.config, service_factory=self.context.service_factory, tmp_directory=self.context.config.current_tmp_directory, database_engine=self.context.database_engine, logger=self.logger, datasource=datasource,data_config=data_config)
# 5. Task processor
simple_processor = TaskProcessor(task_processor=task_processor_impl, max_workers=self.context.config.processing.max_workers, logger=self.console_logger, database_engine=self.context.database_engine,data_config=data_config)
# Create tasks
tasks = self._create_tasks(objects_to_process, data_config,self.context.config)
self.console_logger.info(f"Starting processing of {len(tasks)} tasks")
# Synchronously process all tasks
await asyncio.to_thread(simple_processor.process_tasks, tasks)
# Get processing stats
stats = ProcessingStats(total_tasks=simple_processor.total_tasks, completed_tasks=simple_processor.completed_tasks, failed_tasks=simple_processor.failed_tasks, start_time=simple_processor.start_time or datetime.datetime.now())
self.console_logger.info(json.dumps(stats, ensure_ascii=False, default=custom_serializer))
# Update job status
datasource_name = data_config.get("datasource_name", "default")
await self._update_index_job_status(stats, datasource_name)
except Exception as e:
self.console_logger.error(f"Error processing data config: {traceback.format_exc()}")
self.console_logger.error(f"Error processing data config: {str(e)}")
raise
async def _get_objects_to_process(self, data_config: Dict[str, Any]) -> List[IndexObject]:
"""Get objects to process."""
try:
# 1. Get last successful processing time from database
datasource_name = data_config.get("datasource_name", "default")
Session = sessionmaker(bind=self.context.database_engine)
session = Session()
try:
last_success_doc_job = session.query(IndexJob).filter(
and_(
IndexJob.status == "success",
IndexJob.doc_upper_time.is_not(None),
IndexJob.datasource_name == datasource_name
)
).order_by(IndexJob.id.desc()).first()
last_success_meta_job = session.query(IndexJob).filter(
and_(
IndexJob.status == "success",
IndexJob.metadata_upper_time.is_not(None),
IndexJob.datasource_name == datasource_name
)
).order_by(IndexJob.id.desc()).first()
doc_upper_time = last_success_doc_job.doc_upper_time if last_success_doc_job and last_success_doc_job.doc_upper_time else None
metadata_upper_time = last_success_meta_job.metadata_upper_time if last_success_meta_job and last_success_meta_job.metadata_upper_time else None
self.console_logger.info(f"Checking for updates in datasource '{datasource_name}' since doc: {doc_upper_time}, metadata: {metadata_upper_time}")
finally:
session.close()
# 2. Check file updates (only get files updated after baseline)
new_files = await asyncio.to_thread(check_files, data_config["data_path"], doc_upper_time)
# 3. Check metadata updates (only get metadata updated after baseline)
new_metas:list[dict[Any, Any]] = await asyncio.to_thread(check_meta, data_config["data_path"], metadata_upper_time, self.context.config.current_tmp_directory, data_config["data_dir"])
self.console_logger.info(f"Found {len(new_files)} updated files and {len(new_metas)} updated metadata entries")
# Crop new_metas and new_files, and only get 100 corresponding to new_metas and new_files. According to the name field, according to process_file_num: 100. If the name of new_files is not directly removed in new_metas
if data_config["process_file_num"]>0:
new_files = [file_info for file_info in new_files if file_info["name"] in {meta["name"] for meta in new_metas}]
if len(new_files) > data_config["process_file_num"]:
new_files = new_files[:data_config["process_file_num"]]
# Filter new_metas according to the latest number of new_files
new_metas = [meta_info for meta_info in new_metas if meta_info["name"] in {file_info["name"] for file_info in new_files}]
self.console_logger.info(f"After filtering, {len(new_files)} files and {len(new_metas)} metadata entries to process")
# 4. Merge file and metadata info, create processing objects
objects_to_process:list[IndexObject] = []
for file_info in new_files:
index_object = IndexObject(object_key=file_info["name"], type="document", doc_modifed_time=file_info.get("doc_upper_time"))
objects_to_process.append(index_object)
for meta_info in new_metas:
existing_obj = next((obj for obj in objects_to_process if obj.object_key == meta_info["name"]), None)
if existing_obj:
existing_obj.metadata_modifed_time = meta_info.get("meta_upper_time")
else:
index_object = IndexObject(object_key=meta_info["name"], type="document", metadata_modifed_time=meta_info.get("meta_upper_time"))
objects_to_process.append(index_object)
# 5. If there are objects to process, create a new job record
if objects_to_process:
await self._create_index_job(objects_to_process, data_config.get("datasource_name", "default"))
return objects_to_process
except Exception as e:
self.logger.error(f"Error getting objects to process: {e}")
raise
async def _create_index_job(self, objects_to_process: List[IndexObject], datasource_name: str):
"""Create index job record."""
try:
Session = sessionmaker(bind=self.context.database_engine)
session = Session()
try:
index_job_db = IndexJob(
start_time=datetime.datetime.now(datetime.timezone.utc),
status="processing",
total_process_count=len(objects_to_process),
datasource_name=datasource_name
)
for index_object in objects_to_process:
index_job_db.doc_upper_time = max_datetime_safe(index_object.doc_modifed_time, index_job_db.doc_upper_time)
index_job_db.doc_lower_time = min_datetime_safe(index_object.doc_modifed_time, index_job_db.doc_lower_time)
index_job_db.metadata_upper_time = max_datetime_safe(index_object.metadata_modifed_time, index_job_db.metadata_upper_time)
index_job_db.metadata_lower_time = min_datetime_safe(index_object.metadata_modifed_time, index_job_db.metadata_lower_time)
# Set datasource_name for each index object
index_object.datasource_name = datasource_name
session.add(index_job_db)
session.commit()
self.console_logger.info(f"Created processing job for {len(objects_to_process)} objects in datasource: {datasource_name}")
finally:
session.close()
except Exception as e:
self.console_logger.error(f"Error creating index job: {e}")
raise
async def _update_index_job_status(self, stats: ProcessingStats, datasource_name: str = "default"):
"""Update index job status."""
try:
Session = sessionmaker(bind=self.context.database_engine)
session = Session()
try:
current_job = session.query(IndexJob).filter(
and_(
IndexJob.status == "processing",
IndexJob.datasource_name == datasource_name
)
).order_by(IndexJob.id.desc()).first()
if current_job:
if stats.failed_tasks == 0 and stats.completed_tasks == stats.total_tasks:
current_job.status = "success"
elif stats.completed_tasks > 0 and stats.failed_tasks > 0:
current_job.status = "partial_success"
else:
current_job.status = "failed"
current_job.end_time = datetime.datetime.now(datetime.timezone.utc)
current_job.success_count = stats.completed_tasks
current_job.failed_count = stats.failed_tasks
session.commit()
self.console_logger.info(f"Job completed for datasource '{datasource_name}': {current_job.status}")
finally:
session.close()
except Exception as e:
self.console_logger.error(f"Error updating job status: {e}")
def _create_tasks(self, objects: List[IndexObject], data_config: Dict[str, Any], config: ApplicationConfig) -> List[Task]:
"""Create task list."""
tasks:list[Task] = []
datasource_name = data_config.get("datasource_name", "default")
for obj in objects:
context = ProcessingContext(
object_key=obj.object_key,
data_config=data_config,
metadata={
"doc_modified_time": obj.doc_modifed_time,
"metadata_modified_time": obj.metadata_modifed_time
},
current_tmp_directory=self.context.config.current_tmp_directory,
datasource_name=datasource_name,
config=config
)
task = Task(id = obj.object_key , payload=context, priority=0)
tasks.append(task)
return tasks
async def shutdown(self):
"""Shutdown application."""
self.console_logger.info("Application shutdown completed")
@asynccontextmanager
async def application_context(self):
"""Application context manager."""
await self.initialize()
try:
yield self
finally:
await self.shutdown()
async def _initialize_search_index(self, data_config: Dict[str, Any],applicationconfig: ApplicationConfig):
"""Initialize search index schema, ensure search index is created and configured."""
try:
self.console_logger.info("Initializing search index schema...")
await asyncio.to_thread(index_init, data_config, applicationconfig.azure_services.search_admin_key, applicationconfig.azure_services.search_service_name)
self.console_logger.info("Search index schema initialized successfully")
except Exception as e:
self.console_logger.error(f"Error initializing search index: {e}")
raise
async def main():
"""Main function."""
parser = argparse.ArgumentParser(description="Document Processing Application (Refactored)")
parser.add_argument("--config", type=str, default="config.yaml", help="Business configuration file path")
parser.add_argument("--env", type=str, default="env.yaml", help="Environment variables file path")
parser.add_argument("--log-level", type=str, default="INFO", help="Log level")
args = parser.parse_args()
app = DocumentProcessingApplication(args.config, args.env)
try:
async with app.application_context():
await app.run()
except KeyboardInterrupt:
print("Application interrupted by user")
except Exception as e:
print(f"Application error: {e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,9 @@
# Prompt
caption:
en:
system: "yaml You are a captioning model that helps uses find descriptive captions."
user: "yaml Describe this image as if you were describing it to someone who can't see it."
"zh-Hans":
system: "yaml 您是一个帮助用户寻找描述性字幕的字幕模型。"
user: "yaml 描述此图像就像您将其描述给看不见的人一样。"

View File

@@ -0,0 +1,37 @@
[project]
name = "data preparation"
version = "0.1.0"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"azure-identity == 1.15.0",
"openai == 1.55.3",
"azure-search-documents == 11.4.0b6",
"azure-storage-blob == 12.17.0",
"python-dotenv == 1.0.0",
"httpx",
"azure-ai-documentintelligence",
"azure-ai-formrecognizer == 3.3.0",
"markdown",
"tqdm",
"PyMuPDF",
"tiktoken",
"langchain",
"bs4",
"urllib3",
"six",
"pdf2image",
"opencv-python",
"Pillow",
"chardet",
"SQLAlchemy == 2.0.41",
]
[project.optional-dependencies]
test = ["pytest", "pytest-asyncio"]
dev = []
[tool.pytest.ini_options]
testpaths = ["tests"]
asyncio_mode = "auto"

View File

@@ -0,0 +1,30 @@
azure-identity==1.15.0
openai==1.55.3
azure-search-documents==11.5.0
azure-storage-blob==12.17.0
python-dotenv==1.0.0
httpx
azure-ai-documentintelligence
azure-ai-formrecognizer==3.3.0
markdown
tqdm
PyMuPDF
tiktoken
langchain
langchain-openai
langchain-core
langchain-community
bs4
urllib3
pytest
pytest-asyncio
six
pdf2image
opencv-python
Pillow
chardet
SQLAlchemy==2.0.41
psycopg2==2.9.10
pyyaml==6.0.2
uuid6==2025.0.1
dataclasses-json==0.6.7

View File

@@ -0,0 +1,209 @@
"""
Resilient HTTP Connection Pool Manager
"""
import atexit
from enum import verify
import threading
from contextlib import contextmanager
from typing import Dict, Generator
import httpx
class ResilientConnectionManager:
"""
Elastic Connection Manager
"""
def __init__(self):
self._connection_pools: Dict[str, httpx.Client] = {}
self._pool_lock = threading.Lock()
self._is_closed = False
# Resource cleaning when the registration program exits
atexit.register(self._cleanup_all_pools)
def get_persistent_client(self, service_profile: str = "standard") -> httpx.Client:
"""
Get persistent client - main interface
Args:
service_profile: Service configuration file
- "standard": General API (60s timeout)
- "cloud_api": Cloud API (120s timeout, suitable for Azure)
- "ai_inference": AI Reasoning Services (180s timeout, suitable for OpenAI/VLLM)
- "batch_processing": Batch Processing Services (300s timeout)
"""
if self._is_closed:
raise RuntimeError("Connection manager is closed")
if service_profile not in self._connection_pools:
with self._pool_lock:
# Double-checked locking pattern
if service_profile not in self._connection_pools:
self._connection_pools[service_profile] = self._create_optimized_client(service_profile)
return self._connection_pools[service_profile]
def _create_optimized_client(self, service_profile: str) -> httpx.Client:
"""Create an optimized client based on the service profile"""
# Service profile mapping
profile_configs = {
"standard": {
"timeout": 60.0,
"max_keepalive": 10,
"max_connections": 50,
"description": "General API Services"
},
"cloud_api": {
"timeout": 120.0,
"max_keepalive": 8,
"max_connections": 25,
"description": "Cloud API Services (Azure Search, Storage ...)"
},
"ai_inference": {
"timeout": 180.0,
"max_keepalive": 5,
"max_connections": 15,
"description": "AI Reasoning Services (OpenAI, VLLM ...)"
},
"batch_processing": {
"timeout": 300.0,
"max_keepalive": 3,
"max_connections": 10,
"description": "Batch processing and long-term tasks"
}
}
config = profile_configs.get(service_profile, profile_configs["standard"])
return httpx.Client(
timeout=config["timeout"],
limits=httpx.Limits(
max_keepalive_connections=config["max_keepalive"],
max_connections=config["max_connections"],
keepalive_expiry=300 # 5 minutes to keep alive
),
follow_redirects=True,
verify=False
)
@contextmanager
def resilient_session(self, service_profile: str = "standard"):
"""
Elastic Session Context Manager - Recommended for retry scenarios
Example of usage:
with connection_manager.resilient_session("ai_inference") as client:
for retry in range(3):
response = client.post(...)
"""
client = self.get_persistent_client(service_profile)
# Directly return the client without using the with statement
# Because the client is already managed in the connection pool, no additional context management is needed
try:
yield client
finally:
# Do not close the client here, keep the connection pool alive
pass
def get_pool_statistics(self) -> Dict[str, Dict]:
"""Get connection pool statistics - for monitoring"""
stats = {}
with self._pool_lock:
for profile, client in self._connection_pools.items():
try:
# httpx internal connection pool information
pool_info = {
"is_closed": client.is_closed,
"timeout": str(client.timeout),
"max_connections": client._transport._pool._pool_factory.limits.max_connections, # type: ignore
"profile": profile
}
stats[profile] = pool_info
except Exception:
stats[profile] = {"error": "Statistical information cannot be obtained"}
return stats
def force_refresh_pool(self, service_profile: str):
"""Force refresh the specified connection pool - for fault recovery"""
with self._pool_lock:
if service_profile in self._connection_pools:
try:
self._connection_pools[service_profile].close()
except Exception:
pass
del self._connection_pools[service_profile]
def _cleanup_all_pools(self):
"""Clean all connection pools - Memory security"""
with self._pool_lock:
if not self._is_closed:
for profile, client in list(self._connection_pools.items()):
try:
client.close()
except Exception:
pass # Ignore errors during cleaning
self._connection_pools.clear()
self._is_closed = True
# =============================================================================
# Global instances and convenient interfaces
# =============================================================================
# Global Elastic Connection Manager
_resilient_manager = ResilientConnectionManager()
# Main public interface
def get_persistent_http_client(service_profile: str = "standard") -> httpx.Client:
"""
Get persistent HTTP client - main interface
Recommended service configuration profiles:
- "standard": generic API
- "cloud_api": Azure/cloud service API
- "ai_inference": OpenAI/VLLM etc. AI services
- "batch_processing": long-term batch processing tasks
"""
return _resilient_manager.get_persistent_client(service_profile)
def resilient_http_session(service_profile: str = "standard"):
"""
Elastic HTTP Session Context Manager - Recommended for retry logic
Example of usage:
with resilient_http_session("ai_inference") as client:
for retry in range(3):
response = client.post(endpoint, json=data)
"""
return _resilient_manager.resilient_session(service_profile)
def get_connection_pool_stats() -> Dict[str, Dict]:
"""Get connection pool statistics"""
return _resilient_manager.get_pool_statistics()
def refresh_connection_pool(service_profile: str):
"""Refresh the specified connection pool"""
_resilient_manager.force_refresh_pool(service_profile)
# =============================================================================
# Convenient dedicated client interfaces - more intuitive naming
# =============================================================================
def get_standard_client() -> httpx.Client:
"""Get the standard client (generic HTTP request)"""
return get_persistent_http_client("standard")
def get_cloud_api_client() -> httpx.Client:
"""Get dedicated cloud API clients (Azure Search, Storage, etc.)"""
return get_persistent_http_client("cloud_api")
def get_ai_inference_client() -> httpx.Client:
"""Get AI Inference Dedicated Clients (OpenAI, VLLM, etc.)"""
return get_persistent_http_client("ai_inference")
def get_batch_processing_client() -> httpx.Client:
"""Get a batch-specific client (long-term task)"""
return get_persistent_http_client("batch_processing")

View File

@@ -0,0 +1,243 @@
import time
from typing import List, Any, Optional, Dict
import logging
from dataclasses import dataclass, field
import json
import datetime
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from abc import ABC, abstractmethod
from sqlalchemy import and_
from sqlalchemy.orm import sessionmaker
from database import IndexJobStatus, IndexJob
from utils import custom_serializer
@dataclass
class Task:
"""Task object"""
id: str
payload: Any
priority: int = 0
status: IndexJobStatus = IndexJobStatus.PENDING
created_at: float = field(default_factory=time.time)
started_at: Optional[float] = None
completed_at: Optional[float] = None
error: Optional[Exception] = None
result: Any = None
def __lt__(self, other):
"""Used for priority queue sorting"""
return self.priority > other.priority
@dataclass
class ProcessingStats:
"""Processing statistics information"""
total_tasks: int = 0
completed_tasks: int = 0
failed_tasks: int = 0
cancelled_tasks: int = 0
average_processing_time: float = 0.0
throughput: float = 0.0 # Number of tasks processed per second
start_time: datetime.datetime = datetime.datetime.now()
@property
def success_rate(self) -> float:
"""Success rate"""
if self.total_tasks == 0:
return 0.0
return self.completed_tasks / self.total_tasks
@property
def pending_tasks(self) -> int:
"""Number of pending tasks"""
return self.total_tasks - self.completed_tasks - self.failed_tasks - self.cancelled_tasks
@property
def elapsed_time(self) -> float:
"""Elapsed time"""
time_diff = datetime.datetime.now() - self.start_time
return time_diff.total_seconds()
@property
def eta(self) -> float:
"""Estimated remaining time"""
if self.completed_tasks == 0:
return 0.0
rate = self.completed_tasks / self.elapsed_time
if rate == 0:
return 0.0
return self.pending_tasks / rate
class TaskProcessorInterface(ABC):
@abstractmethod
def process(self, task: Task) -> Any:
pass
class TaskProcessor:
"""Task processor"""
def __init__(self,
task_processor: TaskProcessorInterface,
max_workers: int = 4,
logger: Optional[logging.Logger] = None,
database_engine: Optional[Any] = None,
data_config:Optional[dict[str,Any]] = None):
if data_config is None:
raise ValueError("data_config must be provided")
self.task_processor = task_processor
self.max_workers = max_workers
self.logger = logger or logging.getLogger(__name__)
self.database_engine = database_engine
# Simple statistics
self.total_tasks = 0
self.completed_tasks = 0
self.failed_tasks = 0
self.start_time:datetime.datetime|None = None
# Processing report collection
self.processing_reports: List[Dict[str, Any]] = []
# Control variable
self.should_stop = False
self.data_config = data_config
self.datasource_name: str = data_config.get("datasource_name", "default")
def process_tasks(self, tasks: List[Any]) -> None:
"""Process task list - simple and effective"""
self.total_tasks = len(tasks)
self.completed_tasks = 0
self.failed_tasks = 0
self.start_time = datetime.datetime.now()
self.processing_reports = []
self.logger.info(f"Starting to process {self.total_tasks} tasks")
# Use thread pool to process tasks
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all tasks
future_to_task = {executor.submit(self._process_single_task, task): task
for task in tasks}
# Wait for tasks to complete
for future in as_completed(future_to_task):
if self.should_stop:
break
task = future_to_task[future]
try:
result = future.result()
self.completed_tasks += 1
# Record successful processing report
report:dict[str,Any] = { 'task_id': getattr(task, 'id', 'unknown'), 'status': 'success', 'message': getattr(result, 'message', 'Processing completed'), 'chunks_count': getattr(result, 'chunks_count', 0), 'processing_time': getattr(result, 'processing_time', 0) }
self.processing_reports.append(report)
# Output progress every 1 task
self._log_progress()
except Exception:
self.failed_tasks += 1
self.logger.error(f"Task processing failed: {traceback.format_exc()}")
# Record failed processing report
report = { 'task_id': getattr(task, 'id', 'unknown'), 'status': 'failed', 'error': traceback.format_exc(), 'processing_time': 0 }
self.processing_reports.append(report)
# Output final statistics
self.finalize_job_status_and_log()
def _process_single_task(self, task: Any) -> Any:
"""Process a single task"""
return self.task_processor.process(task)
def get_processing_reports(self) -> List[Dict[str, Any]]:
"""Get processing reports"""
return self.processing_reports
def _log_progress(self) -> None:
"""Output progress information (estimate remaining time based on average time per processed document)"""
if self.start_time is None:
return
elapsed = (datetime.datetime.now() - self.start_time).total_seconds() if self.start_time else 0
total_processed = self.completed_tasks + self.failed_tasks
remaining = self.total_tasks - total_processed
# Total processing time for processed tasks
total_processing_time = sum(r.get('processing_time', 0) for r in self.processing_reports)
avg_processing_time = (total_processing_time / total_processed) if total_processed > 0 else 0
eta = avg_processing_time * remaining
if total_processed > 0:
rate = total_processed / elapsed if elapsed > 0 else 0
self.logger.info(
f"Progress: {total_processed}/{self.total_tasks} "
f"({100.0 * total_processed / self.total_tasks:.1f}%) "
f"Success: {self.completed_tasks} Failed: {self.failed_tasks} "
f"Rate: {rate:.2f} tasks/second "
f"Average time: {avg_processing_time:.2f} seconds/task "
f"Estimated remaining: {eta / 60:.1f} minutes"
)
def finalize_job_status_and_log(self) -> None:
"""Statistics, write IndexJob status, and output all log details."""
elapsed = (datetime.datetime.now() - self.start_time).total_seconds() if self.start_time else 0
success_count = self.completed_tasks
fail_count = self.failed_tasks
total_count = self.total_tasks
success_rate = (success_count / total_count * 100) if total_count > 0 else 0.0
status = IndexJobStatus.FAILED.value
if total_count == success_count:
status = IndexJobStatus.SUCCESS.value
elif success_count > 0 and fail_count > 0:
status = IndexJobStatus.PARTIAL_SUCCESS.value
report:dict[str,Any] = {
"status": status,
"success_rate": f"{success_rate:.4f}%",
"total_tasks": total_count,
"completed": success_count,
"failed": fail_count,
"start_time": self.start_time,
"end_time": datetime.datetime.now(datetime.timezone.utc),
"processing_time": f"{elapsed:.4f} sec",
"total_elapsed": f"{elapsed / 3600:.4f} hours ",
"average_speed": f"{total_count / elapsed:.5f} tasks/sec" if elapsed > 0 else "average speed: 0 tasks/sec"
}
# Database write section
if self.database_engine:
try:
Session = sessionmaker(bind=self.database_engine)
session = Session()
try:
current_job = session.query(IndexJob).filter(and_(IndexJob.status == "processing",IndexJob.datasource_name==self.datasource_name)).order_by(IndexJob.id.desc()).first()
if current_job:
setattr(current_job, 'finished_time', report["end_time"])
setattr(current_job, 'success_object_count', success_count - fail_count)
setattr(current_job, 'failed_object_count', fail_count)
setattr(current_job, 'detailed_message', json.dumps(report, default=custom_serializer, ensure_ascii=False))
session.commit()
self.logger.info(f"IndexJob status updated: {current_job.status}, Success: {current_job.success_object_count}, Failed: {current_job.failed_object_count}")
else:
self.logger.warning("No IndexJob record with processing status found")
finally:
session.close()
except Exception as e:
self.logger.error(f"Failed to update IndexJob status: {e}")
# Output merged report content
self.logger.info(f"Final report: {json.dumps(report, default=custom_serializer, ensure_ascii=False)}")
if self.processing_reports:
success_reports = [r for r in self.processing_reports if r['status'] == 'success']
failed_reports = [r for r in self.processing_reports if r['status'] == 'failed']
if success_reports:
total_chunks = sum(r.get('chunks_count', 0) for r in success_reports)
avg_processing_time = sum(r.get('processing_time', 0) for r in success_reports) / len(success_reports)
self.logger.info(f"Success reports: {len(success_reports)} tasks, total {total_chunks} chunks, average processing time {avg_processing_time:.2f} sec")
if failed_reports:
self.logger.error(f"Failed reports: {len(failed_reports)} tasks")
for r in failed_reports[:5]:
self.logger.error(f" - {r['task_id']}: {r['error']}")

View File

@@ -0,0 +1,78 @@
"""
Level 3 title recommendation algorithm - only count the number of most frequently used # numbers
"""
from collections import Counter
from typing import Dict, Any, List
import re
def get_third_level_hash_counts_simple(content: str) -> List[int]:
hash_counts = []
in_code_block = False
for line in content.split('\n'):
line = line.strip()
if not line:
continue
# Processing code blocks
if line.startswith('```'):
in_code_block = not in_code_block
continue
if in_code_block:
continue
# Match the title line: #+ space Content
match = re.match(r'^(#{1,6})\s+(.+)$', line)
if match:
hash_count = len(match.group(1))
title_text = match.group(2).strip()
# Check if it is a third-level heading - supports two formats:
# 1. Traditional numeric format: "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
# 2. Letter+number format: "A.1.2.3" (treat A.x.x.x as a third-level heading)
is_third_level = False
# Traditional numeric third-level format: x.x.x
if re.match(r'^\d+\s*\.\s*\d+\s*\.\s*\d+(?:\s|$|[^\d\.])', title_text):
is_third_level = True
# Letter+number third-level format: A.x.x.x (treat as third-level heading)
elif re.match(r'^[A-Z]\.\d+\.\d+\.\d+(?:\s|$|[^\d\.])', title_text):
is_third_level = True
if is_third_level:
hash_counts.append(hash_count)
return hash_counts
def get_recommended_hash_count_simple(content: str) -> Dict[str, Any]:
hash_counts = get_third_level_hash_counts_simple(content)
if not hash_counts:
return {
'recommendation': 5, # Default value
'reason': 'No third-level headings detected, using default value',
'statistics': {},
'total_count': 0
}
# Count the frequency of various # usage
usage_stats = Counter(hash_counts)
# Select the most frequently used # count
most_common = usage_stats.most_common(1)[0]
recommended_hash_count = most_common[0]
frequency = most_common[1]
total_count = len(hash_counts)
percentage = frequency / total_count * 100
return {
'recommendation': recommended_hash_count,
'reason': f'Most frequently used: {frequency}/{total_count} times ({percentage:.1f}%)',
'statistics': dict(usage_stats),
'total_count': total_count
}

View File

@@ -0,0 +1,334 @@
import shutil
from dataclasses import fields
import json
import os
import logging
from datetime import datetime
from decimal import Decimal
import random
from typing import Any, List, Optional, Union
import string
from PIL import Image
import tiktoken
from PIL.Image import Resampling
from entity_models import Document, FigureFlat
class TokenEstimator(object):
GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
def estimate_tokens(self, text: str) -> int:
return len(self.GPT2_TOKENIZER.encode(text, allowed_special="all"))
def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str:
newTokens = self.GPT2_TOKENIZER.decode(
self.GPT2_TOKENIZER.encode(tokens, allowed_special="all")[:numofTokens]
)
return newTokens
TOKEN_ESTIMATOR = TokenEstimator()
def generate_random_name(length:int=12):
# Characters to use: letters and digits
characters = string.ascii_letters + string.digits
# Randomly select `length` characters
folder_name = ''.join(random.choices(characters, k=length))
return folder_name
def asdict_with_dynamic(obj:Any) -> dict[str, Any]:
"""Returns a dictionary containing dynamic attributes"""
# Use predefined fields as the basis
result = {f.name: getattr(obj, f.name) for f in fields(obj)}
# Add dynamic attributes
all_attrs = dir(obj)
predefined_attrs = [f.name for f in fields(obj)]
for attr in all_attrs:
# Skip special attributes, private attributes, methods, and predefined attributes
if (
not attr.startswith("__")
and not callable(getattr(obj, attr))
and attr not in predefined_attrs
):
result[attr] = getattr(obj, attr)
return result
def write_log(message: str):
"""Write log message (INFO level) to data_preparation logger."""
logging.getLogger("data_preparation").info(msg=message)
def init_current_data_directory(base_path:str) -> str:
"""Initialize the current data directory and return its path."""
folder_name = generate_random_name(10)
if base_path == "":
base_path = os.path.expanduser("~")
# Create the directory path
local_data_folder = os.path.join(base_path , "doc-extractor", folder_name)
os.makedirs(local_data_folder, exist_ok=True)
return local_data_folder
def write_content(content: str, directory_path: str, file_name: str):
"""Write merged content to a markdown file in the .extracted directory, and optionally upload to blob storage."""
output_folder = directory_path + "/.extracted/" + file_name
os.makedirs(f"{output_folder}", exist_ok=True)
with open(f"{output_folder}/_merged.md", "w", encoding="utf-8") as file:
file.write(content)
print(f"Merged Saved: {output_folder}/_merged.md")
def write_object(obj: Any, directory_path: str, file_name: str):
"""Write a dictionary to a JSON file in the specified directory."""
output_folder = directory_path + "/.extracted/" + file_name
os.makedirs(f"{output_folder}", exist_ok=True)
with open(f"{output_folder}/_merged.json", "w", encoding="utf-8") as file:
json.dump(obj, file, indent=4, ensure_ascii=False, default=custom_serializer)
print(f"Dict Saved: {output_folder}/_merged.json")
def write_document(documents: list[Document], file_path: str, directory_path: str, rel_file_path: str):
"""Write the parsed document list to a JSON file in the specified directory."""
chunks_save = []
for chunk_idx, chunk_doc in enumerate(documents):
chunk_doc.filepath = rel_file_path
chunk_doc.metadata = json.dumps({"chunk_id": str(chunk_idx)})
chunk_doc.image_mapping = json.dumps(chunk_doc.image_mapping) if chunk_doc.image_mapping else None
chunks_save.append(asdict_with_dynamic(chunk_doc))
output_folder = directory_path + "/.chunked"
os.makedirs(f"{output_folder}", exist_ok=True)
with open(f"{output_folder}/{rel_file_path}.json", "w", encoding="utf-8") as file:
file.write(json.dumps(chunks_save, indent=4, ensure_ascii=False))
print(f"Processed {file_path} to {len(documents)} chunks. Document Schema: {documents[0].document_schema}")
print(f"Saved Result: {output_folder}/{rel_file_path}.json")
# Custom serializer function
def custom_serializer(obj:Any)->Any:
"""Handle types that cannot be serialized by JSON"""
if isinstance(obj, datetime):
return obj.isoformat() # Convert to ISO 8601 string
elif isinstance(obj, Decimal):
return float(obj) # Decimal to float
elif hasattr(obj, '__dict__'):
return obj.__dict__ # Class object to dict
else:
raise TypeError(f"Type {type(obj)} cannot be JSON serialized")
def keep_latest(data_list: list[dict[str,Any]] , id_key:str, timestamp_key:Optional[str]='')->list[dict[str,Any]]:
"""
Advanced method to keep the latest records
Args:
data_list: List of dictionaries containing records
id_key: Key to identify the entity
timestamp_key: Timestamp key (optional, if not provided, keep the last occurrence)
Returns:
List of the latest records for each entity
"""
latest_dict = {}
for idx, record in enumerate(data_list):
entity_id = record[id_key]
# If no timestamp, keep the last occurrence by position
if timestamp_key is None or timestamp_key not in record:
# Record index to handle same id cases
latest_dict[entity_id] = (idx, record)
continue
current_time = record[timestamp_key]
# If the current record is newer, update
if entity_id not in latest_dict or current_time > latest_dict[entity_id][1][timestamp_key]:
latest_dict[entity_id] = (idx, record)
# Sort by original position (optional)
return [record for _, record in sorted(latest_dict.values(), key=lambda x: x[0])]
def max_datetime_safe(
dt1: Union[datetime, None],
dt2: Union[datetime, None]
) -> Union[datetime, None]:
"""
Safely get the maximum of two datetimes, handling None values
Args:
dt1: First datetime (may be None)
dt2: Second datetime (may be None)
Returns:
The maximum datetime, or None if both are None
"""
if dt1 is None:
return dt2
if dt2 is None:
return dt1
return max(dt1, dt2)
def min_datetime_safe(
dt1: Union[datetime, None],
dt2: Union[datetime, None]
) -> Union[datetime, None]:
"""
Safely get the minimum of two datetimes, handling None values
Rules:
- Both datetimes are None → return None
- One datetime is None → return the other
- Both datetimes are not None → return the smaller one
Args:
dt1: First datetime (may be None)
dt2: Second datetime (may be None)
Returns:
The minimum datetime, or None if both are None
"""
if dt1 is None:
return dt2
if dt2 is None:
return dt1
return min(dt1, dt2)
def write_json_to_file(data: list[dict], filename: str):
"""Write data to a JSON file."""
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w", encoding="utf-8") as file:
json.dump(data, file, indent=4, ensure_ascii=False, default=custom_serializer)
print(f"JSON file saved: {filename}")
def write_grouped_index_files(to_upload_dicts: list[dict[str,Any]],index_name:str, base_directory: str = ""):
"""
Write to the corresponding json file in the .index directory, grouped by the filepath field in to_upload_dicts
Args:
to_upload_dicts: List of dictionaries to upload
base_directory: Basic directory path
"""
if not to_upload_dicts:
print("No data to write.")
return
# Group by filepath field
grouped_data = {}
for item in to_upload_dicts:
filepath = item.get("filepath", "unknown")
if filepath not in grouped_data:
grouped_data[filepath] = []
grouped_data[filepath].append(item)
# Create .index directory
index_dir = os.path.join(base_directory, ".index")
os.makedirs(index_dir, exist_ok=True)
# Create corresponding json files for each filepath
for filepath, items in grouped_data.items():
# Convert filepath to a safe filename
safe_filename = filepath.replace("/", "_").replace("\\", "_").replace(":", "_")
if safe_filename.endswith(".pdf"):
safe_filename = safe_filename[:-4] # Remove .pdf extension
json_filename = f"{safe_filename}.{index_name}.json"
json_filepath = os.path.join(index_dir, json_filename)
# Write JSON file
with open(json_filepath, "w", encoding="utf-8") as file:
json.dump(items, file, indent=4, ensure_ascii=False, default=custom_serializer)
print(f"Grouped index file saved: {json_filepath} (contains {len(items)} items)")
print(f"Total {len(grouped_data)} files written to .index directory")
def replace_urls_in_content(content:str, replacements: List[FigureFlat])->str:
"""
Insert URLs from the replacement list into the specified positions in the content
:param content: Original text content
:param replacements: Replacement list, each element contains:
- 'url': Image URL
- 'offset': Offset in the original content
- 'length': Length of the text to be replaced
:return: New content with replacements
"""
if not replacements:
return content
# Sort by offset in descending order (process in reverse order)
sorted_replacements = sorted(replacements, key=lambda x: x.offset, reverse=True)
# List to store text fragments
fragments = []
current_index = len(content) # Current position (start from the end)
for item in sorted_replacements:
url = f"![{item.content}]({item.url})"
offset = item.offset
length = item.length
# Check offset validity
if offset >= current_index:
continue # Skip invalid offset
# Calculate actual end position for replacement
end_pos = min(offset + length, current_index)
# 1. Add text between current position and end of replacement
fragments.append(content[end_pos:current_index])
# 2. Add URL (replace original content)
fragments.append(url)
# Update current position to start of replacement
current_index = offset
# Add remaining head content
fragments.append(content[:current_index])
# Concatenate fragments in reverse order (since processed backwards)
return ''.join(fragments[::-1])
def resize_image(input_path:str, output_path:str=None, max_size:int=10000)->str:
"""Scaling PNG pictures in an equal ratio to ensure that the length and width do not exceed max_size pixels"""
with Image.open(input_path) as img:
# Calculate the scaling ratio
ratio = min(max_size / max(img.size), 1.0)
if ratio >= 1: # No scaling required
return input_path
# Calculate new dimensions (maintain aspect ratio)
new_size = tuple(round(dim * ratio) for dim in img.size)
# Using high-quality scaling algorithm
resized_img = img.resize(new_size, Resampling.LANCZOS)
# Process the output path
if not output_path:
filename, ext = os.path.splitext(input_path)
output_path = f"{filename}_resized{ext}"
# Save the zoomed image (preserve PNG features)
resized_img.save(output_path, format="PNG", optimize=True)
print(f"Images have been scaled:{img.size}{new_size} | Save to: {output_path}")
return output_path
def file_rename(input_path:str)->str:
filename, ext = os.path.splitext(input_path)
if ext.lower() == ".doc":
new_path = f"{filename}.docx"
shutil.copy2(input_path, new_path)
print("file renamed to ", new_path)
return new_path
return input_path

View File

@@ -0,0 +1,483 @@
import json
import os
import time
from typing import Any, List
import base64
from app_config import ApplicationConfig
from azure_index_service import get_cloud_api_client
from pdf2image import convert_from_path # type: ignore
import numpy as np
from PIL import Image
from langchain_openai import ChatOpenAI ,AzureChatOpenAI
from langchain.schema.messages import SystemMessage
from langchain_core.messages import AIMessage,HumanMessage,ToolMessage
from di_extractor import FigureFlat
from entity_models import DiResult, Document, UnsupportedFormatError
from resilient_http_pool import get_ai_inference_client
RETRY_COUNT = 3
def vision_extract(pdf_file_path:str, file_format:str, directory_path:str, vllm_endpoint:str, vllm_key:str) -> List[Document]:
if file_format not in ["pdf"]:
raise UnsupportedFormatError(f"Unsupported file format: {file_format}")
source_rel_file_path = os.path.relpath(pdf_file_path, directory_path)
image_dir = directory_path + "/.images/" + source_rel_file_path
print(f"Converting to images: {pdf_file_path}")
pdf_to_images(pdf_file_path, image_dir)
print(f"Converted to images: {pdf_file_path}")
image_filenames = os.listdir(image_dir)
image_filenames.sort()
rsltDocs: List[Document] = []
page_index = 0
for image_filename in image_filenames:
if image_filename.endswith(".webp"):
print(f"extracting: {image_dir}/{image_filename}")
image_path = os.path.join(image_dir, image_filename)
rsltDoc = None
if page_index == 0:
rsltDoc = extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index)
else:
rsltDoc = extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index, rsltDocs[page_index-1])
rsltDocs.append(rsltDoc)
page_index = page_index+1
return rsltDocs
def pdf_to_images(pdf_path, output_folder, dpi=250):
untrimed_folder = output_folder+"/.untrimed"
os.makedirs(untrimed_folder, exist_ok=True)
# Convert PDF to images
convert_from_path(pdf_path, dpi=dpi, output_folder=untrimed_folder,fmt="png", paths_only=True)
image_filenames = os.listdir(untrimed_folder)
image_filenames.sort()
# # clear the output folder
# for file in os.listdir(output_folder):
# os.remove(os.path.join(output_folder, file))
# Save images to the output folder
for i, image_filename in enumerate(image_filenames):
# generate index num with fixed width of 6 digits
# load image
image = Image.open(f"{untrimed_folder}/{image_filename}")
trimmed_image = trim_image(image)
index = str(i + 1).zfill(6)
image_path = f"{output_folder}/{index}.webp"
trimmed_image.save(image_path, format="WEBP")
os.remove(f"{untrimed_folder}/{image_filename}")
def trim_image(input_image: Image.Image) -> Image.Image:
"""
Trim the margins of a scanned document image, ignoring noise and small specks.
Args:
input_image (Image.Image): The input PIL Image object.
Returns:
Image.Image: The cropped PIL Image object.
"""
# Convert the image to grayscale
grayscale_image = input_image.convert("L")
# Convert grayscale to numpy array
image_array = np.array(grayscale_image)
# Apply a threshold to create a binary image
threshold = 240 # Adjust this value if needed
binary_image = (image_array < threshold).astype(np.uint8)
# Find the bounding box of the non-zero regions
rows = np.any(binary_image, axis=1)
cols = np.any(binary_image, axis=0)
if not rows.any() or not cols.any():
# If the image is completely empty or noise-free, return the original
return input_image
ymin, ymax = np.where(rows)[0][[0, -1]]
xmin, xmax = np.where(cols)[0][[0, -1]]
# Add a small margin (optional, remove if not needed)
margin = 10
ymin = max(0, ymin - margin)
ymax = min(binary_image.shape[0], ymax + margin)
xmin = max(0, xmin - margin)
xmax = min(binary_image.shape[1], xmax + margin)
# Crop the image using the calculated bounding box
cropped_image = input_image.crop((xmin, ymin, xmax + 1, ymax + 1))
return cropped_image
tips = "- The document is about standard/regulatory for a automobile industry company to refer. So prioritize extracting content about standards/regulatory/compliance carefully"
# Define the messages for the chat
SYS_MSG_Flow_Layout = f"""# Role
You are specialized in extracting content from screenshots of document.
# Rules
- You will receive a page screenshot from a multi-pages document. Extract content into a structured markdown format.
- Identify if the page is Table of Contents(目录, 目次) or empty page(after ignoring watermarks)
- If yes, just ignore the whole page, and output "[]" only
- If no, you should follow below rules to extract content
- Recognize hierarchical section header, and use appropriate markdown symbols "#" to reflect its hierarchy level.
- Detection:
- Identify line of section header that beginning with a hierarchical section numbering part and optionally followed by a text part. The section numbering part conatains only numbers, alphabets, and dots. The section numbering part is a tiered (multi-level) numbering system. For example: "2.3.17 示例标题", "1 Sample Title", "6.1.2.5", "A.14.8.9 示例标题".
- Each section header is just one line, and the section number is at the beginning of the line.
- Header Hierarchy Level Mapping:
- The section numbering part is a tiered (multi-level) numbering system. Section number at each hierarchy level in section numbering part is seperated by dot(.), so the count of separated section number reflects its the section header's hierarchy levels. For example, the header "4.13.2 Sample" should be considered as an H3 level.
- Use appropriate markdown symbols "#" to reflect section headers's hierarchy levels. **The number of "#" symbols should correspond to the depth of the section level.** For instance:
- "1 section Title" should be output as "# 1 section Title"
- "2.3.17 section Title" should be output as "### 2.3.17 section Title"
- "A.14.8.9 section Title" should be output as "#### A.14.8.9 section Title"
- **Table title or picture title should NOT be considered as a section header, even if it is at beginning of the page. Output them as format "[table/picture titles]", for example: "[表 1.2 示例]", "[图5 示例]")**
- IMPORTANT: The screenshot is taken from one page of a multi-page document, note that it represents only a single page, not the entire document.**The beginning area of the page may not fall under a section header. Nevertheless, ensure that you still extract content from this area, even if it is not explicitly labeled under a section header.**
- Embedded Pictures/Graphs/Diagram:
- If the embedded picture/graph/diagram is major content and can be understood clearly, descript it as caption, using format: `![<caption>](picture)`
- Otherwise, just use a placeholder: `![](picture)`
# Tips
- Carefully recognize scientific symbols and formulas, and output them professionally and accurately.
- If a table is not a blank template, you should extract using markdown table markup
- Accurately recognize the content according to the screenshot, and do not speculate any content.
- Ignore any diagonally arranged watermarks present in the document.
- The page footer and header can be ignored.
{tips}
"""
SYS_MSG_Slides_Layout = f"""# Role
You are specialized in extracting content from screenshots of a slides deck like PPT.
# Rules
- You will receive a page screenshot from a multi-pages deck. Extract content into a structured markdown format.
- Recognize title headers from the page and use appropriate markdown symbols "#" to reflect their hierarchy levels. Every page should have one H1 title header.
- Embedded Pictures/Graphs/Diagram: If there are embedded pictures/figures, try your best to understand them, and descript them into caption paragraphs.
# Tips
- Carefully recognize scientific symbols and formulas, and output them professionally and accurately.
- If a table is not a blank template, you should extract using markdown table markup
- Accurately recognize the content according to the screenshot, and do not speculate any content.
- Ignore any diagonally arranged watermarks present in the document. Identify if the page is empty after ignoring watermarks. If yes, just ignore this page, and output "[]" only
{tips}
"""
SYS_MSG_Cover = f"""# Role
You are specialized in extracting content from screenshots of document.
# Rules
- You will receive the cover page from a multi-pages document. Extract content into a structured JSON format.
- Recognize what type of Document Schema it is, there are the two below types of document layout schema:
- flow: Like a page of Office Words document, mainly in flow document layout.
- slides: Like a page of Office PowerPoint document, mainly in a presenting slide layout.
- other: Not looks like either of abvoe document layout schema type
- The cover page may contain the following information: main_title, sub_title, publisher, publised_date, document_code, document_category.
- Detect the primary and secondary language of the document. Use language code as their values. The default primary language is `zh-Hans`. If there are titles in secondary language, they should also be included as well.
- Whole page should be extracted as markdown string and stored in the `whole_page` field.
- The output JSON schema:
- document_schema
- main_title
- sub_title
- publisher
- publised_date
- document_code
- document_category
- main_title_sec_language
- sub_title_sec_language
- primary_language
- secondary_language
- whole_page
# Tips
- Accurately recognize the text content according to the screenshot, and do not speculate any content.
- Ignore any diagonally arranged watermarks present in the document.
- Don't use horizontal divider("---") or simmilar markdown syntax to separate the content.
{tips}
"""
USER_MSG = """# task
Recognize screenshot of this document cover page, return the result
"""
def extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index, pre_document:Document = None) -> Document:
encoded_image = base64.b64encode(open(image_path, 'rb').read()).decode('ascii')
file_ext = image_path.split(".")[-1]
system_msg = ""
if page_index==0:
system_msg = SYS_MSG_Cover
else:
if pre_document.document_schema == "flow":
system_msg = SYS_MSG_Flow_Layout
elif pre_document.document_schema == "slides":
system_msg = SYS_MSG_Slides_Layout
else:
raise ValueError(f"schema = {pre_document.document_schema}, not supported")
headers = {
"Content-Type": "application/json",
"api-key": vllm_key,
}
payload = {
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": system_msg
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": USER_MSG
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/{file_ext};base64,{encoded_image}"
}
}
]
}
],
"temperature": 0
}
response = None
for i in range(RETRY_COUNT):
try:
client = get_ai_inference_client()
response = client.post(vllm_endpoint, headers=headers, json=payload, timeout=180)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
break
except Exception as e:
print(f"Error extract_from_image {image_path} with error={e}, retrying, current at {i + 1} retry, {RETRY_COUNT - (i + 1)} retries left")
time.sleep(15)
rslt = None
if response and response.status_code != 200:
if response.status_code == 400:
try:
rsltObj = response.json()
if rsltObj["error"]["inner_error"]["code"] == "ResponsibleAIPolicyViolation":
rslt = "[]"
print(f"Ignored: {image_path}. Error extract_from_image with status_code={response.status_code}\n {response.text}")
except:
raise Exception(f"Error extract_from_image {image_path} with status_code={response.status_code}\n {response.text}")
else:
raise Exception(f"Error extract_from_image {image_path} with status_code={response.status_code}\n {response.text}")
if rslt is None and response:
rslt = response.json()["choices"][0]["message"]["content"]
# img_tag = image_content_to_tag(caption)
# mapping = {img_tag: f"data:image/{file_ext};base64,{encoded_image}"}
# if rslt starts with ```markdown
if rslt.startswith("```"):
# remove the first line and the last line
rslt = rslt.split("\n")[1:-1]
rslt = "\n".join(rslt)
## add a page number at the first line of the result text
# rslt = f"[Page {image_filename.replace('page_', '').replace('.png', '')}]\n\n{rslt}\n\n\n\n"
page_index_output = str(page_index + 1).zfill(6)
output_folder = directory_path + "/.extracted/" + source_rel_file_path
os.makedirs(f"{output_folder}", exist_ok=True)
document = None
if page_index==0:
with open(f"{output_folder}/{page_index_output}.json", "w") as file:
file.write(rslt)
rsltObj = json.loads(rslt)
document_schema = rsltObj.get("document_schema", "flow").lower()
if document_schema == "other":
document_schema = "flow"
document = Document(
document_schema = document_schema,
main_title = rsltObj.get("main_title", "") or "",
sub_title = rsltObj.get("sub_title", "") or "",
publisher = rsltObj.get("publisher", "") or "",
document_code = rsltObj.get("document_code", "") or "",
document_category = rsltObj.get("document_category", "") or "",
main_title_sec_language = rsltObj.get("main_title_sec_language", "") or "",
sub_title_sec_language = rsltObj.get("sub_title_sec_language", "") or "",
primary_language= rsltObj.get("primary_language", ""),
secondary_language= rsltObj.get("secondary_language", ""),
)
if document.sub_title != "":
document.title = f"{document.main_title}-{document.sub_title}"
else:
document.title = document.main_title
document.doc_metadata = f"{document.main_title}, {document.sub_title}, {document.document_code}, {document.main_title_sec_language}, {document.sub_title_sec_language}"
document.filepath = source_rel_file_path
document.content = rsltObj.get("whole_page", "")
else:
with open(f"{output_folder}/{page_index_output}.md", "w") as file:
file.write(rslt)
document = Document(
document_schema = pre_document.document_schema,
main_title = pre_document.main_title,
sub_title = pre_document.sub_title,
publisher = pre_document.publisher,
document_code = pre_document.document_code,
document_category = pre_document.document_category,
main_title_sec_language = pre_document.main_title_sec_language,
sub_title_sec_language = pre_document.sub_title_sec_language,
primary_language= pre_document.primary_language,
secondary_language= pre_document.secondary_language,
title = pre_document.title,
doc_metadata = pre_document.doc_metadata,
filepath = pre_document.filepath,
)
document.content = rslt
return document
def understand_with_langchain(image:bytes, mime_type: str, captioning_model_endpoint: str, captioning_model_key: str,model:str|None,azure_deployment:str|None=None,api_version:str|None=None,language:str|None=None, prompts: dict[str,Any]=None):
"""
Use LangChain to automatically adapt to various model platforms for image understanding
Supports OpenAI, Azure OpenAI, Tongyi Qianwen, Bailian and other platforms
"""
# Select prompt words based on language and description type
lang_key = "zh-Hans" if language == "zh-Hans" else "en"
if prompts is None or len(prompts) == 0:
prompts = {
"zh-Hans": { "system": "您是一个帮助用户寻找描述性字幕的字幕模型。", "user": "描述此图像就像您将其描述给看不见的人一样。" },
"en": { "system": "You are a captioning model that helps uses find descriptive captions.", "user": "Describe this image as if you were describing it to someone who can't see it." }
}
if lang_key in prompts.keys():
prompt = prompts[lang_key]
elif "en" in prompts.keys() :
prompt = prompts["en"]
else:
prompt =prompts[prompts.keys()[0]]
# Encoded images
encoded_image = base64.b64encode(image).decode('utf-8')
image_url = f"data:image/{mime_type};base64,{encoded_image}"
http_client = get_cloud_api_client()
# Judging the model type according to endpoint and initialize the corresponding LangChain client
llm:Any=None
for i in range(RETRY_COUNT):
try:
if "openai.azure" in captioning_model_endpoint:
llm = AzureChatOpenAI(azure_deployment=azure_deployment,api_key=captioning_model_key, azure_endpoint=captioning_model_endpoint,api_version=api_version, temperature=0, http_client=http_client)
else:
llm = ChatOpenAI(base_url=captioning_model_endpoint, api_key=captioning_model_key, model=model, temperature=0, http_client=http_client)
# Build the message
messages = [
SystemMessage(content=prompt["system"]),
HumanMessage(content=[{"type": "text", "text": prompt["user"]}, {"type": "image_url", "image_url": {"url": image_url}} ])
]
# 调用模型
response = llm.invoke(messages)
caption = response.content
return caption
except Exception as e:
print(f"Error getting caption with langchain (attempt {i+1}/{RETRY_COUNT}): {e}")
if i < RETRY_COUNT - 1:
time.sleep(5)
else:
# The last attempt failed
raise Exception(f"Failed to get caption after {RETRY_COUNT} attempts: {e}")
return ""
def process_document_figures(di_result:DiResult|None=None,config:ApplicationConfig|None=None) -> DiResult:
"""
Perform figure fusion on the extracted document content.
"""
# Implement figure fusion logic here
if di_result is None:
raise Exception("di_result cannot be None")
if config is None:
raise ValueError("config is None")
description_gen_max_images: int = config.caption.description_gen_max_images
vllm_endpoint:str = config.caption.model_endpoint
vllm_key:str = config.caption.model_key
captioning_model:str = config.caption.model
api_version:str = config.caption.api_version
azure_deployment:str = config.caption.azure_deployment
include_di_content: bool = config.caption.include_di_content
figures = di_result.figures or []
processed_figures:List[FigureFlat] = []
content:str = di_result.di_content
len_figures:int = len(figures)
for figure in figures:
figure_content:str= content[figure.offset:figure.offset + figure.length]
if not figure_content.lstrip().startswith("<figure>"):
continue
image_bytes = base64.b64decode(figure.image)
language = di_result.language
# Image content generation
vision_content:str = ""
if figure.understand_flag:
vision_content = figure.content
elif include_di_content:
if len_figures < description_gen_max_images:
vision_content = understand_with_langchain(image=image_bytes, mime_type="png", captioning_model_endpoint=vllm_endpoint, captioning_model_key=vllm_key, model=captioning_model,azure_deployment=azure_deployment,api_version=api_version, language=language, prompts=config.caption.prompts)
figure.understand_flag = True
else:
vision_content = content[figure.offset:figure.offset + figure.length].lstrip("<figure>").rstrip("</figure>").strip()
vision_content = ' '.join(line.strip() for line in vision_content.splitlines())
vision_content = f"<figcaption>{figure.caption}</figcaption>" + vision_content
if not include_di_content and figure.caption and len(figure.caption)>0:
vision_content = f"<figcaption>{figure.caption}</figcaption>"
figure.content = vision_content
processed_figures.append(figure)
return di_result