init
This commit is contained in:
15
vw-document-ai-indexer/.flake8
Normal file
15
vw-document-ai-indexer/.flake8
Normal file
@@ -0,0 +1,15 @@
|
||||
[flake8]
|
||||
ignore = W293
|
||||
exclude =
|
||||
.git,
|
||||
__pycache__,
|
||||
.venv,
|
||||
venv,
|
||||
tests,
|
||||
docs,
|
||||
build,
|
||||
dist,
|
||||
*.egg-info,
|
||||
.tox,
|
||||
.mypy_cache,
|
||||
.pytest_cache
|
||||
209
vw-document-ai-indexer/.gitignore
vendored
Normal file
209
vw-document-ai-indexer/.gitignore
vendored
Normal file
@@ -0,0 +1,209 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.env.production
|
||||
.env.development
|
||||
config.json
|
||||
config.prd.json
|
||||
config.dev.json
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
.conda/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
.idea/'
|
||||
|
||||
.DS_Store
|
||||
web/.vscode/settings.json
|
||||
|
||||
# Intellij IDEA Files
|
||||
.idea/*
|
||||
!.idea/vcs.xml
|
||||
!.idea/icon.png
|
||||
.ideaDataSources/
|
||||
*.iml
|
||||
api/.idea
|
||||
|
||||
api/.env
|
||||
api/storage/*
|
||||
|
||||
docker-legacy/volumes/app/storage/*
|
||||
docker-legacy/volumes/db/data/*
|
||||
docker-legacy/volumes/redis/data/*
|
||||
docker-legacy/volumes/weaviate/*
|
||||
docker-legacy/volumes/qdrant/*
|
||||
docker-legacy/volumes/etcd/*
|
||||
docker-legacy/volumes/minio/*
|
||||
docker-legacy/volumes/milvus/*
|
||||
docker-legacy/volumes/chroma/*
|
||||
docker-legacy/volumes/opensearch/data/*
|
||||
docker-legacy/volumes/pgvectors/data/*
|
||||
docker-legacy/volumes/pgvector/data/*
|
||||
|
||||
docker/volumes/app/storage/*
|
||||
docker/volumes/certbot/*
|
||||
docker/volumes/db/data/*
|
||||
docker/volumes/redis/data/*
|
||||
docker/volumes/weaviate/*
|
||||
docker/volumes/qdrant/*
|
||||
docker/volumes/etcd/*
|
||||
docker/volumes/minio/*
|
||||
docker/volumes/milvus/*
|
||||
docker/volumes/chroma/*
|
||||
docker/volumes/opensearch/data/*
|
||||
docker/volumes/myscale/data/*
|
||||
docker/volumes/myscale/log/*
|
||||
docker/volumes/unstructured/*
|
||||
docker/volumes/pgvector/data/*
|
||||
docker/volumes/pgvecto_rs/data/*
|
||||
docker/volumes/couchbase/*
|
||||
docker/volumes/oceanbase/*
|
||||
!docker/volumes/oceanbase/init.d
|
||||
|
||||
docker/nginx/conf.d/default.conf
|
||||
docker/nginx/ssl/*
|
||||
!docker/nginx/ssl/.gitkeep
|
||||
docker/middleware.env
|
||||
|
||||
sdks/python-client/build
|
||||
sdks/python-client/dist
|
||||
sdks/python-client/dify_client.egg-info
|
||||
|
||||
pyrightconfig.json
|
||||
api/.vscode
|
||||
|
||||
.idea/
|
||||
|
||||
#.tmp
|
||||
.tmp/
|
||||
.vscode/
|
||||
|
||||
tests/
|
||||
.playground/
|
||||
.vscode/
|
||||
.vs/
|
||||
/version1/
|
||||
/doc/
|
||||
/.vibe
|
||||
10
vw-document-ai-indexer/.pylintrc
Normal file
10
vw-document-ai-indexer/.pylintrc
Normal file
@@ -0,0 +1,10 @@
|
||||
[MASTER]
|
||||
ignore=tests,venv
|
||||
disable=
|
||||
C0114, # missing-module-docstring
|
||||
C0115, # missing-class-docstring
|
||||
C0116, # missing-function-docstring
|
||||
C0303,
|
||||
W1203, # missing-parameter-docstring
|
||||
W0718,
|
||||
W0719
|
||||
391
vw-document-ai-indexer/Deployment.md
Normal file
391
vw-document-ai-indexer/Deployment.md
Normal file
@@ -0,0 +1,391 @@
|
||||
# Document Extractor - Deployment Guide
|
||||
|
||||
This document provides a complete deployment guide for Document Extractor, including on-premises development, Docker containerized deployment, and Kubernetes production environment deployment.
|
||||
|
||||
|
||||
## 📋 Pre-deployment preparation
|
||||
|
||||
### System Requirements
|
||||
- Python 3.12+
|
||||
- Docker (optional, for containerized deployment)
|
||||
- Kubernetes (production environment deployment)
|
||||
- Azure subscription and related services
|
||||
|
||||
### Azure Service Preparation
|
||||
Ensure that you have configured the following Azure services:
|
||||
- Azure Document Intelligence
|
||||
- Azure AI Search
|
||||
- Azure Blob Storage
|
||||
- Azure OpenAI (for vector embeddings)
|
||||
|
||||
## 🔧 Configuration File Preparation
|
||||
|
||||
### 1. Environment Configuration (env.yaml)
|
||||
```yaml
|
||||
# Configuration file reference
|
||||
config: config.yaml
|
||||
|
||||
# Processing settings
|
||||
njobs: 8 # Number of parallel processing jobs
|
||||
|
||||
# Azure AI Search configuration
|
||||
search_service_name: "https://your-search-service.search.windows.net"
|
||||
search_admin_key: "your-search-admin-key"
|
||||
|
||||
# Azure OpenAI Embedding service
|
||||
embedding_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview"
|
||||
embedding_model_key: "your-openai-key"
|
||||
VECTOR_DIMENSION: 1536
|
||||
FLAG_AOAI: "V3" # Azure OpenAI version
|
||||
FLAG_EMBEDDING_MODEL: "AOAI" # Embedding model type: "AOAI" or "qwen3-embedding-8b"
|
||||
|
||||
# Document Intelligence configuration
|
||||
extract_method: "di+vision-llm" # Extraction method: "di+vision-llm", "vision-llm", "di"
|
||||
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
|
||||
form_rec_key: "your-di-key"
|
||||
|
||||
# Document Intelligence features
|
||||
di-hiRes: true # High resolution OCR
|
||||
di-Formulas: true # Mathematical expression detection
|
||||
di_allow_features_ext: "pdf;jpeg;jpg;png;bmp;tiff;heif" # Supported file extensions
|
||||
|
||||
# Vision and captioning models
|
||||
captioning_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
|
||||
captioning_model_key: "your-openai-key"
|
||||
vision_max_images: 200 # Maximum images to process per document (0 = no limit)
|
||||
vision_image_method: "openai" # Image processing method: "openai" or "newapi"
|
||||
FIGURE_CONTENT_CLEAR: true # Clear DI recognized image content
|
||||
|
||||
|
||||
|
||||
# Blob storage for figures and DI results
|
||||
FIGURE_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
|
||||
DI_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
|
||||
|
||||
# Database configuration
|
||||
DB_URI: "postgresql://user:password@host:port/database_name"
|
||||
|
||||
# Processing flags
|
||||
header_fix: false # Enable/disable header fixing
|
||||
```
|
||||
|
||||
### 2. Business Configuration (config.yaml)
|
||||
|
||||
```yaml
|
||||
# Main data configuration (array format)
|
||||
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
|
||||
datasource_name: "CATOnline-cn" # data source name
|
||||
data_dir: "" # Optional local data directory
|
||||
base_path: "/app/run_tmp" # Temporary processing directory
|
||||
|
||||
# File processing limits
|
||||
process_file_num: 0 # 0 = process all files
|
||||
process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date
|
||||
|
||||
# Chunking configuration
|
||||
chunk_size: 2048 # Maximum tokens per chunk
|
||||
token_overlap: 128 # Overlap between chunks
|
||||
|
||||
# Index schemas configuration
|
||||
index_schemas:
|
||||
# Chunk-level index for search
|
||||
- index_name: "your-knowledge-chunk-index"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
field_type: "append" # How to handle existing data
|
||||
upload_batch_size: 50 # Documents per batch upload
|
||||
|
||||
# Metadata fields to include
|
||||
fields: [
|
||||
"filepath", "timestamp", "title", "publisher", "publish_date",
|
||||
"document_category", "document_code", "language_code",
|
||||
"x_Standard_Regulation_Id", "x_Attachment_Type",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status",
|
||||
"x_Standard_Range", "x_Standard_Kind", "x_Standard_No",
|
||||
"x_Standard_Code", "x_Standard_Technical_Committee",
|
||||
"x_Standard_Vehicle_Type", "x_Standard_Power_Type",
|
||||
"x_Standard_CCS", "x_Standard_ICS",
|
||||
"x_Standard_Published_Date", "x_Standard_Effective_Date",
|
||||
"x_Regulation_Status", "x_Regulation_Title_CN",
|
||||
"x_Regulation_Title_EN", "x_Regulation_Document_No",
|
||||
"x_Regulation_Issued_Date", "x_Classification",
|
||||
"x_Work_Group", "x_Reference_Standard",
|
||||
"x_Replaced_by", "x_Refer_To", "func_uuid",
|
||||
"update_time", "status"
|
||||
]
|
||||
|
||||
# Vector configuration
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"] # Fields to vectorize for content
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization
|
||||
|
||||
# Azure AI Search configuration
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath" # Field to use for updates
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
|
||||
# Document-level index
|
||||
- index_name: "your-knowledge-document-index"
|
||||
data_type: ["document", "metadata"]
|
||||
field_type: "full" # Replace entire documents
|
||||
key_fields: ["filepath"] # Primary key fields
|
||||
upload_batch_size: 1
|
||||
|
||||
fields: [
|
||||
# Same field list as chunk index
|
||||
"filepath", "timestamp", "title", "publisher"
|
||||
# ... (same as above)
|
||||
]
|
||||
|
||||
merge_content_fields: ["content"] # Fields to merge from chunks
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
|
||||
# Regulation-specific index
|
||||
- index_name: "your-regulation-index"
|
||||
data_type: ["metadata"]
|
||||
field_type: "full"
|
||||
key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key
|
||||
upload_batch_size: 50
|
||||
|
||||
fields: [
|
||||
# Regulation-specific fields
|
||||
"x_Standard_Regulation_Id", "x_Standard_Title_CN",
|
||||
"x_Standard_Title_EN", "x_Regulation_Status"
|
||||
# ... (regulation metadata fields)
|
||||
]
|
||||
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
update_by_field: "x_Standard_Regulation_Id"
|
||||
|
||||
# Field merging configuration
|
||||
merge_fields:
|
||||
- key: "doc_metadata" # Combined metadata field
|
||||
fields: [
|
||||
"title", "publisher", "document_category", "document_code",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status"
|
||||
# ... (all metadata fields to combine)
|
||||
]
|
||||
|
||||
# Vector field configuration
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
```
|
||||
|
||||
## 🚀 Deployment method
|
||||
|
||||
### Method 1: Local Development Deployment
|
||||
|
||||
#### 1. Environment Preparation
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone <repository-url>
|
||||
cd document-extractor
|
||||
|
||||
# Create a virtual environment
|
||||
python -m venv .venv
|
||||
|
||||
# Activate the virtual environment
|
||||
# Linux/Mac:
|
||||
source .venv/bin/activate
|
||||
# Windows:
|
||||
.venv\Scripts\activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### 2. Configuration File Setup
|
||||
```bash
|
||||
# Copy configuration templates
|
||||
cp config.yaml.example config.yaml
|
||||
cp env.yaml.example env.yaml
|
||||
|
||||
# Edit config.yaml and env.yaml to actual configuration
|
||||
```
|
||||
|
||||
#### 3. Run the application
|
||||
```bash
|
||||
# Directly run
|
||||
python main.py --config config.yaml --env env.yaml
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Method 2: Kubernetes Production Deployment
|
||||
|
||||
#### 1. Build the image
|
||||
```bash
|
||||
docker build . -t document-ai-indexer:latest
|
||||
|
||||
docker tag document-ai-indexer:latest acrsales2caiprd.azurecr.cn/document-ai-indexer:latest
|
||||
|
||||
docker login acrsales2caiprd.azurecr.cn -u username -p password
|
||||
|
||||
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:latest
|
||||
```
|
||||
|
||||
|
||||
#### 2. Prepare Configuration Files
|
||||
```bash
|
||||
# Create namespace (if not exists)
|
||||
kubectl create namespace knowledge-agent
|
||||
|
||||
# Create ConfigMap
|
||||
kubectl create configmap document-ai-indexer-config \
|
||||
--from-file=config.yaml \
|
||||
--from-file=env.yaml \
|
||||
-n knowledge-agent
|
||||
```
|
||||
|
||||
#### 3. One-time Task Deployment
|
||||
```bash
|
||||
# Deploy Pod
|
||||
kubectl apply -f deploy/document-ai-indexer_k8s.yml -n knowledge-agent
|
||||
|
||||
# Check status
|
||||
kubectl get pods -n knowledge-agent
|
||||
kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
```
|
||||
|
||||
#### 4. CronJob Deployment
|
||||
```bash
|
||||
# Deploy CronJob
|
||||
kubectl apply -f deploy/document-ai-indexer-cronjob.yml -n knowledge-agent
|
||||
|
||||
# Check CronJob status
|
||||
kubectl get cronjobs -n knowledge-agent
|
||||
|
||||
# Check job history
|
||||
kubectl get jobs -n knowledge-agent
|
||||
|
||||
# Trigger execution manually
|
||||
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
|
||||
```
|
||||
|
||||
## 📊 Deployment architecture diagram
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Azure Cloud Services"
|
||||
ABS[Azure Blob Storage]
|
||||
ADI[Azure Document Intelligence]
|
||||
AAS[Azure AI Search]
|
||||
AOI[Azure OpenAI]
|
||||
end
|
||||
|
||||
subgraph "Kubernetes Cluster"
|
||||
subgraph "Namespace: knowledge-agent"
|
||||
CM[ConfigMap<br/>Configuration File]
|
||||
CJ[CronJob<br/>Timing tasks]
|
||||
POD[Pod<br/>Processing container]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph "Container Registry"
|
||||
ACR[Azure Container Registry<br/>acrsales2caiprd.azurecr.cn]
|
||||
end
|
||||
|
||||
CM --> POD
|
||||
CJ --> POD
|
||||
ACR --> POD
|
||||
|
||||
POD --> ABS
|
||||
POD --> ADI
|
||||
POD --> AAS
|
||||
POD --> AOI
|
||||
|
||||
style POD fill:#e1f5fe
|
||||
style CM fill:#e8f5e8
|
||||
style CJ fill:#fff3e0
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 📈 Monitoring and logging
|
||||
|
||||
|
||||
### View log
|
||||
```bash
|
||||
# Kubernetes environment
|
||||
kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Filter error logs
|
||||
kubectl logs document-ai-indexer -n knowledge-agent | grep ERROR
|
||||
|
||||
# Check the processing progress
|
||||
kubectl logs document-ai-indexer -n knowledge-agent | grep "Processing"
|
||||
```
|
||||
|
||||
|
||||
#### 4. Kubernetes Deployment Issues
|
||||
**Symptoms**: Pod fails to start or keeps restarting
|
||||
**Solutions**:
|
||||
```bash
|
||||
# Check Pod Status
|
||||
kubectl describe pod document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Check Events
|
||||
kubectl get events -n knowledge-agent
|
||||
|
||||
# Check ConfigMap
|
||||
kubectl get configmap document-ai-indexer-config -n knowledge-agent -o yaml
|
||||
```
|
||||
|
||||
### Debugging Commands
|
||||
```bash
|
||||
# Check Configuration
|
||||
kubectl exec -it document-ai-indexer -n knowledge-agent -- cat /app/config.yaml
|
||||
|
||||
# Enter Container for Debugging
|
||||
kubectl exec -it document-ai-indexer -n knowledge-agent -- /bin/bash
|
||||
|
||||
# Manually run processing
|
||||
kubectl exec -it document-ai-indexer -n knowledge-agent -- python main.py --config config.yaml --env env.yaml
|
||||
```
|
||||
|
||||
## 🔄 Update deployment
|
||||
|
||||
### Application update
|
||||
```bash
|
||||
# Build new image
|
||||
docker build -t document-ai-indexer:v0.21.0 .
|
||||
|
||||
# Push to repository
|
||||
docker tag document-ai-indexer:v0.21.0 acrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0
|
||||
docker push aacrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0
|
||||
|
||||
# Update Kubernetes deployment
|
||||
kubectl set image cronjob/document-ai-indexer-cronjob \
|
||||
document-ai-indexer=acrsales2caiprd.azurecr.cn/document-ai-indexer:v0.21.0 \
|
||||
-n knowledge-agent
|
||||
```
|
||||
|
||||
### Configuration update
|
||||
```bash
|
||||
# Update ConfigMap
|
||||
kubectl create configmap document-ai-indexer-config \
|
||||
--from-file=config.yaml \
|
||||
--from-file=env.yaml \
|
||||
-n knowledge-agent \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Restart the application (if needed)
|
||||
kubectl rollout restart cronjob/document-ai-indexer-cronjob -n knowledge-agent
|
||||
```
|
||||
|
||||
|
||||
---
|
||||
|
||||
*Last updated: August 2025*
|
||||
19
vw-document-ai-indexer/Dockerfile
Normal file
19
vw-document-ai-indexer/Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
FROM acraiflowlab.azurecr.io/python:3.12-bullseye
|
||||
|
||||
RUN echo “Asia/Shanghai” > /etc/timezone
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
COPY ./*.py /app
|
||||
|
||||
|
||||
# RUN rm -f /app/env.yaml
|
||||
# RUN rm -f /app/config.yaml
|
||||
|
||||
|
||||
ENTRYPOINT ["python", "main.py"]
|
||||
|
||||
260
vw-document-ai-indexer/README.md
Normal file
260
vw-document-ai-indexer/README.md
Normal file
@@ -0,0 +1,260 @@
|
||||
# Document AI Indexer
|
||||
|
||||
An intelligent document processing and indexing system based on Azure AI services, supporting content extraction, processing, and vectorized indexing for multiple document formats.
|
||||
|
||||
## Features
|
||||
|
||||
### 🚀 Core Features
|
||||
- **Multi-format Document Support**: PDF, DOCX, image formats, etc.
|
||||
- **Intelligent Content Extraction**: OCR and structured extraction using Azure Document Intelligence
|
||||
- **Document Chunking**: Smart document chunking and vectorization
|
||||
- **Azure AI Search Integration**: Automatically create search indexes and upload documents
|
||||
- **Metadata Management**: Complete document metadata extraction and management
|
||||
- **Hierarchy Structure Repair**: Automatically fix title hierarchy structure in Markdown documents
|
||||
|
||||
### 🔧 Technical Features
|
||||
- **Asynchronous Processing**: High-performance async processing based on asyncio
|
||||
- **Containerized Deployment**: Complete Docker and Kubernetes support
|
||||
- **Configuration Management**: Flexible YAML configuration file management
|
||||
- **Database Support**: SQLAlchemy ORM supporting multiple databases
|
||||
- **Resilient Processing**: Built-in retry mechanisms and error handling
|
||||
- **Monitoring & Logging**: Complete logging and progress monitoring
|
||||
|
||||
## System Architecture
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Data Sources"
|
||||
DS[Document Sources<br/>Blob Storage/Local]
|
||||
MD[Metadata<br/>Extraction]
|
||||
end
|
||||
|
||||
subgraph "Azure AI Services"
|
||||
ADI[Azure Document<br/>Intelligence]
|
||||
AAS[Azure AI Search<br/>Index]
|
||||
EMB[Vector<br/>Embedding]
|
||||
end
|
||||
|
||||
subgraph "Processing Pipeline"
|
||||
HF[Hierarchy<br/>Fix]
|
||||
CH[Content<br/>Chunking]
|
||||
end
|
||||
|
||||
DS --> ADI
|
||||
MD --> HF
|
||||
ADI --> HF
|
||||
HF --> CH
|
||||
CH --> EMB
|
||||
EMB --> AAS
|
||||
|
||||
style DS fill:#e1f5fe
|
||||
style ADI fill:#e8f5e8
|
||||
style AAS fill:#fff3e0
|
||||
style EMB fill:#f3e5f5
|
||||
style HF fill:#ffebee
|
||||
style CH fill:#f1f8e9
|
||||
```
|
||||
|
||||
### Document Processing Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
START([Document Input]) --> DOWNLOAD[Download Document]
|
||||
DOWNLOAD --> EXTRACT[AI Content Extraction]
|
||||
EXTRACT --> FIX[Hierarchy Structure Fix]
|
||||
FIX --> CHUNK[Content Chunking]
|
||||
CHUNK --> EMBED[Vector Embedding]
|
||||
EMBED --> INDEX[Search Index Upload]
|
||||
INDEX --> END([Processing Complete])
|
||||
|
||||
style START fill:#c8e6c9
|
||||
style END fill:#c8e6c9
|
||||
style EXTRACT fill:#e1f5fe
|
||||
style FIX fill:#fff3e0
|
||||
style CHUNK fill:#f3e5f5
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Requirements
|
||||
|
||||
- Python 3.12+
|
||||
- Azure subscription and related services
|
||||
|
||||
For detailed deployment guides, please refer to: [Deployment.md](Deployment.md)
|
||||
|
||||
### Install Dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Configuration Files
|
||||
|
||||
The system uses two main configuration files:
|
||||
|
||||
- `config.yaml` - Business configuration (data source, index configuration, etc.)
|
||||
- `env.yaml` - Environment variable configuration (Azure service keys, etc.)
|
||||
|
||||
**Quick Start Configuration:**
|
||||
|
||||
```yaml
|
||||
# env.yaml - Essential Azure services
|
||||
search_service_name: "https://your-search-service.search.windows.net"
|
||||
search_admin_key: "your-search-admin-key"
|
||||
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
|
||||
form_rec_key: "your-di-key"
|
||||
embedding_model_endpoint: "https://your-openai.openai.azure.com/..."
|
||||
embedding_model_key: "your-openai-key"
|
||||
|
||||
# config.yaml - Basic data source
|
||||
data_configs:
|
||||
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
|
||||
index_schemas:
|
||||
- index_name: "your-knowledge-index"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
```
|
||||
|
||||
📖 **Detailed configuration instructions**: See the complete configuration parameters and examples [Deployment.md - Configuration file preparation](Deployment.md#Configuration-file-preparation)
|
||||
|
||||
### Run Application
|
||||
|
||||
```bash
|
||||
# Direct execution
|
||||
python main.py
|
||||
|
||||
# Or use predefined tasks
|
||||
# (In VS Code, use Ctrl+Shift+P -> Run Task)
|
||||
```
|
||||
|
||||
## 📚 Document Navigation
|
||||
|
||||
- **[Deployment Guide (Deployment.md)](Deployment.md)** - Complete deployment guide, including Docker and Kubernetes deployments
|
||||
- **[Configuration instructions](Deployment.md#Configuration-file-preparation)** - Detailed configuration file description
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
document-extractor/
|
||||
├── main.py # Application entry point
|
||||
├── app_config.py # Configuration management
|
||||
├── business_layer.py # Business logic layer
|
||||
├── document_task_processor.py # Document task processor
|
||||
├── di_extractor.py # Document Intelligence extractor
|
||||
├── azure_index_service.py # Azure Search service
|
||||
├── blob_service.py # Blob storage service
|
||||
├── chunk_service.py # Document chunking service
|
||||
├── hierarchy_fix.py # Hierarchy structure repair
|
||||
├── database.py # Database models
|
||||
├── entity_models.py # Entity models
|
||||
├── utils.py # Utility functions
|
||||
├── config.yaml # Business configuration
|
||||
├── env.yaml # Environment configuration
|
||||
├── requirements.txt # Dependencies
|
||||
├── Dockerfile # Docker build file
|
||||
├── pyproject.toml # Project configuration
|
||||
├── build-script/ # Build scripts
|
||||
│ └── document-ai-indexer.sh
|
||||
├── deploy/ # Deployment files
|
||||
│ ├── document-ai-indexer.sh
|
||||
│ ├── document-ai-indexer_k8s.yml
|
||||
│ ├── document-ai-indexer_cronjob.yml
|
||||
│ └── embedding-api-proxy_k8s.yml
|
||||
└── doc/ # Documentation
|
||||
```
|
||||
|
||||
## Core Components
|
||||
|
||||
### 1. Document Processing Pipeline
|
||||
|
||||
- **Document Loading**: Support loading from Azure Blob Storage or local file system
|
||||
- **Content Extraction**: OCR and structured extraction using Azure Document Intelligence
|
||||
- **Content Chunking**: Smart chunking algorithms maintaining semantic integrity
|
||||
- **Vectorization**: Generate vector representations of document content
|
||||
|
||||
### 2. Index Management
|
||||
|
||||
- **Dynamic Index Creation**: Automatically create Azure AI Search indexes based on configuration
|
||||
- **Batch Upload**: Efficient batch document upload
|
||||
- **Metadata Management**: Complete document metadata indexing
|
||||
- **Incremental Updates**: Support incremental document updates
|
||||
|
||||
### 3. Data Processing
|
||||
|
||||
- **Hierarchy Structure Repair**: Automatically fix title hierarchy in Markdown documents
|
||||
- **Metadata Extraction**: Extract structured metadata from documents and filenames
|
||||
- **Format Conversion**: Unified processing support for multiple document formats
|
||||
|
||||
|
||||
## API and Integration
|
||||
|
||||
### Azure Service Integration
|
||||
- **Azure Document Intelligence**: Document analysis and OCR
|
||||
- **Azure AI Search**: Search indexing and querying
|
||||
- **Azure Blob Storage**: Document storage
|
||||
- **Azure OpenAI**: Vector embedding generation
|
||||
|
||||
### Database Support
|
||||
- PostgreSQL (recommended)
|
||||
- SQLite (development and testing)
|
||||
- Other SQLAlchemy-supported databases
|
||||
|
||||
## Monitoring and Logging
|
||||
|
||||
The system provides comprehensive logging capabilities:
|
||||
- Processing progress monitoring
|
||||
- Error logging
|
||||
- Performance statistics
|
||||
- Task status tracking
|
||||
|
||||
View logs:
|
||||
```bash
|
||||
# Kubernetes environment
|
||||
kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Docker environment
|
||||
docker logs -f <container-id>
|
||||
```
|
||||
|
||||
|
||||
## Development
|
||||
|
||||
### Development Mode
|
||||
|
||||
```bash
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate # Linux/Mac
|
||||
# or
|
||||
.venv\Scripts\activate # Windows
|
||||
|
||||
# Install development dependencies
|
||||
pip install -e .[dev,test]
|
||||
|
||||
# Run code checks
|
||||
mypy .
|
||||
```
|
||||
|
||||
|
||||
### Log Analysis
|
||||
```bash
|
||||
# View error logs
|
||||
kubectl logs document-ai-indexer -n knowledge-agent | grep ERROR
|
||||
|
||||
# View processing progress
|
||||
kubectl logs document-ai-indexer -n knowledge-agent | grep "Processing"
|
||||
```
|
||||
|
||||
## Version Information
|
||||
|
||||
- **Current Version**: 0.20.4
|
||||
- **Python Version**: 3.12+
|
||||
- **Main Dependencies**:
|
||||
- azure-ai-documentintelligence
|
||||
- azure-search-documents
|
||||
- SQLAlchemy 2.0.41
|
||||
- openai 1.55.3
|
||||
|
||||
|
||||
---
|
||||
|
||||
*Last updated: August 2025*
|
||||
197
vw-document-ai-indexer/app_config.py
Normal file
197
vw-document-ai-indexer/app_config.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Refactored configuration management system
|
||||
Uses dependency injection and config classes instead of global variables
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Dict, Any
|
||||
import os
|
||||
import yaml
|
||||
from azure.ai.formrecognizer import DocumentAnalysisClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
@dataclass
|
||||
class DatabaseConfig:
|
||||
"""Database configuration"""
|
||||
uri: str
|
||||
pool_size: int = 5
|
||||
max_overflow: int = 10
|
||||
pool_timeout: int = 30
|
||||
|
||||
|
||||
@dataclass
|
||||
class AzureServiceConfig:
|
||||
"""Azure service configuration"""
|
||||
form_recognizer_endpoint: str
|
||||
form_recognizer_key: str
|
||||
search_service_name: str
|
||||
search_admin_key: str
|
||||
embedding_model_endpoint: Optional[str] = None
|
||||
embedding_model_key: Optional[str] = None
|
||||
captioning_model_endpoint: Optional[str] = None
|
||||
captioning_model_key: Optional[str] = None
|
||||
di_blob_account_url: Optional[str] = None
|
||||
figure_blob_account_url: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CaptionServiceConfig:
|
||||
"""Caption service configuration"""
|
||||
include_di_content:bool = True
|
||||
description_gen_max_images:int = 0
|
||||
model_endpoint: Optional[str] = None
|
||||
model_key: Optional[str] = None
|
||||
model:Optional[str] = None
|
||||
azure_deployment:Optional[str] = None
|
||||
api_version:Optional[str] = None
|
||||
prompts:Optional[dict[str,Any]] = None
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingConfig:
|
||||
"""Processing configuration"""
|
||||
max_workers: int = 8
|
||||
chunk_size: int = 2048
|
||||
token_overlap: int = 128
|
||||
min_chunk_size: int = 10
|
||||
retry_count: int = 3
|
||||
retry_delay: int = 15
|
||||
tmp_directory: str = '/tmp'
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoggingConfig:
|
||||
"""Logging configuration"""
|
||||
level: str = "INFO"
|
||||
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file_path: Optional[str] = None
|
||||
console_output: bool = True
|
||||
console_level: str = "WARNING" # Console only shows WARNING and above
|
||||
console_format: str = "%(message)s" # Simplified format for console
|
||||
console_progress_only: bool = True # Only show progress and key info in console
|
||||
|
||||
|
||||
@dataclass
|
||||
class ApplicationConfig:
|
||||
"""Main application configuration"""
|
||||
database: DatabaseConfig
|
||||
azure_services: AzureServiceConfig
|
||||
processing: ProcessingConfig
|
||||
data_configs: list[Dict[str, Any]] = field(default_factory= list[Dict[str, Any]])
|
||||
current_tmp_directory: str = ''
|
||||
caption: CaptionServiceConfig = None
|
||||
env_data: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_env_and_config_files(cls, config_yaml_path: str, env_yaml_path: str = "env.yaml",prompt_path:str="prompt.yaml") -> 'ApplicationConfig':
|
||||
"""Load configuration from environment variable file and config file."""
|
||||
# 1. Load environment variable config file first
|
||||
cls._load_env_yaml(cls,env_yaml_path)
|
||||
|
||||
# 2. Load business config file
|
||||
with open(config_yaml_path, 'r', encoding='utf-8') as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
|
||||
# 3. Load prompt config file
|
||||
if os.path.exists(prompt_path):
|
||||
with open(prompt_path, 'r', encoding='utf-8') as f:
|
||||
prompt_data = yaml.safe_load(f)
|
||||
|
||||
# 4. Build config object
|
||||
return cls(
|
||||
database=DatabaseConfig(
|
||||
uri=os.getenv('DB_URI', 'sqlite:///app.db'),
|
||||
pool_size=int(os.getenv('DB_POOL_SIZE', '5')),
|
||||
max_overflow=int(os.getenv('DB_MAX_OVERFLOW', '10')),
|
||||
pool_timeout=int(os.getenv('DB_POOL_TIMEOUT', '30'))
|
||||
),
|
||||
azure_services=AzureServiceConfig(
|
||||
form_recognizer_endpoint=os.getenv('form_rec_resource', ''),
|
||||
form_recognizer_key=os.getenv('form_rec_key', ''),
|
||||
search_service_name=os.getenv('search_service_name', ''),
|
||||
search_admin_key=os.getenv('search_admin_key', ''),
|
||||
embedding_model_endpoint=os.getenv('embedding_model_endpoint'),
|
||||
embedding_model_key=os.getenv('embedding_model_key'),
|
||||
captioning_model_endpoint=os.getenv('captioning_model_endpoint'),
|
||||
captioning_model_key=os.getenv('captioning_model_key'),
|
||||
di_blob_account_url=os.getenv('DI_BLOB_ACCOUNT_URL',None),
|
||||
figure_blob_account_url=os.getenv('FIGURE_BLOB_ACCOUNT_URL', '')
|
||||
),
|
||||
processing=ProcessingConfig(
|
||||
max_workers=int(os.getenv('njobs', '8')),
|
||||
retry_count=int(os.getenv('RETRY_COUNT', '3')),
|
||||
retry_delay=int(os.getenv('RETRY_DELAY', '15')),
|
||||
tmp_directory=os.getenv('TMP_DIRECTORY', '/tmp')
|
||||
),
|
||||
caption=CaptionServiceConfig(
|
||||
description_gen_max_images= int(cls.env_data["figure_caption"]["description_gen_max_images"]),
|
||||
include_di_content = cls.env_data["figure_caption"]["include_di_content"],
|
||||
model_endpoint= cls.env_data["figure_caption"]["model_endpoint"],
|
||||
model_key= cls.env_data["figure_caption"]["model_key"],
|
||||
model= cls.env_data["figure_caption"]["model"],
|
||||
azure_deployment= cls.env_data["figure_caption"]["azure_deployment"],
|
||||
api_version=cls.env_data["figure_caption"]["api_version"],
|
||||
prompts=prompt_data["caption"] if prompt_data and "caption" in prompt_data else None
|
||||
),
|
||||
data_configs=config_data if isinstance(config_data, list) else [config_data]
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _load_env_yaml(self,env_yaml_path: str):
|
||||
"""Load environment variable YAML file."""
|
||||
if not os.path.exists(env_yaml_path):
|
||||
return
|
||||
|
||||
with open(env_yaml_path, 'r', encoding='utf-8') as f:
|
||||
self.env_data = yaml.safe_load(f)
|
||||
|
||||
# Set environment variables to system environment
|
||||
if self.env_data:
|
||||
for key, value in self.env_data.items():
|
||||
if isinstance(value, bool):
|
||||
value = str(value).lower()
|
||||
os.environ[str(key)] = str(value)
|
||||
|
||||
def validate(self) -> None:
|
||||
"""Validate configuration."""
|
||||
if not self.database.uri:
|
||||
raise ValueError("Database URI cannot be empty")
|
||||
|
||||
if not self.azure_services.form_recognizer_endpoint:
|
||||
raise ValueError("Form Recognizer endpoint cannot be empty")
|
||||
|
||||
if not self.azure_services.form_recognizer_key:
|
||||
raise ValueError("Form Recognizer key cannot be empty")
|
||||
|
||||
if self.processing.max_workers < 1:
|
||||
raise ValueError("Number of worker threads must be greater than 0")
|
||||
|
||||
|
||||
|
||||
class ServiceFactory:
|
||||
"""Service factory class, responsible for creating and managing various service instances."""
|
||||
|
||||
def __init__(self, config: ApplicationConfig):
|
||||
self.config = config
|
||||
self._form_recognizer_client = None
|
||||
|
||||
def get_form_recognizer_client(self) -> DocumentAnalysisClient:
|
||||
"""Get Form Recognizer client (singleton)."""
|
||||
if self._form_recognizer_client is None:
|
||||
self._form_recognizer_client = DocumentAnalysisClient(
|
||||
endpoint=self.config.azure_services.form_recognizer_endpoint,
|
||||
credential=AzureKeyCredential(self.config.azure_services.form_recognizer_key)
|
||||
)
|
||||
return self._form_recognizer_client
|
||||
|
||||
def get_database_engine(self):
|
||||
"""Get database engine."""
|
||||
return create_engine(
|
||||
self.config.database.uri,
|
||||
pool_size=self.config.database.pool_size,
|
||||
max_overflow=self.config.database.max_overflow,
|
||||
pool_timeout=self.config.database.pool_timeout
|
||||
)
|
||||
751
vw-document-ai-indexer/azure_index_service.py
Normal file
751
vw-document-ai-indexer/azure_index_service.py
Normal file
@@ -0,0 +1,751 @@
|
||||
"""
|
||||
Azure AI index search service
|
||||
Provides operations for Azure AI Search Index, including creating indexes, uploading documents, checking if an index exists, etc.
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import fields
|
||||
from typing import List, Dict, Any, Optional
|
||||
from tqdm import tqdm
|
||||
import uuid6
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
from azure.core.exceptions import HttpResponseError
|
||||
from azure.search.documents import SearchClient, IndexDocumentsBatch
|
||||
from azure.search.documents._generated.models import IndexingResult
|
||||
from azure.search.documents.indexes.models import SearchIndex, SimpleField # type: ignore
|
||||
from azure.search.documents.indexes import SearchIndexClient
|
||||
from resilient_http_pool import get_cloud_api_client
|
||||
from entity_models import Document
|
||||
from utils import asdict_with_dynamic, write_log, write_grouped_index_files
|
||||
from di_extractor import retry_get_embedding
|
||||
|
||||
|
||||
SUPPORTED_LANGUAGE_CODES = {
|
||||
"ar": "Arabic",
|
||||
"hy": "Armenian",
|
||||
"eu": "Basque",
|
||||
"bg": "Bulgarian",
|
||||
"ca": "Catalan",
|
||||
"zh-Hans": "Chinese Simplified",
|
||||
"zh-Hant": "Chinese Traditional",
|
||||
"cs": "Czech",
|
||||
"da": "Danish",
|
||||
"nl": "Dutch",
|
||||
"en": "English",
|
||||
"fi": "Finnish",
|
||||
"fr": "French",
|
||||
"gl": "Galician",
|
||||
"de": "German",
|
||||
"el": "Greek",
|
||||
"hi": "Hindi",
|
||||
"hu": "Hungarian",
|
||||
"id": "Indonesian (Bahasa)",
|
||||
"ga": "Irish",
|
||||
"it": "Italian",
|
||||
"ja": "Japanese",
|
||||
"ko": "Korean",
|
||||
"lv": "Latvian",
|
||||
"no": "Norwegian",
|
||||
"fa": "Persian",
|
||||
"pl": "Polish",
|
||||
"pt-Br": "Portuguese (Brazil)",
|
||||
"pt-Pt": "Portuguese (Portugal)",
|
||||
"ro": "Romanian",
|
||||
"ru": "Russian",
|
||||
"es": "Spanish",
|
||||
"sv": "Swedish",
|
||||
"th": "Thai",
|
||||
"tr": "Turkish"
|
||||
}
|
||||
|
||||
def index_init(data_config: dict[str, Any] , search_admin_key:str, search_service_name:str) -> None:
|
||||
|
||||
index_schemas: dict[str, Any] = data_config.get("index_schemas") if data_config else None # type: ignore
|
||||
|
||||
admin_key = search_admin_key if search_admin_key else None
|
||||
service_name = search_service_name
|
||||
for schema_name in index_schemas:
|
||||
language = data_config.get("language", None)
|
||||
|
||||
if language and language not in SUPPORTED_LANGUAGE_CODES:
|
||||
raise Exception(f"ERROR: Ingestion does not support {language} documents. "
|
||||
f"Please use one of {SUPPORTED_LANGUAGE_CODES}."
|
||||
f"Language is set as two letter code for e.g. 'en' for English."
|
||||
f"If you donot want to set a language just remove this prompt config or set as None")
|
||||
|
||||
# Basic index structure initialization
|
||||
create_or_update_search_index(service_name=service_name, index_name=schema_name["index_name"],
|
||||
semantic_config_name=schema_name["semantic_config_name"],
|
||||
vector_config_name=schema_name["vector_config_name"],
|
||||
language=language,admin_key=admin_key,
|
||||
meta_fields = schema_name["fields"])
|
||||
|
||||
|
||||
def create_or_update_search_index(service_name: str|None, index_name: str|None, semantic_config_name: str = "default", vector_config_name: str = "", language:str="", admin_key: str = "", meta_fields: list[str]|None = None):
|
||||
url = f"{service_name}/indexes/{index_name}?api-version=2024-11-01-Preview"
|
||||
headers: dict[str, str] = {"Content-Type": "application/json", "api-key": admin_key}
|
||||
|
||||
body: dict[str, Any] = {
|
||||
"fields": [
|
||||
{"name":"session_id","type":"Edm.String", "searchable": True, "sortable": False, "facetable": False, "filterable": True},
|
||||
{"name": "id","type": "Edm.String","searchable": True,"key": True,},
|
||||
{"name": "content","type": "Edm.String","searchable": True,"sortable": False,"facetable": False,"filterable": False,"analyzer": f"{language}.lucene" if language else None,},
|
||||
{"name": "title","type": "Edm.String","searchable": True,"sortable": True,"facetable": False,"filterable": False,"analyzer": f"{language}.lucene" if language else None,},
|
||||
{"name": "filepath","type": "Edm.String", "searchable": True,"sortable": True,"facetable": False,"filterable": True},
|
||||
{"name": "url","type": "Edm.String","searchable": True,"sortable": True,"filterable": True},
|
||||
{ "name": "metadata", "type": "Edm.String", "searchable": True, "filterable": True },
|
||||
{ "name": "image_mapping", "type": "Edm.String", "searchable": False, "sortable": False, "facetable": False, "filterable": True },
|
||||
{ "name": "doc_metadata", "type": "Edm.String", "searchable": True, "sortable": False, "facetable": False, "filterable": False },
|
||||
{ "name": "document_schema", "type": "Edm.String", "searchable": True, "sortable": True, "facetable": False, "filterable": True },
|
||||
{ "name": "main_title", "type": "Edm.String", "searchable": True, "sortable": True, "facetable": False, "filterable": True },
|
||||
{
|
||||
"name": "sub_title",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "publisher",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "document_code",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "document_category",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "main_title_sec_language",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "sub_title_sec_language",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "primary_language",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "secondary_language",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "full_headers",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h1",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h2",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h3",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h4",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h5",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "h6",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "timestamp",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": True,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "publish_date",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": True,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
},
|
||||
{
|
||||
"name": "description",
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": False,
|
||||
"facetable": False,
|
||||
"filterable": True
|
||||
}
|
||||
],
|
||||
"suggesters": [],
|
||||
"scoringProfiles": [],
|
||||
"semantic": {
|
||||
"configurations": [
|
||||
{
|
||||
"name": semantic_config_name,
|
||||
"prioritizedFields": {
|
||||
"titleField": {"fieldName": "title"},
|
||||
"prioritizedContentFields": [{"fieldName": "content"}],
|
||||
"prioritizedKeywordsFields": [{"fieldName": "full_headers"}, {"fieldName": "doc_metadata"}],
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
if vector_config_name:
|
||||
body["fields"].append({
|
||||
"name": "contentVector",
|
||||
"type": "Collection(Edm.Single)",
|
||||
"searchable": True,
|
||||
"retrievable": True,
|
||||
"stored": True,
|
||||
"dimensions": int(os.getenv("VECTOR_DIMENSION", "1536")),
|
||||
"vectorSearchProfile": vector_config_name
|
||||
})
|
||||
|
||||
body["fields"].append({
|
||||
"name": "full_metadata_vector",
|
||||
"type": "Collection(Edm.Single)",
|
||||
"searchable": True,
|
||||
"retrievable": True,
|
||||
"stored": True,
|
||||
"dimensions": int(os.getenv("VECTOR_DIMENSION", "1536")),
|
||||
"vectorSearchProfile": vector_config_name
|
||||
})
|
||||
|
||||
body["vectorSearch"] = {
|
||||
"algorithms": [
|
||||
{
|
||||
"name": "my-hnsw-config-1",
|
||||
"kind": "hnsw",
|
||||
"hnswParameters": {
|
||||
"m": 4,
|
||||
"efConstruction": 400,
|
||||
"efSearch": 500,
|
||||
"metric": "cosine"
|
||||
}
|
||||
}
|
||||
],
|
||||
"profiles": [
|
||||
{
|
||||
"name": "vectorSearchProfile",
|
||||
"algorithm": "my-hnsw-config-1",
|
||||
# "vectorizer": "azure_vectorizer"
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
if os.getenv("AOAI_EMBEDDING_ENDPOINT"):
|
||||
body["vectorSearch"]["profiles"][0]["vectorizer"] = "azure_vectorizer"
|
||||
body["vectorSearch"]["vectorizers"] = [
|
||||
{
|
||||
"name": "azure_vectorizer",
|
||||
"kind": "azureOpenAI",
|
||||
"azureOpenAIParameters": {
|
||||
"resourceUri": os.getenv("AOAI_EMBEDDING_ENDPOINT"),
|
||||
"deploymentId": os.getenv("AOAI_EMBEDDING_DEPLOYMENT"),
|
||||
"apiKey": os.getenv("AOAI_EMBEDDING_KEY"),
|
||||
"modelName": os.getenv("AOAI_EMBEDDING_MODEL")
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
for field in meta_fields if meta_fields is not None else []:
|
||||
if not any(str(item["name"]) == field for item in body['fields']):
|
||||
sortable:bool = True
|
||||
facetable:bool = True
|
||||
filterable:bool = True
|
||||
if field in ["x_Standard_Range"]:
|
||||
sortable = False
|
||||
facetable = False
|
||||
filterable = False
|
||||
body["fields"].append({
|
||||
"name": field,
|
||||
"type": "Edm.String",
|
||||
"searchable": True,
|
||||
"sortable": sortable,
|
||||
"facetable": facetable,
|
||||
"filterable": filterable
|
||||
})
|
||||
|
||||
client = get_cloud_api_client()
|
||||
response = client.put(url, json=body, headers=headers)
|
||||
if response.status_code == 201:
|
||||
print(f"Created search index {index_name}")
|
||||
elif response.status_code == 204:
|
||||
print(f"Updated existing search index {index_name}")
|
||||
else:
|
||||
raise Exception(f"Failed to create search index. Status Code:{response.status_code}, Error: {response.text}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def upload_documents_to_index(service_name:str, index_name:str, docs, upload_batch_size:int=50, admin_key:str|None=None):
|
||||
if admin_key is None:
|
||||
raise ValueError("credential and admin_key cannot be None")
|
||||
|
||||
to_upload_dicts = []
|
||||
|
||||
for d in docs:
|
||||
# Get dynamically added attributes
|
||||
if type(d) is not dict:
|
||||
d = asdict_with_dynamic(d)
|
||||
|
||||
# add id to documents
|
||||
d.update({"@search.action": "upload", "id": d["id"]})
|
||||
if "contentVector" in d and d["contentVector"] is None:
|
||||
del d["contentVector"]
|
||||
if "full_metadata_vector" in d and d["full_metadata_vector"] is None:
|
||||
del d["full_metadata_vector"]
|
||||
to_upload_dicts.append(d)
|
||||
|
||||
# endpoint = "https://{}.search.windows.net/".format(service_name)
|
||||
endpoint: str = service_name
|
||||
|
||||
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_key))
|
||||
|
||||
# Upload the documents in batches of upload_batch_size
|
||||
for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc="Indexing Chunks..."):
|
||||
batch = to_upload_dicts[i: i + upload_batch_size]
|
||||
results = search_client.upload_documents(documents=batch)
|
||||
num_failures = 0
|
||||
errors = set()
|
||||
for result in results:
|
||||
if not result.succeeded:
|
||||
print(f"Indexing Failed for {result.key} with ERROR: {result.error_message}")
|
||||
num_failures += 1
|
||||
errors.add(result.error_message)
|
||||
if num_failures > 0:
|
||||
raise Exception(f"INDEXING FAILED for {num_failures} documents. Please recreate the index."
|
||||
f"To Debug: PLEASE CHECK chunk_size and upload_batch_size. \n Error Messages: {list(errors)}")
|
||||
|
||||
|
||||
|
||||
|
||||
def upload_merge_index(index_config: Any, docs:list[dict[str,Any]],merge_fields:list[dict[str,Any]]|None=None,current_tmp_directory:str='') -> bool:
|
||||
"""
|
||||
Merge chunk information and upload to AI search index
|
||||
"""
|
||||
index_name: str = index_config["index_name"]
|
||||
embedding_endpoint: str = os.environ.get("embedding_model_endpoint", '')
|
||||
embedding_model_key: str = os.environ.get("embedding_model_key", '') #config.embedding_model_key
|
||||
|
||||
fields_meta: Any = index_config["fields"] or []
|
||||
merge_content_fields: Any = index_config[ "merge_content_fields"] if "merge_content_fields" in index_config.keys() else []
|
||||
key_fields: Any = index_config["key_fields"] if "key_fields" in index_config.keys() else []
|
||||
|
||||
all_fields = list(dict.fromkeys(["id"] + fields_meta + merge_content_fields + key_fields + [f.name for f in fields(Document)] ))
|
||||
upload_batch_size = index_config["upload_batch_size"] if "upload_batch_size" in index_config.keys() else 1
|
||||
|
||||
original_to_upload_dicts: list[Any] = []
|
||||
|
||||
for d in docs:
|
||||
# Get dynamically added attributes
|
||||
if type(d) is not dict:
|
||||
d = asdict_with_dynamic(d)
|
||||
|
||||
for key in list(d.keys()):
|
||||
if key not in all_fields:
|
||||
del d[key]
|
||||
|
||||
if ("contentVector" in d) and (d["contentVector"] is None or "contentVector" not in all_fields):
|
||||
del d["contentVector"]
|
||||
if ("full_metadata_vector" in d) and (
|
||||
d["full_metadata_vector"] is None or "full_metadata_vector" not in all_fields):
|
||||
del d["full_metadata_vector"]
|
||||
|
||||
# Default id primary key assignment, key_fields content merge and base64
|
||||
id_value = d["id"] if "id" in d else ""
|
||||
if "key_fields" in index_config.keys():
|
||||
id_value = '_'.join(str(d[k]) for k in key_fields if k in d)
|
||||
|
||||
if id_value is None or id_value == "":
|
||||
continue
|
||||
# Select certain fields, concatenate to another field
|
||||
|
||||
for merge_field in merge_fields:
|
||||
d[merge_field["key"]] = json.dumps( {field: d[field] for field in merge_field["fields"] if field in d and (value := d[field]) is not None and value != ""}, ensure_ascii=False)
|
||||
|
||||
d["id"] = base64.urlsafe_b64encode(id_value.encode('utf-8')).decode('utf-8') \
|
||||
# add id to documents
|
||||
d.update({"@search.action": "upload", "id": d["id"]})
|
||||
d.update({"session_id":str(uuid6.uuid7())})
|
||||
original_to_upload_dicts.append(d)
|
||||
|
||||
to_upload_dicts = original_to_upload_dicts
|
||||
current_object_key = to_upload_dicts[0]["filepath"] if len(to_upload_dicts) > 0 and "filepath" in to_upload_dicts[0] else ''
|
||||
|
||||
|
||||
# Calculate vector data based on configuration fields
|
||||
for vector_config in index_config["vector_fields"] if "vector_fields" in index_config.keys() else []:
|
||||
for i in tqdm(range(0, len(to_upload_dicts), 1), desc=f"{current_object_key} vector {vector_config["field"]} embedding..."):
|
||||
d = to_upload_dicts[i: i + 1][0]
|
||||
vector_dict = {}
|
||||
for field in vector_config["append_fields"]:
|
||||
if isinstance(d[field], dict):
|
||||
vector_dict |= d[field]
|
||||
elif isinstance(d[field], str):
|
||||
vector_dict[field] = d[field]
|
||||
vector_str = str(vector_dict) if vector_dict else ""
|
||||
embedding = retry_get_embedding(text=vector_str, embedding_model_key=embedding_model_key, embedding_endpoint=embedding_endpoint)
|
||||
if embedding:
|
||||
d[vector_config["field"]] = retry_get_embedding(text=vector_str, embedding_model_key=embedding_model_key, embedding_endpoint=embedding_endpoint)
|
||||
|
||||
# 根据to_upload_dicts种的filepath字段分组,写入到.index目录下对应的json文件
|
||||
write_grouped_index_files(to_upload_dicts, index_name=index_name, base_directory=current_tmp_directory)
|
||||
|
||||
results: list[bool] = []
|
||||
# Upload the documents in batches of upload_batch_size
|
||||
for i in tqdm(range(0, len(to_upload_dicts), upload_batch_size), desc=f"Indexing {index_name} Chunks..."):
|
||||
batch = to_upload_dicts[i: i + upload_batch_size]
|
||||
results.append(upload_and_ensure(index_name=index_name, docs=batch, key_field="session_id"))
|
||||
return all(results)
|
||||
|
||||
|
||||
|
||||
def merge_dicts(data_list, key_fields, merge_fields, separator='\n'):
|
||||
"""
|
||||
Merge dictionary list based on specified fields
|
||||
|
||||
Arguments:
|
||||
data_list -- Original dictionary list
|
||||
key_fields -- Fields used for deduplication (e.g., ['title', 'filepath'])
|
||||
merge_fields -- Fields to be merged (e.g., ['content'])
|
||||
separator -- Separator used for merging fields (default is newline)
|
||||
|
||||
Returns:
|
||||
New dictionary list after merging
|
||||
"""
|
||||
merged_dict = {}
|
||||
|
||||
for item in data_list:
|
||||
# Create a unique key - a tuple of all key fields
|
||||
key = tuple(item.get(field) for field in key_fields)
|
||||
|
||||
if key in merged_dict:
|
||||
# Merge fields
|
||||
existing = merged_dict[key]
|
||||
for field in merge_fields:
|
||||
# Merge new value with old value
|
||||
existing[field] = separator.join([
|
||||
existing.get(field, ''),
|
||||
item.get(field, '')
|
||||
]).strip(separator)
|
||||
else:
|
||||
# Create new record
|
||||
merged_dict[key] = {
|
||||
**item, # Copy original fields
|
||||
# Pre-initialize merged fields
|
||||
**{field: item.get(field, '') for field in merge_fields}
|
||||
}
|
||||
|
||||
return list(merged_dict.values())
|
||||
|
||||
|
||||
def validate_index(service_name:str, index_name:str, admin_key:str=None):
|
||||
api_version = "2024-11-01-Preview"
|
||||
headers = {"Content-Type": "application/json", "api-key": admin_key}
|
||||
params = {"api-version": api_version}
|
||||
url = f"{service_name}/indexes/{index_name}/stats"
|
||||
client = get_cloud_api_client()
|
||||
for retry_count in range(5):
|
||||
response = client.get(url, headers=headers, params=params)
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
num_chunks = response_data['documentCount']
|
||||
if num_chunks == 0 and retry_count < 10:
|
||||
print("Index is empty. Waiting 20 seconds to check again...")
|
||||
time.sleep(20)
|
||||
elif num_chunks == 0 and retry_count == 10:
|
||||
print("Index is empty. Please investigate and re-index.")
|
||||
else:
|
||||
print(f"The index contains {num_chunks} chunks.")
|
||||
average_chunk_size = response_data['storageSize'] / num_chunks
|
||||
print(f"The average chunk size of the index is {average_chunk_size} bytes.")
|
||||
break
|
||||
else:
|
||||
if response.status_code == 404:
|
||||
print("The index does not seem to exist. Please make sure the index was created correctly, and that you are using the correct service and index names")
|
||||
elif response.status_code == 403:
|
||||
print("Authentication Failure: Make sure you are using the correct key")
|
||||
else:
|
||||
print(f"Request failed. Please investigate. Status code: {response.status_code}")
|
||||
break
|
||||
|
||||
|
||||
def index_exists(index_name: str) -> bool:
|
||||
try:
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
|
||||
endpoint = search_service_name
|
||||
credential = AzureKeyCredential(search_admin_key)
|
||||
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
|
||||
|
||||
index_client.get_index(index_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
write_log(f"Index '{index_name}' does not exist: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def create_index(index_name:str, index_fields: list[dict[str, Any]], suggesters: Optional[list[dict[str, Any]]] = None) -> None:
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
|
||||
endpoint = search_service_name
|
||||
credential = AzureKeyCredential(search_admin_key)
|
||||
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
|
||||
|
||||
if index_exists(index_name=index_name):
|
||||
write_log(f"Index '{index_name}' already exists.")
|
||||
return
|
||||
search_fields = [SimpleField(**field) for field in index_fields]
|
||||
index = SearchIndex(name=index_name, fields=search_fields, suggesters=suggesters or [])
|
||||
index_client.create_index(index)
|
||||
write_log(f"Index '{index_name}' created.")
|
||||
|
||||
|
||||
def upload_documents(index_name:str, documents: List[Dict[str, Any]]) -> None:
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
|
||||
endpoint = search_service_name
|
||||
credential = AzureKeyCredential(search_admin_key)
|
||||
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
|
||||
batch = IndexDocumentsBatch()
|
||||
batch.add_merge_or_upload_actions(documents) #type: ignore
|
||||
results = search_client.index_documents(batch)
|
||||
|
||||
write_log(f"Uploaded {len(documents)} documents to index '{index_name}'. Result: {results}")
|
||||
|
||||
def delete_index(index_name:str) -> None:
|
||||
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
endpoint = search_service_name
|
||||
credential = AzureKeyCredential(search_admin_key)
|
||||
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
|
||||
|
||||
if index_exists(index_name=index_name):
|
||||
index_client.delete_index(index_name)
|
||||
write_log(f"Index '{index_name}' deleted.")
|
||||
else:
|
||||
write_log(f"Index '{index_name}' does not exist.")
|
||||
|
||||
|
||||
def search(index_name, search_text: str, **kwargs) -> Any:
|
||||
endpoint = os.getenv("search_service_name","")
|
||||
credential = AzureKeyCredential(os.getenv("search_admin_key",""))
|
||||
index_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
|
||||
return index_client.search(search_text, **kwargs)
|
||||
|
||||
|
||||
def documents_with_field_value_exist(index_name:str, field_name: str, value: Any) -> bool:
|
||||
"""
|
||||
Check if there are documents in the index where a specific field equals the given value.
|
||||
"""
|
||||
|
||||
endpoint = os.getenv("search_service_name", "")
|
||||
credential = AzureKeyCredential(os.getenv("search_admin_key", ""))
|
||||
index_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
|
||||
|
||||
filter_query = f"{field_name} eq '{value}'" if isinstance(value, str) else f"{field_name} eq {value}"
|
||||
results: Any = index_client.search("*", filter=filter_query, top=1)
|
||||
for _ in results:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def delete_documents_by_field(index_name:str,field_name: str, value: Any) -> bool:
|
||||
"""
|
||||
Delete all documents where the specified field equals the given value.
|
||||
"""
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
search_client = SearchClient(endpoint=search_service_name, index_name=index_name, credential=AzureKeyCredential(search_admin_key))
|
||||
# Step 1: Retrieve documents that meet the criteria (here looking for documents with status field as "inactive")
|
||||
query = f"{field_name} eq '{value}'"
|
||||
results: Any = search_client.search(select=["id"], filter=query)
|
||||
if not results:
|
||||
return True
|
||||
|
||||
# Step 2: Extract the primary keys (id) of the documents to be deleted
|
||||
keys_to_delete = [doc['id'] for doc in results]
|
||||
|
||||
# Step 3: Delete the documents that meet the criteria
|
||||
if keys_to_delete:
|
||||
# Use batch delete API to remove documents
|
||||
delete_results:list[IndexingResult] = search_client.delete_documents(documents=[{'id': key} for key in keys_to_delete])#type: ignore
|
||||
|
||||
logging.getLogger().info(f"Deleted documents with keys: {keys_to_delete}")
|
||||
return all(result.succeeded for result in delete_results)
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def query_by_field( index_name: str, field_name: str, value: Any, top: int = 99999) -> list[dict[Any,Any]]:
|
||||
"""
|
||||
Query documents in the index where a specific field equals the given value.
|
||||
:param field_name: The field to filter on.
|
||||
:param value: The value to match.
|
||||
:param top: Maximum number of results to return.
|
||||
:return: List of matching documents.
|
||||
"""
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
|
||||
search_client = SearchClient(endpoint = search_service_name, index_name=index_name,credential=AzureKeyCredential(search_admin_key))
|
||||
filter_query = f"{field_name} eq '{value}'" if isinstance(value, str) else f"{field_name} eq {value}"
|
||||
results:Any = search_client.search("*", filter=filter_query, top=top)
|
||||
return [doc for doc in results]
|
||||
|
||||
|
||||
|
||||
def upload_and_ensure(index_name:str, docs: list[dict[Any, Any]], key_field="session_id", delay_seconds:int=5, max_retries:int=5) -> bool:
|
||||
search_service_name = os.getenv("search_service_name", "")
|
||||
search_admin_key = os.getenv("search_admin_key", "")
|
||||
|
||||
endpoint = search_service_name
|
||||
api_key = search_admin_key
|
||||
|
||||
client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
|
||||
|
||||
# Step 1: Batch submit MergeOrUpload
|
||||
batch = IndexDocumentsBatch()
|
||||
batch.add_merge_or_upload_actions(docs) # type: ignore
|
||||
results = client.index_documents(batch)
|
||||
|
||||
# Step 2: Check status of each document
|
||||
failed = [r.key for r in results if not r.succeeded]
|
||||
if failed:
|
||||
raise Exception(f"Initial submission failed for documents: {failed}")
|
||||
|
||||
|
||||
return True
|
||||
# # Step 3: Delay waiting for background index
|
||||
# time.sleep(delay_seconds)
|
||||
|
||||
# # Step 4: Verify and retry
|
||||
# keys: list[str] = [doc[key_field] for doc in docs]
|
||||
# return verify_and_retry(client, keys, docs, key_field, delay_seconds, max_retries)
|
||||
|
||||
|
||||
def verify_and_retry(client: SearchClient, keys: list[str], docs, key_field, delay_seconds, max_retries) -> bool:
|
||||
attempt = 0
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
while attempt <= max_retries:
|
||||
missing = find_missing(client, keys, session_id)
|
||||
if not missing:
|
||||
return True
|
||||
|
||||
attempt += 1
|
||||
print(f"Retry {attempt}, missing: {missing}")
|
||||
|
||||
to_retry = [doc for doc in docs if doc[key_field] in missing]
|
||||
|
||||
batch = IndexDocumentsBatch()
|
||||
actions = [batch.add_merge_or_upload_actions([doc]) for doc in to_retry]
|
||||
|
||||
client.index_documents(batch)
|
||||
|
||||
time.sleep(delay_seconds)
|
||||
|
||||
# Final check
|
||||
missing = find_missing(client, keys, session_id)
|
||||
if missing:
|
||||
raise Exception(f"Index verification failed, the following documents were not indexed: {missing}")
|
||||
return True
|
||||
|
||||
|
||||
def find_missing(client: SearchClient, keys: list[str], session_id: str) -> list[str]:
|
||||
missing: list[str] = []
|
||||
for key in keys:
|
||||
try:
|
||||
results = client.search(filter=f"session_id eq '{key}'", top=1)
|
||||
if not any(results):
|
||||
missing.append(key)
|
||||
except HttpResponseError:
|
||||
missing.append(key)
|
||||
return missing
|
||||
150
vw-document-ai-indexer/blob_service.py
Normal file
150
vw-document-ai-indexer/blob_service.py
Normal file
@@ -0,0 +1,150 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from azure.storage.blob import ContainerClient, BlobProperties
|
||||
|
||||
from utils import custom_serializer, keep_latest
|
||||
|
||||
|
||||
def check_files(blob_url:str, doc_time:datetime|None) -> list[dict[str, Any]]:
|
||||
# If blob, get blob properties; if local file, get system modification time
|
||||
container_client = ContainerClient.from_container_url(blob_url)
|
||||
updated_files: list[dict[str, Any]] = []
|
||||
blobs: list[BlobProperties] = list(container_client.list_blobs())
|
||||
# Sort by modification time ascending
|
||||
blobs_by_last_modified = sorted(blobs, key=lambda b: b.last_modified) #datetime.fromisoformat()
|
||||
|
||||
for blob in blobs_by_last_modified:
|
||||
if blob.name.endswith('.doc_metadata.json'):
|
||||
continue
|
||||
else:
|
||||
last_modified: datetime = blob.last_modified.replace(tzinfo=None) #datetime.fromisoformat(blob.last_modified)
|
||||
name = blob.name
|
||||
|
||||
if doc_time is None or last_modified > doc_time:
|
||||
updated_files.append({"name": name, "doc_upper_time": last_modified})
|
||||
|
||||
return updated_files
|
||||
|
||||
def load_metadata(blob_url:str, directory_path: str, data_directory: str) -> list[Any]:
|
||||
"""Download .doc_metadata.json file from blob_url and return the parsed metadata list."""
|
||||
downloadToLocalFolder(blob_url, data_directory, directory_path, ".doc_metadata.json")
|
||||
if not os.path.exists(f"{directory_path}/.doc_metadata.json"):
|
||||
return []
|
||||
#raise FileNotFoundError(f"Metadata file not found in {directory_path}")
|
||||
|
||||
with open(f"{directory_path}/.doc_metadata.json", "rb") as doc_metadata_file:
|
||||
doc_metadata = json.load(doc_metadata_file)
|
||||
sorted_list = sorted(doc_metadata["doc_metadata"], key=lambda x: x["timestamp"], reverse=True)
|
||||
# For testing: replace '-' with '_' in keys
|
||||
[dic.update({k.replace("-", "_"): dic.pop(k)}) for dic in sorted_list for k in list(dic.keys()) if "-" in k]
|
||||
return sorted_list
|
||||
|
||||
def check_meta(blob_url:str, meta_upper_time:Any, current_tmp_directory: str, data_dir: str) -> list[dict[Any,Any]]:
|
||||
"""Check .doc_metadata.json records under blob_url and compare with processed meta_upper_time, return updated metadata list."""
|
||||
sorted_list = load_metadata(blob_url, current_tmp_directory, data_directory=data_dir)
|
||||
filter_list = filter(lambda x: meta_upper_time is None or datetime.fromisoformat(x["timestamp"]).replace(tzinfo=None) > meta_upper_time, sorted_list)
|
||||
updated_metas: list[dict[str,Any]] = []
|
||||
for item in filter_list:
|
||||
# Parse string to datetime object
|
||||
dt = datetime.fromisoformat(item["timestamp"]).replace(tzinfo=None)
|
||||
# Keep the latest meta_upper_time data
|
||||
updated_metas.append({"name": item["filepath"], "meta_upper_time": dt})
|
||||
return keep_latest(updated_metas, "name", "meta_upper_time")
|
||||
|
||||
def downloadToLocalFolder(blob_url:str, data_dir:str, local_folder: str, name_starts_with:str) -> list[str]:
|
||||
"""Check if .doc_metadata.json exists in the directory, download if not."""
|
||||
# If local_folder is empty, use temp directory
|
||||
if os.path.exists(f"{local_folder}/{name_starts_with}"):
|
||||
return []
|
||||
path = data_dir
|
||||
if path and not path.endswith('/'):
|
||||
path = path + '/'
|
||||
container_client = ContainerClient.from_container_url(blob_url)
|
||||
last_destination_folder = None
|
||||
destination_paths: list[str] = []
|
||||
for blob in container_client.list_blobs(name_starts_with=name_starts_with):
|
||||
relative_path = blob.name[len(path):]
|
||||
destination_path = os.path.join(local_folder, relative_path)
|
||||
destination_folder = os.path.dirname(destination_path)
|
||||
if destination_folder != last_destination_folder:
|
||||
os.makedirs(destination_folder, exist_ok=True)
|
||||
last_destination_folder = destination_folder
|
||||
blob_client = container_client.get_blob_client(blob.name)
|
||||
with open(file=destination_path, mode='wb') as local_file:
|
||||
stream = blob_client.download_blob()
|
||||
local_file.write(stream.readall())
|
||||
destination_paths.append(destination_path)
|
||||
return destination_paths
|
||||
|
||||
def blob_upload_content(blob_sas_url: str, file_name: str, content: str, retry_count: int = 3) -> str:
|
||||
for i in range(retry_count):
|
||||
try:
|
||||
# Upload file to Azure blob
|
||||
container_client: ContainerClient = ContainerClient.from_container_url(blob_sas_url)
|
||||
container_client.upload_blob(name=file_name, data=content, overwrite=True) # type: ignore
|
||||
return f"{blob_sas_url}/{file_name}"
|
||||
except Exception as e:
|
||||
print(f"Error uploading content for {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
|
||||
time.sleep(5)
|
||||
raise Exception(f"Error uploading content for: {file_name}")
|
||||
|
||||
def blob_upload_object(blob_sas_url: str, file_name: str, obj: Any, retry_count: int = 3) -> str:
|
||||
|
||||
if not blob_sas_url:
|
||||
return ''
|
||||
|
||||
content = json.dumps(obj, default=custom_serializer,ensure_ascii=False, indent=4)
|
||||
|
||||
for i in range(retry_count):
|
||||
try:
|
||||
# Upload file to Azure blob
|
||||
container_client: ContainerClient = ContainerClient.from_container_url(blob_sas_url)
|
||||
container_client.upload_blob(name=file_name, data=content, overwrite=True) # type: ignore
|
||||
return f"{blob_sas_url}/{file_name}"
|
||||
except Exception as e:
|
||||
print(f"Error uploading content for {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
|
||||
time.sleep(5)
|
||||
raise Exception(f"Error uploading content for: {file_name}")
|
||||
|
||||
def blob_exists(blob_sas_url: str, file_name: str) -> bool:
|
||||
"""Check if a blob exists in the container."""
|
||||
try:
|
||||
container_client = ContainerClient.from_container_url(blob_sas_url)
|
||||
blob_client = container_client.get_blob_client(file_name)
|
||||
return blob_client.exists()
|
||||
except Exception as e:
|
||||
print(f"Error checking existence of blob {file_name}: {e}")
|
||||
return False
|
||||
|
||||
def load_content(blob_sas_url: str, file_name: str, retry_count: int = 3) -> str:
|
||||
"""Download the file from blob storage."""
|
||||
for i in range(retry_count):
|
||||
try:
|
||||
container_client = ContainerClient.from_container_url(blob_sas_url)
|
||||
blob_client = container_client.get_blob_client(file_name)
|
||||
# Download blob content as bytes and decode to string
|
||||
blob_data = blob_client.download_blob().readall() # type: ignore
|
||||
# Try to decode as UTF-8 first, fallback to other encodings if needed
|
||||
try:
|
||||
return blob_data.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try other common encodings
|
||||
for encoding in ['gbk', 'latin-1', 'cp1252']:
|
||||
try:
|
||||
return blob_data.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
# If all encodings fail, return with error replacement
|
||||
return blob_data.decode('utf-8', errors='replace')
|
||||
except Exception as e:
|
||||
print(f"Error loading content from {file_name} with error={e}, retrying, currently at {i + 1} retry, {retry_count - (i + 1)} retries left")
|
||||
if i < retry_count - 1:
|
||||
time.sleep(5)
|
||||
|
||||
# If all retries fail, raise exception
|
||||
raise Exception(f"Error loading content from blob: {file_name} after {retry_count} retries")
|
||||
|
||||
623
vw-document-ai-indexer/business_layer.py
Normal file
623
vw-document-ai-indexer/business_layer.py
Normal file
@@ -0,0 +1,623 @@
|
||||
""" business_layer.py
|
||||
This module contains the business logic for document processing."""
|
||||
|
||||
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
import traceback
|
||||
import datetime
|
||||
from collections import Counter
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
from azure.core.pipeline.policies import RetryPolicy
|
||||
from app_config import ApplicationConfig, ServiceFactory
|
||||
from chunk_service import chunk_di_doc
|
||||
from entity_models import Document, ChunkingResult,DiResult
|
||||
from database import DatabaseInterface, IndexObject, IndexObjectStatus,LegacyDatabaseAdapter
|
||||
|
||||
from di_extractor import di_extract
|
||||
from blob_service import blob_exists, blob_upload_content, blob_upload_object, downloadToLocalFolder, load_content
|
||||
from utils import replace_urls_in_content, write_content,write_document,asdict_with_dynamic
|
||||
|
||||
|
||||
from azure_index_service import upload_merge_index, delete_documents_by_field,query_by_field
|
||||
from vllm_extractor import process_document_figures
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class SingletonFormRecognizerClient:
|
||||
instance = None
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if not cls.instance:
|
||||
extract_method = os.environ.get("extract_method", "default")
|
||||
if extract_method == "vision-llm":
|
||||
cls.instance = object() # dummy object
|
||||
else:
|
||||
url = os.getenv("form_rec_resource")
|
||||
key = os.getenv("form_rec_key")
|
||||
if url and key:
|
||||
print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
|
||||
|
||||
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
|
||||
|
||||
cls.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
|
||||
else:
|
||||
print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
|
||||
cls.instance = object() # dummy object
|
||||
return cls.instance
|
||||
|
||||
def __getstate__(self)->tuple[Any,Any]:
|
||||
return self.url, self.key
|
||||
|
||||
def __setstate__(self, state):
|
||||
url, key = state
|
||||
|
||||
retry = RetryPolicy(total_retries=5,connect_retries=3,read_retries=3,backoff_factor=0.8,retry_backoff_max=60)
|
||||
self.instance = DocumentIntelligenceClient(endpoint=url, credential=AzureKeyCredential(key), retry_policy=retry, connection_timeout=1200,read_timeout=1200)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingContext:
|
||||
"""Processing Context"""
|
||||
object_key: str
|
||||
data_config: Dict[str, Any]
|
||||
metadata: Dict[str, Any]
|
||||
retry_count: int = 0
|
||||
error_message: Optional[str] = None
|
||||
current_tmp_directory: str = ""
|
||||
datasource_name: str = ""
|
||||
config: ApplicationConfig | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingResult:
|
||||
"""Processing Result"""
|
||||
status: IndexObjectStatus
|
||||
object_key: str
|
||||
message: str
|
||||
processing_time: float
|
||||
chunks_count: int = 0
|
||||
error: Optional[Exception] = None
|
||||
|
||||
|
||||
# Keep only the DocumentRepository interface, other services directly use the specific implementation
|
||||
class DocumentRepository(ABC):
|
||||
"""Document Repository Interface"""
|
||||
|
||||
@abstractmethod
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
pass
|
||||
|
||||
|
||||
# Application service layer
|
||||
class DocumentProcessingOrchestrator:
|
||||
"""Document Processing Orchestrator (Application Service Layer)"""
|
||||
|
||||
def __init__(self,
|
||||
extraction_service: 'DocumentIntelligenceExtractionService',
|
||||
chunking_service: 'DefaultDocumentChunkingService',
|
||||
indexing_service: 'AzureSearchIndexingService',
|
||||
metadata_service: 'BlobMetadataService',
|
||||
repository: DocumentRepository):
|
||||
self.extraction_service = extraction_service
|
||||
self.chunking_service = chunking_service
|
||||
self.indexing_service = indexing_service
|
||||
self.metadata_service = metadata_service
|
||||
self.repository = repository
|
||||
|
||||
def process_document(self, context: ProcessingContext) -> ProcessingResult:
|
||||
"""Main process for handling a single document"""
|
||||
start_time = datetime.datetime.now()
|
||||
|
||||
# 1. Get or create index object
|
||||
index_object = self._get_or_create_index_object(context)
|
||||
# if not index_object:
|
||||
# raise ValueError(f"Failed to create or retrieve index object for {context.object_key}")
|
||||
|
||||
try:
|
||||
|
||||
# 2. Check retry count
|
||||
# If the current processing object's time is updated, reset the retry count, and execute the subsequent logic. The comparison dimensions are the last failed document modification time and metadata modification time
|
||||
if index_object.last_fail_doc_modifed_time != context.metadata.get("doc_modified_time") or index_object.last_fail_metadata_modifed_time != context.metadata.get("metadata_modified_time"):
|
||||
index_object.try_count = 0
|
||||
|
||||
|
||||
if index_object.status in ["processing", "failed"]:
|
||||
# Check if the maximum retry count has been reached
|
||||
if index_object.try_count >= 3:
|
||||
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Object has been retried {index_object.try_count} times, skipping processing", processing_time=0)
|
||||
|
||||
# Increase the retry count and save immediately
|
||||
index_object.try_count += 1
|
||||
|
||||
# Immediately save the retry count update
|
||||
self.repository.save_index_object(index_object)
|
||||
|
||||
# 3. Update status to processing
|
||||
self.repository.update_processing_status(context.object_key,context.datasource_name, IndexObjectStatus.PROCESSING)
|
||||
|
||||
# 4. Check if processing is needed (metadata and document modification times)
|
||||
meta_update_flag = self._should_process_metadata(index_object, context)
|
||||
doc_update_flag = self._should_process_document(index_object, context)
|
||||
|
||||
chunks_count = 0
|
||||
|
||||
# 5. Process metadata index (if update is needed)
|
||||
if meta_update_flag:
|
||||
self._process_metadata_indexes(context)
|
||||
|
||||
# 6. Process document and chunk indexes (Important: Only process when meta_update_flag OR doc_update_flag=True)
|
||||
if meta_update_flag or doc_update_flag:
|
||||
chunks_count = self._process_document_and_chunks(context, doc_update_flag)
|
||||
|
||||
# 7. Update the modification time of the index object
|
||||
if meta_update_flag:
|
||||
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
if doc_update_flag:
|
||||
index_object.doc_modifed_time = context.metadata.get("doc_modified_time")
|
||||
|
||||
index_object.status = IndexObjectStatus.SUCCESS.value
|
||||
|
||||
|
||||
if index_object.metadata_modifed_time is None:
|
||||
index_object.metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
|
||||
self.repository.save_index_object(index_object)
|
||||
|
||||
processing_time = (datetime.datetime.now() - start_time).total_seconds()
|
||||
return ProcessingResult(status=IndexObjectStatus.SUCCESS, object_key=context.object_key, message=f"Successfully processed {chunks_count} chunks", processing_time=processing_time, chunks_count=chunks_count)
|
||||
|
||||
except Exception as e:
|
||||
error_message:str = traceback.format_exc()
|
||||
index_object.status = IndexObjectStatus.FAILED.value
|
||||
index_object.last_fail_doc_modifed_time = context.metadata.get("doc_modified_time")
|
||||
index_object.last_fail_metadata_modifed_time = context.metadata.get("metadata_modified_time")
|
||||
self.repository.save_index_object(index_object)
|
||||
processing_time = (datetime.datetime.now() - start_time).total_seconds()
|
||||
return ProcessingResult(status=IndexObjectStatus.FAILED, object_key=context.object_key, message=f"Processing failed: {error_message}", processing_time=processing_time, error=e )
|
||||
|
||||
|
||||
|
||||
def _get_or_create_index_object(self, context: ProcessingContext) -> IndexObject:
|
||||
"""Get or create index object"""
|
||||
index_object = self.repository.get_index_object(context.object_key,context.datasource_name)
|
||||
if not index_object:
|
||||
index_object = IndexObject(
|
||||
object_key=context.object_key,
|
||||
type="document",
|
||||
status=IndexObjectStatus.PROCESSING.value,
|
||||
datasource_name=context.datasource_name
|
||||
)
|
||||
self.repository.save_index_object(index_object)
|
||||
return index_object
|
||||
|
||||
def _should_process(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether processing is needed (keep the original logic for backward compatibility)"""
|
||||
return self._should_process_metadata(index_object, context) or self._should_process_document(index_object, context)
|
||||
|
||||
def _should_process_metadata(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether metadata processing is needed"""
|
||||
if 'metadata_modified_time' in context.metadata:
|
||||
metadata_modified_time = context.metadata['metadata_modified_time']
|
||||
if index_object.metadata_modifed_time is None:
|
||||
return True
|
||||
if metadata_modified_time is not None and metadata_modified_time > index_object.metadata_modifed_time:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _should_process_document(self, index_object: IndexObject, context: ProcessingContext) -> bool:
|
||||
"""Determine whether document processing is needed"""
|
||||
if 'doc_modified_time' in context.metadata:
|
||||
doc_modified_time = context.metadata['doc_modified_time']
|
||||
if index_object.doc_modifed_time is None:
|
||||
return True
|
||||
if doc_modified_time is not None and doc_modified_time > index_object.doc_modifed_time:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _process_metadata_indexes(self, context: ProcessingContext) -> None:
|
||||
"""Process metadata index"""
|
||||
|
||||
# Push metadata index - only process index with data_type of ["metadata"]
|
||||
meta_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata"])]
|
||||
if not any(meta_index_schemas):
|
||||
return
|
||||
|
||||
# Get metadata - from metadata service
|
||||
doc_meta = self.metadata_service.get_metadata(context.object_key)
|
||||
# Metadata must not be empty, use empty dictionary as default value
|
||||
if not doc_meta:
|
||||
raise ValueError(f"Metadata for object {context.object_key} not found")
|
||||
|
||||
for meta_index_schema in meta_index_schemas:
|
||||
self.indexing_service.index_metadata(doc_meta, meta_index_schema, context)
|
||||
|
||||
def _process_document_and_chunks(self, context: ProcessingContext, doc_update_flag: bool) -> int:
|
||||
"""Process document and chunk indexes, return the number of processed chunks"""
|
||||
|
||||
doc_dict = {}
|
||||
chunk_dict = []
|
||||
chunks_count = 0
|
||||
# Update document dictionary with metadata
|
||||
doc_meta = self.metadata_service.get_metadata(context.object_key)
|
||||
language_code = doc_meta.get("language_code", "zh-Hans") # Default to "zh-Hans" if not specified
|
||||
# Future error or skip operation if no doc_meta configuration file
|
||||
if not doc_meta:
|
||||
doc_meta={}
|
||||
|
||||
|
||||
# If the document needs to be updated, re-extract and chunk
|
||||
if doc_update_flag:
|
||||
# Extract document
|
||||
document = self.extraction_service.extract_document(context, language_code)
|
||||
document.title = os.path.splitext(context.object_key)[0]
|
||||
|
||||
# Chunk processing
|
||||
chunking_result = self.chunking_service.chunk_document(document, context)
|
||||
chunks_count = len(chunking_result.chunks)
|
||||
|
||||
# Convert to dictionary format
|
||||
doc_dict = self._convert_document_to_dict(document)
|
||||
chunk_dict = [self._convert_document_to_dict(chunk) for chunk in chunking_result.chunks]
|
||||
|
||||
# Process document index - data_type is ["metadata","document"]
|
||||
document_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document"]) or Counter(schema["data_type"]) == Counter(["document"])]
|
||||
|
||||
for document_index_schema in document_index_schemas:
|
||||
if not doc_update_flag:
|
||||
# Get existing document data from Azure Search Index
|
||||
existing_docs = self.indexing_service.get_existing_document_data(
|
||||
context.object_key, document_index_schema["index_name"],
|
||||
document_index_schema["update_by_field"]
|
||||
)
|
||||
if existing_docs:
|
||||
doc_dict = existing_docs
|
||||
|
||||
doc_dict.update({k: doc_meta[k] for k in document_index_schema["fields"] if k in doc_meta})
|
||||
|
||||
# Upload document index
|
||||
self.indexing_service.index_document_with_schema(doc_dict, document_index_schema, context)
|
||||
|
||||
# Process chunk index - data_type is ["metadata","document","chunk"]
|
||||
chunk_index_schemas = [schema for schema in context.data_config["index_schemas"] if Counter(schema["data_type"]) == Counter(["metadata","document","chunk"]) or Counter(schema["data_type"]) == Counter(["chunk"])]
|
||||
|
||||
for index_schema in chunk_index_schemas:
|
||||
current_chunk_dict = chunk_dict # Use existing chunk_dict
|
||||
current_chunks_count = chunks_count # Use existing chunks_count
|
||||
if not doc_update_flag:
|
||||
# Get existing chunk data from Azure Search Index
|
||||
current_chunk_dict = self.indexing_service.get_existing_chunk_data(context.object_key, index_schema["index_name"], index_schema["update_by_field"])
|
||||
current_chunks_count = len(current_chunk_dict) if current_chunk_dict else 0
|
||||
|
||||
# Update the total chunks_count (for return value)
|
||||
chunks_count = current_chunks_count
|
||||
|
||||
for chunk in current_chunk_dict if current_chunk_dict else []:
|
||||
chunk.update({k: doc_meta[k] for k in index_schema["fields"] if k in doc_meta})
|
||||
|
||||
# Delete old chunk data
|
||||
self.indexing_service.delete_chunks_by_field(index_schema["index_name"], index_schema["update_by_field"], doc_dict.get(index_schema["update_by_field"], context.object_key))
|
||||
|
||||
# Upload new chunk data
|
||||
if current_chunk_dict:
|
||||
self.indexing_service.index_chunks_with_schema(current_chunk_dict, index_schema, context)
|
||||
|
||||
return chunks_count
|
||||
|
||||
def _convert_document_to_dict(self, document:Document) -> Dict[str, Any]:
|
||||
"""Convert Document object to dictionary"""
|
||||
|
||||
try:
|
||||
# Use the original asdict_with_dynamic function to maintain compatibility
|
||||
return asdict_with_dynamic(document)
|
||||
except Exception:
|
||||
# If asdict_with_dynamic fails, use the fallback method
|
||||
if hasattr(document, '__dict__'):
|
||||
return document.__dict__.copy()
|
||||
elif hasattr(document, 'to_dict'):
|
||||
return document.to_dict()
|
||||
else:
|
||||
# If all fails, return empty dictionary
|
||||
return {}
|
||||
|
||||
|
||||
# Infrastructure layer implementation
|
||||
class SqlAlchemyDocumentRepository(DocumentRepository):
|
||||
"""SQLAlchemy-based document repository implementation"""
|
||||
|
||||
def __init__(self, database_interface: DatabaseInterface):
|
||||
self.database_interface = database_interface
|
||||
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object"""
|
||||
return self.database_interface.get_index_object(object_key,datasource_name)
|
||||
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
self.database_interface.save_index_object(index_object)
|
||||
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
|
||||
message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
|
||||
# Convert business layer status to database status
|
||||
self.database_interface.update_processing_status(object_key,datasource_name, status, message)
|
||||
|
||||
|
||||
# Concrete implementation class
|
||||
class DocumentIntelligenceExtractionService:
|
||||
"""Document extraction service based on Document Intelligence"""
|
||||
|
||||
def __init__(self, form_recognizer_client: DocumentIntelligenceClient, vllm_endpoint, vllm_key, tmp_directory, data_directory=None,di_sas_url=None, figure_sas_url=None):
|
||||
self.form_recognizer_client: DocumentIntelligenceClient = form_recognizer_client
|
||||
self.vllm_endpoint: str = vllm_endpoint
|
||||
self.vllm_key: str = vllm_key
|
||||
self.tmp_directory: str = tmp_directory
|
||||
self.data_directory: str = data_directory or ""
|
||||
self.di_sas_url: str = di_sas_url
|
||||
self.figure_sas_url: str = figure_sas_url
|
||||
|
||||
def extract_document(self, context: ProcessingContext,language:str) -> Document:
|
||||
"""Extract document content using Document Intelligence"""
|
||||
|
||||
# Get data_dir config, use instance variable if not present
|
||||
data_dir = context.data_config.get("data_dir", self.data_directory)
|
||||
|
||||
# Download document file - use correct parameter order
|
||||
local_file_paths = downloadToLocalFolder(blob_url=context.data_config["data_path"], data_dir=data_dir, local_folder=self.tmp_directory, name_starts_with=context.object_key)
|
||||
|
||||
if not local_file_paths or len(local_file_paths) == 0:
|
||||
raise ValueError(f"File {context.object_key} not found in blob storage")
|
||||
|
||||
di_blob_file_name = context.object_key + str(context.metadata["doc_modified_time"]) + ".json"
|
||||
di_result:DiResult = None
|
||||
# Try to download the di result from the blob. If you can download it, you will no longer di_extract
|
||||
if self.di_sas_url and blob_exists(self.di_sas_url, di_blob_file_name):
|
||||
content:str = load_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name)
|
||||
if content:
|
||||
di_result = DiResult.from_json(content) # type: ignore
|
||||
if not di_result:
|
||||
di_result = di_extract(source_file_path=local_file_paths.pop(), di_client=self.form_recognizer_client, directory_path=self.tmp_directory, figure_sas_url=self.figure_sas_url, language=language)
|
||||
try:
|
||||
process_document_figures(di_result=di_result,config=context.config)
|
||||
except Exception as e:
|
||||
print(f"Error processing document figures: {e}")
|
||||
finally:
|
||||
# The result after understanding is written directly to the blob to prevent subsequent reprocessing
|
||||
blob_upload_object(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name, obj=di_result)
|
||||
|
||||
under_image_content = replace_urls_in_content(content=di_result.di_content, replacements=di_result.figures)
|
||||
# Save extracted content to local file (same as original logic)
|
||||
write_content(content=under_image_content, directory_path=self.tmp_directory, file_name=context.object_key)
|
||||
|
||||
blob_upload_content(blob_sas_url=self.di_sas_url, file_name=di_blob_file_name+".md", content=under_image_content)
|
||||
|
||||
return Document(content=under_image_content, filepath=context.object_key)
|
||||
|
||||
|
||||
class DefaultDocumentChunkingService:
|
||||
"""Default document chunking service"""
|
||||
|
||||
def __init__(self, tmp_directory: str = None):
|
||||
self.tmp_directory = tmp_directory
|
||||
|
||||
def chunk_document(self, document: Document, context: ProcessingContext) -> ChunkingResult:
|
||||
"""Chunk document"""
|
||||
|
||||
# Call the original chunking method
|
||||
chunking_result = chunk_di_doc(document, data_config=context.data_config, tmp_path=context.current_tmp_directory)
|
||||
|
||||
# If tmp_directory is configured, save chunk result to local file
|
||||
if self.tmp_directory:
|
||||
write_document( chunking_result.chunks, file_path=context.object_key, directory_path=self.tmp_directory, rel_file_path=context.object_key )
|
||||
|
||||
return chunking_result
|
||||
|
||||
|
||||
class AzureSearchIndexingService:
|
||||
"""Azure Search-based indexing service"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def index_document(self, document: Document, context: ProcessingContext) -> bool:
|
||||
"""Index document"""
|
||||
|
||||
# Get document index schema
|
||||
document_schemas = [schema for schema in context.data_config["index_schemas"]
|
||||
if set(schema["data_type"]) == {"metadata", "document"}]
|
||||
|
||||
doc_dict = asdict_with_dynamic(document)
|
||||
doc_dict.update(context.metadata)
|
||||
|
||||
for schema in document_schemas:
|
||||
if not upload_merge_index(index_config=schema, docs=[doc_dict], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def index_chunks(self, chunks: List[Document], context: ProcessingContext) -> bool:
|
||||
"""Index document chunks"""
|
||||
|
||||
# Get chunk index schema
|
||||
chunk_schemas = [schema for schema in context.data_config["index_schemas"]
|
||||
if set(schema["data_type"]) == {"metadata", "document", "chunk"}]
|
||||
|
||||
chunk_dict = [asdict_with_dynamic(chunk) for chunk in chunks]
|
||||
|
||||
for schema in chunk_schemas:
|
||||
# First delete old chunk data
|
||||
delete_documents_by_field(schema["index_name"], schema["update_by_field"], context.object_key)
|
||||
|
||||
# Add metadata to each chunk
|
||||
for chunk in chunk_dict:
|
||||
chunk.update(context.metadata)
|
||||
|
||||
# Upload new chunk data
|
||||
if not upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=chunk_dict,
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_existing_document_data(self, object_key: str, index_name: str, field_name: str) -> Optional[dict[str,Any]]:
|
||||
"""Get existing document data from Azure Search Index"""
|
||||
|
||||
results = query_by_field(
|
||||
index_name=index_name,
|
||||
field_name=field_name,
|
||||
value=object_key
|
||||
)
|
||||
|
||||
return results[0] if results else None
|
||||
|
||||
def get_existing_chunk_data(self, object_key: str, index_name: str, field_name: str) -> List[dict[str,Any]]:
|
||||
"""Get existing chunk data from Azure Search Index"""
|
||||
|
||||
results = query_by_field( index_name=index_name, field_name=field_name, value=object_key )
|
||||
|
||||
return results if results else []
|
||||
|
||||
def index_metadata(self, metadata: dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index metadata"""
|
||||
|
||||
return upload_merge_index(index_config=schema, docs=[metadata], merge_fields=context.data_config["merge_fields"], current_tmp_directory=context.current_tmp_directory )
|
||||
|
||||
def index_document_with_schema(self, doc_dict: Dict[str,Any], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index document using specified schema"""
|
||||
return upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=[doc_dict],
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
)
|
||||
|
||||
def index_chunks_with_schema(self, chunk_dict: List[Dict[str,Any]], schema: Any, context: ProcessingContext) -> bool:
|
||||
"""Index chunks using specified schema"""
|
||||
|
||||
return upload_merge_index(
|
||||
index_config=schema,
|
||||
docs=chunk_dict,
|
||||
merge_fields=context.data_config["merge_fields"],
|
||||
current_tmp_directory=context.current_tmp_directory
|
||||
)
|
||||
|
||||
def delete_chunks_by_field(self, index_name: str, field_name: str, field_value: str) -> bool:
|
||||
"""Delete chunks by field"""
|
||||
|
||||
try:
|
||||
delete_documents_by_field(index_name, field_name, field_value)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class BlobMetadataService:
|
||||
"""Metadata service based on Blob storage"""
|
||||
|
||||
def __init__(self, datasource: Dict[str, Any]):
|
||||
self.datasource = datasource
|
||||
|
||||
def get_metadata(self, object_key: str) -> Dict[str, Any]:
|
||||
"""Get metadata"""
|
||||
if "metadata" not in self.datasource:
|
||||
return {}
|
||||
|
||||
return self.datasource["metadata"].get(object_key, {})
|
||||
|
||||
|
||||
# Update the factory class with specific implementations
|
||||
class DocumentProcessingFactory:
|
||||
"""Document processing factory class"""
|
||||
|
||||
def __init__(self, service_factory: ServiceFactory, tmp_directory:str, datasource: Optional[Dict[str, Any]] = None, config:ApplicationConfig = None):
|
||||
"""
|
||||
Initialize factory
|
||||
Args:
|
||||
service_factory: Service factory (used to get database engine)
|
||||
datasource: Data source configuration
|
||||
"""
|
||||
self.service_factory: ServiceFactory = service_factory
|
||||
self.datasource = datasource or {}
|
||||
self.shared_tmp_directory = tmp_directory
|
||||
self.config:ApplicationConfig = config
|
||||
|
||||
def create_orchestrator(self) -> DocumentProcessingOrchestrator:
|
||||
"""Create document processing orchestrator"""
|
||||
extraction_service = self._create_extraction_service()
|
||||
chunking_service = self._create_chunking_service()
|
||||
indexing_service = self._create_indexing_service()
|
||||
metadata_service = self._create_metadata_service()
|
||||
repository = self._create_repository()
|
||||
|
||||
return DocumentProcessingOrchestrator(
|
||||
extraction_service=extraction_service,
|
||||
chunking_service=chunking_service,
|
||||
indexing_service=indexing_service,
|
||||
metadata_service=metadata_service,
|
||||
repository=repository
|
||||
)
|
||||
|
||||
def _create_extraction_service(self) -> 'DocumentIntelligenceExtractionService':
|
||||
"""Create document extraction service"""
|
||||
|
||||
|
||||
# Use the factory shared temporary directory (same as original app.py logic)
|
||||
tmp_directory = self.shared_tmp_directory
|
||||
|
||||
# Get configuration from environment variables (same as original worker.py logic)
|
||||
vllm_endpoint = os.environ.get("captioning_model_endpoint", "")
|
||||
vllm_key = os.environ.get("captioning_model_key", "")
|
||||
|
||||
form_recognizer_client = SingletonFormRecognizerClient()
|
||||
return DocumentIntelligenceExtractionService(
|
||||
form_recognizer_client=form_recognizer_client,
|
||||
vllm_endpoint=vllm_endpoint,
|
||||
vllm_key=vllm_key,
|
||||
tmp_directory=tmp_directory,
|
||||
data_directory="", # Will be dynamically fetched from data_config
|
||||
di_sas_url=self.config.azure_services.di_blob_account_url,
|
||||
figure_sas_url=self.config.azure_services.figure_blob_account_url
|
||||
)
|
||||
|
||||
def _create_chunking_service(self) -> 'DefaultDocumentChunkingService':
|
||||
"""Create document chunking service"""
|
||||
|
||||
# Use the factory shared temporary directory
|
||||
tmp_directory = self.shared_tmp_directory
|
||||
|
||||
return DefaultDocumentChunkingService(tmp_directory=tmp_directory)
|
||||
|
||||
def _create_indexing_service(self) -> 'AzureSearchIndexingService':
|
||||
"""Create indexing service"""
|
||||
return AzureSearchIndexingService()
|
||||
|
||||
def _create_metadata_service(self) -> 'BlobMetadataService':
|
||||
"""Create metadata service"""
|
||||
return BlobMetadataService(self.datasource)
|
||||
|
||||
def _create_repository(self) -> DocumentRepository:
|
||||
"""Create document repository"""
|
||||
database_interface = LegacyDatabaseAdapter(self.service_factory.get_database_engine())
|
||||
return SqlAlchemyDocumentRepository(database_interface)
|
||||
|
||||
|
||||
177
vw-document-ai-indexer/chunk_service.py
Normal file
177
vw-document-ai-indexer/chunk_service.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import json
|
||||
import os
|
||||
from os import makedirs
|
||||
import re
|
||||
import time
|
||||
from typing import Any, List
|
||||
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from entity_models import Document, ChunkingResult
|
||||
from hierarchy_fix import HierarchyFixer
|
||||
from third_level_service import get_recommended_hash_count_simple
|
||||
|
||||
from utils import TOKEN_ESTIMATOR, custom_serializer
|
||||
|
||||
# Compile once for efficiency
|
||||
_specific_comments = re.compile(
|
||||
r"""<!--\s* # opening
|
||||
(?:PageFooter="[^"]*" # PageFooter="…"
|
||||
|PageNumber="[^"]*" # PageNumber="…"
|
||||
|PageBreak # PageBreak
|
||||
|PageHeader="[^"]*") # PageHeader="…"
|
||||
\s*--> # closing
|
||||
""",
|
||||
flags=re.VERBOSE
|
||||
)
|
||||
|
||||
|
||||
def remove_specific_comments(text: str) -> str:
|
||||
return _specific_comments.sub('', text)
|
||||
|
||||
|
||||
def infer_level_from_number():
|
||||
pass
|
||||
|
||||
def chunk_docs_by_section(extracted_doc: Document, num_tokens:int, token_overlap:int,tmp_path:str) -> List[Document]:
|
||||
headers_to_split_on = [
|
||||
("#", "h1"),
|
||||
("##", "h2"),
|
||||
("###", "h3"),
|
||||
("####", "h4"),
|
||||
("#####", "h5"),
|
||||
("######", "h6")
|
||||
]
|
||||
filepath:str = extracted_doc.filepath if extracted_doc.filepath else ""
|
||||
extracted_content:str = extracted_doc.content or ""
|
||||
merged_content:str = extracted_content
|
||||
if os.getenv("header_fix","false").lower() == "true":
|
||||
#merge content of all extracted_docs into one string
|
||||
fixer = HierarchyFixer()
|
||||
fix_result:dict[str,Any] = fixer.fix_hierarchy(content=extracted_content)
|
||||
# If a fix exists, the fix report is saved by file
|
||||
merged_content = fix_result["fixed_content"]
|
||||
|
||||
makedirs(tmp_path + f"/.extracted/{filepath}", exist_ok=True)
|
||||
if tmp_path and fix_result["fixes_applied"] > 0:
|
||||
with open(tmp_path + f"/.extracted/{filepath}/hierarchy_fix_log.json", "a", encoding="utf-8") as log_file:
|
||||
json.dump(fix_result, log_file, default=custom_serializer, ensure_ascii=False)
|
||||
|
||||
# Dynamically get the number of # for level 3 headers
|
||||
third_level_counts:int = get_recommended_hash_count_simple(merged_content)['recommendation']
|
||||
headers_to_split_on = [( "#" * i, f"h{i}") for i in range(1, third_level_counts + 1)]
|
||||
|
||||
with open(tmp_path + f"/.extracted/{filepath}/get_recommended_hash_count.txt", "a", encoding="utf-8") as md_file:
|
||||
md_file.write(str(headers_to_split_on))
|
||||
|
||||
with open(tmp_path + f"/.extracted/{filepath}/new_merged_hierarchy.md", "a", encoding="utf-8") as md_file:
|
||||
md_file.write(merged_content)
|
||||
|
||||
# MD splits
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False )
|
||||
md_header_splits = markdown_splitter.split_text(merged_content)
|
||||
|
||||
chunk_size = num_tokens
|
||||
chunk_overlap = token_overlap
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
|
||||
splits = text_splitter.split_documents(md_header_splits)
|
||||
|
||||
pre_document = extracted_doc
|
||||
|
||||
chunked_docs: List[Document] = []
|
||||
for i, split in enumerate(splits):
|
||||
|
||||
if TOKEN_ESTIMATOR.estimate_tokens(split.page_content) < num_tokens * 1.5:
|
||||
chunked_doc = Document(
|
||||
document_schema=pre_document.document_schema,
|
||||
main_title=pre_document.main_title,
|
||||
sub_title=pre_document.sub_title,
|
||||
publisher=pre_document.publisher,
|
||||
document_code=pre_document.document_code,
|
||||
document_category=pre_document.document_category,
|
||||
main_title_sec_language=pre_document.main_title_sec_language,
|
||||
sub_title_sec_language=pre_document.sub_title_sec_language,
|
||||
primary_language=pre_document.primary_language,
|
||||
secondary_language=pre_document.secondary_language,
|
||||
title=pre_document.title,
|
||||
doc_metadata=pre_document.doc_metadata,
|
||||
filepath=pre_document.filepath,
|
||||
|
||||
)
|
||||
chunked_doc.copy_dynamic_attrs(pre_document)
|
||||
chunked_doc.content = split.page_content
|
||||
chunked_doc.h1 = split.metadata.get("h1", "")
|
||||
chunked_doc.h2 = split.metadata.get("h2", "")
|
||||
chunked_doc.h3 = split.metadata.get("h3", "")
|
||||
chunked_doc.h4 = split.metadata.get("h4", "")
|
||||
chunked_doc.h5 = split.metadata.get("h5", "")
|
||||
chunked_doc.h6 = split.metadata.get("h6", "")
|
||||
chunked_doc.h7 = split.metadata.get("h7", "")
|
||||
|
||||
# chunked_doc.h4 =split.metadata.get("h4", "")
|
||||
chunked_doc.full_headers = "||".join(h for h in [chunked_doc.h6, chunked_doc.h5, chunked_doc.h4, chunked_doc.h3, chunked_doc.h2, chunked_doc.h1] if h)
|
||||
|
||||
chunked_doc.id = chunked_doc.filepath + f"_{i}"
|
||||
|
||||
chunked_docs.append(chunked_doc)
|
||||
|
||||
else:
|
||||
splitter = MarkdownTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=num_tokens, chunk_overlap=token_overlap)
|
||||
chunked_content_list = splitter.split_text(
|
||||
split.page_content)
|
||||
# chunk the original content
|
||||
for j, chunked_content in enumerate(chunked_content_list):
|
||||
chunked_doc = Document(
|
||||
document_schema=pre_document.document_schema,
|
||||
main_title=pre_document.main_title,
|
||||
sub_title=pre_document.sub_title,
|
||||
publisher=pre_document.publisher,
|
||||
document_code=pre_document.document_code,
|
||||
document_category=pre_document.document_category,
|
||||
main_title_sec_language=pre_document.main_title_sec_language,
|
||||
sub_title_sec_language=pre_document.sub_title_sec_language,
|
||||
primary_language=pre_document.primary_language,
|
||||
secondary_language=pre_document.secondary_language,
|
||||
title=pre_document.title,
|
||||
doc_metadata=pre_document.doc_metadata,
|
||||
filepath=pre_document.filepath
|
||||
)
|
||||
chunked_doc.copy_dynamic_attrs(pre_document)
|
||||
chunked_doc.content = chunked_content
|
||||
chunked_doc.h1 = split.metadata.get("h1", "")
|
||||
chunked_doc.h2 = split.metadata.get("h2", "")
|
||||
chunked_doc.h3 = split.metadata.get("h3", "")
|
||||
chunked_doc.h4 = split.metadata.get("h4", "")
|
||||
chunked_doc.h5 = split.metadata.get("h5", "")
|
||||
chunked_doc.h6 = split.metadata.get("h6", "")
|
||||
chunked_doc.h7 = split.metadata.get("h7", "")
|
||||
|
||||
chunked_doc.full_headers = "||".join(h for h in [chunked_doc.h6, chunked_doc.h5, chunked_doc.h4, chunked_doc.h3, chunked_doc.h2, chunked_doc.h1] if h)
|
||||
|
||||
chunked_doc.id = chunked_doc.filepath + f"_{i}_{j}"
|
||||
|
||||
chunked_docs.append(chunked_doc)
|
||||
|
||||
return chunked_docs
|
||||
|
||||
|
||||
def chunk_di_doc(extracted_doc: Document, data_config: dict[str, Any], tmp_path: str) -> ChunkingResult:
|
||||
"""
|
||||
Chunk the document.
|
||||
Args:
|
||||
extracted_doc: The document object to be processed.
|
||||
data_config: Processing configuration.
|
||||
Returns:
|
||||
ChunkingResult: The result containing the list of chunks and total files.
|
||||
"""
|
||||
num_tokens:int = data_config["chunk_size"] if "chunk_size" in data_config else 1024
|
||||
token_overlap:int = data_config["token_overlap"] if "token_overlap" in data_config else 128
|
||||
|
||||
print({"index_name":extracted_doc.filepath , "num_tokens": num_tokens, "token_overlap": token_overlap})
|
||||
extracted_doc.content = remove_specific_comments(text=extracted_doc.content or "")
|
||||
chunked_docs: List[Document] = chunk_docs_by_section(extracted_doc= extracted_doc,num_tokens=num_tokens, token_overlap=token_overlap,tmp_path=tmp_path)
|
||||
time.sleep(0.1)
|
||||
return ChunkingResult(chunks=chunked_docs, total_files=1)
|
||||
27
vw-document-ai-indexer/config.yaml
Normal file
27
vw-document-ai-indexer/config.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
- data_path: "blob sas url"
|
||||
datasource_name: "demo-vw-03"
|
||||
data_dir: ""
|
||||
base_path: "D:\\tmp\\"
|
||||
process_file_num: 0
|
||||
process_file_last_modify: "2025-06-24 00:00:00"
|
||||
chunk_size: 2048
|
||||
token_overlap: 128
|
||||
index_schemas:
|
||||
- index_name: "index-dev-figure-01-chunk"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
field_type: "append"
|
||||
upload_batch_size: 50
|
||||
fields: ["filepath", "title"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"]
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"]
|
||||
merge_fields:
|
||||
- key: "doc_metadata"
|
||||
fields: ["title"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
109
vw-document-ai-indexer/config.yaml.example
Normal file
109
vw-document-ai-indexer/config.yaml.example
Normal file
@@ -0,0 +1,109 @@
|
||||
# Main data configuration (array format)
|
||||
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
|
||||
datasource_name: "CATOnline-cn" # data source name
|
||||
data_dir: "" # Optional local data directory
|
||||
base_path: "/app/run_tmp" # Temporary processing directory
|
||||
|
||||
# File processing limits
|
||||
process_file_num: 0 # 0 = process all files
|
||||
process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date
|
||||
|
||||
# Chunking configuration
|
||||
chunk_size: 2048 # Maximum tokens per chunk
|
||||
token_overlap: 128 # Overlap between chunks
|
||||
|
||||
# Index schemas configuration
|
||||
index_schemas:
|
||||
# Chunk-level index for search
|
||||
- index_name: "your-knowledge-chunk-index"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
field_type: "append" # How to handle existing data
|
||||
upload_batch_size: 50 # Documents per batch upload
|
||||
|
||||
# Metadata fields to include
|
||||
fields: [
|
||||
"filepath", "timestamp", "title", "publisher", "publish_date",
|
||||
"document_category", "document_code", "language_code",
|
||||
"x_Standard_Regulation_Id", "x_Attachment_Type",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status",
|
||||
"x_Standard_Range", "x_Standard_Kind", "x_Standard_No",
|
||||
"x_Standard_Code", "x_Standard_Technical_Committee",
|
||||
"x_Standard_Vehicle_Type", "x_Standard_Power_Type",
|
||||
"x_Standard_CCS", "x_Standard_ICS",
|
||||
"x_Standard_Published_Date", "x_Standard_Effective_Date",
|
||||
"x_Regulation_Status", "x_Regulation_Title_CN",
|
||||
"x_Regulation_Title_EN", "x_Regulation_Document_No",
|
||||
"x_Regulation_Issued_Date", "x_Classification",
|
||||
"x_Work_Group", "x_Reference_Standard",
|
||||
"x_Replaced_by", "x_Refer_To", "func_uuid",
|
||||
"update_time", "status"
|
||||
]
|
||||
|
||||
# Vector configuration
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"] # Fields to vectorize for content
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization
|
||||
|
||||
# Azure AI Search configuration
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath" # Field to use for updates
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
|
||||
# Document-level index
|
||||
- index_name: "your-knowledge-document-index"
|
||||
data_type: ["document", "metadata"]
|
||||
field_type: "full" # Replace entire documents
|
||||
key_fields: ["filepath"] # Primary key fields
|
||||
upload_batch_size: 1
|
||||
|
||||
fields: [
|
||||
# Same field list as chunk index
|
||||
"filepath", "timestamp", "title", "publisher"
|
||||
# ... (same as above)
|
||||
]
|
||||
|
||||
merge_content_fields: ["content"] # Fields to merge from chunks
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
|
||||
# Regulation-specific index
|
||||
- index_name: "your-regulation-index"
|
||||
data_type: ["metadata"]
|
||||
field_type: "full"
|
||||
key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key
|
||||
upload_batch_size: 50
|
||||
|
||||
fields: [
|
||||
# Regulation-specific fields
|
||||
"x_Standard_Regulation_Id", "x_Standard_Title_CN",
|
||||
"x_Standard_Title_EN", "x_Regulation_Status"
|
||||
# ... (regulation metadata fields)
|
||||
]
|
||||
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
update_by_field: "x_Standard_Regulation_Id"
|
||||
|
||||
# Field merging configuration
|
||||
merge_fields:
|
||||
- key: "doc_metadata" # Combined metadata field
|
||||
fields: [
|
||||
"title", "publisher", "document_category", "document_code",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status"
|
||||
# ... (all metadata fields to combine)
|
||||
]
|
||||
|
||||
# Vector field configuration
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
189
vw-document-ai-indexer/database.py
Normal file
189
vw-document-ai-indexer/database.py
Normal file
@@ -0,0 +1,189 @@
|
||||
from enum import Enum
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, Any
|
||||
import datetime
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
|
||||
from sqlalchemy.orm import Mapped, declarative_base, mapped_column
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class IndexJobStatus(Enum):
|
||||
"""Enumeration for index job status"""
|
||||
PENDING = 'pending' # todo
|
||||
PROCESSING = 'processing'
|
||||
SUCCESS = 'success'
|
||||
PARTIAL_SUCCESS = 'partial_success'
|
||||
FAILED = 'failed'
|
||||
|
||||
class IndexObjectStatus(Enum):
|
||||
"""Enumeration for index object status"""
|
||||
SUCCESS = 'success'
|
||||
PROCESSING = 'processing'
|
||||
FAILED = 'failed'
|
||||
|
||||
class IndexJob(Base): # type: ignore
|
||||
"""Index job model, represents a single index run"""
|
||||
__tablename__ = 'index_run'
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
start_time = Column(DateTime, nullable=True)
|
||||
finished_time = Column(DateTime)
|
||||
status: Mapped[str] = mapped_column(String(20), default=IndexJobStatus.PENDING.value)
|
||||
detailed_message = Column(Text,nullable=True)
|
||||
doc_lower_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
doc_upper_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
metadata_lower_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
metadata_upper_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
total_process_count = Column(Integer)
|
||||
success_object_count = Column(Integer, default=0)
|
||||
failed_object_count = Column(Integer, default=0)
|
||||
datasource_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
|
||||
class IndexObject(Base):
|
||||
"""Index object model, represents a document or metadata file to be processed"""
|
||||
__tablename__ = 'index_object'
|
||||
object_key: Mapped[str] = mapped_column(String(255), primary_key=True)
|
||||
type = Column(String(20), nullable=False)
|
||||
doc_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
metadata_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
status: Mapped[str] = mapped_column(String(20), default=IndexObjectStatus.PROCESSING.value)
|
||||
try_count: Mapped[int] = mapped_column(Integer, default=0)
|
||||
last_run_id = Column(Integer)
|
||||
last_start_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
last_finished_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
detailed_message: Mapped[str] = mapped_column(Text,nullable=True)
|
||||
last_fail_doc_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
last_fail_metadata_modifed_time:Mapped[Optional[datetime.datetime]] = mapped_column(DateTime)
|
||||
datasource_name: Mapped[str] = mapped_column(String(255), primary_key=True)
|
||||
|
||||
def init_database(database_uri: str = '') -> Any:
|
||||
engine = create_engine(database_uri)
|
||||
Base.metadata.create_all(engine)
|
||||
return engine
|
||||
|
||||
|
||||
|
||||
class DatabaseInterface(ABC):
|
||||
"""Database interface for the refactored system"""
|
||||
|
||||
@abstractmethod
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object by key"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
|
||||
message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
pass
|
||||
|
||||
|
||||
class InMemoryDatabase(DatabaseInterface):
|
||||
"""In-memory database implementation for testing"""
|
||||
|
||||
def __init__(self):
|
||||
self._objects: Dict[str, IndexObject] = {}
|
||||
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object by key"""
|
||||
return self._objects.get(object_key)
|
||||
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
index_object.updated_at = datetime.datetime.now()
|
||||
if index_object.created_at is None:
|
||||
index_object.created_at = datetime.datetime.now()
|
||||
self._objects[index_object.object_key] = index_object
|
||||
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus,
|
||||
message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
if object_key in self._objects:
|
||||
self._objects[object_key].status = status
|
||||
self._objects[object_key].error_message = message
|
||||
self._objects[object_key].updated_at = datetime.datetime.now()
|
||||
else:
|
||||
# Create new object if it doesn't exist
|
||||
obj = IndexObject(
|
||||
object_key=object_key,
|
||||
status=status,
|
||||
error_message=message,
|
||||
created_at=datetime.datetime.now(),
|
||||
updated_at=datetime.datetime.now()
|
||||
)
|
||||
self._objects[object_key] = obj
|
||||
|
||||
|
||||
class LegacyDatabaseAdapter(DatabaseInterface):
|
||||
"""Adapter to bridge the old database module with the new interface"""
|
||||
|
||||
def __init__(self, database_engine):
|
||||
self.database_engine = database_engine
|
||||
self._session_factory = None
|
||||
|
||||
def _get_session_factory(self):
|
||||
"""Get session factory (lazy initialization)"""
|
||||
if self._session_factory is None:
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
self._session_factory = sessionmaker(bind=self.database_engine)
|
||||
return self._session_factory
|
||||
|
||||
def get_index_object(self, object_key: str,datasource_name:str) -> Optional[IndexObject]:
|
||||
"""Get index object by key"""
|
||||
|
||||
session_factory = self._get_session_factory()
|
||||
with session_factory() as session:
|
||||
return session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
|
||||
|
||||
|
||||
def save_index_object(self, index_object: IndexObject) -> None:
|
||||
"""Save index object"""
|
||||
object_key = index_object.object_key
|
||||
datasource_name = index_object.datasource_name
|
||||
|
||||
session_factory = self._get_session_factory()
|
||||
with session_factory() as session:
|
||||
old_obj = session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
|
||||
if old_obj:
|
||||
# Update existing
|
||||
old_obj.doc_modifed_time = index_object.doc_modifed_time
|
||||
old_obj.metadata_modifed_time = index_object.metadata_modifed_time
|
||||
old_obj.try_count = index_object.try_count
|
||||
old_obj.status = index_object.status
|
||||
old_obj.last_fail_doc_modifed_time = index_object.last_fail_doc_modifed_time
|
||||
old_obj.last_fail_metadata_modifed_time = index_object.last_fail_metadata_modifed_time
|
||||
old_obj.datasource_name = index_object.datasource_name
|
||||
# Note: legacy IndexObject might not have all fields
|
||||
else:
|
||||
# Create new
|
||||
old_obj = IndexObject(
|
||||
object_key=index_object.object_key,
|
||||
type=index_object.type,
|
||||
doc_modifed_time=index_object.doc_modifed_time,
|
||||
metadata_modifed_time=index_object.metadata_modifed_time,
|
||||
try_count=index_object.try_count,
|
||||
status=index_object.status,
|
||||
last_fail_doc_modifed_time=index_object.last_fail_doc_modifed_time,
|
||||
last_fail_metadata_modifed_time=index_object.last_fail_metadata_modifed_time,
|
||||
datasource_name=index_object.datasource_name
|
||||
)
|
||||
|
||||
session.add(old_obj)
|
||||
session.commit()
|
||||
|
||||
def update_processing_status(self, object_key: str,datasource_name:str, status: IndexObjectStatus, message: str = None) -> None:
|
||||
"""Update processing status"""
|
||||
|
||||
session_factory = self._get_session_factory()
|
||||
with session_factory() as session:
|
||||
old_obj = session.query(IndexObject).get({"object_key":object_key,"datasource_name":datasource_name})
|
||||
if old_obj:
|
||||
old_obj.status = status.value
|
||||
old_obj.detailed_message = message
|
||||
session.commit()
|
||||
|
||||
|
||||
50
vw-document-ai-indexer/deploy/dev-abroad/config.yaml
Normal file
50
vw-document-ai-indexer/deploy/dev-abroad/config.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-abroad-prd?sp=rl&st=2025-08-02T08:25:56Z&se=2125-08-02T16:40:56Z&spr=https&sv=2024-11-04&sr=c&sig=lJui2%2BOs8V%2BdzCkjchQCR7ITWT28tJ0HAq8bIhkkM%2Bk%3D"
|
||||
datasource_name: "cat-standard-regulation-oversea"
|
||||
data_dir: ""
|
||||
base_path: "/app/run_tmp"
|
||||
process_file_num: 0
|
||||
process_file_last_modify: "2025-06-24 00:00:00"
|
||||
chunk_size: 2048
|
||||
token_overlap: 256
|
||||
index_schemas:
|
||||
- index_name: "index-catonline-chunk-oversea"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
upload_batch_size: 50
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"]
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"]
|
||||
- index_name: "index-catonline-document-oversea"
|
||||
data_type: ["document", "metadata"]
|
||||
key_fields: ["filepath"]
|
||||
upload_batch_size: 1
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
merge_content_fields: ["content"]
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
- index_name: "index-catonline-standard-regulation-oversea"
|
||||
data_type: ["metadata"]
|
||||
key_fields: ["standard_Id"]
|
||||
upload_batch_size: 1
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
update_by_field: "standard_Id"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
merge_fields:
|
||||
- key: "doc_metadata"
|
||||
fields: ["file_Name","entity_Attribute","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","version_Name","version_Parent_Name","technical_Series_No","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str " ]
|
||||
50
vw-document-ai-indexer/deploy/dev-abroad/deploy.sh
Normal file
50
vw-document-ai-indexer/deploy/dev-abroad/deploy.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
# docker build
|
||||
docker login acrsales2caiprd.azurecr.cn -u username -p password
|
||||
docker build . -t document-ai-indexer:2.0.1
|
||||
docker tag document-ai-indexer:2.0.1 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
|
||||
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
|
||||
|
||||
|
||||
# login AKS
|
||||
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
|
||||
# az login # Log in to Azure China account (browser or device code flow)
|
||||
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
|
||||
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# Create Azure Files Volume
|
||||
# kubectl create secret generic azure-files-cred \
|
||||
# --from-literal=azurestorageaccountname=saaisearchlab \
|
||||
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
|
||||
# -n knowledge-agent
|
||||
|
||||
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
|
||||
# Deploy ConfigMap
|
||||
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=.\deploy\prd\env.yaml --from-file=.\deploy\prd\config.yaml
|
||||
|
||||
# Deploy Pod
|
||||
# kubectl create namespace knowledge-agent
|
||||
# kubectl delete pod document-ai-indexer -n knowledge-agent
|
||||
kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
|
||||
|
||||
# Monitor Pod
|
||||
kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Deploy CronJob
|
||||
kubectl apply -f ./deploy/prd/document-ai-indexer-cronjob.yml --namespace knowledge-agent
|
||||
|
||||
# Check CronJob Status
|
||||
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
|
||||
# Check Job Execution History
|
||||
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
|
||||
|
||||
###########
|
||||
# Manually trigger a job (for testing)
|
||||
kubectl delete job manual-test -n knowledge-agent
|
||||
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
|
||||
# Check Job Logs
|
||||
kubectl logs -f job/manual-test -n knowledge-agent
|
||||
@@ -0,0 +1,64 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: document-ai-indexer-cronjob
|
||||
spec:
|
||||
# Scheduling configuration - execute every 10 minutes
|
||||
schedule: "*/10 * * * *"
|
||||
|
||||
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
# Successful jobs history limit: Keep the last 3 successful job records.
|
||||
successfulJobsHistoryLimit: 10
|
||||
|
||||
# Failed jobs history limit: Keep the last failed job record.
|
||||
failedJobsHistoryLimit: 10
|
||||
|
||||
# Job template
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: document-ai-indexer
|
||||
job-type: cronjob
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false # Write permission
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
|
||||
# Azure File Shared mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Program write/read directory
|
||||
@@ -0,0 +1,42 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: document-ai-indexer
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
|
||||
# Azure File Share Mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Directory for program read/write
|
||||
@@ -0,0 +1,10 @@
|
||||
# login AKS
|
||||
# az cloud set -n AzureChinaCloud
|
||||
# az login
|
||||
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
|
||||
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# kubectl create namespace knowledge-agent
|
||||
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent
|
||||
@@ -0,0 +1,39 @@
|
||||
# Service 资源:将外部域名映射为集群内 Service
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: itpai-backend
|
||||
spec:
|
||||
type: ExternalName
|
||||
externalName: itpai.infer.api.vgcserv.com.cn
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
---
|
||||
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: itpai-proxy
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
|
||||
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
|
||||
spec:
|
||||
rules:
|
||||
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
|
||||
http:
|
||||
paths:
|
||||
- path: /v1-openai
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: itpai-backend
|
||||
port:
|
||||
number: 443
|
||||
40
vw-document-ai-indexer/deploy/dev-abroad/env.yaml
Normal file
40
vw-document-ai-indexer/deploy/dev-abroad/env.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
config: config.yaml
|
||||
njobs: 12
|
||||
|
||||
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
|
||||
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
|
||||
|
||||
|
||||
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
|
||||
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
|
||||
VECTOR_DIMENSION: 4096
|
||||
FLAG_AOAI: "V3"
|
||||
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
|
||||
|
||||
|
||||
extract_method: di+vision-llm
|
||||
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
|
||||
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
|
||||
di-Formulas: true
|
||||
di-hiRes: true
|
||||
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
|
||||
|
||||
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
|
||||
|
||||
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
|
||||
|
||||
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
|
||||
|
||||
|
||||
# 图片理解
|
||||
figure_caption:
|
||||
include_di_content: false
|
||||
description_gen_max_images: 0
|
||||
model_endpoint: null
|
||||
model_key: null
|
||||
model: null # azure 留空
|
||||
azure_deployment: gpt-4o # azure 部署名称,其他平台模型留空
|
||||
api_version: 2024-08-01-preview # azure api版本,其他平台留空
|
||||
|
||||
|
||||
header_fix: true
|
||||
46
vw-document-ai-indexer/deploy/dev/deploy.sh
Normal file
46
vw-document-ai-indexer/deploy/dev/deploy.sh
Normal file
@@ -0,0 +1,46 @@
|
||||
|
||||
# login AKS
|
||||
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
|
||||
# az login # Log in to Azure China account (browser or device code flow)
|
||||
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
|
||||
# az aks get-credentials -g rg-aiflow-lab -n aks-aiflow-lab --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-aiflow-lab
|
||||
kubectl config current-context
|
||||
|
||||
# kubectl create secret generic azure-files-cred \
|
||||
# --from-literal=azurestorageaccountname=saaisearchlab \
|
||||
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
|
||||
# -n knowledge-agent
|
||||
|
||||
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
|
||||
docker build . -t document-ai-indexer:2.0.2
|
||||
docker tag document-ai-indexer:2.0.2 acraiflowlab.azurecr.io/document-ai-indexer:2.0.2
|
||||
docker push acraiflowlab.azurecr.io/document-ai-indexer:2.0.2
|
||||
|
||||
|
||||
# dev
|
||||
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=env.yaml --from-file=config.yaml
|
||||
|
||||
# kubectl create namespace knowledge-agent
|
||||
|
||||
# # kubectl delete pod document-ai-indexer -n knowledge-agent
|
||||
# kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
|
||||
|
||||
# kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Deploy CronJob
|
||||
kubectl apply -f deploy/dev/document-ai-indexer-cronjob.yml --namespace knowledge-agent
|
||||
|
||||
# Check CronJob Status
|
||||
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
|
||||
# Check Job Execution History
|
||||
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
|
||||
|
||||
###########
|
||||
# Manually trigger a job (for testing)
|
||||
kubectl delete job manual-test -n knowledge-agent
|
||||
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
|
||||
# Check Job Logs
|
||||
kubectl logs -f job/manual-test -n knowledge-agent
|
||||
@@ -0,0 +1,64 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: document-ai-indexer-cronjob
|
||||
spec:
|
||||
# Scheduling configuration - execute every 10 minutes
|
||||
schedule: "*/10 * * * *"
|
||||
|
||||
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
# Successful jobs history limit: Keep the last 3 successful job records.
|
||||
successfulJobsHistoryLimit: 10
|
||||
|
||||
# Failed jobs history limit: Keep the last failed job record.
|
||||
failedJobsHistoryLimit: 10
|
||||
|
||||
# Job template
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: document-ai-indexer
|
||||
job-type: cronjob
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quote Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false # Write permission
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acraiflowlab.azurecr.io/document-ai-indexer:2.0.1
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
|
||||
# Azure File Shared mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Program write/read directory
|
||||
@@ -0,0 +1,42 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: document-ai-indexer
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. 原有的 ConfigMap 卷
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
|
||||
# 2. Azure File Share 卷
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # 引用你创建的 Secret
|
||||
shareName: fs-document-ai-indexer # 你的文件共享名称
|
||||
readOnly: false # 写权限
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acraiflowlab.azurecr.io/document-ai-indexer:2.0.1
|
||||
imagePullPolicy: Always
|
||||
# 挂载卷到容器内
|
||||
volumeMounts:
|
||||
# ConfigMap 挂载
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
|
||||
# Azure File 共享挂载
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # 程序写入/读取目录
|
||||
27
vw-document-ai-indexer/deploy/prd-usermanual/config.yaml
Normal file
27
vw-document-ai-indexer/deploy/prd-usermanual/config.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-usermanual-prd?sp=racwdl&st=2025-08-27T06:26:11Z&se=2035-08-27T14:41:11Z&spr=https&sv=2024-11-04&sr=c&sig=7GVqfbWPM5VDRW8crTeR06KsSPX%2BuuDLjN7ceqBuLCE%3D"
|
||||
datasource_name: "cat-usermanual-prd"
|
||||
data_dir: ""
|
||||
base_path: "/app/run_tmp"
|
||||
process_file_num: 0
|
||||
process_file_last_modify: "2025-06-24 00:00:00"
|
||||
chunk_size: 2048
|
||||
token_overlap: 128
|
||||
index_schemas:
|
||||
- index_name: "index-cat-usermanual-chunk-prd"
|
||||
data_type: ["chunk"]
|
||||
field_type: "append"
|
||||
upload_batch_size: 50
|
||||
fields: ["filepath", "title"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"]
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"]
|
||||
merge_fields:
|
||||
- key: "doc_metadata"
|
||||
fields: ["title"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
50
vw-document-ai-indexer/deploy/prd-usermanual/deploy.sh
Normal file
50
vw-document-ai-indexer/deploy/prd-usermanual/deploy.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
# docker build
|
||||
docker login acrsales2caiprd.azurecr.cn -u username -p password
|
||||
docker build . -t document-ai-indexer:2.0.4
|
||||
docker tag document-ai-indexer:2.0.4 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
|
||||
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
|
||||
|
||||
|
||||
# login AKS
|
||||
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
|
||||
# az login # Log in to Azure China account (browser or device code flow)
|
||||
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
|
||||
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# Create Azure Files Volume
|
||||
# kubectl create secret generic azure-files-cred \
|
||||
# --from-literal=azurestorageaccountname=saaisearchlab \
|
||||
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
|
||||
# -n knowledge-agent
|
||||
|
||||
# kubectl delete configmap document-ai-indexer-usermanual-config -n knowledge-agent
|
||||
|
||||
# Deploy ConfigMap
|
||||
kubectl delete configmap document-ai-indexer-usermanual-config -n knowledge-agent
|
||||
kubectl create configmap document-ai-indexer-usermanual-config -n knowledge-agent --from-file=.\deploy\prd-usermanual\env.yaml --from-file=.\deploy\prd-usermanual\config.yaml --from-file=prompt.yaml
|
||||
|
||||
# Deploy Pod
|
||||
# kubectl create namespace knowledge-agent
|
||||
# kubectl delete pod document-ai-indexer-usermanual -n knowledge-agent
|
||||
kubectl apply -f .\deploy\prd-usermanual\document-ai-indexer-usermanual.yml -n knowledge-agent
|
||||
|
||||
# Monitor Pod
|
||||
kubectl logs -f document-ai-indexer-usermanual -n knowledge-agent
|
||||
|
||||
# Deploy CronJob
|
||||
kubectl apply -f deploy/prd-usermanual/document-ai-indexer-cronjob.yml --namespace knowledge-agent
|
||||
|
||||
# Check CronJob Status
|
||||
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
|
||||
# Check Job Execution History
|
||||
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
|
||||
|
||||
###########
|
||||
# Manually trigger a job (for testing)
|
||||
kubectl delete job manual-test -n knowledge-agent
|
||||
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
|
||||
# Check Job Logs
|
||||
kubectl logs -f job/manual-test -n knowledge-agent
|
||||
@@ -0,0 +1,64 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: document-ai-indexer-cronjob
|
||||
spec:
|
||||
# Scheduling configuration - execute every 10 minutes
|
||||
schedule: "*/10 * * * *"
|
||||
|
||||
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
# Successful jobs history limit: Keep the last 3 successful job records.
|
||||
successfulJobsHistoryLimit: 10
|
||||
|
||||
# Failed jobs history limit: Keep the last failed job record.
|
||||
failedJobsHistoryLimit: 10
|
||||
|
||||
# Job template
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: document-ai-indexer
|
||||
job-type: cronjob
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false # Write permission
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
|
||||
# Azure File Shared mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Program write/read directory
|
||||
@@ -0,0 +1,47 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: document-ai-indexer-usermanual
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-usermanual-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
- key: prompt.yaml
|
||||
path: prompt.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer-usermanual
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/prompt.yaml
|
||||
subPath: prompt.yaml
|
||||
|
||||
# Azure File Share Mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Directory for program read/write
|
||||
@@ -0,0 +1,10 @@
|
||||
# login AKS
|
||||
# az cloud set -n AzureChinaCloud
|
||||
# az login
|
||||
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
|
||||
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# kubectl create namespace knowledge-agent
|
||||
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent
|
||||
@@ -0,0 +1,39 @@
|
||||
# Service 资源:将外部域名映射为集群内 Service
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: itpai-backend
|
||||
spec:
|
||||
type: ExternalName
|
||||
externalName: itpai.infer.api.vgcserv.com.cn
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
---
|
||||
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: itpai-proxy
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
|
||||
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
|
||||
spec:
|
||||
rules:
|
||||
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
|
||||
http:
|
||||
paths:
|
||||
- path: /v1-openai
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: itpai-backend
|
||||
port:
|
||||
number: 443
|
||||
42
vw-document-ai-indexer/deploy/prd-usermanual/env.yaml
Normal file
42
vw-document-ai-indexer/deploy/prd-usermanual/env.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
config: config.yaml
|
||||
njobs: 12
|
||||
|
||||
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
|
||||
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
|
||||
|
||||
|
||||
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
|
||||
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
|
||||
VECTOR_DIMENSION: 4096
|
||||
FLAG_AOAI: "V3"
|
||||
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
|
||||
|
||||
|
||||
extract_method: di+vision-llm
|
||||
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
|
||||
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
|
||||
di-Formulas: false
|
||||
di-hiRes: true
|
||||
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
|
||||
|
||||
|
||||
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
|
||||
|
||||
|
||||
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
|
||||
|
||||
|
||||
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
|
||||
|
||||
# Image understanding
|
||||
figure_caption:
|
||||
include_di_content: false # Figure content that quotes the result of di
|
||||
description_gen_max_images: 0 # The maximum number of images to be described. 0 means no description
|
||||
model_endpoint: null
|
||||
model_key: null
|
||||
model: null # azure openai set null
|
||||
azure_deployment: gpt-4o # azure openai deployment name,Other platforms are set to empty
|
||||
api_version: 2024-08-01-preview # azure openai deployment name,Other platforms are set to empty
|
||||
|
||||
|
||||
header_fix: true
|
||||
103
vw-document-ai-indexer/deploy/prd/config.yaml
Normal file
103
vw-document-ai-indexer/deploy/prd/config.yaml
Normal file
@@ -0,0 +1,103 @@
|
||||
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-prd?sp=rl&st=2025-08-02T08:25:56Z&se=2125-08-02T16:40:56Z&spr=https&sv=2024-11-04&sr=c&sig=lJui2%2BOs8V%2BdzCkjchQCR7ITWT28tJ0HAq8bIhkkM%2Bk%3D"
|
||||
datasource_name: "cat-standard-regulation-prd"
|
||||
data_dir: ""
|
||||
base_path: "/app/run_tmp"
|
||||
process_file_num: 0
|
||||
process_file_last_modify: "2025-06-24 00:00:00"
|
||||
chunk_size: 2048
|
||||
token_overlap: 256
|
||||
index_schemas:
|
||||
- index_name: "index-catonline-chunk-v2-prd"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
# field_type: "append"
|
||||
upload_batch_size: 50
|
||||
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"] #todo check
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"]
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"]
|
||||
- index_name: "index-catonline-document-v2-prd"
|
||||
data_type: ["document", "metadata"]
|
||||
# field_type: "full"
|
||||
key_fields: ["filepath"]
|
||||
upload_batch_size: 1
|
||||
fields: ["doc_metadata", "full_metadata_vector", "url", "metadata", "image_mapping", "document_schema", "main_title", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
|
||||
merge_content_fields: ["content"]
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
- index_name: "index-catonline-standard-regulation-v2-prd"
|
||||
data_type: ["metadata"]
|
||||
# field_type: "full"
|
||||
key_fields: ["x_Standard_Regulation_Id"]
|
||||
upload_batch_size: 1
|
||||
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
update_by_field: "x_Standard_Regulation_Id"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
merge_fields:
|
||||
- key: "doc_metadata"
|
||||
fields: ["title", "publisher", "document_category", "document_code", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Kind", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Classification", "x_Work_Group", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
|
||||
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-abroad-prd?sp=rl&st=2025-09-08T05:32:13Z&se=2099-09-08T13:47:13Z&sv=2024-11-04&sr=c&sig=ebYoiKrSwCk12cRnQqov197LvuBv7m%2FxNoQv4VDMY5o%3D"
|
||||
datasource_name: "cat-standard-regulation-oversea"
|
||||
data_dir: ""
|
||||
base_path: "/app/run_tmp"
|
||||
process_file_num: 0
|
||||
process_file_last_modify: "2025-06-24 00:00:00"
|
||||
chunk_size: 2048
|
||||
token_overlap: 256
|
||||
index_schemas:
|
||||
- index_name: "index-catonline-chunk-oversea"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
upload_batch_size: 50
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"]
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"]
|
||||
- index_name: "index-catonline-document-oversea"
|
||||
data_type: ["document", "metadata"]
|
||||
key_fields: ["filepath"]
|
||||
upload_batch_size: 1
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
merge_content_fields: ["content"]
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
- index_name: "index-catonline-standard-regulation-oversea"
|
||||
data_type: ["metadata"]
|
||||
key_fields: ["standard_Id"]
|
||||
upload_batch_size: 1
|
||||
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
full_metadata_vector_fields: ["doc_metadata"]
|
||||
semantic_config_name: "default"
|
||||
update_by_field: "standard_Id"
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
merge_fields:
|
||||
- key: "doc_metadata"
|
||||
fields: ["file_Name","entity_Attribute","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","version_Name","version_Parent_Name","technical_Series_No","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str " ]
|
||||
50
vw-document-ai-indexer/deploy/prd/deploy.sh
Normal file
50
vw-document-ai-indexer/deploy/prd/deploy.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
# docker build
|
||||
docker login acrsales2caiprd.azurecr.cn -u username -p password
|
||||
docker build . -t document-ai-indexer:2.0.1
|
||||
docker tag document-ai-indexer:2.0.1 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
|
||||
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
|
||||
|
||||
|
||||
# login AKS
|
||||
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
|
||||
# az login # Log in to Azure China account (browser or device code flow)
|
||||
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
|
||||
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# Create Azure Files Volume
|
||||
# kubectl create secret generic azure-files-cred \
|
||||
# --from-literal=azurestorageaccountname=saaisearchlab \
|
||||
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
|
||||
# -n knowledge-agent
|
||||
|
||||
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
|
||||
# Deploy ConfigMap
|
||||
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
|
||||
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=.\deploy\prd\env.yaml --from-file=.\deploy\prd\config.yaml --from-file=prompt.yaml
|
||||
|
||||
# Deploy Pod
|
||||
# kubectl create namespace knowledge-agent
|
||||
# kubectl delete pod document-ai-indexer -n knowledge-agent
|
||||
kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
|
||||
|
||||
# Monitor Pod
|
||||
kubectl logs -f document-ai-indexer -n knowledge-agent
|
||||
|
||||
# Deploy CronJob
|
||||
kubectl apply -f ./deploy/prd/document-ai-indexer-cronjob.yml --namespace knowledge-agent
|
||||
|
||||
# Check CronJob Status
|
||||
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
|
||||
# Check Job Execution History
|
||||
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
|
||||
|
||||
###########
|
||||
# Manually trigger a job (for testing)
|
||||
kubectl delete job manual-test -n knowledge-agent
|
||||
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
|
||||
# Check Job Logs
|
||||
kubectl logs -f job/manual-test -n knowledge-agent
|
||||
@@ -0,0 +1,69 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: document-ai-indexer-cronjob
|
||||
spec:
|
||||
# Scheduling configuration - execute every 10 minutes
|
||||
schedule: "*/10 * * * *"
|
||||
|
||||
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
# Successful jobs history limit: Keep the last 3 successful job records.
|
||||
successfulJobsHistoryLimit: 10
|
||||
|
||||
# Failed jobs history limit: Keep the last failed job record.
|
||||
failedJobsHistoryLimit: 10
|
||||
|
||||
# Job template
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: document-ai-indexer
|
||||
job-type: cronjob
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
- key: prompt.yaml
|
||||
path: prompt.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false # Write permission
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/prompt.yaml
|
||||
subPath: prompt.yaml
|
||||
|
||||
# Azure File Shared mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Program write/read directory
|
||||
@@ -0,0 +1,47 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: document-ai-indexer
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
|
||||
volumes:
|
||||
# 1. ConfigMap volume
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: document-ai-indexer-config
|
||||
items:
|
||||
- key: env.yaml
|
||||
path: env.yaml
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
- key: prompt.yaml
|
||||
path: prompt.yaml
|
||||
|
||||
# 2. Azure File Share volume
|
||||
- name: data-volume
|
||||
azureFile:
|
||||
secretName: azure-files-cred # Quoting what you created Secret
|
||||
shareName: fs-document-ai-indexer # Your file share name
|
||||
readOnly: false
|
||||
|
||||
containers:
|
||||
- name: document-ai-indexer
|
||||
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
|
||||
imagePullPolicy: Always
|
||||
# Mount the volume into the container
|
||||
volumeMounts:
|
||||
# ConfigMap Mount
|
||||
- name: config-volume
|
||||
mountPath: /app/env.yaml
|
||||
subPath: env.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/config.yaml
|
||||
subPath: config.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app/prompt.yaml
|
||||
subPath: prompt.yaml
|
||||
|
||||
# Azure File Share Mount
|
||||
- name: data-volume
|
||||
mountPath: /app/run_tmp # Directory for program read/write
|
||||
10
vw-document-ai-indexer/deploy/prd/embedding-api-proxy.sh
Normal file
10
vw-document-ai-indexer/deploy/prd/embedding-api-proxy.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
# login AKS
|
||||
# az cloud set -n AzureChinaCloud
|
||||
# az login
|
||||
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
|
||||
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
|
||||
kubectl config use-context aks-sales2c-ai-prd
|
||||
kubectl config current-context
|
||||
|
||||
# kubectl create namespace knowledge-agent
|
||||
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent
|
||||
@@ -0,0 +1,39 @@
|
||||
# Service 资源:将外部域名映射为集群内 Service
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: itpai-backend
|
||||
spec:
|
||||
type: ExternalName
|
||||
externalName: itpai.infer.api.vgcserv.com.cn
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
---
|
||||
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: itpai-proxy
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
|
||||
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
|
||||
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
|
||||
spec:
|
||||
rules:
|
||||
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
|
||||
http:
|
||||
paths:
|
||||
- path: /v1-openai
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: itpai-backend
|
||||
port:
|
||||
number: 443
|
||||
36
vw-document-ai-indexer/deploy/prd/env.yaml
Normal file
36
vw-document-ai-indexer/deploy/prd/env.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
config: config.yaml
|
||||
njobs: 12
|
||||
|
||||
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
|
||||
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
|
||||
|
||||
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
|
||||
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
|
||||
VECTOR_DIMENSION: 4096
|
||||
FLAG_AOAI: "V3"
|
||||
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
|
||||
|
||||
extract_method: di+vision-llm
|
||||
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
|
||||
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
|
||||
di-Formulas: true
|
||||
di-hiRes: true
|
||||
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
|
||||
|
||||
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
|
||||
|
||||
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
|
||||
|
||||
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
|
||||
|
||||
# Image understanding
|
||||
figure_caption:
|
||||
include_di_content: false # Figure content that quotes the result of di
|
||||
description_gen_max_images: 0 # The maximum number of images to be described. 0 means no description
|
||||
model_endpoint: null
|
||||
model_key: null
|
||||
model: null # azure openai set null
|
||||
azure_deployment: null # azure openai deployment name,Other platforms are set to empty
|
||||
api_version: null # azure openai deployment name,Other platforms are set to empty
|
||||
|
||||
header_fix: true
|
||||
215
vw-document-ai-indexer/di_extractor.py
Normal file
215
vw-document-ai-indexer/di_extractor.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import base64
|
||||
import uuid
|
||||
from openai import AzureOpenAI
|
||||
from azure.storage.blob import ContainerClient
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat, AnalyzeResult, \
|
||||
DocumentAnalysisFeature, AnalyzeOutputOption, DocumentSpan
|
||||
from entity_models import DiResult, Document, FigureFlat
|
||||
from utils import TOKEN_ESTIMATOR, custom_serializer, resize_image, file_rename
|
||||
from resilient_http_pool import get_ai_inference_client
|
||||
|
||||
|
||||
def di_extract(source_file_path:str, di_client: DocumentIntelligenceClient, directory_path:str, figure_sas_url:str, language:str="zh-Hans") -> DiResult:
|
||||
di_features:list[str|DocumentAnalysisFeature] = []
|
||||
allow_features_exts: list[str] = os.getenv("di_allow_features_ext", "").lower().split(';')
|
||||
|
||||
# get file name from source_file_path without extension
|
||||
file_name = os.path.basename(source_file_path)
|
||||
di_source_file_path = source_file_path
|
||||
# PDF
|
||||
# JPEG / JPG、PNG、BMP、TIFF、HEIF
|
||||
|
||||
|
||||
file_ext: str = (source_file_path.split('.')[-1] if '.' in source_file_path.split('/')[-1] else '' ).lower()
|
||||
|
||||
if file_ext in ['jpg', 'jpeg', 'jpe', 'jfif', 'pjpeg', 'pjp', 'png', 'gif', 'webp', 'tif', 'tiff', 'bmp', 'dib', 'heif', 'heic', 'avif', 'apng', 'svg']:
|
||||
di_source_file_path = resize_image(source_file_path)
|
||||
|
||||
# doc to docx
|
||||
di_source_file_path = file_rename(di_source_file_path)
|
||||
|
||||
if os.getenv("di-hiRes",'').lower() == "true" and file_ext in allow_features_exts:
|
||||
di_features.append(DocumentAnalysisFeature.OCR_HIGH_RESOLUTION)
|
||||
if os.getenv("di-Formulas",'').lower() == "true" and file_ext in allow_features_exts:
|
||||
di_features.append(DocumentAnalysisFeature.FORMULAS)
|
||||
|
||||
|
||||
print(f"di_features: {di_features},file_path:{file_name}")
|
||||
with open(di_source_file_path, "rb") as file:
|
||||
poller = di_client.begin_analyze_document(model_id="prebuilt-layout", body=file,
|
||||
features=di_features, output_content_format=DocumentContentFormat.MARKDOWN, output=[AnalyzeOutputOption.FIGURES]) # type: ignore
|
||||
|
||||
result: AnalyzeResult = poller.result()
|
||||
extracted_doc = Document()
|
||||
|
||||
source_rel_file_path = os.path.relpath(source_file_path, directory_path)
|
||||
extracted_doc.filepath = source_rel_file_path
|
||||
|
||||
result_content: str = result.content
|
||||
# The operation id is required to later query individual figures
|
||||
operation_id: str = str(poller.details.get("operation_id"))
|
||||
|
||||
output_folder = directory_path + "/.extracted/" + file_name
|
||||
os.makedirs(f"{output_folder}", exist_ok=True)
|
||||
extracted_doc.content = result_content
|
||||
|
||||
with open(f"{output_folder}/_merged_origin.md", "w", encoding="utf-8") as doc_meta_file:
|
||||
doc_meta_file.write(result_content)
|
||||
|
||||
# Download and process images
|
||||
figures = extract_figures(di_client, result, operation_id, directory_path, file_name, figure_sas_url)
|
||||
di_result:DiResult = DiResult(
|
||||
figures = figures,
|
||||
di_content = result_content,
|
||||
filepath= source_rel_file_path,
|
||||
language=language
|
||||
)
|
||||
return di_result
|
||||
|
||||
|
||||
|
||||
def extract_figures(di_client: DocumentIntelligenceClient, result:AnalyzeResult, result_id:str, directory_path:str, file_name:str, figure_sas_url:str)->list[FigureFlat]:
|
||||
"""Extracts figures and their metadata from the analyzed result."""
|
||||
figures:list[FigureFlat] = []
|
||||
|
||||
base_path: Path = Path(os.path.join(directory_path, ".extracted", file_name, ".images"))
|
||||
base_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(f"{base_path}/result.json", "w", encoding="utf-8") as figures_file:
|
||||
json.dump(result, figures_file, default=custom_serializer, ensure_ascii=False, indent=4)
|
||||
|
||||
for figure in result.figures if result.figures is not None else []:
|
||||
if not any(figure.spans):
|
||||
continue
|
||||
|
||||
span:DocumentSpan = figure.spans[0]
|
||||
|
||||
# Image extraction
|
||||
stream = di_client.get_analyze_result_figure(model_id=result.model_id, result_id=result_id, figure_id=figure.id)
|
||||
image_bytes = b"".join(list(stream))
|
||||
path_image: Path = Path(os.path.join(base_path, f"figure_{figure.id}.png"))
|
||||
path_image.write_bytes(image_bytes)
|
||||
|
||||
blob_url = upload_figure(figure_sas_url,f"figure_{figure.id}.png", image_bytes)
|
||||
image_str:str = base64.b64encode(image_bytes).decode('utf-8')
|
||||
figures.append(FigureFlat(offset=span.offset, length=span.length, url=blob_url, content="",image=image_str,understand_flag=False,caption = figure.caption.content if figure.caption else ""))
|
||||
return figures
|
||||
|
||||
|
||||
|
||||
# Compile once for efficiency
|
||||
_specific_comments = re.compile(
|
||||
r"""<!--\s* # opening
|
||||
(?:PageFooter="[^"]*" # PageFooter="…"
|
||||
|PageNumber="[^"]*" # PageNumber="…"
|
||||
|PageBreak # PageBreak
|
||||
|PageHeader="[^"]*") # PageHeader="…"
|
||||
\s*--> # closing
|
||||
""",
|
||||
flags=re.VERBOSE
|
||||
)
|
||||
|
||||
|
||||
def remove_specific_comments(text: str) -> str:
|
||||
return _specific_comments.sub('', text)
|
||||
|
||||
def retry_get_embedding(text: str, embedding_model_key:str, embedding_endpoint:str,min_chunk_size:int=10,retry_num:int = 3):
|
||||
""" Retries getting embedding for the provided text until it succeeds or reaches the retry limit."""
|
||||
full_metadata_size = TOKEN_ESTIMATOR.estimate_tokens(text)
|
||||
if full_metadata_size >= min_chunk_size:
|
||||
for i in range(retry_num):
|
||||
try:
|
||||
return get_embedding(text, embedding_model_key=embedding_model_key,embedding_model_endpoint=embedding_endpoint)
|
||||
except Exception as e:
|
||||
print(f"Error getting embedding for full_metadata_vector with error={e}, retrying, currently at {i + 1} retry, {retry_num - (i + 1)} retries left")
|
||||
time.sleep(10)
|
||||
raise Exception(f"Error getting embedding for full_metadata_vector={text}")
|
||||
|
||||
return None
|
||||
|
||||
def get_embedding(text:str, embedding_model_endpoint:str="", embedding_model_key:str="", azure_credential=None):
|
||||
endpoint = embedding_model_endpoint if embedding_model_endpoint else os.environ.get("EMBEDDING_MODEL_ENDPOINT")
|
||||
|
||||
FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
|
||||
FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
|
||||
FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")
|
||||
|
||||
if azure_credential is None and (endpoint is None or embedding_model_key is None):
|
||||
raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
|
||||
|
||||
try:
|
||||
if FLAG_EMBEDDING_MODEL == "AOAI":
|
||||
endpoint_parts = endpoint.split("/openai/deployments/")
|
||||
base_url = endpoint_parts[0]
|
||||
deployment_id = endpoint_parts[1].split("/embeddings")[0]
|
||||
api_version = endpoint_parts[1].split("api-version=")[1].split("&")[0]
|
||||
if azure_credential is not None:
|
||||
api_key = azure_credential.get_token("https://cognitiveservices.azure.com/.default").token
|
||||
else:
|
||||
api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")
|
||||
|
||||
client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
|
||||
if FLAG_AOAI == "V2":
|
||||
embeddings = client.embeddings.create(model=deployment_id, input=text, timeout=120)
|
||||
elif FLAG_AOAI == "V3":
|
||||
embeddings = client.embeddings.create(model=deployment_id,
|
||||
input=text,
|
||||
dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)), timeout=120)
|
||||
|
||||
return embeddings.model_dump()['data'][0]['embedding']
|
||||
|
||||
if FLAG_EMBEDDING_MODEL == "COHERE":
|
||||
raise Exception("COHERE is not supported for now")
|
||||
# if FLAG_COHERE == "MULTILINGUAL":
|
||||
# key = embedding_model_key if embedding_model_key else os.getenv("COHERE_MULTILINGUAL_API_KEY")
|
||||
# elif FLAG_COHERE == "ENGLISH":
|
||||
# key = embedding_model_key if embedding_model_key else os.getenv("COHERE_ENGLISH_API_KEY")
|
||||
# data, headers = get_payload_and_headers_cohere(text, key)
|
||||
|
||||
# with httpx.Client() as client:
|
||||
# response = client.post(endpoint, json=data, headers=headers)
|
||||
# result_content = response.json()
|
||||
|
||||
# return result_content["embeddings"][0]
|
||||
|
||||
if FLAG_EMBEDDING_MODEL:
|
||||
headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {embedding_model_key}' }
|
||||
data = { "model": FLAG_EMBEDDING_MODEL, "input": text }
|
||||
|
||||
client = get_ai_inference_client()
|
||||
response = client.post(endpoint, json=data, headers=headers)
|
||||
result_content = response.json()
|
||||
|
||||
return result_content["data"][0]["embedding"]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting embeddings with endpoint={endpoint} with error={e}")
|
||||
raise Exception(f"Error getting embeddings with endpoint={endpoint} with error={e}")
|
||||
|
||||
|
||||
def upload_figure(blob_sas_url: str, orgin_file_name: str, data: bytes) -> str:
|
||||
for i in range(3):
|
||||
try:
|
||||
# Upload image to Azure Blob
|
||||
fileName = generate_filename()
|
||||
container_client = ContainerClient.from_container_url(blob_sas_url)
|
||||
blob = container_client.upload_blob(name=f"{fileName}.png", data=data)
|
||||
return urlunparse(urlparse(blob.url)._replace(query='', fragment=''))
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Error uploading figure with error={e}, retrying, currently at {i + 1} retry, {3 - (i + 1)} retries left")
|
||||
time.sleep(3)
|
||||
raise Exception(f"Error uploading figure for: {orgin_file_name}")
|
||||
|
||||
def generate_filename(length:int=8):
|
||||
"""Generate a unique 10-character ID using UUID"""
|
||||
t = int(time.time() * 1000) % 1000000
|
||||
base = uuid.uuid4().hex[:length]
|
||||
return f"{t:06x}{base}"
|
||||
841
vw-document-ai-indexer/docs/design.md
Normal file
841
vw-document-ai-indexer/docs/design.md
Normal file
@@ -0,0 +1,841 @@
|
||||
# Document AI Indexer - Design Document
|
||||
|
||||
## Overview
|
||||
|
||||
The Document AI Indexer is an intelligent document processing and indexing system built on Azure AI services. It provides comprehensive document extraction, processing, and vectorized indexing capabilities for multiple document formats, enabling advanced search and retrieval functionality.
|
||||
|
||||
### Design Philosophy
|
||||
|
||||
The system is designed with several key principles in mind:
|
||||
|
||||
**Modularity and Separation of Concerns**: The architecture follows a layered approach with clear separation between application logic, business logic, service layer, and data access. This ensures maintainability and allows for easy testing and modification of individual components.
|
||||
|
||||
**Scalability and Performance**: Built with asynchronous processing capabilities and horizontal scaling in mind. The system can handle large volumes of documents through configurable parallel processing and efficient resource utilization.
|
||||
|
||||
**Resilience and Fault Tolerance**: Implements comprehensive error handling, retry mechanisms, and graceful degradation to ensure reliable operation even when external services experience issues.
|
||||
|
||||
**Configuration-Driven Architecture**: Utilizes YAML-based configuration management that allows for flexible deployment across different environments without code changes.
|
||||
|
||||
**Cloud-Native Design**: Leverages Azure services for AI processing, storage, and search capabilities while maintaining vendor independence through abstraction layers.
|
||||
|
||||
## Features
|
||||
|
||||
### 🚀 Core Features
|
||||
|
||||
- **Multi-format Document Support**: Handles PDF, DOCX, images (JPEG, PNG, TIFF, etc.), and other document formats
|
||||
- **Intelligent Content Extraction**: Leverages Azure Document Intelligence for OCR and structured data extraction
|
||||
- **Smart Document Chunking**: Implements hierarchy-aware chunking with configurable token limits and overlap
|
||||
- **Vector Search Integration**: Automatic Azure AI Search index creation and document vectorization
|
||||
- **Metadata Management**: Complete extraction and management of document metadata and custom fields
|
||||
- **Hierarchy Structure Repair**: Automatic correction of title hierarchy structure in Markdown documents
|
||||
- **Figure and Formula Extraction**: Advanced extraction of visual elements and mathematical formulas
|
||||
|
||||
### 🔧 Technical Features
|
||||
|
||||
- **Asynchronous Processing**: High-performance async processing using asyncio and task queues
|
||||
- **Containerized Deployment**: Complete Docker and Kubernetes support with configurable environments
|
||||
- **Configuration Management**: Flexible YAML-based configuration for different deployment scenarios
|
||||
- **Database Support**: SQLAlchemy ORM with support for multiple database backends
|
||||
- **Resilient Processing**: Built-in retry mechanisms, error handling, and fault tolerance
|
||||
- **Monitoring & Logging**: Comprehensive logging, progress monitoring, and processing statistics
|
||||
- **Scalable Architecture**: Horizontal scaling support through containerization and task distribution
|
||||
|
||||
## System Architecture
|
||||
|
||||
The Document AI Indexer follows a multi-layered architecture designed for scalability, maintainability, and robust error handling. The system processes documents through a well-defined pipeline that transforms raw documents into searchable, vectorized content.
|
||||
|
||||
### Architectural Patterns
|
||||
|
||||
**Service Factory Pattern**: The system uses a centralized ServiceFactory to manage dependencies and service creation. This pattern ensures consistent configuration across all services and enables easy testing through dependency injection.
|
||||
|
||||
**Repository Pattern**: Data access is abstracted through repository interfaces, allowing for different storage backends and simplified testing with mock implementations.
|
||||
|
||||
**Command Pattern**: Document processing tasks are encapsulated as commands that can be queued, retried, and executed asynchronously.
|
||||
|
||||
**Pipeline Pattern**: The document processing workflow follows a clear pipeline with distinct stages: extraction, hierarchy fixing, chunking, vectorization, and indexing.
|
||||
|
||||
### High-Level Architecture
|
||||
|
||||
The high-level architecture represents a distributed, service-oriented system designed for scalable document processing and intelligent content extraction. The architecture emphasizes separation of concerns, fault tolerance, and cloud-native principles to handle enterprise-scale document processing workloads.
|
||||
|
||||
#### Architectural Overview
|
||||
|
||||
**Multi-Layered Design**: The system is organized into distinct functional layers that separate data ingestion, processing logic, AI services, and storage concerns. This layered approach enables independent scaling, testing, and maintenance of different system components.
|
||||
|
||||
**Service-Oriented Architecture**: Each major functional area is implemented as a distinct service or component group, enabling independent deployment, scaling, and maintenance. Services communicate through well-defined interfaces and can be replaced or upgraded independently.
|
||||
|
||||
**Cloud-Native Integration**: The architecture leverages Azure cloud services for AI processing, storage, and search capabilities while maintaining abstraction layers that enable portability and testing flexibility.
|
||||
|
||||
**Event-Driven Processing**: The system follows an event-driven model where document processing is triggered by events (new documents, configuration changes, etc.) and progresses through a series of processing stages with clear state transitions.
|
||||
|
||||
#### System Components and Responsibilities
|
||||
|
||||
**Data Sources Layer**: Manages document ingestion from various sources including Azure Blob Storage and local file systems. This layer handles authentication, access control, and metadata extraction from source systems. It provides a unified interface for document discovery regardless of the underlying storage mechanism.
|
||||
|
||||
**Processing Engine Layer**: Orchestrates the entire document processing workflow through a hierarchical task management system. The Main Application serves as the central coordinator, while the Task Processor manages work distribution and the Document Task Processor handles individual document processing operations with full state tracking and error recovery.
|
||||
|
||||
**AI Services Layer**: Provides intelligent document processing capabilities through integration with Azure AI services and optional Vision LLM systems. These services handle complex operations like OCR, layout analysis, content extraction, and embedding generation. The modular design allows for easy integration of additional AI services or replacement of existing ones.
|
||||
|
||||
**Processing Pipeline Layer**: Implements the core document transformation logic through a series of processing stages. Each stage has specific responsibilities: content extraction converts raw documents to structured text, hierarchy fixing normalizes document structure, chunking creates manageable content segments, and vector generation produces searchable embeddings.
|
||||
|
||||
**Storage & Search Layer**: Manages persistent data storage and search capabilities through a combination of relational database storage for metadata and state management, Azure AI Search for vector-based content search, and blob storage for processed content and temporary files.
|
||||
|
||||
#### Data Flow and Integration Patterns
|
||||
|
||||
**Asynchronous Processing Flow**: Documents flow through the system asynchronously, enabling high throughput and efficient resource utilization. Each processing stage can operate independently, with clear handoff points and state persistence between stages.
|
||||
|
||||
**Fault-Tolerant Design**: The architecture includes comprehensive error handling and recovery mechanisms at every level. Failed operations are tracked, logged, and can be retried with exponential backoff. The system maintains processing state to enable recovery from failures without losing work.
|
||||
|
||||
**Scalability Patterns**: The architecture supports both vertical and horizontal scaling through stateless processing components, connection pooling, and queue-based work distribution. Different components can be scaled independently based on their specific resource requirements and bottlenecks.
|
||||
|
||||
**Configuration-Driven Behavior**: The system behavior is largely controlled through configuration rather than code changes, enabling flexible deployment across different environments and use cases without requiring code modifications or redeployment.
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Data Sources"
|
||||
DS[Document Sources<br/>Azure Blob Storage/Local Files]
|
||||
META[Metadata<br/>Configuration]
|
||||
end
|
||||
|
||||
subgraph "Processing Engine"
|
||||
MAIN[Main Application<br/>Orchestrator]
|
||||
TP[Task Processor<br/>Queue Management]
|
||||
DTP[Document Task<br/>Processor]
|
||||
end
|
||||
|
||||
subgraph "AI Services"
|
||||
ADI[Azure Document<br/>Intelligence]
|
||||
EMBED[Embedding<br/>Service]
|
||||
VLLM[Vision LLM<br/>Optional]
|
||||
end
|
||||
|
||||
subgraph "Processing Pipeline"
|
||||
EXTRACT[Content<br/>Extraction]
|
||||
HIERARCHY[Hierarchy<br/>Fix]
|
||||
CHUNK[Document<br/>Chunking]
|
||||
VECTOR[Vector<br/>Generation]
|
||||
end
|
||||
|
||||
subgraph "Storage & Search"
|
||||
DB[(Database<br/>SQLAlchemy)]
|
||||
AAS[Azure AI Search<br/>Index]
|
||||
BLOB[Azure Blob<br/>Storage]
|
||||
end
|
||||
|
||||
DS --> MAIN
|
||||
META --> MAIN
|
||||
MAIN --> TP
|
||||
TP --> DTP
|
||||
DTP --> EXTRACT
|
||||
|
||||
EXTRACT --> ADI
|
||||
EXTRACT --> VLLM
|
||||
ADI --> HIERARCHY
|
||||
HIERARCHY --> CHUNK
|
||||
CHUNK --> VECTOR
|
||||
VECTOR --> EMBED
|
||||
|
||||
DTP --> DB
|
||||
VECTOR --> AAS
|
||||
EXTRACT --> BLOB
|
||||
|
||||
style DS fill:#e1f5fe
|
||||
style AI fill:#f3e5f5
|
||||
style STORAGE fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Component Architecture
|
||||
|
||||
The component architecture illustrates the internal structure and dependencies between different layers of the system. Each layer has specific responsibilities and communicates through well-defined interfaces.
|
||||
|
||||
**Application Layer**: Handles application initialization, configuration loading, and high-level orchestration. The ApplicationContext manages the overall application state and provides access to configuration and services.
|
||||
|
||||
**Business Layer**: Contains the core business logic for document processing. The DocumentProcessingOrchestrator coordinates the entire processing workflow, while the DocumentProcessor handles individual document processing tasks.
|
||||
|
||||
**Service Layer**: Provides abstracted access to external services and resources. The ServiceFactory manages service creation and configuration, ensuring consistent behavior across the application.
|
||||
|
||||
**Data Layer**: Manages data persistence and retrieval through repository patterns and entity models. This layer abstracts database operations and provides a clean interface for data access.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Application Layer"
|
||||
APP[DocumentProcessingApplication]
|
||||
CTX[ApplicationContext]
|
||||
CONFIG[ApplicationConfig]
|
||||
end
|
||||
|
||||
subgraph "Business Layer"
|
||||
BL[Business Layer]
|
||||
ORCH[DocumentProcessingOrchestrator]
|
||||
PROC[DocumentProcessor]
|
||||
FACTORY[DocumentProcessingFactory]
|
||||
end
|
||||
|
||||
subgraph "Service Layer"
|
||||
SF[ServiceFactory]
|
||||
DI[DocumentIntelligenceService]
|
||||
CHUNK[ChunkService]
|
||||
INDEX[AzureIndexService]
|
||||
BLOB[BlobService]
|
||||
end
|
||||
|
||||
subgraph "Data Layer"
|
||||
DB[DatabaseInterface]
|
||||
REPO[DocumentRepository]
|
||||
MODELS[Entity Models]
|
||||
end
|
||||
|
||||
APP --> BL
|
||||
CTX --> CONFIG
|
||||
APP --> CTX
|
||||
|
||||
BL --> SF
|
||||
ORCH --> PROC
|
||||
FACTORY --> ORCH
|
||||
|
||||
SF --> DI
|
||||
SF --> CHUNK
|
||||
SF --> INDEX
|
||||
SF --> BLOB
|
||||
|
||||
PROC --> DB
|
||||
DB --> REPO
|
||||
REPO --> MODELS
|
||||
|
||||
style APP fill:#bbdefb
|
||||
style BL fill:#c8e6c9
|
||||
style SF fill:#ffecb3
|
||||
style DB fill:#f8bbd9
|
||||
```
|
||||
|
||||
## Workflow
|
||||
|
||||
The document processing workflow is designed to handle large-scale document processing with fault tolerance and efficient resource utilization. The system processes documents asynchronously through a task-based architecture.
|
||||
|
||||
### Processing Strategy
|
||||
|
||||
**Asynchronous Task Processing**: Documents are processed as individual tasks that can be executed in parallel. This approach maximizes throughput and allows for efficient resource utilization across multiple processing nodes.
|
||||
|
||||
**Stateful Processing**: Each document's processing state is tracked in the database, enabling recovery from failures and preventing duplicate processing. The system maintains detailed status information and processing history.
|
||||
|
||||
**Batch Operations**: Where possible, operations are batched to improve efficiency. This is particularly important for operations like embedding generation and search index uploads.
|
||||
|
||||
**Retry Logic**: Failed operations are automatically retried with exponential backoff. The system distinguishes between transient failures (which should be retried) and permanent failures (which should be logged and skipped).
|
||||
|
||||
### Document Processing Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant USER as User/Scheduler
|
||||
participant MAIN as Main App
|
||||
participant TP as Task Processor
|
||||
participant DTP as Document Task Processor
|
||||
participant ORCH as Orchestrator
|
||||
participant ADI as Azure DI
|
||||
participant CHUNK as Chunk Service
|
||||
participant INDEX as Index Service
|
||||
participant DB as Database
|
||||
|
||||
USER->>MAIN: Start Processing
|
||||
MAIN->>MAIN: Initialize Configuration
|
||||
MAIN->>DB: Initialize Database
|
||||
MAIN->>TP: Create Task Processor
|
||||
|
||||
loop For Each Document
|
||||
MAIN->>TP: Submit Document Task
|
||||
TP->>DTP: Process Task
|
||||
DTP->>DB: Create/Update IndexObject
|
||||
DTP->>ORCH: Execute Processing
|
||||
|
||||
ORCH->>ADI: Extract Document Content
|
||||
ADI-->>ORCH: Return Extracted Content
|
||||
|
||||
ORCH->>ORCH: Fix Hierarchy
|
||||
ORCH->>CHUNK: Chunk Document
|
||||
CHUNK-->>ORCH: Return Chunks
|
||||
|
||||
ORCH->>INDEX: Generate Embeddings
|
||||
INDEX-->>ORCH: Return Vectors
|
||||
|
||||
ORCH->>INDEX: Upload to Search Index
|
||||
INDEX-->>ORCH: Confirm Upload
|
||||
|
||||
ORCH-->>DTP: Return Processing Result
|
||||
DTP->>DB: Update IndexObject Status
|
||||
DTP-->>TP: Return Result
|
||||
end
|
||||
|
||||
TP-->>MAIN: Processing Complete
|
||||
MAIN-->>USER: Return Statistics
|
||||
```
|
||||
|
||||
### Data Flow Architecture
|
||||
|
||||
The data flow architecture represents the end-to-end processing pipeline from document ingestion to search index publication. This design emphasizes fault tolerance, scalability, and efficient resource utilization throughout the processing lifecycle.
|
||||
|
||||
#### Design Principles for Data Flow
|
||||
|
||||
**Pipeline-Based Processing**: The data flow follows a clear pipeline pattern where each stage has specific responsibilities and well-defined inputs and outputs. This design enables parallel processing, easier debugging, and modular testing of individual stages.
|
||||
|
||||
**Decision Points and Routing**: The architecture includes intelligent decision points that route documents through appropriate processing paths based on their characteristics. This ensures optimal processing strategies for different document types while maintaining a unified interface.
|
||||
|
||||
**State Management**: Processing state is carefully managed throughout the pipeline, with persistent state stored in the database and transient state maintained in memory. This approach enables recovery from failures at any point in the pipeline.
|
||||
|
||||
**Resource Optimization**: The flow is designed to minimize resource usage through efficient batching, connection reuse, and memory management. Processing stages are optimized to balance throughput with resource consumption.
|
||||
|
||||
#### Processing Flow Stages
|
||||
|
||||
**Initialization Phase**: The system performs comprehensive initialization including configuration validation, database connectivity checks, and service authentication. This phase ensures that all dependencies are available before processing begins.
|
||||
|
||||
**Discovery and Task Creation**: Document sources are scanned to identify new or modified documents that require processing. Tasks are created based on configured criteria such as file modification dates and processing history.
|
||||
|
||||
**Format Detection and Routing**: Documents are analyzed to determine their format and complexity, enabling the system to select the most appropriate extraction method. This intelligent routing ensures optimal processing quality and efficiency.
|
||||
|
||||
**Content Extraction**: Multiple extraction paths are available depending on document characteristics. The system can leverage Azure Document Intelligence for complex documents, Vision LLM for advanced image analysis, or direct processing for simple text documents.
|
||||
|
||||
**Content Enhancement**: Extracted content undergoes enhancement through hierarchy fixing and structure normalization. This stage ensures that the processed content maintains logical structure and is suitable for effective chunking.
|
||||
|
||||
**Vectorization and Indexing**: The final stages convert processed content into searchable vectors and upload them to the search index. These operations are batched for efficiency and include comprehensive error handling and retry logic.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
START([Start Processing]) --> INIT[Initialize Application]
|
||||
INIT --> LOAD_CONFIG[Load Configuration]
|
||||
LOAD_CONFIG --> INIT_DB[Initialize Database]
|
||||
INIT_DB --> SCAN_DOCS[Scan Document Sources]
|
||||
|
||||
SCAN_DOCS --> CREATE_TASKS[Create Processing Tasks]
|
||||
CREATE_TASKS --> PROCESS_TASK{Process Each Task}
|
||||
|
||||
PROCESS_TASK --> EXTRACT[Extract Content]
|
||||
EXTRACT --> CHECK_FORMAT{Check Document Format}
|
||||
|
||||
CHECK_FORMAT -->|PDF/Images| USE_DI[Use Azure Document Intelligence]
|
||||
CHECK_FORMAT -->|Vision Mode| USE_VLLM[Use Vision LLM]
|
||||
CHECK_FORMAT -->|Text| DIRECT_PROCESS[Direct Processing]
|
||||
|
||||
USE_DI --> EXTRACT_RESULT[Content + Metadata]
|
||||
USE_VLLM --> EXTRACT_RESULT
|
||||
DIRECT_PROCESS --> EXTRACT_RESULT
|
||||
|
||||
EXTRACT_RESULT --> FIX_HIERARCHY[Fix Document Hierarchy]
|
||||
FIX_HIERARCHY --> CHUNK_DOC[Chunk Document]
|
||||
CHUNK_DOC --> GENERATE_VECTORS[Generate Embeddings]
|
||||
GENERATE_VECTORS --> UPLOAD_INDEX[Upload to Search Index]
|
||||
|
||||
UPLOAD_INDEX --> UPDATE_DB[Update Database Status]
|
||||
UPDATE_DB --> MORE_TASKS{More Tasks?}
|
||||
|
||||
MORE_TASKS -->|Yes| PROCESS_TASK
|
||||
MORE_TASKS -->|No| COMPLETE[Processing Complete]
|
||||
|
||||
COMPLETE --> STATS[Generate Statistics]
|
||||
STATS --> END([End])
|
||||
|
||||
style START fill:#c8e6c9
|
||||
style END fill:#ffcdd2
|
||||
style EXTRACT fill:#fff3e0
|
||||
style GENERATE_VECTORS fill:#e1f5fe
|
||||
style UPLOAD_INDEX fill:#f3e5f5
|
||||
```
|
||||
|
||||
## Functional Logic
|
||||
|
||||
The functional logic of the Document AI Indexer encompasses three main processing areas: document extraction, content chunking, and search indexing. Each area implements sophisticated algorithms to ensure high-quality output.
|
||||
|
||||
### Design Principles for Document Processing
|
||||
|
||||
**Format-Agnostic Processing**: The system handles multiple document formats through a unified interface. Different extractors are used based on document type, but all produce a standardized Document object.
|
||||
|
||||
**Intelligent Content Analysis**: Before processing, the system analyzes document structure to determine the optimal processing strategy. This includes detecting header hierarchies, identifying figures and tables, and understanding document layout.
|
||||
|
||||
**Quality Assurance**: Each processing stage includes validation and quality checks. For example, the hierarchy fixer validates that document structure is logical and coherent before proceeding to chunking.
|
||||
|
||||
**Metadata Preservation**: Throughout the processing pipeline, important metadata is preserved and enriched. This includes document properties, processing timestamps, and structural information.
|
||||
|
||||
### Document Extraction Logic
|
||||
|
||||
The document extraction logic is the foundation of the processing pipeline. It handles the complex task of converting various document formats into structured, searchable content while preserving important layout and formatting information.
|
||||
|
||||
**Multi-Modal Processing**: The system supports both traditional OCR-based extraction and advanced vision-language model processing. The choice of extraction method depends on document complexity and available resources.
|
||||
|
||||
**Feature Detection**: Azure Document Intelligence features are selectively enabled based on document characteristics and configuration. This includes high-resolution OCR for detailed documents, formula extraction for technical content, and figure extraction for visual elements.
|
||||
|
||||
**Content Structure Preservation**: The extraction process maintains document structure through markdown formatting, preserving headers, lists, tables, and other formatting elements that provide context for the content.
|
||||
|
||||
**Error Handling and Fallbacks**: If advanced extraction features fail, the system falls back to basic extraction methods to ensure that content is not lost due to processing errors.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
DOC[Document Input] --> DETECT[Detect Format]
|
||||
|
||||
DETECT --> PDF{PDF?}
|
||||
DETECT --> IMG{Image?}
|
||||
DETECT --> OFFICE{Office Doc?}
|
||||
DETECT --> TEXT{Text File?}
|
||||
|
||||
PDF -->|Yes| DI_PDF[Azure DI Layout Model]
|
||||
IMG -->|Yes| RESIZE[Resize if Needed]
|
||||
OFFICE -->|Yes| CONVERT[Convert to Supported Format]
|
||||
TEXT -->|Yes| DIRECT[Direct Content Read]
|
||||
|
||||
RESIZE --> DI_IMG[Azure DI OCR + Layout]
|
||||
CONVERT --> DI_OFFICE[Azure DI Document Analysis]
|
||||
|
||||
DI_PDF --> FEATURES[Apply DI Features]
|
||||
DI_IMG --> FEATURES
|
||||
DI_OFFICE --> FEATURES
|
||||
|
||||
FEATURES --> HIGH_RES{High Resolution OCR?}
|
||||
FEATURES --> FORMULAS{Extract Formulas?}
|
||||
FEATURES --> FIGURES{Extract Figures?}
|
||||
|
||||
HIGH_RES -->|Yes| ENABLE_HIRES[Enable High-Res OCR]
|
||||
FORMULAS -->|Yes| ENABLE_FORMULAS[Enable Formula Extraction]
|
||||
FIGURES -->|Yes| ENABLE_FIGURES[Enable Figure Extraction]
|
||||
|
||||
ENABLE_HIRES --> PROCESS_DI[Process with Azure DI]
|
||||
ENABLE_FORMULAS --> PROCESS_DI
|
||||
ENABLE_FIGURES --> PROCESS_DI
|
||||
HIGH_RES -->|No| PROCESS_DI
|
||||
FORMULAS -->|No| PROCESS_DI
|
||||
FIGURES -->|No| PROCESS_DI
|
||||
|
||||
DIRECT --> EXTRACT_META[Extract Metadata]
|
||||
PROCESS_DI --> EXTRACT_CONTENT[Extract Content + Structure]
|
||||
|
||||
EXTRACT_CONTENT --> EXTRACT_META
|
||||
EXTRACT_META --> RESULT[Document Object]
|
||||
|
||||
style DOC fill:#e3f2fd
|
||||
style RESULT fill:#c8e6c9
|
||||
style PROCESS_DI fill:#fff3e0
|
||||
```
|
||||
|
||||
### Chunking Strategy
|
||||
|
||||
The chunking strategy is critical for creating meaningful, searchable segments from large documents. The system implements intelligent chunking that respects document structure while maintaining optimal chunk sizes for search and retrieval.
|
||||
|
||||
**Hierarchy-Aware Chunking**: The system analyzes document structure and uses markdown headers to create logical chunks. This ensures that related content stays together and that chunks maintain contextual coherence.
|
||||
|
||||
**Adaptive Chunking**: Chunk boundaries are determined by both content structure and token limits. The system balances the need for complete thoughts with search engine constraints.
|
||||
|
||||
**Overlap Strategy**: Configurable token overlap between chunks ensures that important information at chunk boundaries is not lost during retrieval operations.
|
||||
|
||||
**Token Management**: Precise token counting using tiktoken ensures that chunks stay within specified limits while maximizing content density.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
CONTENT[Extracted Content] --> HIERARCHY_FIX{Apply Hierarchy Fix?}
|
||||
|
||||
HIERARCHY_FIX -->|Yes| FIX[Fix Header Hierarchy]
|
||||
HIERARCHY_FIX -->|No| CHUNK_STRATEGY[Determine Chunking Strategy]
|
||||
|
||||
FIX --> ANALYZE[Analyze Document Structure]
|
||||
ANALYZE --> CHUNK_STRATEGY
|
||||
|
||||
CHUNK_STRATEGY --> MARKDOWN{Markdown Headers?}
|
||||
CHUNK_STRATEGY --> RECURSIVE{Use Recursive Split?}
|
||||
|
||||
MARKDOWN -->|Yes| HEADER_SPLIT[Markdown Header Splitter]
|
||||
MARKDOWN -->|No| RECURSIVE
|
||||
RECURSIVE -->|Yes| CHAR_SPLIT[Recursive Character Splitter]
|
||||
|
||||
HEADER_SPLIT --> CONFIG[Apply Chunk Configuration]
|
||||
CHAR_SPLIT --> CONFIG
|
||||
|
||||
CONFIG --> SIZE[Chunk Size: 2048 tokens]
|
||||
CONFIG --> OVERLAP[Token Overlap: 128]
|
||||
|
||||
SIZE --> SPLIT[Split Document]
|
||||
OVERLAP --> SPLIT
|
||||
|
||||
SPLIT --> VALIDATE[Validate Chunk Sizes]
|
||||
VALIDATE --> METADATA[Add Chunk Metadata]
|
||||
|
||||
METADATA --> RESULT[Chunked Documents]
|
||||
|
||||
style CONTENT fill:#e3f2fd
|
||||
style RESULT fill:#c8e6c9
|
||||
style FIX fill:#fff3e0
|
||||
style SPLIT fill:#f3e5f5
|
||||
```
|
||||
|
||||
### Indexing and Search Integration
|
||||
|
||||
The indexing and search integration component handles the final stage of the processing pipeline, converting processed documents into searchable vector representations and uploading them to Azure AI Search.
|
||||
|
||||
**Vector Generation**: The system generates high-quality embeddings using Azure OpenAI services. Multiple vector fields can be configured to support different search scenarios (content-based, metadata-based, etc.).
|
||||
|
||||
**Batch Processing**: Documents are processed in configurable batches to optimize upload performance and manage API rate limits effectively.
|
||||
|
||||
**Schema Management**: The system automatically creates and manages search index schemas based on configuration files, ensuring that all required fields and vector configurations are properly set up.
|
||||
|
||||
**Error Recovery**: Failed uploads are tracked and retried, with detailed logging to help diagnose and resolve issues. The system can recover from partial batch failures without losing processed content.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
CHUNKS[Document Chunks] --> EMBED[Generate Embeddings]
|
||||
|
||||
EMBED --> OPENAI[Azure OpenAI API]
|
||||
OPENAI --> VECTORS[Vector Embeddings]
|
||||
|
||||
VECTORS --> PREPARE[Prepare Index Documents]
|
||||
PREPARE --> METADATA[Add Metadata Fields]
|
||||
|
||||
METADATA --> CUSTOM[Add Custom Fields]
|
||||
CUSTOM --> BATCH[Create Upload Batches]
|
||||
|
||||
BATCH --> SIZE[Batch Size: 50 docs]
|
||||
SIZE --> UPLOAD[Upload to Azure AI Search]
|
||||
|
||||
UPLOAD --> SUCCESS{Upload Successful?}
|
||||
SUCCESS -->|Yes| UPDATE_STATUS[Update Success Status]
|
||||
SUCCESS -->|No| RETRY[Retry Upload]
|
||||
|
||||
RETRY --> MAX_RETRIES{Max Retries Reached?}
|
||||
MAX_RETRIES -->|No| UPLOAD
|
||||
MAX_RETRIES -->|Yes| ERROR[Mark as Failed]
|
||||
|
||||
UPDATE_STATUS --> NEXT_BATCH{More Batches?}
|
||||
NEXT_BATCH -->|Yes| BATCH
|
||||
NEXT_BATCH -->|No| COMPLETE[Index Complete]
|
||||
|
||||
ERROR --> LOG[Log Error Details]
|
||||
LOG --> COMPLETE
|
||||
|
||||
style CHUNKS fill:#e3f2fd
|
||||
style COMPLETE fill:#c8e6c9
|
||||
style EMBED fill:#fff3e0
|
||||
style UPLOAD fill:#f3e5f5
|
||||
style ERROR fill:#ffcdd2
|
||||
```
|
||||
|
||||
## Database Schema
|
||||
|
||||
The database schema is designed to support scalable document processing operations while maintaining data integrity and enabling efficient querying. The schema tracks processing state, manages job coordination, and provides audit trails.
|
||||
|
||||
### Design Rationale
|
||||
|
||||
**Composite Primary Keys**: The IndexObject table uses composite primary keys (object_key, datasource_name) to support multi-tenant scenarios where the same document might exist in different data sources.
|
||||
|
||||
**State Tracking**: Detailed status tracking allows the system to resume processing after failures and provides visibility into processing progress and issues.
|
||||
|
||||
**Audit Trail**: Comprehensive timestamp tracking and detailed message logging provide full audit trails for compliance and debugging purposes.
|
||||
|
||||
**Job Coordination**: The IndexJob table enables coordination of processing jobs across multiple instances and provides reporting on job completion and success rates.
|
||||
|
||||
### Core Entities
|
||||
|
||||
```mermaid
|
||||
erDiagram
|
||||
IndexObject {
|
||||
string object_key PK
|
||||
string datasource_name PK
|
||||
string type
|
||||
string status
|
||||
datetime created_time
|
||||
datetime updated_time
|
||||
datetime last_start_time
|
||||
datetime last_finished_time
|
||||
int try_count
|
||||
int last_run_id
|
||||
text detailed_message
|
||||
text error_message
|
||||
text last_message
|
||||
}
|
||||
|
||||
IndexJob {
|
||||
int id PK
|
||||
string datasource_name
|
||||
string status
|
||||
datetime start_time
|
||||
datetime end_time
|
||||
int total_files
|
||||
int processed_files
|
||||
int failed_files
|
||||
int skipped_files
|
||||
text config_snapshot
|
||||
text error_message
|
||||
}
|
||||
|
||||
IndexObject ||--o{ IndexJob : belongs_to
|
||||
```
|
||||
|
||||
## Configuration Management
|
||||
|
||||
The configuration management system is designed to support flexible deployment across different environments while maintaining security and ease of management. The system separates business configuration from sensitive credentials and provides environment-specific overrides.
|
||||
|
||||
### Configuration Strategy
|
||||
|
||||
**Separation of Concerns**: Business logic configuration (data sources, processing parameters) is separated from sensitive credentials (API keys, connection strings) to enable secure deployment practices.
|
||||
|
||||
**Environment-Specific Configuration**: The system supports multiple configuration files that can be combined to create environment-specific deployments without duplicating common settings.
|
||||
|
||||
**Validation and Defaults**: Configuration values are validated at startup, and sensible defaults are provided to minimize required configuration while ensuring the system operates correctly.
|
||||
|
||||
**Dynamic Reconfiguration**: Many configuration parameters can be modified without requiring application restarts, enabling operational flexibility and optimization.
|
||||
|
||||
### Configuration Structure
|
||||
|
||||
```mermaid
|
||||
mindmap
|
||||
root((Configuration))
|
||||
Data Sources
|
||||
Blob Storage
|
||||
SAS Tokens
|
||||
Container Paths
|
||||
Local Files
|
||||
Directory Paths
|
||||
File Filters
|
||||
Processing
|
||||
Chunk Size
|
||||
Token Overlap
|
||||
Batch Sizes
|
||||
Retry Limits
|
||||
AI Services
|
||||
Azure Document Intelligence
|
||||
Endpoint
|
||||
API Key
|
||||
Features
|
||||
Azure OpenAI
|
||||
Endpoint
|
||||
API Key
|
||||
Model Settings
|
||||
Database
|
||||
Connection String
|
||||
Connection Pool
|
||||
Index Schemas
|
||||
Field Mappings
|
||||
Vector Configurations
|
||||
Search Index Settings
|
||||
```
|
||||
|
||||
## Deployment Architecture
|
||||
|
||||
The deployment architecture is designed for cloud-native operations with support for both batch processing and continuous operation modes. The system leverages Kubernetes for orchestration and scaling while maintaining compatibility with various deployment scenarios.
|
||||
|
||||
### Cloud-Native Design Principles
|
||||
|
||||
**Containerization**: The application is fully containerized, enabling consistent deployment across different environments and easy scaling based on demand.
|
||||
|
||||
**Stateless Processing**: Processing pods are designed to be stateless, with all persistent state managed through external databases and storage services. This enables horizontal scaling and fault tolerance.
|
||||
|
||||
**Configuration Externalization**: All configuration is externalized through ConfigMaps and Secrets, allowing for environment-specific configuration without rebuilding container images.
|
||||
|
||||
**Resource Management**: The deployment configuration includes resource limits and requests to ensure proper resource allocation and prevent resource contention in multi-tenant environments.
|
||||
|
||||
### Scaling Strategy
|
||||
|
||||
**Horizontal Pod Autoscaling**: The system can automatically scale the number of processing pods based on CPU utilization, memory usage, or custom metrics like queue depth.
|
||||
|
||||
**Job-Based Processing**: For batch operations, the system uses Kubernetes Jobs and CronJobs to ensure processing completion and automatic cleanup of completed jobs.
|
||||
|
||||
**Load Distribution**: Multiple pods process documents in parallel, with work distribution managed through the database-backed task queue system.
|
||||
|
||||
### Kubernetes Deployment
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Kubernetes Cluster"
|
||||
subgraph "Namespace: document-ai"
|
||||
POD1[Document Processor Pod 1]
|
||||
POD2[Document Processor Pod 2]
|
||||
POD3[Document Processor Pod N]
|
||||
|
||||
CM[ConfigMap<br/>config.yaml]
|
||||
SECRET[Secret<br/>env.yaml]
|
||||
|
||||
PVC[PersistentVolumeClaim<br/>Temp Storage]
|
||||
end
|
||||
|
||||
subgraph "Services"
|
||||
SVC[LoadBalancer Service]
|
||||
CRON[CronJob Controller]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph "External Services"
|
||||
AZURE_DI[Azure Document Intelligence]
|
||||
AZURE_OPENAI[Azure OpenAI]
|
||||
AZURE_SEARCH[Azure AI Search]
|
||||
AZURE_STORAGE[Azure Blob Storage]
|
||||
DATABASE[(Database)]
|
||||
end
|
||||
|
||||
CM --> POD1
|
||||
CM --> POD2
|
||||
CM --> POD3
|
||||
|
||||
SECRET --> POD1
|
||||
SECRET --> POD2
|
||||
SECRET --> POD3
|
||||
|
||||
PVC --> POD1
|
||||
PVC --> POD2
|
||||
PVC --> POD3
|
||||
|
||||
SVC --> POD1
|
||||
SVC --> POD2
|
||||
SVC --> POD3
|
||||
|
||||
CRON --> POD1
|
||||
|
||||
POD1 --> AZURE_DI
|
||||
POD1 --> AZURE_OPENAI
|
||||
POD1 --> AZURE_SEARCH
|
||||
POD1 --> AZURE_STORAGE
|
||||
POD1 --> DATABASE
|
||||
|
||||
POD2 --> AZURE_DI
|
||||
POD2 --> AZURE_OPENAI
|
||||
POD2 --> AZURE_SEARCH
|
||||
POD2 --> AZURE_STORAGE
|
||||
POD2 --> DATABASE
|
||||
|
||||
POD3 --> AZURE_DI
|
||||
POD3 --> AZURE_OPENAI
|
||||
POD3 --> AZURE_SEARCH
|
||||
POD3 --> AZURE_STORAGE
|
||||
POD3 --> DATABASE
|
||||
|
||||
style POD1 fill:#e1f5fe
|
||||
style POD2 fill:#e1f5fe
|
||||
style POD3 fill:#e1f5fe
|
||||
style CM fill:#fff3e0
|
||||
style SECRET fill:#ffebee
|
||||
```
|
||||
|
||||
## Performance and Scalability
|
||||
|
||||
The system is designed to handle large-scale document processing operations efficiently while maintaining high quality output. Performance optimization occurs at multiple levels: application design, resource utilization, and operational practices.
|
||||
|
||||
### Performance Optimization Strategies
|
||||
|
||||
**Asynchronous Processing**: All I/O-bound operations are implemented asynchronously to maximize throughput and resource utilization. This is particularly important for operations involving external API calls and database operations.
|
||||
|
||||
**Connection Pooling**: Database and HTTP connections are pooled and reused to minimize connection overhead and improve response times.
|
||||
|
||||
**Caching Strategies**: Frequently accessed configuration data and metadata are cached in memory to reduce database load and improve response times.
|
||||
|
||||
**Batch Operations**: Operations that can be batched (such as database writes and API calls) are grouped together to reduce overhead and improve efficiency.
|
||||
|
||||
### Scalability Considerations
|
||||
|
||||
**Horizontal Scaling**: The stateless design of processing components enables horizontal scaling by adding more processing instances without architectural changes.
|
||||
|
||||
**Database Optimization**: Database operations are optimized through proper indexing, connection pooling, and efficient query patterns to support high-concurrency operations.
|
||||
|
||||
**Rate Limiting and Throttling**: The system implements rate limiting and throttling mechanisms to respect external service limits while maintaining optimal throughput.
|
||||
|
||||
**Resource Monitoring**: Comprehensive monitoring of resource utilization enables proactive scaling decisions and performance optimization.
|
||||
|
||||
### Processing Pipeline Performance
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Performance Metrics"
|
||||
TPS[Throughput<br/>Documents/Second]
|
||||
LAT[Latency<br/>Processing Time]
|
||||
ERR[Error Rate<br/>Failed Documents]
|
||||
RES[Resource Usage<br/>CPU/Memory]
|
||||
end
|
||||
|
||||
subgraph "Optimization Strategies"
|
||||
ASYNC[Async Processing]
|
||||
BATCH[Batch Operations]
|
||||
CACHE[Caching Layer]
|
||||
RETRY[Retry Logic]
|
||||
end
|
||||
|
||||
subgraph "Scaling Options"
|
||||
HSCALE[Horizontal Scaling<br/>More Pods]
|
||||
VSCALE[Vertical Scaling<br/>Larger Pods]
|
||||
QUEUE[Queue Management<br/>Task Distribution]
|
||||
end
|
||||
|
||||
TPS --> ASYNC
|
||||
LAT --> BATCH
|
||||
ERR --> RETRY
|
||||
RES --> CACHE
|
||||
|
||||
ASYNC --> HSCALE
|
||||
BATCH --> QUEUE
|
||||
CACHE --> VSCALE
|
||||
|
||||
style TPS fill:#c8e6c9
|
||||
style LAT fill:#fff3e0
|
||||
style ERR fill:#ffcdd2
|
||||
style RES fill:#e1f5fe
|
||||
```
|
||||
|
||||
## Error Handling and Monitoring
|
||||
|
||||
The error handling and monitoring system is designed to provide comprehensive visibility into system operations while implementing robust recovery mechanisms. The system distinguishes between different types of errors and responds appropriately to each.
|
||||
|
||||
### Error Classification and Response
|
||||
|
||||
**Transient Errors**: Network timeouts, temporary service unavailability, and rate limiting are handled through exponential backoff retry mechanisms. These errors are expected in distributed systems and are handled automatically.
|
||||
|
||||
**Configuration Errors**: Invalid configuration values, missing credentials, and similar issues are detected at startup and cause immediate failure with clear error messages to facilitate quick resolution.
|
||||
|
||||
**Resource Errors**: Insufficient disk space, memory exhaustion, and similar resource constraints are detected and handled gracefully, often by pausing processing until resources become available.
|
||||
|
||||
**Service Errors**: Failures in external services (Azure Document Intelligence, Azure OpenAI, etc.) are handled through fallback mechanisms where possible, or graceful degradation when fallbacks are not available.
|
||||
|
||||
### Monitoring and Observability
|
||||
|
||||
**Structured Logging**: All log messages follow a structured format that enables efficient searching and analysis. Log levels are used appropriately to balance information content with log volume.
|
||||
|
||||
**Processing Metrics**: Key performance indicators such as processing rates, error rates, and resource utilization are tracked and can be exported to monitoring systems.
|
||||
|
||||
**Health Checks**: The system implements health check endpoints that can be used by orchestration systems to determine system health and restart unhealthy instances.
|
||||
|
||||
**Audit Trails**: Complete audit trails of document processing operations are maintained for compliance and debugging purposes.
|
||||
|
||||
### Error Handling Strategy
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
ERROR[Error Detected] --> CLASSIFY[Classify Error Type]
|
||||
|
||||
CLASSIFY --> TRANSIENT{Transient Error?}
|
||||
CLASSIFY --> CONFIG{Configuration Error?}
|
||||
CLASSIFY --> RESOURCE{Resource Error?}
|
||||
CLASSIFY --> SERVICE{Service Error?}
|
||||
|
||||
TRANSIENT -->|Yes| RETRY[Retry with Backoff]
|
||||
CONFIG -->|Yes| LOG_FATAL[Log Fatal Error]
|
||||
RESOURCE -->|Yes| WAIT[Wait for Resources]
|
||||
SERVICE -->|Yes| CHECK_SERVICE[Check Service Status]
|
||||
|
||||
RETRY --> MAX_RETRY{Max Retries?}
|
||||
MAX_RETRY -->|No| ATTEMPT[Retry Attempt]
|
||||
MAX_RETRY -->|Yes| MARK_FAILED[Mark as Failed]
|
||||
|
||||
ATTEMPT --> SUCCESS{Success?}
|
||||
SUCCESS -->|Yes| UPDATE_SUCCESS[Update Success]
|
||||
SUCCESS -->|No| RETRY
|
||||
|
||||
WAIT --> RESOURCE_CHECK{Resources Available?}
|
||||
RESOURCE_CHECK -->|Yes| RETRY
|
||||
RESOURCE_CHECK -->|No| WAIT
|
||||
|
||||
CHECK_SERVICE --> SERVICE_OK{Service OK?}
|
||||
SERVICE_OK -->|Yes| RETRY
|
||||
SERVICE_OK -->|No| ESCALATE[Escalate Error]
|
||||
|
||||
LOG_FATAL --> STOP[Stop Processing]
|
||||
MARK_FAILED --> LOG_ERROR[Log Detailed Error]
|
||||
ESCALATE --> LOG_ERROR
|
||||
|
||||
UPDATE_SUCCESS --> CONTINUE[Continue Processing]
|
||||
LOG_ERROR --> CONTINUE
|
||||
|
||||
style ERROR fill:#ffcdd2
|
||||
style UPDATE_SUCCESS fill:#c8e6c9
|
||||
style CONTINUE fill:#e8f5e8
|
||||
```
|
||||
## Conclusion
|
||||
|
||||
The Document AI Indexer provides a comprehensive, scalable solution for intelligent document processing and indexing. Its modular architecture, robust error handling, and integration with Azure AI services make it suitable for enterprise-scale document processing workflows. The system's flexibility allows for easy customization and extension to meet specific business requirements while maintaining high performance and reliability.
|
||||
103
vw-document-ai-indexer/document_task_processor.py
Normal file
103
vw-document-ai-indexer/document_task_processor.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
Document Task Processor
|
||||
Integrates business logic and database operations
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
from sqlalchemy import and_
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from app_config import ServiceFactory
|
||||
from task_processor import Task, TaskProcessorInterface
|
||||
from business_layer import ApplicationConfig, DocumentProcessingFactory, ProcessingContext
|
||||
from database import IndexObject, IndexObjectStatus, IndexJob
|
||||
from utils import custom_serializer
|
||||
|
||||
|
||||
class DocumentTaskProcessor(TaskProcessorInterface):
|
||||
"""Document task processor"""
|
||||
|
||||
def __init__(self, config:ApplicationConfig, service_factory:ServiceFactory, tmp_directory:str, database_engine:Any, logger: Optional[logging.Logger] , datasource: dict[str,Any] ,data_config:dict[str,Any]):
|
||||
self.config = config
|
||||
self.service_factory = service_factory
|
||||
self.database_engine = database_engine
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
self.datasource = datasource or {}
|
||||
self.processing_factory = DocumentProcessingFactory(service_factory=service_factory, tmp_directory=tmp_directory, datasource=datasource, config=config)
|
||||
self.data_config: dict[str, Any] = data_config
|
||||
self.datasource_name: str = data_config.get("datasource_name", "default")
|
||||
|
||||
def process(self, task: Task) -> Any:
|
||||
"""Process document task"""
|
||||
if not isinstance(task.payload, ProcessingContext):
|
||||
raise ValueError(f"Expected ProcessingContext, got {type(task.payload)}")
|
||||
|
||||
context = task.payload
|
||||
detailed_message:dict[str,Any] = {}
|
||||
detailed_message["start_time"] = datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
Session = sessionmaker(bind=self.database_engine)
|
||||
session = Session()
|
||||
|
||||
try:
|
||||
# 1. Query or create IndexObject record
|
||||
index_object_db = session.query(IndexObject).get({"object_key":context.object_key,"datasource_name":context.datasource_name})
|
||||
if not index_object_db:
|
||||
self.logger.info(f"Creating new IndexObject entry for {context.object_key}")
|
||||
index_object_db = IndexObject(
|
||||
object_key=context.object_key,
|
||||
type="document",
|
||||
status=IndexObjectStatus.PROCESSING.value,
|
||||
try_count=0,
|
||||
datasource_name=context.datasource_name
|
||||
)
|
||||
session.add(index_object_db)
|
||||
session.commit()
|
||||
|
||||
# 2. Only update task-related fields, no longer update business fields
|
||||
index_object_db.last_start_time = datetime.datetime.now(datetime.timezone.utc)
|
||||
current_job = session.query(IndexJob).filter(and_(IndexJob.status == "processing",IndexJob.datasource_name== context.datasource_name)).order_by(IndexJob.id.desc()).first()
|
||||
if current_job:
|
||||
index_object_db.last_run_id = current_job.id
|
||||
|
||||
session.commit()
|
||||
|
||||
# 3. Execute business processing
|
||||
self.logger.info(f"Processing document: {context.object_key}")
|
||||
orchestrator = self.processing_factory.create_orchestrator()
|
||||
result = orchestrator.process_document(context)
|
||||
|
||||
# 4. Only update task-related fields, no longer update business fields
|
||||
detailed_message["success"] = result.status == IndexObjectStatus.SUCCESS
|
||||
detailed_message["chunks_count"] = result.chunks_count
|
||||
detailed_message["processing_time"] = result.processing_time
|
||||
detailed_message["message"] = result.message
|
||||
if result.status != IndexObjectStatus.SUCCESS:
|
||||
self.logger.error(f"Failed to process {context.object_key}: {result.message}")
|
||||
detailed_message["error"] = result.message
|
||||
if result.error:
|
||||
detailed_message["error_details"] = str(result.error)
|
||||
else:
|
||||
self.logger.info(f"Successfully processed {context.object_key}")
|
||||
|
||||
index_object_db.last_finished_time = datetime.datetime.now(datetime.timezone.utc)
|
||||
detailed_message["end_time"] = datetime.datetime.now(datetime.timezone.utc)
|
||||
index_object_db.detailed_message = json.dumps(detailed_message, default=custom_serializer, ensure_ascii=False)
|
||||
session.commit()
|
||||
|
||||
# If processing failed, raise exception to trigger retry mechanism
|
||||
if result.status == IndexObjectStatus.FAILED:
|
||||
raise Exception(result.message)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
# Handle exceptions - only update database in case of unexpected exceptions
|
||||
# Business logic failures are already handled above
|
||||
self.logger.error(f"Error processing {context.object_key}: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
132
vw-document-ai-indexer/entity_models.py
Normal file
132
vw-document-ai-indexer/entity_models.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, fields
|
||||
from dataclasses_json import dataclass_json
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class DiResult:
|
||||
"""Data class for storing"""
|
||||
figures: List['FigureFlat']
|
||||
di_content: str
|
||||
filepath:str
|
||||
language:str
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class FigureFlat:
|
||||
offset: int
|
||||
length: int
|
||||
url: str
|
||||
content: str
|
||||
image: str
|
||||
understand_flag:bool
|
||||
caption:str
|
||||
|
||||
|
||||
def dict_to_str(v):
|
||||
return v if isinstance(v, str) else str(v)
|
||||
|
||||
@dataclass
|
||||
class Document(object):
|
||||
"""A data class for storing documents
|
||||
|
||||
Attributes:
|
||||
content (str): The content of the document.
|
||||
id (Optional[str]): The id of the document.
|
||||
title (Optional[str]): The title of the document.
|
||||
filepath (Optional[str]): The filepath of the document.
|
||||
url (Optional[str]): The url of the document.
|
||||
metadata (Optional[Dict]): The metadata of the document.
|
||||
"""
|
||||
|
||||
content: Optional[str] = None
|
||||
id: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
filepath: Optional[str] = None
|
||||
|
||||
url: Optional[str] = None
|
||||
metadata: Optional[Dict] = None
|
||||
image_mapping: Optional[Dict] = None
|
||||
|
||||
doc_metadata: Optional[str] = None
|
||||
document_schema: Optional[str] = None
|
||||
main_title: Optional[str] = None
|
||||
sub_title: Optional[str] = None
|
||||
publisher: Optional[str] = None
|
||||
document_code: Optional[str] = None
|
||||
document_category: Optional[str] = None
|
||||
main_title_sec_language: Optional[str] = None
|
||||
sub_title_sec_language: Optional[str] = None
|
||||
primary_language: Optional[str] = None
|
||||
secondary_language: Optional[str] = None
|
||||
|
||||
full_headers: Optional[str] = None
|
||||
h1: Optional[str] = None
|
||||
h2: Optional[str] = None
|
||||
h3: Optional[str] = None
|
||||
h4: Optional[str] = None
|
||||
h5: Optional[str] = None
|
||||
h6: Optional[str] = None
|
||||
|
||||
contentVector: Optional[List[float]] = None
|
||||
full_metadata_vector: Optional[List[float]] = None
|
||||
|
||||
|
||||
def __setattr__(self, key, value) -> None:
|
||||
# If the attribute is a list or dictionary, convert it to a string for storage
|
||||
if key =="doc_metadata" and value is not None and isinstance(value, (list, dict)):
|
||||
value = dict_to_str(value)
|
||||
# Avoid infinite recursion of __setattr__ calls
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
def __setitem__(self, key, value) -> None:
|
||||
# Store the attribute directly in the instance's __dict__
|
||||
self.__dict__[key] = value
|
||||
|
||||
|
||||
def __getitem__(self, key) -> Any:
|
||||
# Retrieve the attribute from the instance's __dict__
|
||||
return self.__dict__[key]
|
||||
|
||||
def copy_dynamic_attrs(self, source) -> None:
|
||||
"""Copy dynamic attributes from the source object to the current object"""
|
||||
predefined = {f.name for f in fields(source)}
|
||||
for attr in dir(source):
|
||||
# Filter dynamic attributes
|
||||
if (attr not in predefined and
|
||||
not attr.startswith('__') and
|
||||
not callable(getattr(source, attr))):
|
||||
value = getattr(source, attr)
|
||||
setattr(self, attr, value)
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingResult:
|
||||
"""Data model for chunking result
|
||||
|
||||
Attributes:
|
||||
chunks (List[Document]): List of chunks.
|
||||
total_files (int): Total number of files.
|
||||
num_unsupported_format_files (int): Number of files with unsupported format.
|
||||
num_files_with_errors (int): Number of files with errors.
|
||||
skipped_chunks (int): Number of chunks skipped due to too few tokens.
|
||||
"""
|
||||
chunks: List[Document]
|
||||
total_files: int
|
||||
num_unsupported_format_files: int = 0
|
||||
num_files_with_errors: int = 0
|
||||
# some chunks might be skipped due to too few tokens
|
||||
skipped_chunks: int = 0
|
||||
failed_files = None
|
||||
|
||||
|
||||
|
||||
class UnsupportedFormatError(Exception):
|
||||
"""Exception raised when a format is not supported by a parser."""
|
||||
|
||||
pass
|
||||
|
||||
51
vw-document-ai-indexer/env.yaml
Normal file
51
vw-document-ai-indexer/env.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
config: config.yaml
|
||||
njobs: 1
|
||||
|
||||
|
||||
search_service_name: https://<resource name>.search.windows.net
|
||||
search_admin_key:
|
||||
|
||||
embedding_model_endpoint: https://<resource name>.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview
|
||||
embedding_model_key:
|
||||
VECTOR_DIMENSION: 1536
|
||||
|
||||
extract_method: di+vision-llm
|
||||
|
||||
# extract_method=vision-llm
|
||||
|
||||
form_rec_resource: https://<resource name>.cognitiveservices.azure.cn/
|
||||
form_rec_key:
|
||||
|
||||
# Perform OCR at a higher resolution to handle documents with fine print
|
||||
di-hiRes: true
|
||||
# Enable the detection of mathematical expressions in the document.
|
||||
di-Formulas: true
|
||||
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
|
||||
|
||||
|
||||
|
||||
# 图片理解
|
||||
figure_caption:
|
||||
include_di_content: false
|
||||
description_gen_max_images: 0
|
||||
model_endpoint: null
|
||||
model_key: null
|
||||
model: null # azure 留空
|
||||
azure_deployment: gpt-4o # azure 部署名称,其他平台模型留空
|
||||
api_version: 2024-08-01-preview # azure api版本,其他平台留空
|
||||
|
||||
|
||||
FLAG_AOAI: "V3"
|
||||
#FLAG_EMBEDDING_MODEL: "qwen3-embedding-8b"
|
||||
FLAG_EMBEDDING_MODEL: "AOAI"
|
||||
|
||||
|
||||
FIGURE_BLOB_ACCOUNT_URL: https://blob sas url
|
||||
|
||||
DI_BLOB_ACCOUNT_URL: https://blob sas url
|
||||
|
||||
DB_URI: postgresql+psycopg2://user:passwords@localhost:5433/document_indexer
|
||||
|
||||
header_fix: true
|
||||
|
||||
|
||||
43
vw-document-ai-indexer/env.yaml.example
Normal file
43
vw-document-ai-indexer/env.yaml.example
Normal file
@@ -0,0 +1,43 @@
|
||||
# Configuration file reference
|
||||
config: config.yaml
|
||||
|
||||
# Processing settings
|
||||
njobs: 8 # Number of parallel processing jobs
|
||||
|
||||
# Azure AI Search configuration
|
||||
search_service_name: "https://your-search-service.search.windows.net"
|
||||
search_admin_key: "your-search-admin-key"
|
||||
|
||||
# Azure OpenAI Embedding service
|
||||
embedding_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview"
|
||||
embedding_model_key: "your-openai-key"
|
||||
VECTOR_DIMENSION: 1536
|
||||
FLAG_AOAI: "V3" # Azure OpenAI version
|
||||
FLAG_EMBEDDING_MODEL: "AOAI" # Embedding model type: "AOAI" or "qwen3-embedding-8b"
|
||||
|
||||
# Document Intelligence configuration
|
||||
extract_method: "di+vision-llm" # Extraction method: "di+vision-llm", "vision-llm", "di"
|
||||
form_rec_resource: "https://your-di-service.cognitiveservices.azure.com/"
|
||||
form_rec_key: "your-di-key"
|
||||
|
||||
# Document Intelligence features
|
||||
di-hiRes: true # High resolution OCR
|
||||
di-Formulas: true # Mathematical expression detection
|
||||
di_allow_features_ext: "pdf;jpeg;jpg;png;bmp;tiff;heif" # Supported file extensions
|
||||
|
||||
# Vision and captioning models
|
||||
captioning_model_endpoint: "https://your-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
|
||||
captioning_model_key: "your-openai-key"
|
||||
vision_max_images: 200 # Maximum images to process per document (0 = no limit)
|
||||
vision_image_method: "openai" # Image processing method: "openai"
|
||||
|
||||
|
||||
# Blob storage for figures and DI results
|
||||
FIGURE_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
|
||||
DI_BLOB_ACCOUNT_URL: "https://your-storage.blob.core.windows.net/container?sas-token"
|
||||
|
||||
# Database configuration
|
||||
DB_URI: "postgresql://user:password@host:port/database_name"
|
||||
|
||||
# Processing flags
|
||||
header_fix: false # Enable/disable header fixing
|
||||
473
vw-document-ai-indexer/hierarchy_fix.py
Normal file
473
vw-document-ai-indexer/hierarchy_fix.py
Normal file
@@ -0,0 +1,473 @@
|
||||
"""
|
||||
Fixed the problem of mismatch between the upper and lower titles in MD documents. Solve the problem that the # number of the lower title is raised to the same as the upper title, or is higher than the upper title.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, List, Dict, Optional
|
||||
|
||||
class HeaderInfo:
|
||||
"""Title information"""
|
||||
def __init__(self, line_number: int, original_line: str, hash_count: int,
|
||||
level: int, number_pattern: str, title_text: str):
|
||||
self.line_number = line_number
|
||||
self.original_line = original_line
|
||||
self.hash_count = hash_count
|
||||
self.level = level
|
||||
self.number_pattern = number_pattern
|
||||
self.title_text = title_text
|
||||
self.correct_hash_count = hash_count # Will be updated by Fixer
|
||||
|
||||
class HierarchyFixer:
|
||||
"""Special fixer for title hierarchy # number mismatch issues"""
|
||||
|
||||
def __init__(self):
|
||||
# Number pattern matching - supports both formats with and without trailing dots
|
||||
self.number_patterns = [
|
||||
r'^(\d+)\.?$', # 1 or 1.
|
||||
r'^(\d+)\.(\d+)\.?$', # 1.1 or 1.1.
|
||||
r'^(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1 or 1.1.1.
|
||||
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1 or 1.1.1.1.
|
||||
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1 or 1.1.1.1.1.
|
||||
r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1.1 or 1.1.1.1.1.1.
|
||||
]
|
||||
|
||||
# Letter+number pattern matching - supports both "A.x.x.x" and "C. x.x.x" formats
|
||||
self.letter_number_patterns = [
|
||||
# Single letter: A, B, C (followed by space or end)
|
||||
(r'^([A-Z])(?:\s|$)', 1),
|
||||
|
||||
# Letter + space + numbers: "C. 1", "A. 2"
|
||||
(r'^([A-Z])\.\s+(\d+)(?:\s|$)', 2),
|
||||
(r'^([A-Z])\.\s+(\d+)\.(\d+)(?:\s|$)', 3), # C. 1.1, A. 2.3
|
||||
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)(?:\s|$)', 4), # C. 1.1.1, A. 2.3.4
|
||||
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 5), # C. 1.1.1.1, A. 2.3.4.5
|
||||
(r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 6), # C. 1.1.1.1.1, A. 2.3.4.5.6
|
||||
|
||||
# Compact format (no space): A.1, A.1.2, A.1.2.3 etc.
|
||||
(r'^([A-Z])\.(\d+)(?:\s|$|[^\d\.])', 2), # A.1, A.2
|
||||
(r'^([A-Z])\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 3), # A.1.2, A.1.3
|
||||
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 4), # A.1.2.3
|
||||
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 5), # A.1.2.3.4
|
||||
(r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 6), # A.1.2.3.4.5
|
||||
]
|
||||
|
||||
def detect_headers(self, content: str) -> List[HeaderInfo]:
|
||||
"""Detect all headers and determine their logical levels"""
|
||||
lines = content.split('\n')
|
||||
headers: List[HeaderInfo] = []
|
||||
|
||||
for line_num, line in enumerate(lines):
|
||||
if line.strip().startswith('#'):
|
||||
header_info = self._parse_header_line(line_num, line)
|
||||
if header_info:
|
||||
headers.append(header_info)
|
||||
|
||||
return headers
|
||||
|
||||
def _parse_header_line(self, line_num: int, line: str) -> Optional[HeaderInfo]:
|
||||
"""Analyze the title line"""
|
||||
line = line.strip()
|
||||
|
||||
# Count the number of # characters
|
||||
hash_count = 0
|
||||
for char in line:
|
||||
if char == '#':
|
||||
hash_count += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if hash_count == 0:
|
||||
return None
|
||||
|
||||
# Extract title content
|
||||
title_content = line[hash_count:].strip()
|
||||
|
||||
# Try to match number pattern
|
||||
level = 1
|
||||
number_pattern = ""
|
||||
|
||||
# Check for letter+number patterns first (A.1.2.3 format)
|
||||
for pattern, expected_level in self.letter_number_patterns:
|
||||
match = re.match(pattern, title_content)
|
||||
if match:
|
||||
level = expected_level
|
||||
# Extract the complete matched numbering pattern
|
||||
matched_text = match.group(0)
|
||||
|
||||
# For space-separated patterns like "C. 1.1", we need to extract the full pattern
|
||||
if '. ' in matched_text:
|
||||
# This is a space-separated pattern like "C. 1.1"
|
||||
# The match already contains the complete pattern we want
|
||||
number_pattern = matched_text.rstrip() # Remove trailing space if any
|
||||
else:
|
||||
# This is a compact pattern like "A.1.2.3"
|
||||
number_pattern = matched_text
|
||||
|
||||
return HeaderInfo(
|
||||
line_number=line_num,
|
||||
original_line=line,
|
||||
hash_count=hash_count,
|
||||
level=level,
|
||||
number_pattern=number_pattern,
|
||||
title_text=title_content
|
||||
)
|
||||
|
||||
# If no letter+number pattern, try traditional number patterns
|
||||
if title_content:
|
||||
# First, try to identify and extract the complete numbering part
|
||||
# Look for patterns like "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
|
||||
words = title_content.split()
|
||||
numbering_words = []
|
||||
|
||||
# Collect words that could be part of the numbering (digits, dots, spaces)
|
||||
for word in words:
|
||||
if re.match(r'^[\d\.]+$', word) or word == '.':
|
||||
numbering_words.append(word)
|
||||
else:
|
||||
break # Stop at first non-numbering word
|
||||
|
||||
if numbering_words:
|
||||
# Join and normalize the numbering part
|
||||
numbering_text = ' '.join(numbering_words)
|
||||
# Normalize: "1 . 2 . 3" -> "1.2.3", "1. 2. 3" -> "1.2.3"
|
||||
normalized = re.sub(r'\s*\.\s*', '.', numbering_text)
|
||||
normalized = re.sub(r'\.+$', '', normalized) # Remove trailing dots
|
||||
normalized = normalized.strip()
|
||||
|
||||
# Try to match the normalized pattern
|
||||
for i, pattern in enumerate(self.number_patterns, 1):
|
||||
match = re.match(pattern, normalized)
|
||||
if match:
|
||||
level = i
|
||||
number_pattern = normalized
|
||||
break
|
||||
else:
|
||||
# If no numbering pattern found in separate words, try the first word directly
|
||||
first_word = words[0] if words else ""
|
||||
for i, pattern in enumerate(self.number_patterns, 1):
|
||||
match = re.match(pattern, first_word)
|
||||
if match:
|
||||
level = i
|
||||
number_pattern = match.group(0).rstrip('.')
|
||||
break
|
||||
|
||||
# If no number pattern is found, infer level from # count
|
||||
if not number_pattern:
|
||||
level = hash_count
|
||||
|
||||
return HeaderInfo(
|
||||
line_number=line_num,
|
||||
original_line=line,
|
||||
hash_count=hash_count,
|
||||
level=level,
|
||||
number_pattern=number_pattern,
|
||||
title_text=title_content
|
||||
)
|
||||
|
||||
def find_hierarchy_problems(self, headers: List[HeaderInfo]) -> List[Dict]:
|
||||
"""Find problems with mismatched # counts using adaptive analysis"""
|
||||
problems = []
|
||||
|
||||
# 首先分析文档的自适应层级映射
|
||||
level_hash_mapping = self._analyze_document_hash_pattern(headers)
|
||||
|
||||
# 1. Check for level-hash mismatch based on adaptive mapping
|
||||
for header in headers:
|
||||
if header.number_pattern: # Only check numbered headers
|
||||
expected_hash_count = level_hash_mapping.get(header.level, header.level)
|
||||
if header.hash_count != expected_hash_count:
|
||||
problems.append({
|
||||
'type': 'level_hash_mismatch',
|
||||
'line': header.line_number + 1,
|
||||
'level': header.level,
|
||||
'current_hash': header.hash_count,
|
||||
'expected_hash': expected_hash_count,
|
||||
'title': header.title_text[:50],
|
||||
'pattern': header.number_pattern,
|
||||
'problem': f"Level {header.level} header '{header.number_pattern}' uses {header.hash_count} #, but document pattern suggests {expected_hash_count} #"
|
||||
})
|
||||
|
||||
# 2. Check for parent-child hierarchy issues
|
||||
for i in range(len(headers) - 1):
|
||||
current = headers[i]
|
||||
next_header = headers[i + 1]
|
||||
|
||||
# Only consider headers with a clear number pattern
|
||||
if current.number_pattern and next_header.number_pattern:
|
||||
# Check if the child header's # count is less than or equal to the parent header's
|
||||
if next_header.level > current.level: # Child header
|
||||
expected_parent_hash = level_hash_mapping.get(current.level, current.level)
|
||||
expected_child_hash = level_hash_mapping.get(next_header.level, next_header.level)
|
||||
|
||||
if next_header.hash_count <= current.hash_count:
|
||||
problems.append({
|
||||
'type': 'hierarchy_violation',
|
||||
'parent_line': current.line_number + 1,
|
||||
'parent_level': current.level,
|
||||
'parent_hash': current.hash_count,
|
||||
'parent_title': current.title_text[:50],
|
||||
'child_line': next_header.line_number + 1,
|
||||
'child_level': next_header.level,
|
||||
'child_hash': next_header.hash_count,
|
||||
'child_title': next_header.title_text[:50],
|
||||
'problem': f"Child header ({next_header.level} level) # count ({next_header.hash_count}) should be greater than parent header ({current.level} level, {current.hash_count} #). Expected pattern: parent {expected_parent_hash}#, child {expected_child_hash}#"
|
||||
})
|
||||
|
||||
# 3. Check for significant inconsistency within same level (now less strict)
|
||||
same_level_problems = self._find_same_level_inconsistency(headers)
|
||||
problems.extend(same_level_problems)
|
||||
|
||||
return problems
|
||||
|
||||
def _find_same_level_inconsistency(self, headers: List[HeaderInfo]) -> List[Dict]:
|
||||
"""Check the problem of inconsistent number of titles # numbers at the same level"""
|
||||
problems = []
|
||||
|
||||
# Group by level, only numbered titles
|
||||
level_groups = {}
|
||||
for header in headers:
|
||||
if header.number_pattern: # Only numbered titles
|
||||
if header.level not in level_groups:
|
||||
level_groups[header.level] = []
|
||||
level_groups[header.level].append(header)
|
||||
|
||||
# Check the consistency of # numbers within each level
|
||||
for level, group_headers in level_groups.items():
|
||||
if len(group_headers) < 2:
|
||||
continue # Only one header, no need to check
|
||||
|
||||
# Count the usage of different # numbers within the same level
|
||||
hash_count_stats = {}
|
||||
for header in group_headers:
|
||||
hash_count = header.hash_count
|
||||
if hash_count not in hash_count_stats:
|
||||
hash_count_stats[hash_count] = []
|
||||
hash_count_stats[hash_count].append(header)
|
||||
|
||||
# If there are different # numbers in the same level
|
||||
if len(hash_count_stats) > 1:
|
||||
# Find the most common # number as the standard
|
||||
most_common_hash_count = max(hash_count_stats.keys(),
|
||||
key=lambda x: len(hash_count_stats[x]))
|
||||
|
||||
# Report titles that do not meet the standard
|
||||
for hash_count, headers_with_this_count in hash_count_stats.items():
|
||||
if hash_count != most_common_hash_count:
|
||||
for header in headers_with_this_count:
|
||||
problems.append({
|
||||
'type': 'same_level_inconsistency',
|
||||
'line': header.line_number + 1,
|
||||
'level': header.level,
|
||||
'current_hash': header.hash_count,
|
||||
'expected_hash': most_common_hash_count,
|
||||
'title': header.title_text[:50],
|
||||
'pattern': header.number_pattern,
|
||||
'problem': f"{header.level} level header uses {header.hash_count} #, but the majority of siblings use {most_common_hash_count} #"
|
||||
})
|
||||
|
||||
return problems
|
||||
|
||||
def fix_hierarchy(self, content: str) -> Dict[str,Any]:
|
||||
"""Fix hierarchy issues"""
|
||||
headers = self.detect_headers(content)
|
||||
|
||||
if not headers:
|
||||
return {
|
||||
'fixed_content': content,
|
||||
'problems_found': [],
|
||||
'fixes_applied': 0,
|
||||
'message': 'No headers detected'
|
||||
}
|
||||
|
||||
# Check for problems
|
||||
problems = self.find_hierarchy_problems(headers)
|
||||
|
||||
if not problems:
|
||||
return {
|
||||
'fixed_content': content,
|
||||
'problems_found': [],
|
||||
'fixes_applied': 0,
|
||||
'message': 'No hierarchy issues found'
|
||||
}
|
||||
|
||||
# Apply fixes
|
||||
lines = content.split('\n')
|
||||
fixes_applied = 0
|
||||
|
||||
# To ensure child headers have more # than parent headers, we need to recalculate the # count for each header
|
||||
fixed_headers = self._calculate_correct_hash_counts(headers)
|
||||
|
||||
# Apply fixes
|
||||
for header in fixed_headers:
|
||||
if header.hash_count != header.correct_hash_count:
|
||||
old_line = lines[header.line_number]
|
||||
new_hash = '#' * header.correct_hash_count
|
||||
# Replace # part
|
||||
new_line = re.sub(r'^#+', new_hash, old_line)
|
||||
lines[header.line_number] = new_line
|
||||
fixes_applied += 1
|
||||
|
||||
fixed_content = '\n'.join(lines)
|
||||
|
||||
return {
|
||||
'fixed_content': fixed_content,
|
||||
'original_content': content,
|
||||
'problems_found': problems,
|
||||
'fixes_applied': fixes_applied,
|
||||
'fixed_headers': [(h.line_number + 1, h.hash_count, h.correct_hash_count, h.title_text[:30])
|
||||
for h in fixed_headers if h.hash_count != h.correct_hash_count]
|
||||
}
|
||||
|
||||
def _calculate_correct_hash_counts(self, headers: List[HeaderInfo]) -> List[HeaderInfo]:
|
||||
"""Calculate the correct number of #'s based on adaptive analysis of the document"""
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
# 1. 分析文档中各层级的#号使用模式 (自适应分析)
|
||||
level_hash_mapping = self._analyze_document_hash_pattern(headers)
|
||||
|
||||
# Create copies with the correct number of #'s
|
||||
fixed_headers: list[HeaderInfo] = []
|
||||
|
||||
for header in headers:
|
||||
# Copy original information
|
||||
fixed_header = HeaderInfo(
|
||||
line_number=header.line_number,
|
||||
original_line=header.original_line,
|
||||
hash_count=header.hash_count,
|
||||
level=header.level,
|
||||
number_pattern=header.number_pattern,
|
||||
title_text=header.title_text
|
||||
)
|
||||
|
||||
if fixed_header.number_pattern:
|
||||
# For numbered headers, use the adaptive mapping
|
||||
if fixed_header.level in level_hash_mapping:
|
||||
fixed_header.correct_hash_count = level_hash_mapping[fixed_header.level]
|
||||
else:
|
||||
# Fallback: extrapolate from existing pattern
|
||||
fixed_header.correct_hash_count = self._extrapolate_hash_count(
|
||||
fixed_header.level, level_hash_mapping)
|
||||
else:
|
||||
# For non-numbered headers, keep the original # count
|
||||
fixed_header.correct_hash_count = fixed_header.hash_count
|
||||
|
||||
fixed_headers.append(fixed_header)
|
||||
|
||||
return fixed_headers
|
||||
|
||||
def _analyze_document_hash_pattern(self, headers: List[HeaderInfo]) -> Dict[int, int]:
|
||||
"""Analyze the document's # pattern to determine the adaptive mapping"""
|
||||
# Count the number of #'s used at each level
|
||||
level_hash_stats = {}
|
||||
for header in headers:
|
||||
if header.number_pattern: # Only numbered titles are considered
|
||||
level = header.level
|
||||
hash_count = header.hash_count
|
||||
|
||||
if level not in level_hash_stats:
|
||||
level_hash_stats[level] = {}
|
||||
if hash_count not in level_hash_stats[level]:
|
||||
level_hash_stats[level][hash_count] = 0
|
||||
level_hash_stats[level][hash_count] += 1
|
||||
|
||||
# Find out the most commonly used number of # numbers for each level
|
||||
level_hash_mapping = {}
|
||||
for level, hash_stats in level_hash_stats.items():
|
||||
most_common_hash = max(hash_stats.keys(), key=lambda x: hash_stats[x])
|
||||
level_hash_mapping[level] = most_common_hash
|
||||
|
||||
# Verify and adjust the mapping to ensure that the incremental # number of the hierarchy is also incremented
|
||||
level_hash_mapping = self._ensure_monotonic_mapping(level_hash_mapping)
|
||||
|
||||
return level_hash_mapping
|
||||
|
||||
def _ensure_monotonic_mapping(self, level_hash_mapping: Dict[int, int]) -> Dict[int, int]:
|
||||
"""Ensure that the level mapping is monotonically increasing (higher level = more #'s)"""
|
||||
if not level_hash_mapping:
|
||||
return level_hash_mapping
|
||||
|
||||
# Sort by level
|
||||
sorted_levels = sorted(level_hash_mapping.keys())
|
||||
adjusted_mapping = {}
|
||||
|
||||
# Ensure that the # count for each level is at least 1 more than the previous level
|
||||
for i, level in enumerate(sorted_levels):
|
||||
current_hash = level_hash_mapping[level]
|
||||
|
||||
if i == 0:
|
||||
# First level, use as is
|
||||
adjusted_mapping[level] = current_hash
|
||||
else:
|
||||
# Ensure at least 1 more # than the previous level
|
||||
prev_level = sorted_levels[i-1]
|
||||
min_required_hash = adjusted_mapping[prev_level] + 1
|
||||
adjusted_mapping[level] = max(current_hash, min_required_hash)
|
||||
|
||||
return adjusted_mapping
|
||||
|
||||
def _extrapolate_hash_count(self, level: int, level_hash_mapping: Dict[int, int]) -> int:
|
||||
"""Infer the number of # numbers for the hierarchy that have not appeared"""
|
||||
if not level_hash_mapping:
|
||||
return level # Fallback to simple 1:1 mapping
|
||||
|
||||
sorted_levels = sorted(level_hash_mapping.keys())
|
||||
|
||||
if level < sorted_levels[0]:
|
||||
# Smaller than the minimum level, infer forward
|
||||
diff = sorted_levels[0] - level
|
||||
return max(1, level_hash_mapping[sorted_levels[0]] - diff)
|
||||
elif level > sorted_levels[-1]:
|
||||
# Larger than the maximum level, infer backward
|
||||
diff = level - sorted_levels[-1]
|
||||
return level_hash_mapping[sorted_levels[-1]] + diff
|
||||
else:
|
||||
# Between known levels, interpolation inference
|
||||
for i in range(len(sorted_levels) - 1):
|
||||
if sorted_levels[i] < level < sorted_levels[i + 1]:
|
||||
# Simple linear interpolation
|
||||
lower_level = sorted_levels[i]
|
||||
upper_level = sorted_levels[i + 1]
|
||||
lower_hash = level_hash_mapping[lower_level]
|
||||
upper_hash = level_hash_mapping[upper_level]
|
||||
|
||||
# Linear interpolation
|
||||
ratio = (level - lower_level) / (upper_level - lower_level)
|
||||
return int(lower_hash + ratio * (upper_hash - lower_hash))
|
||||
|
||||
return level # Fallback
|
||||
|
||||
def _fix_same_level_inconsistency(self, headers: List[HeaderInfo]) -> None:
|
||||
"""Fix inconsistency of # count at the same level"""
|
||||
# Group by level, only process headers with a numbering pattern
|
||||
level_groups = {}
|
||||
for header in headers:
|
||||
if header.number_pattern: # Only process headers with a numbering pattern
|
||||
if header.level not in level_groups:
|
||||
level_groups[header.level] = []
|
||||
level_groups[header.level].append(header)
|
||||
|
||||
# Fix inconsistency of # count within each level
|
||||
for level, group_headers in level_groups.items():
|
||||
if len(group_headers) < 2:
|
||||
continue # Only one header, no need to fix
|
||||
|
||||
# Count the usage of different # counts within the same level
|
||||
hash_count_stats = {}
|
||||
for header in group_headers:
|
||||
hash_count = header.correct_hash_count
|
||||
if hash_count not in hash_count_stats:
|
||||
hash_count_stats[hash_count] = []
|
||||
hash_count_stats[hash_count].append(header)
|
||||
|
||||
# If different # counts exist at the same level
|
||||
if len(hash_count_stats) > 1:
|
||||
# Find the most common # count as the standard
|
||||
most_common_hash_count = max(hash_count_stats.keys(),
|
||||
key=lambda x: len(hash_count_stats[x]))
|
||||
|
||||
# Unify all titles of the same level into the most commonly used number of # numbers
|
||||
for header in group_headers:
|
||||
header.correct_hash_count = most_common_hash_count
|
||||
|
||||
|
||||
370
vw-document-ai-indexer/main.py
Normal file
370
vw-document-ai-indexer/main.py
Normal file
@@ -0,0 +1,370 @@
|
||||
"""Main application entry point for document processing."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
from typing import Optional, List, Dict, Any
|
||||
from contextlib import asynccontextmanager
|
||||
from dataclasses import dataclass
|
||||
import argparse
|
||||
import datetime
|
||||
from sqlalchemy import and_
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
from app_config import ApplicationConfig, ServiceFactory
|
||||
from business_layer import ProcessingContext
|
||||
from document_task_processor import DocumentTaskProcessor
|
||||
from task_processor import ProcessingStats, Task, TaskProcessor
|
||||
|
||||
from database import init_database,IndexObject,IndexJob
|
||||
from utils import custom_serializer, init_current_data_directory,max_datetime_safe, min_datetime_safe
|
||||
from blob_service import check_files, check_meta,load_metadata
|
||||
from azure_index_service import index_init
|
||||
|
||||
@dataclass
|
||||
class ApplicationContext:
|
||||
"""Application context."""
|
||||
config: ApplicationConfig
|
||||
service_factory: ServiceFactory
|
||||
database_engine: Any
|
||||
logger: logging.Logger
|
||||
|
||||
class DocumentProcessingApplication:
|
||||
"""Main class for document processing application."""
|
||||
def __init__(self, config_path: str, env_path: str = "env.yaml"):
|
||||
self.config_path = config_path
|
||||
self.env_path = env_path
|
||||
self.context: ApplicationContext = None # type: ignore
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.console_logger = logging.getLogger("data_preparation")
|
||||
async def initialize(self):
|
||||
"""Initialize the application."""
|
||||
try:
|
||||
# Load config - load environment and business config separately
|
||||
config = ApplicationConfig.from_env_and_config_files(config_yaml_path=self.config_path, env_yaml_path=self.env_path)
|
||||
config.validate()
|
||||
# Set up logging
|
||||
self._setup_app_logging()
|
||||
# Create service factory
|
||||
service_factory = ServiceFactory(config)
|
||||
# Initialize database (create tables)
|
||||
database_engine = init_database(config.database.uri)
|
||||
self.logger.info("Database initialized successfully")
|
||||
# Validate database engine
|
||||
service_engine = service_factory.get_database_engine()
|
||||
if database_engine.url != service_engine.url:
|
||||
self.logger.warning("Database engines have different URLs, using init_database result")
|
||||
database_engine = service_engine
|
||||
# Create application context
|
||||
self.context = ApplicationContext(config=config, service_factory=service_factory, database_engine=database_engine, logger=self.logger)
|
||||
# Initialize task processor
|
||||
self._initialize_task_processor()
|
||||
|
||||
self.console_logger.info("Application initialized successfully")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to initialize application: {e}")
|
||||
raise
|
||||
def _setup_app_logging(self):
|
||||
self.console_logger.handlers = []
|
||||
self.console_logger.setLevel(logging.DEBUG)
|
||||
self.console_logger.propagate = False
|
||||
# Console output - only show progress and key info
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(console_formatter)
|
||||
console_handler.setLevel(logging.DEBUG)
|
||||
self.console_logger.addHandler(console_handler)
|
||||
|
||||
def _setup_logging(self, log_file: str = '~'):
|
||||
"""Set up logging configuration."""
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.INFO)
|
||||
# Remove existing handlers
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
file_path = f"{log_file}/.chunked/.run.log"
|
||||
# File output - log all details
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
file_handler = logging.FileHandler(file_path, encoding='utf-8')
|
||||
file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
file_handler.setFormatter(file_formatter)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
root_logger.addHandler(file_handler)
|
||||
self.console_logger.addHandler(file_handler)
|
||||
|
||||
|
||||
async def _initialize_datasource(self, data_config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Initialize datasource."""
|
||||
try:
|
||||
self.console_logger.info("Loading metadata from blob storage...")
|
||||
sorted_list = await asyncio.to_thread(load_metadata, data_config["data_path"], self.context.config.current_tmp_directory, data_config["data_dir"])
|
||||
doc_metadata_map: dict[str, dict[str, Any]] = {}
|
||||
for item in sorted_list:
|
||||
key = item["filepath"]
|
||||
# Assume there is a timestamp field, keep the latest
|
||||
if key not in doc_metadata_map or item.get("timestamp", 0) > doc_metadata_map[key].get("timestamp", 0):
|
||||
doc_metadata_map[key] = item
|
||||
datasource = {"metadata": doc_metadata_map}
|
||||
self.console_logger.info(f"Loaded {len(doc_metadata_map)} metadata entries")
|
||||
return datasource
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error initializing datasource: {e}")
|
||||
raise
|
||||
def _initialize_task_processor(self):
|
||||
"""Initialize task processor (basic init only)."""
|
||||
if not self.context:
|
||||
raise RuntimeError("Application context not initialized")
|
||||
# Basic task processor config, actual processor will be created per data config
|
||||
self.logger.info("Task processor configuration initialized")
|
||||
|
||||
|
||||
async def run(self):
|
||||
"""Run the application."""
|
||||
if not self.context:
|
||||
raise RuntimeError("Application not initialized")
|
||||
try:
|
||||
self.console_logger.info("Starting document processing application")
|
||||
for i, data_config in enumerate(self.context.config.data_configs, 1):
|
||||
self.console_logger.info(f"Processing data source {i}/{len(self.context.config.data_configs)}")
|
||||
await self._process_data_config(data_config)
|
||||
self.console_logger.info("Document processing application completed")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Application error: {e}")
|
||||
raise
|
||||
|
||||
async def _process_data_config(self, data_config: Dict[str, Any]):
|
||||
"""Process a single data config."""
|
||||
data_path = data_config.get('data_path', '/')
|
||||
self.console_logger.info(f"Processing data source: {data_path}")
|
||||
if not self.context:
|
||||
raise RuntimeError("Application context not initialized")
|
||||
try:
|
||||
base_path: str = data_config.get('base_path', '')
|
||||
|
||||
self.context.config.current_tmp_directory = init_current_data_directory(base_path)
|
||||
self._setup_logging(self.context.config.current_tmp_directory)
|
||||
# 1. Initialize datasource (load metadata)
|
||||
datasource = await self._initialize_datasource(data_config)
|
||||
# 2. Get objects to process
|
||||
objects_to_process = await self._get_objects_to_process(data_config)
|
||||
if not objects_to_process:
|
||||
self.console_logger.info("No new documents to process")
|
||||
return
|
||||
self.console_logger.info(f"Found {len(objects_to_process)} documents to process")
|
||||
|
||||
# 3. Initialize search index schema (ensure search index is created and configured)
|
||||
await self._initialize_search_index(data_config, self.context.config)
|
||||
|
||||
# 4. Create task processor with datasource
|
||||
task_processor_impl = DocumentTaskProcessor(config=self.context.config, service_factory=self.context.service_factory, tmp_directory=self.context.config.current_tmp_directory, database_engine=self.context.database_engine, logger=self.logger, datasource=datasource,data_config=data_config)
|
||||
|
||||
# 5. Task processor
|
||||
simple_processor = TaskProcessor(task_processor=task_processor_impl, max_workers=self.context.config.processing.max_workers, logger=self.console_logger, database_engine=self.context.database_engine,data_config=data_config)
|
||||
# Create tasks
|
||||
tasks = self._create_tasks(objects_to_process, data_config,self.context.config)
|
||||
self.console_logger.info(f"Starting processing of {len(tasks)} tasks")
|
||||
# Synchronously process all tasks
|
||||
await asyncio.to_thread(simple_processor.process_tasks, tasks)
|
||||
|
||||
# Get processing stats
|
||||
stats = ProcessingStats(total_tasks=simple_processor.total_tasks, completed_tasks=simple_processor.completed_tasks, failed_tasks=simple_processor.failed_tasks, start_time=simple_processor.start_time or datetime.datetime.now())
|
||||
self.console_logger.info(json.dumps(stats, ensure_ascii=False, default=custom_serializer))
|
||||
|
||||
# Update job status
|
||||
datasource_name = data_config.get("datasource_name", "default")
|
||||
await self._update_index_job_status(stats, datasource_name)
|
||||
except Exception as e:
|
||||
self.console_logger.error(f"Error processing data config: {traceback.format_exc()}")
|
||||
self.console_logger.error(f"Error processing data config: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
async def _get_objects_to_process(self, data_config: Dict[str, Any]) -> List[IndexObject]:
|
||||
"""Get objects to process."""
|
||||
try:
|
||||
# 1. Get last successful processing time from database
|
||||
datasource_name = data_config.get("datasource_name", "default")
|
||||
Session = sessionmaker(bind=self.context.database_engine)
|
||||
session = Session()
|
||||
try:
|
||||
last_success_doc_job = session.query(IndexJob).filter(
|
||||
and_(
|
||||
IndexJob.status == "success",
|
||||
IndexJob.doc_upper_time.is_not(None),
|
||||
IndexJob.datasource_name == datasource_name
|
||||
)
|
||||
).order_by(IndexJob.id.desc()).first()
|
||||
|
||||
last_success_meta_job = session.query(IndexJob).filter(
|
||||
and_(
|
||||
IndexJob.status == "success",
|
||||
IndexJob.metadata_upper_time.is_not(None),
|
||||
IndexJob.datasource_name == datasource_name
|
||||
)
|
||||
).order_by(IndexJob.id.desc()).first()
|
||||
|
||||
doc_upper_time = last_success_doc_job.doc_upper_time if last_success_doc_job and last_success_doc_job.doc_upper_time else None
|
||||
metadata_upper_time = last_success_meta_job.metadata_upper_time if last_success_meta_job and last_success_meta_job.metadata_upper_time else None
|
||||
self.console_logger.info(f"Checking for updates in datasource '{datasource_name}' since doc: {doc_upper_time}, metadata: {metadata_upper_time}")
|
||||
finally:
|
||||
session.close()
|
||||
# 2. Check file updates (only get files updated after baseline)
|
||||
new_files = await asyncio.to_thread(check_files, data_config["data_path"], doc_upper_time)
|
||||
# 3. Check metadata updates (only get metadata updated after baseline)
|
||||
new_metas:list[dict[Any, Any]] = await asyncio.to_thread(check_meta, data_config["data_path"], metadata_upper_time, self.context.config.current_tmp_directory, data_config["data_dir"])
|
||||
self.console_logger.info(f"Found {len(new_files)} updated files and {len(new_metas)} updated metadata entries")
|
||||
|
||||
# Crop new_metas and new_files, and only get 100 corresponding to new_metas and new_files. According to the name field, according to process_file_num: 100. If the name of new_files is not directly removed in new_metas
|
||||
if data_config["process_file_num"]>0:
|
||||
new_files = [file_info for file_info in new_files if file_info["name"] in {meta["name"] for meta in new_metas}]
|
||||
if len(new_files) > data_config["process_file_num"]:
|
||||
new_files = new_files[:data_config["process_file_num"]]
|
||||
# Filter new_metas according to the latest number of new_files
|
||||
new_metas = [meta_info for meta_info in new_metas if meta_info["name"] in {file_info["name"] for file_info in new_files}]
|
||||
|
||||
self.console_logger.info(f"After filtering, {len(new_files)} files and {len(new_metas)} metadata entries to process")
|
||||
|
||||
# 4. Merge file and metadata info, create processing objects
|
||||
objects_to_process:list[IndexObject] = []
|
||||
for file_info in new_files:
|
||||
index_object = IndexObject(object_key=file_info["name"], type="document", doc_modifed_time=file_info.get("doc_upper_time"))
|
||||
objects_to_process.append(index_object)
|
||||
for meta_info in new_metas:
|
||||
existing_obj = next((obj for obj in objects_to_process if obj.object_key == meta_info["name"]), None)
|
||||
if existing_obj:
|
||||
existing_obj.metadata_modifed_time = meta_info.get("meta_upper_time")
|
||||
else:
|
||||
index_object = IndexObject(object_key=meta_info["name"], type="document", metadata_modifed_time=meta_info.get("meta_upper_time"))
|
||||
objects_to_process.append(index_object)
|
||||
# 5. If there are objects to process, create a new job record
|
||||
if objects_to_process:
|
||||
await self._create_index_job(objects_to_process, data_config.get("datasource_name", "default"))
|
||||
return objects_to_process
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error getting objects to process: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def _create_index_job(self, objects_to_process: List[IndexObject], datasource_name: str):
|
||||
"""Create index job record."""
|
||||
try:
|
||||
Session = sessionmaker(bind=self.context.database_engine)
|
||||
session = Session()
|
||||
try:
|
||||
index_job_db = IndexJob(
|
||||
start_time=datetime.datetime.now(datetime.timezone.utc),
|
||||
status="processing",
|
||||
total_process_count=len(objects_to_process),
|
||||
datasource_name=datasource_name
|
||||
)
|
||||
for index_object in objects_to_process:
|
||||
index_job_db.doc_upper_time = max_datetime_safe(index_object.doc_modifed_time, index_job_db.doc_upper_time)
|
||||
index_job_db.doc_lower_time = min_datetime_safe(index_object.doc_modifed_time, index_job_db.doc_lower_time)
|
||||
index_job_db.metadata_upper_time = max_datetime_safe(index_object.metadata_modifed_time, index_job_db.metadata_upper_time)
|
||||
index_job_db.metadata_lower_time = min_datetime_safe(index_object.metadata_modifed_time, index_job_db.metadata_lower_time)
|
||||
# Set datasource_name for each index object
|
||||
index_object.datasource_name = datasource_name
|
||||
session.add(index_job_db)
|
||||
session.commit()
|
||||
self.console_logger.info(f"Created processing job for {len(objects_to_process)} objects in datasource: {datasource_name}")
|
||||
finally:
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.console_logger.error(f"Error creating index job: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def _update_index_job_status(self, stats: ProcessingStats, datasource_name: str = "default"):
|
||||
"""Update index job status."""
|
||||
try:
|
||||
Session = sessionmaker(bind=self.context.database_engine)
|
||||
session = Session()
|
||||
try:
|
||||
current_job = session.query(IndexJob).filter(
|
||||
and_(
|
||||
IndexJob.status == "processing",
|
||||
IndexJob.datasource_name == datasource_name
|
||||
)
|
||||
).order_by(IndexJob.id.desc()).first()
|
||||
if current_job:
|
||||
if stats.failed_tasks == 0 and stats.completed_tasks == stats.total_tasks:
|
||||
current_job.status = "success"
|
||||
elif stats.completed_tasks > 0 and stats.failed_tasks > 0:
|
||||
current_job.status = "partial_success"
|
||||
else:
|
||||
current_job.status = "failed"
|
||||
current_job.end_time = datetime.datetime.now(datetime.timezone.utc)
|
||||
current_job.success_count = stats.completed_tasks
|
||||
current_job.failed_count = stats.failed_tasks
|
||||
session.commit()
|
||||
self.console_logger.info(f"Job completed for datasource '{datasource_name}': {current_job.status}")
|
||||
finally:
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.console_logger.error(f"Error updating job status: {e}")
|
||||
|
||||
def _create_tasks(self, objects: List[IndexObject], data_config: Dict[str, Any], config: ApplicationConfig) -> List[Task]:
|
||||
"""Create task list."""
|
||||
tasks:list[Task] = []
|
||||
datasource_name = data_config.get("datasource_name", "default")
|
||||
for obj in objects:
|
||||
context = ProcessingContext(
|
||||
object_key=obj.object_key,
|
||||
data_config=data_config,
|
||||
metadata={
|
||||
"doc_modified_time": obj.doc_modifed_time,
|
||||
"metadata_modified_time": obj.metadata_modifed_time
|
||||
},
|
||||
current_tmp_directory=self.context.config.current_tmp_directory,
|
||||
datasource_name=datasource_name,
|
||||
config=config
|
||||
)
|
||||
task = Task(id = obj.object_key , payload=context, priority=0)
|
||||
tasks.append(task)
|
||||
return tasks
|
||||
async def shutdown(self):
|
||||
"""Shutdown application."""
|
||||
self.console_logger.info("Application shutdown completed")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def application_context(self):
|
||||
"""Application context manager."""
|
||||
await self.initialize()
|
||||
try:
|
||||
yield self
|
||||
finally:
|
||||
await self.shutdown()
|
||||
async def _initialize_search_index(self, data_config: Dict[str, Any],applicationconfig: ApplicationConfig):
|
||||
"""Initialize search index schema, ensure search index is created and configured."""
|
||||
try:
|
||||
self.console_logger.info("Initializing search index schema...")
|
||||
await asyncio.to_thread(index_init, data_config, applicationconfig.azure_services.search_admin_key, applicationconfig.azure_services.search_service_name)
|
||||
self.console_logger.info("Search index schema initialized successfully")
|
||||
except Exception as e:
|
||||
self.console_logger.error(f"Error initializing search index: {e}")
|
||||
raise
|
||||
|
||||
async def main():
|
||||
"""Main function."""
|
||||
parser = argparse.ArgumentParser(description="Document Processing Application (Refactored)")
|
||||
parser.add_argument("--config", type=str, default="config.yaml", help="Business configuration file path")
|
||||
parser.add_argument("--env", type=str, default="env.yaml", help="Environment variables file path")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", help="Log level")
|
||||
args = parser.parse_args()
|
||||
app = DocumentProcessingApplication(args.config, args.env)
|
||||
try:
|
||||
async with app.application_context():
|
||||
await app.run()
|
||||
except KeyboardInterrupt:
|
||||
print("Application interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"Application error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
9
vw-document-ai-indexer/prompt.yaml
Normal file
9
vw-document-ai-indexer/prompt.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# Prompt
|
||||
caption:
|
||||
en:
|
||||
system: "yaml You are a captioning model that helps uses find descriptive captions."
|
||||
user: "yaml Describe this image as if you were describing it to someone who can't see it."
|
||||
"zh-Hans":
|
||||
system: "yaml 您是一个帮助用户寻找描述性字幕的字幕模型。"
|
||||
user: "yaml 描述此图像就像您将其描述给看不见的人一样。"
|
||||
|
||||
37
vw-document-ai-indexer/pyproject.toml
Normal file
37
vw-document-ai-indexer/pyproject.toml
Normal file
@@ -0,0 +1,37 @@
|
||||
[project]
|
||||
name = "data preparation"
|
||||
version = "0.1.0"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"azure-identity == 1.15.0",
|
||||
"openai == 1.55.3",
|
||||
"azure-search-documents == 11.4.0b6",
|
||||
"azure-storage-blob == 12.17.0",
|
||||
"python-dotenv == 1.0.0",
|
||||
"httpx",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-ai-formrecognizer == 3.3.0",
|
||||
"markdown",
|
||||
"tqdm",
|
||||
"PyMuPDF",
|
||||
"tiktoken",
|
||||
"langchain",
|
||||
"bs4",
|
||||
"urllib3",
|
||||
"six",
|
||||
"pdf2image",
|
||||
"opencv-python",
|
||||
"Pillow",
|
||||
"chardet",
|
||||
"SQLAlchemy == 2.0.41",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = ["pytest", "pytest-asyncio"]
|
||||
dev = []
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
asyncio_mode = "auto"
|
||||
30
vw-document-ai-indexer/requirements.txt
Normal file
30
vw-document-ai-indexer/requirements.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
azure-identity==1.15.0
|
||||
openai==1.55.3
|
||||
azure-search-documents==11.5.0
|
||||
azure-storage-blob==12.17.0
|
||||
python-dotenv==1.0.0
|
||||
httpx
|
||||
azure-ai-documentintelligence
|
||||
azure-ai-formrecognizer==3.3.0
|
||||
markdown
|
||||
tqdm
|
||||
PyMuPDF
|
||||
tiktoken
|
||||
langchain
|
||||
langchain-openai
|
||||
langchain-core
|
||||
langchain-community
|
||||
bs4
|
||||
urllib3
|
||||
pytest
|
||||
pytest-asyncio
|
||||
six
|
||||
pdf2image
|
||||
opencv-python
|
||||
Pillow
|
||||
chardet
|
||||
SQLAlchemy==2.0.41
|
||||
psycopg2==2.9.10
|
||||
pyyaml==6.0.2
|
||||
uuid6==2025.0.1
|
||||
dataclasses-json==0.6.7
|
||||
209
vw-document-ai-indexer/resilient_http_pool.py
Normal file
209
vw-document-ai-indexer/resilient_http_pool.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Resilient HTTP Connection Pool Manager
|
||||
"""
|
||||
import atexit
|
||||
from enum import verify
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, Generator
|
||||
import httpx
|
||||
|
||||
class ResilientConnectionManager:
|
||||
"""
|
||||
Elastic Connection Manager
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._connection_pools: Dict[str, httpx.Client] = {}
|
||||
self._pool_lock = threading.Lock()
|
||||
self._is_closed = False
|
||||
|
||||
# Resource cleaning when the registration program exits
|
||||
atexit.register(self._cleanup_all_pools)
|
||||
|
||||
def get_persistent_client(self, service_profile: str = "standard") -> httpx.Client:
|
||||
"""
|
||||
Get persistent client - main interface
|
||||
|
||||
Args:
|
||||
service_profile: Service configuration file
|
||||
- "standard": General API (60s timeout)
|
||||
- "cloud_api": Cloud API (120s timeout, suitable for Azure)
|
||||
- "ai_inference": AI Reasoning Services (180s timeout, suitable for OpenAI/VLLM)
|
||||
- "batch_processing": Batch Processing Services (300s timeout)
|
||||
"""
|
||||
if self._is_closed:
|
||||
raise RuntimeError("Connection manager is closed")
|
||||
|
||||
if service_profile not in self._connection_pools:
|
||||
with self._pool_lock:
|
||||
# Double-checked locking pattern
|
||||
if service_profile not in self._connection_pools:
|
||||
self._connection_pools[service_profile] = self._create_optimized_client(service_profile)
|
||||
|
||||
return self._connection_pools[service_profile]
|
||||
|
||||
def _create_optimized_client(self, service_profile: str) -> httpx.Client:
|
||||
"""Create an optimized client based on the service profile"""
|
||||
|
||||
# Service profile mapping
|
||||
profile_configs = {
|
||||
"standard": {
|
||||
"timeout": 60.0,
|
||||
"max_keepalive": 10,
|
||||
"max_connections": 50,
|
||||
"description": "General API Services"
|
||||
},
|
||||
"cloud_api": {
|
||||
"timeout": 120.0,
|
||||
"max_keepalive": 8,
|
||||
"max_connections": 25,
|
||||
"description": "Cloud API Services (Azure Search, Storage ...)"
|
||||
},
|
||||
"ai_inference": {
|
||||
"timeout": 180.0,
|
||||
"max_keepalive": 5,
|
||||
"max_connections": 15,
|
||||
"description": "AI Reasoning Services (OpenAI, VLLM ...)"
|
||||
},
|
||||
"batch_processing": {
|
||||
"timeout": 300.0,
|
||||
"max_keepalive": 3,
|
||||
"max_connections": 10,
|
||||
"description": "Batch processing and long-term tasks"
|
||||
}
|
||||
}
|
||||
|
||||
config = profile_configs.get(service_profile, profile_configs["standard"])
|
||||
|
||||
return httpx.Client(
|
||||
timeout=config["timeout"],
|
||||
limits=httpx.Limits(
|
||||
max_keepalive_connections=config["max_keepalive"],
|
||||
max_connections=config["max_connections"],
|
||||
keepalive_expiry=300 # 5 minutes to keep alive
|
||||
),
|
||||
follow_redirects=True,
|
||||
verify=False
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def resilient_session(self, service_profile: str = "standard"):
|
||||
"""
|
||||
Elastic Session Context Manager - Recommended for retry scenarios
|
||||
|
||||
Example of usage:
|
||||
with connection_manager.resilient_session("ai_inference") as client:
|
||||
for retry in range(3):
|
||||
response = client.post(...)
|
||||
"""
|
||||
client = self.get_persistent_client(service_profile)
|
||||
# Directly return the client without using the with statement
|
||||
# Because the client is already managed in the connection pool, no additional context management is needed
|
||||
try:
|
||||
yield client
|
||||
finally:
|
||||
# Do not close the client here, keep the connection pool alive
|
||||
pass
|
||||
|
||||
def get_pool_statistics(self) -> Dict[str, Dict]:
|
||||
"""Get connection pool statistics - for monitoring"""
|
||||
stats = {}
|
||||
with self._pool_lock:
|
||||
for profile, client in self._connection_pools.items():
|
||||
try:
|
||||
# httpx internal connection pool information
|
||||
pool_info = {
|
||||
"is_closed": client.is_closed,
|
||||
"timeout": str(client.timeout),
|
||||
"max_connections": client._transport._pool._pool_factory.limits.max_connections, # type: ignore
|
||||
"profile": profile
|
||||
}
|
||||
stats[profile] = pool_info
|
||||
except Exception:
|
||||
stats[profile] = {"error": "Statistical information cannot be obtained"}
|
||||
return stats
|
||||
|
||||
def force_refresh_pool(self, service_profile: str):
|
||||
"""Force refresh the specified connection pool - for fault recovery"""
|
||||
with self._pool_lock:
|
||||
if service_profile in self._connection_pools:
|
||||
try:
|
||||
self._connection_pools[service_profile].close()
|
||||
except Exception:
|
||||
pass
|
||||
del self._connection_pools[service_profile]
|
||||
|
||||
def _cleanup_all_pools(self):
|
||||
"""Clean all connection pools - Memory security"""
|
||||
with self._pool_lock:
|
||||
if not self._is_closed:
|
||||
for profile, client in list(self._connection_pools.items()):
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass # Ignore errors during cleaning
|
||||
|
||||
self._connection_pools.clear()
|
||||
self._is_closed = True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Global instances and convenient interfaces
|
||||
# =============================================================================
|
||||
|
||||
# Global Elastic Connection Manager
|
||||
_resilient_manager = ResilientConnectionManager()
|
||||
|
||||
# Main public interface
|
||||
def get_persistent_http_client(service_profile: str = "standard") -> httpx.Client:
|
||||
"""
|
||||
Get persistent HTTP client - main interface
|
||||
|
||||
Recommended service configuration profiles:
|
||||
- "standard": generic API
|
||||
- "cloud_api": Azure/cloud service API
|
||||
- "ai_inference": OpenAI/VLLM etc. AI services
|
||||
- "batch_processing": long-term batch processing tasks
|
||||
"""
|
||||
return _resilient_manager.get_persistent_client(service_profile)
|
||||
|
||||
def resilient_http_session(service_profile: str = "standard"):
|
||||
"""
|
||||
Elastic HTTP Session Context Manager - Recommended for retry logic
|
||||
|
||||
Example of usage:
|
||||
with resilient_http_session("ai_inference") as client:
|
||||
for retry in range(3):
|
||||
response = client.post(endpoint, json=data)
|
||||
"""
|
||||
return _resilient_manager.resilient_session(service_profile)
|
||||
|
||||
def get_connection_pool_stats() -> Dict[str, Dict]:
|
||||
"""Get connection pool statistics"""
|
||||
return _resilient_manager.get_pool_statistics()
|
||||
|
||||
def refresh_connection_pool(service_profile: str):
|
||||
"""Refresh the specified connection pool"""
|
||||
_resilient_manager.force_refresh_pool(service_profile)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Convenient dedicated client interfaces - more intuitive naming
|
||||
# =============================================================================
|
||||
|
||||
def get_standard_client() -> httpx.Client:
|
||||
"""Get the standard client (generic HTTP request)"""
|
||||
return get_persistent_http_client("standard")
|
||||
|
||||
def get_cloud_api_client() -> httpx.Client:
|
||||
"""Get dedicated cloud API clients (Azure Search, Storage, etc.)"""
|
||||
return get_persistent_http_client("cloud_api")
|
||||
|
||||
def get_ai_inference_client() -> httpx.Client:
|
||||
"""Get AI Inference Dedicated Clients (OpenAI, VLLM, etc.)"""
|
||||
return get_persistent_http_client("ai_inference")
|
||||
|
||||
def get_batch_processing_client() -> httpx.Client:
|
||||
"""Get a batch-specific client (long-term task)"""
|
||||
return get_persistent_http_client("batch_processing")
|
||||
243
vw-document-ai-indexer/task_processor.py
Normal file
243
vw-document-ai-indexer/task_processor.py
Normal file
@@ -0,0 +1,243 @@
|
||||
import time
|
||||
from typing import List, Any, Optional, Dict
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
import json
|
||||
import datetime
|
||||
import traceback
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from abc import ABC, abstractmethod
|
||||
from sqlalchemy import and_
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from database import IndexJobStatus, IndexJob
|
||||
|
||||
from utils import custom_serializer
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
"""Task object"""
|
||||
id: str
|
||||
payload: Any
|
||||
priority: int = 0
|
||||
status: IndexJobStatus = IndexJobStatus.PENDING
|
||||
created_at: float = field(default_factory=time.time)
|
||||
started_at: Optional[float] = None
|
||||
completed_at: Optional[float] = None
|
||||
error: Optional[Exception] = None
|
||||
result: Any = None
|
||||
|
||||
def __lt__(self, other):
|
||||
"""Used for priority queue sorting"""
|
||||
return self.priority > other.priority
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingStats:
|
||||
"""Processing statistics information"""
|
||||
total_tasks: int = 0
|
||||
completed_tasks: int = 0
|
||||
failed_tasks: int = 0
|
||||
cancelled_tasks: int = 0
|
||||
average_processing_time: float = 0.0
|
||||
throughput: float = 0.0 # Number of tasks processed per second
|
||||
start_time: datetime.datetime = datetime.datetime.now()
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Success rate"""
|
||||
if self.total_tasks == 0:
|
||||
return 0.0
|
||||
return self.completed_tasks / self.total_tasks
|
||||
|
||||
@property
|
||||
def pending_tasks(self) -> int:
|
||||
"""Number of pending tasks"""
|
||||
return self.total_tasks - self.completed_tasks - self.failed_tasks - self.cancelled_tasks
|
||||
|
||||
@property
|
||||
def elapsed_time(self) -> float:
|
||||
"""Elapsed time"""
|
||||
time_diff = datetime.datetime.now() - self.start_time
|
||||
return time_diff.total_seconds()
|
||||
|
||||
@property
|
||||
def eta(self) -> float:
|
||||
"""Estimated remaining time"""
|
||||
if self.completed_tasks == 0:
|
||||
return 0.0
|
||||
rate = self.completed_tasks / self.elapsed_time
|
||||
if rate == 0:
|
||||
return 0.0
|
||||
return self.pending_tasks / rate
|
||||
|
||||
class TaskProcessorInterface(ABC):
|
||||
@abstractmethod
|
||||
def process(self, task: Task) -> Any:
|
||||
pass
|
||||
|
||||
class TaskProcessor:
|
||||
"""Task processor"""
|
||||
|
||||
def __init__(self,
|
||||
task_processor: TaskProcessorInterface,
|
||||
max_workers: int = 4,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
database_engine: Optional[Any] = None,
|
||||
data_config:Optional[dict[str,Any]] = None):
|
||||
|
||||
if data_config is None:
|
||||
raise ValueError("data_config must be provided")
|
||||
|
||||
self.task_processor = task_processor
|
||||
self.max_workers = max_workers
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
self.database_engine = database_engine
|
||||
|
||||
# Simple statistics
|
||||
self.total_tasks = 0
|
||||
self.completed_tasks = 0
|
||||
self.failed_tasks = 0
|
||||
self.start_time:datetime.datetime|None = None
|
||||
|
||||
# Processing report collection
|
||||
self.processing_reports: List[Dict[str, Any]] = []
|
||||
|
||||
# Control variable
|
||||
self.should_stop = False
|
||||
|
||||
self.data_config = data_config
|
||||
self.datasource_name: str = data_config.get("datasource_name", "default")
|
||||
|
||||
def process_tasks(self, tasks: List[Any]) -> None:
|
||||
"""Process task list - simple and effective"""
|
||||
self.total_tasks = len(tasks)
|
||||
self.completed_tasks = 0
|
||||
self.failed_tasks = 0
|
||||
self.start_time = datetime.datetime.now()
|
||||
self.processing_reports = []
|
||||
|
||||
self.logger.info(f"Starting to process {self.total_tasks} tasks")
|
||||
|
||||
# Use thread pool to process tasks
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all tasks
|
||||
future_to_task = {executor.submit(self._process_single_task, task): task
|
||||
for task in tasks}
|
||||
|
||||
# Wait for tasks to complete
|
||||
for future in as_completed(future_to_task):
|
||||
if self.should_stop:
|
||||
break
|
||||
|
||||
task = future_to_task[future]
|
||||
try:
|
||||
result = future.result()
|
||||
self.completed_tasks += 1
|
||||
|
||||
# Record successful processing report
|
||||
report:dict[str,Any] = { 'task_id': getattr(task, 'id', 'unknown'), 'status': 'success', 'message': getattr(result, 'message', 'Processing completed'), 'chunks_count': getattr(result, 'chunks_count', 0), 'processing_time': getattr(result, 'processing_time', 0) }
|
||||
|
||||
self.processing_reports.append(report)
|
||||
|
||||
# Output progress every 1 task
|
||||
self._log_progress()
|
||||
|
||||
except Exception:
|
||||
self.failed_tasks += 1
|
||||
self.logger.error(f"Task processing failed: {traceback.format_exc()}")
|
||||
# Record failed processing report
|
||||
report = { 'task_id': getattr(task, 'id', 'unknown'), 'status': 'failed', 'error': traceback.format_exc(), 'processing_time': 0 }
|
||||
self.processing_reports.append(report)
|
||||
# Output final statistics
|
||||
self.finalize_job_status_and_log()
|
||||
|
||||
def _process_single_task(self, task: Any) -> Any:
|
||||
"""Process a single task"""
|
||||
return self.task_processor.process(task)
|
||||
|
||||
def get_processing_reports(self) -> List[Dict[str, Any]]:
|
||||
"""Get processing reports"""
|
||||
return self.processing_reports
|
||||
|
||||
def _log_progress(self) -> None:
|
||||
"""Output progress information (estimate remaining time based on average time per processed document)"""
|
||||
if self.start_time is None:
|
||||
return
|
||||
elapsed = (datetime.datetime.now() - self.start_time).total_seconds() if self.start_time else 0
|
||||
total_processed = self.completed_tasks + self.failed_tasks
|
||||
remaining = self.total_tasks - total_processed
|
||||
# Total processing time for processed tasks
|
||||
total_processing_time = sum(r.get('processing_time', 0) for r in self.processing_reports)
|
||||
avg_processing_time = (total_processing_time / total_processed) if total_processed > 0 else 0
|
||||
eta = avg_processing_time * remaining
|
||||
if total_processed > 0:
|
||||
rate = total_processed / elapsed if elapsed > 0 else 0
|
||||
self.logger.info(
|
||||
f"Progress: {total_processed}/{self.total_tasks} "
|
||||
f"({100.0 * total_processed / self.total_tasks:.1f}%) "
|
||||
f"Success: {self.completed_tasks} Failed: {self.failed_tasks} "
|
||||
f"Rate: {rate:.2f} tasks/second "
|
||||
f"Average time: {avg_processing_time:.2f} seconds/task "
|
||||
f"Estimated remaining: {eta / 60:.1f} minutes"
|
||||
)
|
||||
|
||||
def finalize_job_status_and_log(self) -> None:
|
||||
"""Statistics, write IndexJob status, and output all log details."""
|
||||
elapsed = (datetime.datetime.now() - self.start_time).total_seconds() if self.start_time else 0
|
||||
success_count = self.completed_tasks
|
||||
fail_count = self.failed_tasks
|
||||
total_count = self.total_tasks
|
||||
success_rate = (success_count / total_count * 100) if total_count > 0 else 0.0
|
||||
status = IndexJobStatus.FAILED.value
|
||||
if total_count == success_count:
|
||||
status = IndexJobStatus.SUCCESS.value
|
||||
elif success_count > 0 and fail_count > 0:
|
||||
status = IndexJobStatus.PARTIAL_SUCCESS.value
|
||||
|
||||
report:dict[str,Any] = {
|
||||
"status": status,
|
||||
"success_rate": f"{success_rate:.4f}%",
|
||||
"total_tasks": total_count,
|
||||
"completed": success_count,
|
||||
"failed": fail_count,
|
||||
"start_time": self.start_time,
|
||||
"end_time": datetime.datetime.now(datetime.timezone.utc),
|
||||
"processing_time": f"{elapsed:.4f} sec",
|
||||
"total_elapsed": f"{elapsed / 3600:.4f} hours ",
|
||||
"average_speed": f"{total_count / elapsed:.5f} tasks/sec" if elapsed > 0 else "average speed: 0 tasks/sec"
|
||||
}
|
||||
# Database write section
|
||||
if self.database_engine:
|
||||
try:
|
||||
Session = sessionmaker(bind=self.database_engine)
|
||||
session = Session()
|
||||
try:
|
||||
current_job = session.query(IndexJob).filter(and_(IndexJob.status == "processing",IndexJob.datasource_name==self.datasource_name)).order_by(IndexJob.id.desc()).first()
|
||||
if current_job:
|
||||
setattr(current_job, 'finished_time', report["end_time"])
|
||||
setattr(current_job, 'success_object_count', success_count - fail_count)
|
||||
setattr(current_job, 'failed_object_count', fail_count)
|
||||
setattr(current_job, 'detailed_message', json.dumps(report, default=custom_serializer, ensure_ascii=False))
|
||||
session.commit()
|
||||
self.logger.info(f"IndexJob status updated: {current_job.status}, Success: {current_job.success_object_count}, Failed: {current_job.failed_object_count}")
|
||||
else:
|
||||
self.logger.warning("No IndexJob record with processing status found")
|
||||
finally:
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to update IndexJob status: {e}")
|
||||
# Output merged report content
|
||||
self.logger.info(f"Final report: {json.dumps(report, default=custom_serializer, ensure_ascii=False)}")
|
||||
if self.processing_reports:
|
||||
success_reports = [r for r in self.processing_reports if r['status'] == 'success']
|
||||
failed_reports = [r for r in self.processing_reports if r['status'] == 'failed']
|
||||
if success_reports:
|
||||
total_chunks = sum(r.get('chunks_count', 0) for r in success_reports)
|
||||
avg_processing_time = sum(r.get('processing_time', 0) for r in success_reports) / len(success_reports)
|
||||
self.logger.info(f"Success reports: {len(success_reports)} tasks, total {total_chunks} chunks, average processing time {avg_processing_time:.2f} sec")
|
||||
if failed_reports:
|
||||
self.logger.error(f"Failed reports: {len(failed_reports)} tasks")
|
||||
for r in failed_reports[:5]:
|
||||
self.logger.error(f" - {r['task_id']}: {r['error']}")
|
||||
78
vw-document-ai-indexer/third_level_service.py
Normal file
78
vw-document-ai-indexer/third_level_service.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
Level 3 title recommendation algorithm - only count the number of most frequently used # numbers
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
from typing import Dict, Any, List
|
||||
import re
|
||||
|
||||
def get_third_level_hash_counts_simple(content: str) -> List[int]:
|
||||
hash_counts = []
|
||||
in_code_block = False
|
||||
|
||||
for line in content.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Processing code blocks
|
||||
if line.startswith('```'):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
continue
|
||||
|
||||
# Match the title line: #+ space Content
|
||||
match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
if match:
|
||||
hash_count = len(match.group(1))
|
||||
title_text = match.group(2).strip()
|
||||
|
||||
# Check if it is a third-level heading - supports two formats:
|
||||
# 1. Traditional numeric format: "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
|
||||
# 2. Letter+number format: "A.1.2.3" (treat A.x.x.x as a third-level heading)
|
||||
|
||||
is_third_level = False
|
||||
|
||||
# Traditional numeric third-level format: x.x.x
|
||||
if re.match(r'^\d+\s*\.\s*\d+\s*\.\s*\d+(?:\s|$|[^\d\.])', title_text):
|
||||
is_third_level = True
|
||||
|
||||
# Letter+number third-level format: A.x.x.x (treat as third-level heading)
|
||||
elif re.match(r'^[A-Z]\.\d+\.\d+\.\d+(?:\s|$|[^\d\.])', title_text):
|
||||
is_third_level = True
|
||||
|
||||
if is_third_level:
|
||||
hash_counts.append(hash_count)
|
||||
|
||||
return hash_counts
|
||||
|
||||
def get_recommended_hash_count_simple(content: str) -> Dict[str, Any]:
|
||||
hash_counts = get_third_level_hash_counts_simple(content)
|
||||
|
||||
if not hash_counts:
|
||||
return {
|
||||
'recommendation': 5, # Default value
|
||||
'reason': 'No third-level headings detected, using default value',
|
||||
'statistics': {},
|
||||
'total_count': 0
|
||||
}
|
||||
|
||||
# Count the frequency of various # usage
|
||||
usage_stats = Counter(hash_counts)
|
||||
|
||||
# Select the most frequently used # count
|
||||
most_common = usage_stats.most_common(1)[0]
|
||||
recommended_hash_count = most_common[0]
|
||||
frequency = most_common[1]
|
||||
total_count = len(hash_counts)
|
||||
percentage = frequency / total_count * 100
|
||||
|
||||
return {
|
||||
'recommendation': recommended_hash_count,
|
||||
'reason': f'Most frequently used: {frequency}/{total_count} times ({percentage:.1f}%)',
|
||||
'statistics': dict(usage_stats),
|
||||
'total_count': total_count
|
||||
}
|
||||
334
vw-document-ai-indexer/utils.py
Normal file
334
vw-document-ai-indexer/utils.py
Normal file
@@ -0,0 +1,334 @@
|
||||
import shutil
|
||||
from dataclasses import fields
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
import random
|
||||
from typing import Any, List, Optional, Union
|
||||
import string
|
||||
from PIL import Image
|
||||
import tiktoken
|
||||
from PIL.Image import Resampling
|
||||
|
||||
from entity_models import Document, FigureFlat
|
||||
|
||||
|
||||
class TokenEstimator(object):
|
||||
GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
|
||||
return len(self.GPT2_TOKENIZER.encode(text, allowed_special="all"))
|
||||
|
||||
def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str:
|
||||
newTokens = self.GPT2_TOKENIZER.decode(
|
||||
self.GPT2_TOKENIZER.encode(tokens, allowed_special="all")[:numofTokens]
|
||||
)
|
||||
return newTokens
|
||||
|
||||
TOKEN_ESTIMATOR = TokenEstimator()
|
||||
|
||||
|
||||
def generate_random_name(length:int=12):
|
||||
# Characters to use: letters and digits
|
||||
characters = string.ascii_letters + string.digits
|
||||
# Randomly select `length` characters
|
||||
folder_name = ''.join(random.choices(characters, k=length))
|
||||
return folder_name
|
||||
|
||||
def asdict_with_dynamic(obj:Any) -> dict[str, Any]:
|
||||
"""Returns a dictionary containing dynamic attributes"""
|
||||
# Use predefined fields as the basis
|
||||
result = {f.name: getattr(obj, f.name) for f in fields(obj)}
|
||||
# Add dynamic attributes
|
||||
all_attrs = dir(obj)
|
||||
predefined_attrs = [f.name for f in fields(obj)]
|
||||
for attr in all_attrs:
|
||||
# Skip special attributes, private attributes, methods, and predefined attributes
|
||||
if (
|
||||
not attr.startswith("__")
|
||||
and not callable(getattr(obj, attr))
|
||||
and attr not in predefined_attrs
|
||||
):
|
||||
result[attr] = getattr(obj, attr)
|
||||
return result
|
||||
|
||||
|
||||
|
||||
def write_log(message: str):
|
||||
"""Write log message (INFO level) to data_preparation logger."""
|
||||
logging.getLogger("data_preparation").info(msg=message)
|
||||
|
||||
def init_current_data_directory(base_path:str) -> str:
|
||||
"""Initialize the current data directory and return its path."""
|
||||
folder_name = generate_random_name(10)
|
||||
if base_path == "":
|
||||
base_path = os.path.expanduser("~")
|
||||
# Create the directory path
|
||||
local_data_folder = os.path.join(base_path , "doc-extractor", folder_name)
|
||||
os.makedirs(local_data_folder, exist_ok=True)
|
||||
return local_data_folder
|
||||
|
||||
def write_content(content: str, directory_path: str, file_name: str):
|
||||
"""Write merged content to a markdown file in the .extracted directory, and optionally upload to blob storage."""
|
||||
output_folder = directory_path + "/.extracted/" + file_name
|
||||
os.makedirs(f"{output_folder}", exist_ok=True)
|
||||
with open(f"{output_folder}/_merged.md", "w", encoding="utf-8") as file:
|
||||
file.write(content)
|
||||
|
||||
print(f"Merged Saved: {output_folder}/_merged.md")
|
||||
|
||||
def write_object(obj: Any, directory_path: str, file_name: str):
|
||||
"""Write a dictionary to a JSON file in the specified directory."""
|
||||
output_folder = directory_path + "/.extracted/" + file_name
|
||||
os.makedirs(f"{output_folder}", exist_ok=True)
|
||||
with open(f"{output_folder}/_merged.json", "w", encoding="utf-8") as file:
|
||||
json.dump(obj, file, indent=4, ensure_ascii=False, default=custom_serializer)
|
||||
print(f"Dict Saved: {output_folder}/_merged.json")
|
||||
|
||||
def write_document(documents: list[Document], file_path: str, directory_path: str, rel_file_path: str):
|
||||
"""Write the parsed document list to a JSON file in the specified directory."""
|
||||
chunks_save = []
|
||||
for chunk_idx, chunk_doc in enumerate(documents):
|
||||
chunk_doc.filepath = rel_file_path
|
||||
chunk_doc.metadata = json.dumps({"chunk_id": str(chunk_idx)})
|
||||
chunk_doc.image_mapping = json.dumps(chunk_doc.image_mapping) if chunk_doc.image_mapping else None
|
||||
chunks_save.append(asdict_with_dynamic(chunk_doc))
|
||||
|
||||
output_folder = directory_path + "/.chunked"
|
||||
os.makedirs(f"{output_folder}", exist_ok=True)
|
||||
with open(f"{output_folder}/{rel_file_path}.json", "w", encoding="utf-8") as file:
|
||||
file.write(json.dumps(chunks_save, indent=4, ensure_ascii=False))
|
||||
print(f"Processed {file_path} to {len(documents)} chunks. Document Schema: {documents[0].document_schema}")
|
||||
print(f"Saved Result: {output_folder}/{rel_file_path}.json")
|
||||
|
||||
|
||||
# Custom serializer function
|
||||
def custom_serializer(obj:Any)->Any:
|
||||
"""Handle types that cannot be serialized by JSON"""
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat() # Convert to ISO 8601 string
|
||||
elif isinstance(obj, Decimal):
|
||||
return float(obj) # Decimal to float
|
||||
elif hasattr(obj, '__dict__'):
|
||||
return obj.__dict__ # Class object to dict
|
||||
else:
|
||||
raise TypeError(f"Type {type(obj)} cannot be JSON serialized")
|
||||
|
||||
|
||||
def keep_latest(data_list: list[dict[str,Any]] , id_key:str, timestamp_key:Optional[str]='')->list[dict[str,Any]]:
|
||||
"""
|
||||
Advanced method to keep the latest records
|
||||
|
||||
Args:
|
||||
data_list: List of dictionaries containing records
|
||||
id_key: Key to identify the entity
|
||||
timestamp_key: Timestamp key (optional, if not provided, keep the last occurrence)
|
||||
|
||||
Returns:
|
||||
List of the latest records for each entity
|
||||
"""
|
||||
latest_dict = {}
|
||||
|
||||
for idx, record in enumerate(data_list):
|
||||
entity_id = record[id_key]
|
||||
|
||||
# If no timestamp, keep the last occurrence by position
|
||||
if timestamp_key is None or timestamp_key not in record:
|
||||
# Record index to handle same id cases
|
||||
latest_dict[entity_id] = (idx, record)
|
||||
continue
|
||||
|
||||
current_time = record[timestamp_key]
|
||||
|
||||
# If the current record is newer, update
|
||||
if entity_id not in latest_dict or current_time > latest_dict[entity_id][1][timestamp_key]:
|
||||
latest_dict[entity_id] = (idx, record)
|
||||
|
||||
# Sort by original position (optional)
|
||||
return [record for _, record in sorted(latest_dict.values(), key=lambda x: x[0])]
|
||||
|
||||
|
||||
def max_datetime_safe(
|
||||
dt1: Union[datetime, None],
|
||||
dt2: Union[datetime, None]
|
||||
) -> Union[datetime, None]:
|
||||
"""
|
||||
Safely get the maximum of two datetimes, handling None values
|
||||
|
||||
Args:
|
||||
dt1: First datetime (may be None)
|
||||
dt2: Second datetime (may be None)
|
||||
|
||||
Returns:
|
||||
The maximum datetime, or None if both are None
|
||||
"""
|
||||
if dt1 is None:
|
||||
return dt2
|
||||
if dt2 is None:
|
||||
return dt1
|
||||
return max(dt1, dt2)
|
||||
|
||||
|
||||
def min_datetime_safe(
|
||||
dt1: Union[datetime, None],
|
||||
dt2: Union[datetime, None]
|
||||
) -> Union[datetime, None]:
|
||||
"""
|
||||
Safely get the minimum of two datetimes, handling None values
|
||||
|
||||
Rules:
|
||||
- Both datetimes are None → return None
|
||||
- One datetime is None → return the other
|
||||
- Both datetimes are not None → return the smaller one
|
||||
|
||||
Args:
|
||||
dt1: First datetime (may be None)
|
||||
dt2: Second datetime (may be None)
|
||||
|
||||
Returns:
|
||||
The minimum datetime, or None if both are None
|
||||
"""
|
||||
if dt1 is None:
|
||||
return dt2
|
||||
if dt2 is None:
|
||||
return dt1
|
||||
return min(dt1, dt2)
|
||||
|
||||
|
||||
def write_json_to_file(data: list[dict], filename: str):
|
||||
"""Write data to a JSON file."""
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
with open(filename, "w", encoding="utf-8") as file:
|
||||
json.dump(data, file, indent=4, ensure_ascii=False, default=custom_serializer)
|
||||
print(f"JSON file saved: {filename}")
|
||||
|
||||
|
||||
def write_grouped_index_files(to_upload_dicts: list[dict[str,Any]],index_name:str, base_directory: str = ""):
|
||||
"""
|
||||
Write to the corresponding json file in the .index directory, grouped by the filepath field in to_upload_dicts
|
||||
|
||||
Args:
|
||||
to_upload_dicts: List of dictionaries to upload
|
||||
base_directory: Basic directory path
|
||||
"""
|
||||
if not to_upload_dicts:
|
||||
print("No data to write.")
|
||||
return
|
||||
|
||||
# Group by filepath field
|
||||
grouped_data = {}
|
||||
for item in to_upload_dicts:
|
||||
filepath = item.get("filepath", "unknown")
|
||||
if filepath not in grouped_data:
|
||||
grouped_data[filepath] = []
|
||||
grouped_data[filepath].append(item)
|
||||
|
||||
# Create .index directory
|
||||
index_dir = os.path.join(base_directory, ".index")
|
||||
os.makedirs(index_dir, exist_ok=True)
|
||||
|
||||
# Create corresponding json files for each filepath
|
||||
for filepath, items in grouped_data.items():
|
||||
# Convert filepath to a safe filename
|
||||
safe_filename = filepath.replace("/", "_").replace("\\", "_").replace(":", "_")
|
||||
if safe_filename.endswith(".pdf"):
|
||||
safe_filename = safe_filename[:-4] # Remove .pdf extension
|
||||
|
||||
json_filename = f"{safe_filename}.{index_name}.json"
|
||||
json_filepath = os.path.join(index_dir, json_filename)
|
||||
|
||||
# Write JSON file
|
||||
with open(json_filepath, "w", encoding="utf-8") as file:
|
||||
json.dump(items, file, indent=4, ensure_ascii=False, default=custom_serializer)
|
||||
|
||||
print(f"Grouped index file saved: {json_filepath} (contains {len(items)} items)")
|
||||
|
||||
print(f"Total {len(grouped_data)} files written to .index directory")
|
||||
|
||||
|
||||
|
||||
def replace_urls_in_content(content:str, replacements: List[FigureFlat])->str:
|
||||
"""
|
||||
Insert URLs from the replacement list into the specified positions in the content
|
||||
|
||||
:param content: Original text content
|
||||
:param replacements: Replacement list, each element contains:
|
||||
- 'url': Image URL
|
||||
- 'offset': Offset in the original content
|
||||
- 'length': Length of the text to be replaced
|
||||
:return: New content with replacements
|
||||
"""
|
||||
if not replacements:
|
||||
return content
|
||||
|
||||
# Sort by offset in descending order (process in reverse order)
|
||||
sorted_replacements = sorted(replacements, key=lambda x: x.offset, reverse=True)
|
||||
|
||||
# List to store text fragments
|
||||
fragments = []
|
||||
current_index = len(content) # Current position (start from the end)
|
||||
|
||||
for item in sorted_replacements:
|
||||
url = f""
|
||||
offset = item.offset
|
||||
length = item.length
|
||||
|
||||
# Check offset validity
|
||||
if offset >= current_index:
|
||||
continue # Skip invalid offset
|
||||
|
||||
# Calculate actual end position for replacement
|
||||
end_pos = min(offset + length, current_index)
|
||||
|
||||
# 1. Add text between current position and end of replacement
|
||||
fragments.append(content[end_pos:current_index])
|
||||
|
||||
# 2. Add URL (replace original content)
|
||||
fragments.append(url)
|
||||
|
||||
# Update current position to start of replacement
|
||||
current_index = offset
|
||||
|
||||
# Add remaining head content
|
||||
fragments.append(content[:current_index])
|
||||
|
||||
# Concatenate fragments in reverse order (since processed backwards)
|
||||
return ''.join(fragments[::-1])
|
||||
|
||||
|
||||
def resize_image(input_path:str, output_path:str=None, max_size:int=10000)->str:
|
||||
"""Scaling PNG pictures in an equal ratio to ensure that the length and width do not exceed max_size pixels"""
|
||||
with Image.open(input_path) as img:
|
||||
# Calculate the scaling ratio
|
||||
ratio = min(max_size / max(img.size), 1.0)
|
||||
|
||||
if ratio >= 1: # No scaling required
|
||||
return input_path
|
||||
|
||||
# Calculate new dimensions (maintain aspect ratio)
|
||||
new_size = tuple(round(dim * ratio) for dim in img.size)
|
||||
|
||||
# Using high-quality scaling algorithm
|
||||
resized_img = img.resize(new_size, Resampling.LANCZOS)
|
||||
|
||||
# Process the output path
|
||||
if not output_path:
|
||||
filename, ext = os.path.splitext(input_path)
|
||||
output_path = f"{filename}_resized{ext}"
|
||||
|
||||
# Save the zoomed image (preserve PNG features)
|
||||
resized_img.save(output_path, format="PNG", optimize=True)
|
||||
print(f"Images have been scaled:{img.size} → {new_size} | Save to: {output_path}")
|
||||
return output_path
|
||||
|
||||
def file_rename(input_path:str)->str:
|
||||
filename, ext = os.path.splitext(input_path)
|
||||
if ext.lower() == ".doc":
|
||||
new_path = f"{filename}.docx"
|
||||
shutil.copy2(input_path, new_path)
|
||||
print("file renamed to ", new_path)
|
||||
return new_path
|
||||
return input_path
|
||||
483
vw-document-ai-indexer/vllm_extractor.py
Normal file
483
vw-document-ai-indexer/vllm_extractor.py
Normal file
@@ -0,0 +1,483 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Any, List
|
||||
import base64
|
||||
|
||||
from app_config import ApplicationConfig
|
||||
from azure_index_service import get_cloud_api_client
|
||||
from pdf2image import convert_from_path # type: ignore
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from langchain_openai import ChatOpenAI ,AzureChatOpenAI
|
||||
from langchain.schema.messages import SystemMessage
|
||||
from langchain_core.messages import AIMessage,HumanMessage,ToolMessage
|
||||
|
||||
from di_extractor import FigureFlat
|
||||
from entity_models import DiResult, Document, UnsupportedFormatError
|
||||
from resilient_http_pool import get_ai_inference_client
|
||||
|
||||
RETRY_COUNT = 3
|
||||
|
||||
def vision_extract(pdf_file_path:str, file_format:str, directory_path:str, vllm_endpoint:str, vllm_key:str) -> List[Document]:
|
||||
if file_format not in ["pdf"]:
|
||||
raise UnsupportedFormatError(f"Unsupported file format: {file_format}")
|
||||
|
||||
source_rel_file_path = os.path.relpath(pdf_file_path, directory_path)
|
||||
|
||||
image_dir = directory_path + "/.images/" + source_rel_file_path
|
||||
|
||||
print(f"Converting to images: {pdf_file_path}")
|
||||
pdf_to_images(pdf_file_path, image_dir)
|
||||
print(f"Converted to images: {pdf_file_path}")
|
||||
|
||||
image_filenames = os.listdir(image_dir)
|
||||
image_filenames.sort()
|
||||
|
||||
rsltDocs: List[Document] = []
|
||||
page_index = 0
|
||||
for image_filename in image_filenames:
|
||||
if image_filename.endswith(".webp"):
|
||||
print(f"extracting: {image_dir}/{image_filename}")
|
||||
image_path = os.path.join(image_dir, image_filename)
|
||||
|
||||
rsltDoc = None
|
||||
if page_index == 0:
|
||||
rsltDoc = extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index)
|
||||
else:
|
||||
rsltDoc = extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index, rsltDocs[page_index-1])
|
||||
rsltDocs.append(rsltDoc)
|
||||
page_index = page_index+1
|
||||
|
||||
return rsltDocs
|
||||
|
||||
|
||||
def pdf_to_images(pdf_path, output_folder, dpi=250):
|
||||
|
||||
untrimed_folder = output_folder+"/.untrimed"
|
||||
os.makedirs(untrimed_folder, exist_ok=True)
|
||||
|
||||
# Convert PDF to images
|
||||
convert_from_path(pdf_path, dpi=dpi, output_folder=untrimed_folder,fmt="png", paths_only=True)
|
||||
|
||||
image_filenames = os.listdir(untrimed_folder)
|
||||
image_filenames.sort()
|
||||
# # clear the output folder
|
||||
# for file in os.listdir(output_folder):
|
||||
# os.remove(os.path.join(output_folder, file))
|
||||
|
||||
# Save images to the output folder
|
||||
for i, image_filename in enumerate(image_filenames):
|
||||
# generate index num with fixed width of 6 digits
|
||||
# load image
|
||||
image = Image.open(f"{untrimed_folder}/{image_filename}")
|
||||
trimmed_image = trim_image(image)
|
||||
|
||||
index = str(i + 1).zfill(6)
|
||||
image_path = f"{output_folder}/{index}.webp"
|
||||
trimmed_image.save(image_path, format="WEBP")
|
||||
os.remove(f"{untrimed_folder}/{image_filename}")
|
||||
|
||||
|
||||
|
||||
def trim_image(input_image: Image.Image) -> Image.Image:
|
||||
"""
|
||||
Trim the margins of a scanned document image, ignoring noise and small specks.
|
||||
|
||||
Args:
|
||||
input_image (Image.Image): The input PIL Image object.
|
||||
|
||||
Returns:
|
||||
Image.Image: The cropped PIL Image object.
|
||||
"""
|
||||
# Convert the image to grayscale
|
||||
grayscale_image = input_image.convert("L")
|
||||
|
||||
# Convert grayscale to numpy array
|
||||
image_array = np.array(grayscale_image)
|
||||
|
||||
# Apply a threshold to create a binary image
|
||||
threshold = 240 # Adjust this value if needed
|
||||
binary_image = (image_array < threshold).astype(np.uint8)
|
||||
|
||||
# Find the bounding box of the non-zero regions
|
||||
rows = np.any(binary_image, axis=1)
|
||||
cols = np.any(binary_image, axis=0)
|
||||
|
||||
if not rows.any() or not cols.any():
|
||||
# If the image is completely empty or noise-free, return the original
|
||||
return input_image
|
||||
|
||||
ymin, ymax = np.where(rows)[0][[0, -1]]
|
||||
xmin, xmax = np.where(cols)[0][[0, -1]]
|
||||
|
||||
# Add a small margin (optional, remove if not needed)
|
||||
margin = 10
|
||||
ymin = max(0, ymin - margin)
|
||||
ymax = min(binary_image.shape[0], ymax + margin)
|
||||
xmin = max(0, xmin - margin)
|
||||
xmax = min(binary_image.shape[1], xmax + margin)
|
||||
|
||||
# Crop the image using the calculated bounding box
|
||||
cropped_image = input_image.crop((xmin, ymin, xmax + 1, ymax + 1))
|
||||
|
||||
return cropped_image
|
||||
|
||||
|
||||
|
||||
tips = "- The document is about standard/regulatory for a automobile industry company to refer. So prioritize extracting content about standards/regulatory/compliance carefully"
|
||||
|
||||
# Define the messages for the chat
|
||||
SYS_MSG_Flow_Layout = f"""# Role
|
||||
You are specialized in extracting content from screenshots of document.
|
||||
|
||||
# Rules
|
||||
- You will receive a page screenshot from a multi-pages document. Extract content into a structured markdown format.
|
||||
|
||||
- Identify if the page is Table of Contents(目录, 目次) or empty page(after ignoring watermarks)
|
||||
- If yes, just ignore the whole page, and output "[]" only
|
||||
- If no, you should follow below rules to extract content
|
||||
|
||||
- Recognize hierarchical section header, and use appropriate markdown symbols "#" to reflect its hierarchy level.
|
||||
- Detection:
|
||||
- Identify line of section header that beginning with a hierarchical section numbering part and optionally followed by a text part. The section numbering part conatains only numbers, alphabets, and dots. The section numbering part is a tiered (multi-level) numbering system. For example: "2.3.17 示例标题", "1 Sample Title", "6.1.2.5", "A.14.8.9 示例标题".
|
||||
- Each section header is just one line, and the section number is at the beginning of the line.
|
||||
- Header Hierarchy Level Mapping:
|
||||
- The section numbering part is a tiered (multi-level) numbering system. Section number at each hierarchy level in section numbering part is seperated by dot(.), so the count of separated section number reflects its the section header's hierarchy levels. For example, the header "4.13.2 Sample" should be considered as an H3 level.
|
||||
- Use appropriate markdown symbols "#" to reflect section headers's hierarchy levels. **The number of "#" symbols should correspond to the depth of the section level.** For instance:
|
||||
- "1 section Title" should be output as "# 1 section Title"
|
||||
- "2.3.17 section Title" should be output as "### 2.3.17 section Title"
|
||||
- "A.14.8.9 section Title" should be output as "#### A.14.8.9 section Title"
|
||||
- **Table title or picture title should NOT be considered as a section header, even if it is at beginning of the page. Output them as format "[table/picture titles]", for example: "[表 1.2 示例]", "[图5 示例]")**
|
||||
- IMPORTANT: The screenshot is taken from one page of a multi-page document, note that it represents only a single page, not the entire document.**The beginning area of the page may not fall under a section header. Nevertheless, ensure that you still extract content from this area, even if it is not explicitly labeled under a section header.**
|
||||
|
||||
- Embedded Pictures/Graphs/Diagram:
|
||||
- If the embedded picture/graph/diagram is major content and can be understood clearly, descript it as caption, using format: ``
|
||||
- Otherwise, just use a placeholder: ``
|
||||
|
||||
# Tips
|
||||
- Carefully recognize scientific symbols and formulas, and output them professionally and accurately.
|
||||
- If a table is not a blank template, you should extract using markdown table markup
|
||||
- Accurately recognize the content according to the screenshot, and do not speculate any content.
|
||||
- Ignore any diagonally arranged watermarks present in the document.
|
||||
- The page footer and header can be ignored.
|
||||
{tips}
|
||||
"""
|
||||
|
||||
SYS_MSG_Slides_Layout = f"""# Role
|
||||
You are specialized in extracting content from screenshots of a slides deck like PPT.
|
||||
|
||||
# Rules
|
||||
- You will receive a page screenshot from a multi-pages deck. Extract content into a structured markdown format.
|
||||
|
||||
- Recognize title headers from the page and use appropriate markdown symbols "#" to reflect their hierarchy levels. Every page should have one H1 title header.
|
||||
|
||||
- Embedded Pictures/Graphs/Diagram: If there are embedded pictures/figures, try your best to understand them, and descript them into caption paragraphs.
|
||||
|
||||
# Tips
|
||||
- Carefully recognize scientific symbols and formulas, and output them professionally and accurately.
|
||||
- If a table is not a blank template, you should extract using markdown table markup
|
||||
- Accurately recognize the content according to the screenshot, and do not speculate any content.
|
||||
- Ignore any diagonally arranged watermarks present in the document. Identify if the page is empty after ignoring watermarks. If yes, just ignore this page, and output "[]" only
|
||||
{tips}
|
||||
"""
|
||||
|
||||
SYS_MSG_Cover = f"""# Role
|
||||
You are specialized in extracting content from screenshots of document.
|
||||
|
||||
# Rules
|
||||
- You will receive the cover page from a multi-pages document. Extract content into a structured JSON format.
|
||||
|
||||
- Recognize what type of Document Schema it is, there are the two below types of document layout schema:
|
||||
- flow: Like a page of Office Words document, mainly in flow document layout.
|
||||
- slides: Like a page of Office PowerPoint document, mainly in a presenting slide layout.
|
||||
- other: Not looks like either of abvoe document layout schema type
|
||||
- The cover page may contain the following information: main_title, sub_title, publisher, publised_date, document_code, document_category.
|
||||
- Detect the primary and secondary language of the document. Use language code as their values. The default primary language is `zh-Hans`. If there are titles in secondary language, they should also be included as well.
|
||||
- Whole page should be extracted as markdown string and stored in the `whole_page` field.
|
||||
- The output JSON schema:
|
||||
- document_schema
|
||||
- main_title
|
||||
- sub_title
|
||||
- publisher
|
||||
- publised_date
|
||||
- document_code
|
||||
- document_category
|
||||
- main_title_sec_language
|
||||
- sub_title_sec_language
|
||||
- primary_language
|
||||
- secondary_language
|
||||
- whole_page
|
||||
|
||||
# Tips
|
||||
- Accurately recognize the text content according to the screenshot, and do not speculate any content.
|
||||
- Ignore any diagonally arranged watermarks present in the document.
|
||||
- Don't use horizontal divider("---") or simmilar markdown syntax to separate the content.
|
||||
{tips}
|
||||
"""
|
||||
USER_MSG = """# task
|
||||
Recognize screenshot of this document cover page, return the result
|
||||
"""
|
||||
|
||||
def extract_from_image(image_path, vllm_endpoint, vllm_key, directory_path, source_rel_file_path, page_index, pre_document:Document = None) -> Document:
|
||||
encoded_image = base64.b64encode(open(image_path, 'rb').read()).decode('ascii')
|
||||
file_ext = image_path.split(".")[-1]
|
||||
|
||||
system_msg = ""
|
||||
if page_index==0:
|
||||
system_msg = SYS_MSG_Cover
|
||||
else:
|
||||
if pre_document.document_schema == "flow":
|
||||
system_msg = SYS_MSG_Flow_Layout
|
||||
elif pre_document.document_schema == "slides":
|
||||
system_msg = SYS_MSG_Slides_Layout
|
||||
else:
|
||||
raise ValueError(f"schema = {pre_document.document_schema}, not supported")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"api-key": vllm_key,
|
||||
}
|
||||
|
||||
payload = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_msg
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": USER_MSG
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/{file_ext};base64,{encoded_image}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"temperature": 0
|
||||
}
|
||||
|
||||
response = None
|
||||
for i in range(RETRY_COUNT):
|
||||
try:
|
||||
client = get_ai_inference_client()
|
||||
response = client.post(vllm_endpoint, headers=headers, json=payload, timeout=180)
|
||||
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error extract_from_image {image_path} with error={e}, retrying, current at {i + 1} retry, {RETRY_COUNT - (i + 1)} retries left")
|
||||
time.sleep(15)
|
||||
|
||||
rslt = None
|
||||
if response and response.status_code != 200:
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
rsltObj = response.json()
|
||||
if rsltObj["error"]["inner_error"]["code"] == "ResponsibleAIPolicyViolation":
|
||||
rslt = "[]"
|
||||
print(f"Ignored: {image_path}. Error extract_from_image with status_code={response.status_code}\n {response.text}")
|
||||
|
||||
except:
|
||||
raise Exception(f"Error extract_from_image {image_path} with status_code={response.status_code}\n {response.text}")
|
||||
else:
|
||||
raise Exception(f"Error extract_from_image {image_path} with status_code={response.status_code}\n {response.text}")
|
||||
|
||||
if rslt is None and response:
|
||||
rslt = response.json()["choices"][0]["message"]["content"]
|
||||
# img_tag = image_content_to_tag(caption)
|
||||
# mapping = {img_tag: f"data:image/{file_ext};base64,{encoded_image}"}
|
||||
|
||||
# if rslt starts with ```markdown
|
||||
if rslt.startswith("```"):
|
||||
# remove the first line and the last line
|
||||
rslt = rslt.split("\n")[1:-1]
|
||||
rslt = "\n".join(rslt)
|
||||
|
||||
## add a page number at the first line of the result text
|
||||
# rslt = f"[Page {image_filename.replace('page_', '').replace('.png', '')}]\n\n{rslt}\n\n\n\n"
|
||||
|
||||
page_index_output = str(page_index + 1).zfill(6)
|
||||
output_folder = directory_path + "/.extracted/" + source_rel_file_path
|
||||
os.makedirs(f"{output_folder}", exist_ok=True)
|
||||
|
||||
document = None
|
||||
if page_index==0:
|
||||
with open(f"{output_folder}/{page_index_output}.json", "w") as file:
|
||||
file.write(rslt)
|
||||
|
||||
rsltObj = json.loads(rslt)
|
||||
document_schema = rsltObj.get("document_schema", "flow").lower()
|
||||
if document_schema == "other":
|
||||
document_schema = "flow"
|
||||
document = Document(
|
||||
document_schema = document_schema,
|
||||
main_title = rsltObj.get("main_title", "") or "",
|
||||
sub_title = rsltObj.get("sub_title", "") or "",
|
||||
publisher = rsltObj.get("publisher", "") or "",
|
||||
document_code = rsltObj.get("document_code", "") or "",
|
||||
document_category = rsltObj.get("document_category", "") or "",
|
||||
main_title_sec_language = rsltObj.get("main_title_sec_language", "") or "",
|
||||
sub_title_sec_language = rsltObj.get("sub_title_sec_language", "") or "",
|
||||
primary_language= rsltObj.get("primary_language", ""),
|
||||
secondary_language= rsltObj.get("secondary_language", ""),
|
||||
)
|
||||
if document.sub_title != "":
|
||||
document.title = f"{document.main_title}-{document.sub_title}"
|
||||
else:
|
||||
document.title = document.main_title
|
||||
document.doc_metadata = f"{document.main_title}, {document.sub_title}, {document.document_code}, {document.main_title_sec_language}, {document.sub_title_sec_language}"
|
||||
document.filepath = source_rel_file_path
|
||||
|
||||
document.content = rsltObj.get("whole_page", "")
|
||||
|
||||
else:
|
||||
with open(f"{output_folder}/{page_index_output}.md", "w") as file:
|
||||
file.write(rslt)
|
||||
|
||||
document = Document(
|
||||
document_schema = pre_document.document_schema,
|
||||
main_title = pre_document.main_title,
|
||||
sub_title = pre_document.sub_title,
|
||||
publisher = pre_document.publisher,
|
||||
document_code = pre_document.document_code,
|
||||
document_category = pre_document.document_category,
|
||||
main_title_sec_language = pre_document.main_title_sec_language,
|
||||
sub_title_sec_language = pre_document.sub_title_sec_language,
|
||||
primary_language= pre_document.primary_language,
|
||||
secondary_language= pre_document.secondary_language,
|
||||
title = pre_document.title,
|
||||
doc_metadata = pre_document.doc_metadata,
|
||||
filepath = pre_document.filepath,
|
||||
)
|
||||
|
||||
document.content = rslt
|
||||
|
||||
return document
|
||||
|
||||
|
||||
|
||||
def understand_with_langchain(image:bytes, mime_type: str, captioning_model_endpoint: str, captioning_model_key: str,model:str|None,azure_deployment:str|None=None,api_version:str|None=None,language:str|None=None, prompts: dict[str,Any]=None):
|
||||
"""
|
||||
Use LangChain to automatically adapt to various model platforms for image understanding
|
||||
Supports OpenAI, Azure OpenAI, Tongyi Qianwen, Bailian and other platforms
|
||||
"""
|
||||
|
||||
# Select prompt words based on language and description type
|
||||
lang_key = "zh-Hans" if language == "zh-Hans" else "en"
|
||||
|
||||
if prompts is None or len(prompts) == 0:
|
||||
prompts = {
|
||||
"zh-Hans": { "system": "您是一个帮助用户寻找描述性字幕的字幕模型。", "user": "描述此图像就像您将其描述给看不见的人一样。" },
|
||||
"en": { "system": "You are a captioning model that helps uses find descriptive captions.", "user": "Describe this image as if you were describing it to someone who can't see it." }
|
||||
}
|
||||
|
||||
if lang_key in prompts.keys():
|
||||
prompt = prompts[lang_key]
|
||||
elif "en" in prompts.keys() :
|
||||
prompt = prompts["en"]
|
||||
else:
|
||||
prompt =prompts[prompts.keys()[0]]
|
||||
|
||||
|
||||
|
||||
# Encoded images
|
||||
encoded_image = base64.b64encode(image).decode('utf-8')
|
||||
image_url = f"data:image/{mime_type};base64,{encoded_image}"
|
||||
|
||||
http_client = get_cloud_api_client()
|
||||
# Judging the model type according to endpoint and initialize the corresponding LangChain client
|
||||
llm:Any=None
|
||||
for i in range(RETRY_COUNT):
|
||||
try:
|
||||
if "openai.azure" in captioning_model_endpoint:
|
||||
llm = AzureChatOpenAI(azure_deployment=azure_deployment,api_key=captioning_model_key, azure_endpoint=captioning_model_endpoint,api_version=api_version, temperature=0, http_client=http_client)
|
||||
else:
|
||||
llm = ChatOpenAI(base_url=captioning_model_endpoint, api_key=captioning_model_key, model=model, temperature=0, http_client=http_client)
|
||||
|
||||
# Build the message
|
||||
messages = [
|
||||
SystemMessage(content=prompt["system"]),
|
||||
HumanMessage(content=[{"type": "text", "text": prompt["user"]}, {"type": "image_url", "image_url": {"url": image_url}} ])
|
||||
]
|
||||
|
||||
# 调用模型
|
||||
response = llm.invoke(messages)
|
||||
caption = response.content
|
||||
return caption
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting caption with langchain (attempt {i+1}/{RETRY_COUNT}): {e}")
|
||||
if i < RETRY_COUNT - 1:
|
||||
time.sleep(5)
|
||||
else:
|
||||
# The last attempt failed
|
||||
raise Exception(f"Failed to get caption after {RETRY_COUNT} attempts: {e}")
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
|
||||
|
||||
def process_document_figures(di_result:DiResult|None=None,config:ApplicationConfig|None=None) -> DiResult:
|
||||
"""
|
||||
Perform figure fusion on the extracted document content.
|
||||
"""
|
||||
# Implement figure fusion logic here
|
||||
|
||||
if di_result is None:
|
||||
raise Exception("di_result cannot be None")
|
||||
|
||||
if config is None:
|
||||
raise ValueError("config is None")
|
||||
|
||||
description_gen_max_images: int = config.caption.description_gen_max_images
|
||||
vllm_endpoint:str = config.caption.model_endpoint
|
||||
vllm_key:str = config.caption.model_key
|
||||
captioning_model:str = config.caption.model
|
||||
api_version:str = config.caption.api_version
|
||||
azure_deployment:str = config.caption.azure_deployment
|
||||
include_di_content: bool = config.caption.include_di_content
|
||||
|
||||
figures = di_result.figures or []
|
||||
processed_figures:List[FigureFlat] = []
|
||||
content:str = di_result.di_content
|
||||
len_figures:int = len(figures)
|
||||
for figure in figures:
|
||||
figure_content:str= content[figure.offset:figure.offset + figure.length]
|
||||
if not figure_content.lstrip().startswith("<figure>"):
|
||||
continue
|
||||
image_bytes = base64.b64decode(figure.image)
|
||||
language = di_result.language
|
||||
|
||||
# Image content generation
|
||||
vision_content:str = ""
|
||||
if figure.understand_flag:
|
||||
vision_content = figure.content
|
||||
elif include_di_content:
|
||||
if len_figures < description_gen_max_images:
|
||||
vision_content = understand_with_langchain(image=image_bytes, mime_type="png", captioning_model_endpoint=vllm_endpoint, captioning_model_key=vllm_key, model=captioning_model,azure_deployment=azure_deployment,api_version=api_version, language=language, prompts=config.caption.prompts)
|
||||
figure.understand_flag = True
|
||||
else:
|
||||
vision_content = content[figure.offset:figure.offset + figure.length].lstrip("<figure>").rstrip("</figure>").strip()
|
||||
|
||||
vision_content = ' '.join(line.strip() for line in vision_content.splitlines())
|
||||
vision_content = f"<figcaption>{figure.caption}</figcaption>" + vision_content
|
||||
|
||||
if not include_di_content and figure.caption and len(figure.caption)>0:
|
||||
vision_content = f"<figcaption>{figure.caption}</figcaption>"
|
||||
|
||||
figure.content = vision_content
|
||||
processed_figures.append(figure)
|
||||
return di_result
|
||||
Reference in New Issue
Block a user