This commit is contained in:
2025-09-26 17:15:54 +08:00
commit db0e5965ec
211 changed files with 40437 additions and 0 deletions

View File

@@ -0,0 +1,103 @@
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-prd?sp=rl&st=2025-08-02T08:25:56Z&se=2125-08-02T16:40:56Z&spr=https&sv=2024-11-04&sr=c&sig=lJui2%2BOs8V%2BdzCkjchQCR7ITWT28tJ0HAq8bIhkkM%2Bk%3D"
datasource_name: "cat-standard-regulation-prd"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 256
index_schemas:
- index_name: "index-catonline-chunk-v2-prd"
data_type: ["metadata", "document", "chunk"]
# field_type: "append"
upload_batch_size: 50
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"] #todo check
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
- index_name: "index-catonline-document-v2-prd"
data_type: ["document", "metadata"]
# field_type: "full"
key_fields: ["filepath"]
upload_batch_size: 1
fields: ["doc_metadata", "full_metadata_vector", "url", "metadata", "image_mapping", "document_schema", "main_title", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
merge_content_fields: ["content"]
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
- index_name: "index-catonline-standard-regulation-v2-prd"
data_type: ["metadata"]
# field_type: "full"
key_fields: ["x_Standard_Regulation_Id"]
upload_batch_size: 1
fields: ["doc_metadata", "full_metadata_vector", "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
vector_config_name: "vectorSearchProfile"
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
update_by_field: "x_Standard_Regulation_Id"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["title", "publisher", "document_category", "document_code", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Kind", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Classification", "x_Work_Group", "status", "x_Standard_Published_State_EN", "x_Standard_Drafting_Status_EN", "x_Regulation_Status_EN","x_Replaced_Standard"]
- data_path: "https://sasales2caiprd.blob.core.chinacloudapi.cn/doc-landing-cat-abroad-prd?sp=rl&st=2025-09-08T05:32:13Z&se=2099-09-08T13:47:13Z&sv=2024-11-04&sr=c&sig=ebYoiKrSwCk12cRnQqov197LvuBv7m%2FxNoQv4VDMY5o%3D"
datasource_name: "cat-standard-regulation-oversea"
data_dir: ""
base_path: "/app/run_tmp"
process_file_num: 0
process_file_last_modify: "2025-06-24 00:00:00"
chunk_size: 2048
token_overlap: 256
index_schemas:
- index_name: "index-catonline-chunk-oversea"
data_type: ["metadata", "document", "chunk"]
upload_batch_size: 50
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "contentVector"
append_fields: ["content"]
- field: "full_metadata_vector"
append_fields: ["full_headers", "doc_metadata"]
- index_name: "index-catonline-document-oversea"
data_type: ["document", "metadata"]
key_fields: ["filepath"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
merge_content_fields: ["content"]
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
vector_config_name: "vectorSearchProfile"
update_by_field: "filepath"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
- index_name: "index-catonline-standard-regulation-oversea"
data_type: ["metadata"]
key_fields: ["standard_Id"]
upload_batch_size: 1
fields: ["filepath","timestamp","file_Name","file_Url","file_Type","entity_Attribute","standard_Id","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","publish_Date","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","create_Time","update_Time","version_Id","version_Name","version_Parent_id","version_Parent_Name","technical_Series_No","implementation_Date","version_Publish_Date","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str"]
vector_config_name: "vectorSearchProfile"
full_metadata_vector_fields: ["doc_metadata"]
semantic_config_name: "default"
update_by_field: "standard_Id"
vector_fields:
- field: "full_metadata_vector"
append_fields: ["doc_metadata"]
merge_fields:
- key: "doc_metadata"
fields: ["file_Name","entity_Attribute","standard_Code","standard_Title_Cn","standard_Title_En","domain_Name","standard_State_Name","type_Name","draft_type_name","qc_Abroad_Professional_Fields","applicable_Models","standard_Type_Name","technical_Field_Name","version_Name","version_Parent_Name","technical_Series_No","newFlag_State","publish_Status_Text","implementation_Status_Text","new_Car_Implementation_Status_Text","production_Car_Implementation_Status_Text","production_Car_Implementation_Str " ]

View File

@@ -0,0 +1,50 @@
# docker build
docker login acrsales2caiprd.azurecr.cn -u username -p password
docker build . -t document-ai-indexer:2.0.1
docker tag document-ai-indexer:2.0.1 acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
docker push acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.1
# login AKS
# az cloud set --name AzureCloud # Switch CLI to Azure cloud
# az login # Log in to Azure China account (browser or device code flow)
# az account set -s 079d8bd8-b4cc-4892-9307-aa6dedf890e9 #! set subs
# az aks get-credentials -g rg-aiflow-lab -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# Create Azure Files Volume
# kubectl create secret generic azure-files-cred \
# --from-literal=azurestorageaccountname=saaisearchlab \
# --from-literal=azurestorageaccountkey=xxxxxxxxxxxxxxxxxxxx \
# -n knowledge-agent
# kubectl delete configmap document-ai-indexer-config -n knowledge-agent
# Deploy ConfigMap
kubectl delete configmap document-ai-indexer-config -n knowledge-agent
kubectl create configmap document-ai-indexer-config -n knowledge-agent --from-file=.\deploy\prd\env.yaml --from-file=.\deploy\prd\config.yaml --from-file=prompt.yaml
# Deploy Pod
# kubectl create namespace knowledge-agent
# kubectl delete pod document-ai-indexer -n knowledge-agent
kubectl apply -f document-ai-indexer_k8s.yml -n knowledge-agent
# Monitor Pod
kubectl logs -f document-ai-indexer -n knowledge-agent
# Deploy CronJob
kubectl apply -f ./deploy/prd/document-ai-indexer-cronjob.yml --namespace knowledge-agent
# Check CronJob Status
kubectl get cronjobs -n knowledge-agent --namespace knowledge-agent
# Check Job Execution History
kubectl get jobs -n knowledge-agent --namespace knowledge-agent
###########
# Manually trigger a job (for testing)
kubectl delete job manual-test -n knowledge-agent
kubectl create job --from=cronjob/document-ai-indexer-cronjob manual-test -n knowledge-agent
# Check Job Logs
kubectl logs -f job/manual-test -n knowledge-agent

View File

@@ -0,0 +1,69 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: document-ai-indexer-cronjob
spec:
# Scheduling configuration - execute every 10 minutes
schedule: "*/10 * * * *"
# Concurrency policy: Disable concurrent execution. If the previous job is still running, new execution will be skipped.
concurrencyPolicy: Forbid
# Successful jobs history limit: Keep the last 3 successful job records.
successfulJobsHistoryLimit: 10
# Failed jobs history limit: Keep the last failed job record.
failedJobsHistoryLimit: 10
# Job template
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: document-ai-indexer
job-type: cronjob
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
- key: prompt.yaml
path: prompt.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false # Write permission
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.4
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: config-volume
mountPath: /app/prompt.yaml
subPath: prompt.yaml
# Azure File Shared mount
- name: data-volume
mountPath: /app/run_tmp # Program write/read directory

View File

@@ -0,0 +1,47 @@
apiVersion: v1
kind: Pod
metadata:
name: document-ai-indexer
spec:
restartPolicy: Never
volumes:
# 1. ConfigMap volume
- name: config-volume
configMap:
name: document-ai-indexer-config
items:
- key: env.yaml
path: env.yaml
- key: config.yaml
path: config.yaml
- key: prompt.yaml
path: prompt.yaml
# 2. Azure File Share volume
- name: data-volume
azureFile:
secretName: azure-files-cred # Quoting what you created Secret
shareName: fs-document-ai-indexer # Your file share name
readOnly: false
containers:
- name: document-ai-indexer
image: acrsales2caiprd.azurecr.cn/document-ai-indexer:2.0.2
imagePullPolicy: Always
# Mount the volume into the container
volumeMounts:
# ConfigMap Mount
- name: config-volume
mountPath: /app/env.yaml
subPath: env.yaml
- name: config-volume
mountPath: /app/config.yaml
subPath: config.yaml
- name: config-volume
mountPath: /app/prompt.yaml
subPath: prompt.yaml
# Azure File Share Mount
- name: data-volume
mountPath: /app/run_tmp # Directory for program read/write

View File

@@ -0,0 +1,10 @@
# login AKS
# az cloud set -n AzureChinaCloud
# az login
# az account set -s 36646bff-fbd2-4767-b27b-2fe786b5b15c
# az aks get-credentials -g rg-sales2c-ai-service -n aks-sales2c-ai-prd --overwrite-existing --file ~/.kube/config
kubectl config use-context aks-sales2c-ai-prd
kubectl config current-context
# kubectl create namespace knowledge-agent
kubectl apply -f embedding-api-proxy_k8s.yml -n knowledge-agent

View File

@@ -0,0 +1,39 @@
# Service 资源:将外部域名映射为集群内 Service
apiVersion: v1
kind: Service
metadata:
name: itpai-backend
spec:
type: ExternalName
externalName: itpai.infer.api.vgcserv.com.cn
ports:
- port: 443
protocol: TCP
targetPort: 443
---
# Ingress 资源:把 /v1-openai 路径代理到上述 Service
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: itpai-proxy
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
nginx.ingress.kubernetes.io/proxy-ssl-server-name: "on"
nginx.ingress.kubernetes.io/proxy-ssl-verify: "off"
nginx.ingress.kubernetes.io/upstream-vhost: "itpai.infer.api.vgcserv.com.cn"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-send-timeout: "30"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "5"
spec:
rules:
- host: sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn
http:
paths:
- path: /v1-openai
pathType: Prefix
backend:
service:
name: itpai-backend
port:
number: 443

View File

@@ -0,0 +1,36 @@
config: config.yaml
njobs: 12
search_service_name: https://search-sales2c-ai-prd.search.azure.cn
search_admin_key: ev6B0OtF66WkDmQKJBa4n1Haa8e8p8N3zdaEBnbWtoAzSeAMWSid
embedding_model_endpoint: http://sales2c-ai.chinanorth3.cloudapp.chinacloudapi.cn/v1-openai/embeddings
embedding_model_key: gpustack_0e3d5b35adaf239b_99adacd6f540c7d81006365c8030b16c
VECTOR_DIMENSION: 4096
FLAG_AOAI: "V3"
FLAG_EMBEDDING_MODEL: qwen3-embedding-8b
extract_method: di+vision-llm
form_rec_resource: https://di-sales2c-ai-prd.cognitiveservices.azure.cn/
form_rec_key: G0vhH3twd5K3YYCgfnttf5V6XTMMU4PMdVvRHsgaTb8kZDoU8ZHjJQQJ99BDAEHpCsCfT1gyAAALACOGmOcn
di-Formulas: true
di-hiRes: true
di_allow_features_ext: pdf;jpep;jpg;png;bmp;tiff;heif
FIGURE_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/extracted-image-cat-prd?sp=racwdl&st=2025-08-04T06:34:42Z&se=2035-08-04T14:49:42Z&spr=https&sv=2024-11-04&sr=c&sig=t0DTjfht%2FNaPlXUtxhKr40NzZY5kWovgNxJUeAepvgA%3D
DI_BLOB_ACCOUNT_URL: https://sasales2caiprd.blob.core.chinacloudapi.cn/di-result-cat-prd?sp=racwdl&st=2025-08-04T06:34:11Z&se=2035-08-04T14:49:11Z&spr=https&sv=2024-11-04&sr=c&sig=26wxy5M9lcIO2o9zzr6jOtdw2gQTZnGmampHx5EyXbo%3D
DB_URI: postgresql://pgadmin:vwb54pSQDp8vYkusKms@pg-sales2c-ai-prd.postgres.database.chinacloudapi.cn/document-ai-indexer
# Image understanding
figure_caption:
include_di_content: false # Figure content that quotes the result of di
description_gen_max_images: 0 # The maximum number of images to be described. 0 means no description
model_endpoint: null
model_key: null
model: null # azure openai set null
azure_deployment: null # azure openai deployment name,Other platforms are set to empty
api_version: null # azure openai deployment name,Other platforms are set to empty
header_fix: true