init
This commit is contained in:
109
vw-document-ai-indexer/config.yaml.example
Normal file
109
vw-document-ai-indexer/config.yaml.example
Normal file
@@ -0,0 +1,109 @@
|
||||
# Main data configuration (array format)
|
||||
- data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token"
|
||||
datasource_name: "CATOnline-cn" # data source name
|
||||
data_dir: "" # Optional local data directory
|
||||
base_path: "/app/run_tmp" # Temporary processing directory
|
||||
|
||||
# File processing limits
|
||||
process_file_num: 0 # 0 = process all files
|
||||
process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date
|
||||
|
||||
# Chunking configuration
|
||||
chunk_size: 2048 # Maximum tokens per chunk
|
||||
token_overlap: 128 # Overlap between chunks
|
||||
|
||||
# Index schemas configuration
|
||||
index_schemas:
|
||||
# Chunk-level index for search
|
||||
- index_name: "your-knowledge-chunk-index"
|
||||
data_type: ["metadata", "document", "chunk"]
|
||||
field_type: "append" # How to handle existing data
|
||||
upload_batch_size: 50 # Documents per batch upload
|
||||
|
||||
# Metadata fields to include
|
||||
fields: [
|
||||
"filepath", "timestamp", "title", "publisher", "publish_date",
|
||||
"document_category", "document_code", "language_code",
|
||||
"x_Standard_Regulation_Id", "x_Attachment_Type",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status",
|
||||
"x_Standard_Range", "x_Standard_Kind", "x_Standard_No",
|
||||
"x_Standard_Code", "x_Standard_Technical_Committee",
|
||||
"x_Standard_Vehicle_Type", "x_Standard_Power_Type",
|
||||
"x_Standard_CCS", "x_Standard_ICS",
|
||||
"x_Standard_Published_Date", "x_Standard_Effective_Date",
|
||||
"x_Regulation_Status", "x_Regulation_Title_CN",
|
||||
"x_Regulation_Title_EN", "x_Regulation_Document_No",
|
||||
"x_Regulation_Issued_Date", "x_Classification",
|
||||
"x_Work_Group", "x_Reference_Standard",
|
||||
"x_Replaced_by", "x_Refer_To", "func_uuid",
|
||||
"update_time", "status"
|
||||
]
|
||||
|
||||
# Vector configuration
|
||||
vector_fields:
|
||||
- field: "contentVector"
|
||||
append_fields: ["content"] # Fields to vectorize for content
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization
|
||||
|
||||
# Azure AI Search configuration
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath" # Field to use for updates
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
|
||||
# Document-level index
|
||||
- index_name: "your-knowledge-document-index"
|
||||
data_type: ["document", "metadata"]
|
||||
field_type: "full" # Replace entire documents
|
||||
key_fields: ["filepath"] # Primary key fields
|
||||
upload_batch_size: 1
|
||||
|
||||
fields: [
|
||||
# Same field list as chunk index
|
||||
"filepath", "timestamp", "title", "publisher"
|
||||
# ... (same as above)
|
||||
]
|
||||
|
||||
merge_content_fields: ["content"] # Fields to merge from chunks
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
semantic_config_name: "default"
|
||||
vector_config_name: "vectorSearchProfile"
|
||||
update_by_field: "filepath"
|
||||
|
||||
# Regulation-specific index
|
||||
- index_name: "your-regulation-index"
|
||||
data_type: ["metadata"]
|
||||
field_type: "full"
|
||||
key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key
|
||||
upload_batch_size: 50
|
||||
|
||||
fields: [
|
||||
# Regulation-specific fields
|
||||
"x_Standard_Regulation_Id", "x_Standard_Title_CN",
|
||||
"x_Standard_Title_EN", "x_Regulation_Status"
|
||||
# ... (regulation metadata fields)
|
||||
]
|
||||
|
||||
vector_fields:
|
||||
- field: "full_metadata_vector"
|
||||
append_fields: ["doc_metadata"]
|
||||
|
||||
update_by_field: "x_Standard_Regulation_Id"
|
||||
|
||||
# Field merging configuration
|
||||
merge_fields:
|
||||
- key: "doc_metadata" # Combined metadata field
|
||||
fields: [
|
||||
"title", "publisher", "document_category", "document_code",
|
||||
"x_Standard_Title_CN", "x_Standard_Title_EN",
|
||||
"x_Standard_Published_State", "x_Standard_Drafting_Status"
|
||||
# ... (all metadata fields to combine)
|
||||
]
|
||||
|
||||
# Vector field configuration
|
||||
full_metadata_vector_fields: ["full_headers", "doc_metadata"]
|
||||
Reference in New Issue
Block a user