# Main data configuration (array format) - data_path: "https://your-blob-storage.blob.core.windows.net/container?sas-token" datasource_name: "CATOnline-cn" # data source name data_dir: "" # Optional local data directory base_path: "/app/run_tmp" # Temporary processing directory # File processing limits process_file_num: 0 # 0 = process all files process_file_last_modify: "2025-06-24 00:00:00" # Only process files modified after this date # Chunking configuration chunk_size: 2048 # Maximum tokens per chunk token_overlap: 128 # Overlap between chunks # Index schemas configuration index_schemas: # Chunk-level index for search - index_name: "your-knowledge-chunk-index" data_type: ["metadata", "document", "chunk"] field_type: "append" # How to handle existing data upload_batch_size: 50 # Documents per batch upload # Metadata fields to include fields: [ "filepath", "timestamp", "title", "publisher", "publish_date", "document_category", "document_code", "language_code", "x_Standard_Regulation_Id", "x_Attachment_Type", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status", "x_Standard_Range", "x_Standard_Kind", "x_Standard_No", "x_Standard_Code", "x_Standard_Technical_Committee", "x_Standard_Vehicle_Type", "x_Standard_Power_Type", "x_Standard_CCS", "x_Standard_ICS", "x_Standard_Published_Date", "x_Standard_Effective_Date", "x_Regulation_Status", "x_Regulation_Title_CN", "x_Regulation_Title_EN", "x_Regulation_Document_No", "x_Regulation_Issued_Date", "x_Classification", "x_Work_Group", "x_Reference_Standard", "x_Replaced_by", "x_Refer_To", "func_uuid", "update_time", "status" ] # Vector configuration vector_fields: - field: "contentVector" append_fields: ["content"] # Fields to vectorize for content - field: "full_metadata_vector" append_fields: ["full_headers", "doc_metadata"] # Metadata vectorization # Azure AI Search configuration semantic_config_name: "default" vector_config_name: "vectorSearchProfile" update_by_field: "filepath" # Field to use for updates full_metadata_vector_fields: ["full_headers", "doc_metadata"] # Document-level index - index_name: "your-knowledge-document-index" data_type: ["document", "metadata"] field_type: "full" # Replace entire documents key_fields: ["filepath"] # Primary key fields upload_batch_size: 1 fields: [ # Same field list as chunk index "filepath", "timestamp", "title", "publisher" # ... (same as above) ] merge_content_fields: ["content"] # Fields to merge from chunks vector_fields: - field: "full_metadata_vector" append_fields: ["doc_metadata"] semantic_config_name: "default" vector_config_name: "vectorSearchProfile" update_by_field: "filepath" # Regulation-specific index - index_name: "your-regulation-index" data_type: ["metadata"] field_type: "full" key_fields: ["x_Standard_Regulation_Id"] # Regulation ID as key upload_batch_size: 50 fields: [ # Regulation-specific fields "x_Standard_Regulation_Id", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Regulation_Status" # ... (regulation metadata fields) ] vector_fields: - field: "full_metadata_vector" append_fields: ["doc_metadata"] update_by_field: "x_Standard_Regulation_Id" # Field merging configuration merge_fields: - key: "doc_metadata" # Combined metadata field fields: [ "title", "publisher", "document_category", "document_code", "x_Standard_Title_CN", "x_Standard_Title_EN", "x_Standard_Published_State", "x_Standard_Drafting_Status" # ... (all metadata fields to combine) ] # Vector field configuration full_metadata_vector_fields: ["full_headers", "doc_metadata"]