Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/aliyun_parser/upload_to_milvus.py
+++ b/backend/app/aliyun_parser/upload_to_milvus.py
@@ -1,9 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
-使用中转站的 OpenAI 兼容 API
-"""
+"""Handle Aliyun parsing support for upload to milvus."""

 import argparse
 import json
@@ -23,18 +20,18 @@ from pymilvus import (
 )
 from openai import OpenAI

-# ===================== 配置 =====================
-# 中转站配置
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
 RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
-EMBEDDING_MODEL = "text-embedding-v3"  # 中转站支持的 embedding 模型
+EMBEDDING_MODEL = "text-embedding-v3"  # Keep parser integration steps explicit so external workflow behavior stays traceable.

-# Milvus 配置
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 MILVUS_HOST = "localhost"
 MILVUS_PORT = "19530"
 COLLECTION_NAME = "regulation_chunks"

-# PostgreSQL 配置
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 PG_HOST = "6.86.80.10"
 PG_PORT = 5432
 PG_USER = "postgresql"
@@ -44,12 +41,12 @@ PG_DATABASE = "postgres"

 # ===================== Embedding =====================
 def get_openai_client(api_key: str, base_url: str) -> OpenAI:
-    """创建 OpenAI 客户端连接到中转站"""
+    """Return openai client."""
    return OpenAI(api_key=api_key, base_url=base_url)


 def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
-    """批量获取文本向量"""
+    """Return embeddings batch."""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
@@ -69,12 +66,13 @@ def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10)

 # ===================== Milvus =====================
 def init_milvus(host: str, port: str):
+    """Handle init milvus."""
    connections.connect("default", host=host, port=port)
    print(f"已连接 Milvus: {host}:{port}")


 def create_collection(name: str, dim: int) -> Collection:
-    """创建或获取 collection"""
+    """Create collection."""
    if utility.has_collection(name):
        print(f"Collection '{name}' 已存在，删除重建")
        utility.drop_collection(name)
@@ -90,14 +88,14 @@ def create_collection(name: str, dim: int) -> Collection:
        FieldSchema(name="page_end", dtype=DataType.INT64),
        FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
-        FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),  # JSON 字符串
+        FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096),  # Keep parser integration steps explicit so external workflow behavior stays traceable.
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
    ]

    schema = CollectionSchema(fields, description="法规文档检索 chunks")
    collection = Collection(name, schema)

-    # 创建向量索引（IVF_FLAT，适合中小规模）
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    index_params = {
        "metric_type": "COSINE",
        "index_type": "IVF_FLAT",
@@ -110,7 +108,7 @@ def create_collection(name: str, dim: int) -> Collection:


 def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
-    """插入 chunks 到 Milvus"""
+    """Handle insert chunks."""
    data = [
        [c["chunk_id"] for c in chunks],
        [c["doc_id"] for c in chunks],
@@ -122,7 +120,7 @@ def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[L
        [c["page_end"] for c in chunks],
        [c["section_title"] for c in chunks],
        [c["text"] for c in chunks],
-        [json.dumps(c.get("source_ids", [])) for c in chunks],  # JSON 字符串
+        [json.dumps(c.get("source_ids", [])) for c in chunks],  # Keep parser integration steps explicit so external workflow behavior stays traceable.
        embeddings,
    ]

@@ -132,14 +130,14 @@ def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[L


 def load_collection(collection: Collection):
-    """加载 collection 到内存（搜索前必须）"""
+    """Load collection."""
    collection.load()
    print(f"Collection 已加载到内存")


 # ===================== PostgreSQL =====================
 def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
-    """获取 PostgreSQL 连接"""
+    """Return pg connection."""
    conn = psycopg2.connect(
        host=host,
        port=port,
@@ -152,18 +150,18 @@ def get_pg_connection(host: str, port: int, user: str, password: str, database:


 def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
-    """插入 chunks 和相关数据到 PostgreSQL"""
+    """Handle insert chunks to pg."""
    cursor = conn.cursor()

    try:
-        # 1. 插入文档
+        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        cursor.execute("""
            INSERT INTO documents (doc_id, title, standard_number, upload_time)
            VALUES (%s, %s, %s, NOW())
            ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
        """, (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))

-        # 2. 插入语义块
+        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        semantic_blocks = doc_data.get("semantic_blocks", [])
        if semantic_blocks:
            block_rows = [
@@ -192,7 +190,7 @@ def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
            )
            print(f"已插入 {len(semantic_blocks)} 个语义块")

-        # 3. 插入向量块元数据
+        # Keep parser integration steps explicit so external workflow behavior stays traceable.
        chunk_rows = [
            (
                doc_data["doc_id"],
@@ -230,9 +228,9 @@ def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
        cursor.close()


-# ===================== 主流程 =====================
+# Keep parser integration steps explicit so external workflow behavior stays traceable.
 def load_data(file_path: Path) -> Dict:
-    """加载 vector_chunks.json，返回完整数据"""
+    """Load data."""
    data = json.loads(file_path.read_text(encoding="utf-8"))
    return data

@@ -251,7 +249,8 @@ def upload_to_milvus_and_pg(
    pg_password: str,
    pg_database: str,
 ):
-    # 1. 加载完整数据
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
+    """Handle upload to milvus and pg."""
    chunks_path = Path(chunks_file).expanduser().resolve()
    if not chunks_path.exists():
        raise FileNotFoundError(f"文件不存在: {chunks_path}")
@@ -262,29 +261,29 @@ def upload_to_milvus_and_pg(
        raise ValueError("vector_chunks 为空")
    print(f"加载 {len(chunks)} 个 chunks")

-    # 2. 初始化连接
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    client = get_openai_client(api_key, base_url)
    init_milvus(milvus_host, milvus_port)
    pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)

-    # 3. 获取 embeddings
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    texts = [c["embedding_text"] for c in chunks]
    embeddings = get_embeddings_batch(client, texts, batch_size)
    print(f"生成 {len(embeddings)} 个向量")

-    # 4. 获取 embedding 维度
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    embedding_dim = len(embeddings[0])
    print(f"Embedding 维度: {embedding_dim}")

-    # 5. 创建 collection 并插入 Milvus
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    collection = create_collection(collection_name, embedding_dim)
    insert_chunks(collection, chunks, embeddings)
    load_collection(collection)

-    # 6. 插入 PostgreSQL
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    insert_chunks_to_pg(pg_conn, chunks, data)

-    # 7. 关闭连接
+    # Keep parser integration steps explicit so external workflow behavior stays traceable.
    pg_conn.close()

    print("上传完成！")
@@ -292,6 +291,7 @@ def upload_to_milvus_and_pg(

 # ===================== CLI =====================
 def main():
+    """Run the module entrypoint."""
    parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
    parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
    parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")