update

2026-04-23 14:50:24 +08:00
parent 448e078d99
commit af0eb007a0
5 changed files with 327 additions and 49 deletions
--- a/services/compliance-backend/app/worker.py
+++ b/services/compliance-backend/app/worker.py
@@ -46,8 +46,9 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
    from sqlalchemy import select
    from .core.deps import AsyncSessionLocal, get_milvus_collection
    from .models.db import File, Task
-    from .services.parse import parse_document, chunk_text
+    from .services.parse import parse_document
    from .services.embed import embed_texts
+    from .services.regulation_parser import extract_regulation_meta, legal_chunk

    async with AsyncSessionLocal() as db:
        # 查找文件记录
@@ -68,7 +69,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
                task.progress = 10
            await db.commit()

-            # Step 1：解析文档
+            # Step 1：解析文档（调用 mcp-server）
            file_content = Path(file_record.storage_path).read_bytes()
            parse_result = await parse_document(file_content, file_record.original_name)
            markdown = parse_result.get("markdown", "")
@@ -76,16 +77,27 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
            if not markdown.strip():
                raise ValueError("文档解析结果为空")

+            # Step 2：提取法规元数据（发布机关/文号/施行日期/法规类型）
+            reg_meta = extract_regulation_meta(markdown)
            file_record.status = "parsed"
+            file_record.metadata = {
+                "regulation_name":   reg_meta.regulation_name,
+                "issuing_authority": reg_meta.issuing_authority,
+                "doc_number":        reg_meta.doc_number,
+                "effective_date":    reg_meta.effective_date,
+                "regulation_type":   reg_meta.regulation_type,
+                "parser":            parse_result.get("parser", ""),
+                "page_count":        parse_result.get("page_count", 0),
+            }
            if task:
                task.progress = 40
            await db.commit()

-            # Step 2：分块
-            chunks = chunk_text(markdown, chunk_size=512, overlap=64)
-            logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块")
+            # Step 3：法规专用分块（按章/条边界分割，保留条款号）
+            chunks = legal_chunk(markdown, reg_meta, chunk_size=512, overlap=64)
+            logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块，法规：{reg_meta.regulation_name!r}")

-            # Step 3：向量化（分批处理）
+            # Step 4：向量化并写入 Milvus（分批处理）
            batch_size = 16
            col = get_milvus_collection("regulation_chunks")

@@ -93,16 +105,26 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
                batch = chunks[i:i + batch_size]
                texts = [c["content"] for c in batch]
                embed_result = await embed_texts(texts, batch_size=batch_size)
-                dense_vecs = embed_result["dense"]
+
+                dense_vecs  = embed_result["dense"]    # list[list[float]], 1024维
+                sparse_vecs = embed_result.get("sparse", [{}] * len(batch))  # list[dict[str,float]]

                entities = [
-                    [f"{file_id}_{c['idx']}" for c in batch],
-                    [file_id] * len(batch),
-                    [workspace_id] * len(batch),
-                    [c["idx"] for c in batch],
-                    [c["content"] for c in batch],
-                    dense_vecs,
-                    [{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch],
+                    [f"{file_id}_{c['idx']}" for c in batch],           # pk
+                    [file_id]     * len(batch),                           # file_id
+                    [workspace_id] * len(batch),                          # workspace_id
+                    [c["idx"]    for c in batch],                         # chunk_idx
+                    [c["content"] for c in batch],                        # content
+                    dense_vecs,                                           # dense_vec
+                    sparse_vecs,                                          # sparse_vec
+                    [c["clause_no"]       for c in batch],               # clause_no
+                    [c["article_no"]      for c in batch],               # article_no
+                    [c["regulation_name"] for c in batch],               # regulation_name
+                    [{                                                    # metadata
+                        "filename":   file_record.original_name,
+                        "page":       c.get("page", 0),
+                        "doc_number": reg_meta.doc_number,
+                    } for c in batch],
                ]
                col.insert(entities)

@@ -119,7 +141,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
                task.progress = 100
                task.completed_at = datetime.now(timezone.utc)
            await db.commit()
-            logger.info(f"文件 {file_id} 处理完成")
+            logger.info(f"文件 {file_id} 处理完成，共 {len(chunks)} 个向量块")

        except Exception as e:
            logger.error(f"文件 {file_id} 处理失败：{e}")