This commit is contained in:
2026-04-23 14:50:24 +08:00
parent 448e078d99
commit af0eb007a0
5 changed files with 327 additions and 49 deletions

View File

@@ -46,8 +46,9 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
from sqlalchemy import select
from .core.deps import AsyncSessionLocal, get_milvus_collection
from .models.db import File, Task
from .services.parse import parse_document, chunk_text
from .services.parse import parse_document
from .services.embed import embed_texts
from .services.regulation_parser import extract_regulation_meta, legal_chunk
async with AsyncSessionLocal() as db:
# 查找文件记录
@@ -68,7 +69,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
task.progress = 10
await db.commit()
# Step 1解析文档
# Step 1解析文档(调用 mcp-server
file_content = Path(file_record.storage_path).read_bytes()
parse_result = await parse_document(file_content, file_record.original_name)
markdown = parse_result.get("markdown", "")
@@ -76,16 +77,27 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
if not markdown.strip():
raise ValueError("文档解析结果为空")
# Step 2提取法规元数据发布机关/文号/施行日期/法规类型)
reg_meta = extract_regulation_meta(markdown)
file_record.status = "parsed"
file_record.metadata = {
"regulation_name": reg_meta.regulation_name,
"issuing_authority": reg_meta.issuing_authority,
"doc_number": reg_meta.doc_number,
"effective_date": reg_meta.effective_date,
"regulation_type": reg_meta.regulation_type,
"parser": parse_result.get("parser", ""),
"page_count": parse_result.get("page_count", 0),
}
if task:
task.progress = 40
await db.commit()
# Step 2分块
chunks = chunk_text(markdown, chunk_size=512, overlap=64)
logger.info(f"文件 {file_id} 分割为 {len(chunks)}")
# Step 3法规专用分块按章/条边界分割,保留条款号)
chunks = legal_chunk(markdown, reg_meta, chunk_size=512, overlap=64)
logger.info(f"文件 {file_id} 分割为 {len(chunks)},法规:{reg_meta.regulation_name!r}")
# Step 3:向量化(分批处理)
# Step 4:向量化并写入 Milvus(分批处理)
batch_size = 16
col = get_milvus_collection("regulation_chunks")
@@ -93,16 +105,26 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
batch = chunks[i:i + batch_size]
texts = [c["content"] for c in batch]
embed_result = await embed_texts(texts, batch_size=batch_size)
dense_vecs = embed_result["dense"]
dense_vecs = embed_result["dense"] # list[list[float]], 1024维
sparse_vecs = embed_result.get("sparse", [{}] * len(batch)) # list[dict[str,float]]
entities = [
[f"{file_id}_{c['idx']}" for c in batch],
[file_id] * len(batch),
[workspace_id] * len(batch),
[c["idx"] for c in batch],
[c["content"] for c in batch],
dense_vecs,
[{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch],
[f"{file_id}_{c['idx']}" for c in batch], # pk
[file_id] * len(batch), # file_id
[workspace_id] * len(batch), # workspace_id
[c["idx"] for c in batch], # chunk_idx
[c["content"] for c in batch], # content
dense_vecs, # dense_vec
sparse_vecs, # sparse_vec
[c["clause_no"] for c in batch], # clause_no
[c["article_no"] for c in batch], # article_no
[c["regulation_name"] for c in batch], # regulation_name
[{ # metadata
"filename": file_record.original_name,
"page": c.get("page", 0),
"doc_number": reg_meta.doc_number,
} for c in batch],
]
col.insert(entities)
@@ -119,7 +141,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
task.progress = 100
task.completed_at = datetime.now(timezone.utc)
await db.commit()
logger.info(f"文件 {file_id} 处理完成")
logger.info(f"文件 {file_id} 处理完成,共 {len(chunks)} 个向量块")
except Exception as e:
logger.error(f"文件 {file_id} 处理失败:{e}")