update
This commit is contained in:
@@ -46,8 +46,9 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
from sqlalchemy import select
|
||||
from .core.deps import AsyncSessionLocal, get_milvus_collection
|
||||
from .models.db import File, Task
|
||||
from .services.parse import parse_document, chunk_text
|
||||
from .services.parse import parse_document
|
||||
from .services.embed import embed_texts
|
||||
from .services.regulation_parser import extract_regulation_meta, legal_chunk
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
# 查找文件记录
|
||||
@@ -68,7 +69,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
task.progress = 10
|
||||
await db.commit()
|
||||
|
||||
# Step 1:解析文档
|
||||
# Step 1:解析文档(调用 mcp-server)
|
||||
file_content = Path(file_record.storage_path).read_bytes()
|
||||
parse_result = await parse_document(file_content, file_record.original_name)
|
||||
markdown = parse_result.get("markdown", "")
|
||||
@@ -76,16 +77,27 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
if not markdown.strip():
|
||||
raise ValueError("文档解析结果为空")
|
||||
|
||||
# Step 2:提取法规元数据(发布机关/文号/施行日期/法规类型)
|
||||
reg_meta = extract_regulation_meta(markdown)
|
||||
file_record.status = "parsed"
|
||||
file_record.metadata = {
|
||||
"regulation_name": reg_meta.regulation_name,
|
||||
"issuing_authority": reg_meta.issuing_authority,
|
||||
"doc_number": reg_meta.doc_number,
|
||||
"effective_date": reg_meta.effective_date,
|
||||
"regulation_type": reg_meta.regulation_type,
|
||||
"parser": parse_result.get("parser", ""),
|
||||
"page_count": parse_result.get("page_count", 0),
|
||||
}
|
||||
if task:
|
||||
task.progress = 40
|
||||
await db.commit()
|
||||
|
||||
# Step 2:分块
|
||||
chunks = chunk_text(markdown, chunk_size=512, overlap=64)
|
||||
logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块")
|
||||
# Step 3:法规专用分块(按章/条边界分割,保留条款号)
|
||||
chunks = legal_chunk(markdown, reg_meta, chunk_size=512, overlap=64)
|
||||
logger.info(f"文件 {file_id} 分割为 {len(chunks)} 块,法规:{reg_meta.regulation_name!r}")
|
||||
|
||||
# Step 3:向量化(分批处理)
|
||||
# Step 4:向量化并写入 Milvus(分批处理)
|
||||
batch_size = 16
|
||||
col = get_milvus_collection("regulation_chunks")
|
||||
|
||||
@@ -93,16 +105,26 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
batch = chunks[i:i + batch_size]
|
||||
texts = [c["content"] for c in batch]
|
||||
embed_result = await embed_texts(texts, batch_size=batch_size)
|
||||
dense_vecs = embed_result["dense"]
|
||||
|
||||
dense_vecs = embed_result["dense"] # list[list[float]], 1024维
|
||||
sparse_vecs = embed_result.get("sparse", [{}] * len(batch)) # list[dict[str,float]]
|
||||
|
||||
entities = [
|
||||
[f"{file_id}_{c['idx']}" for c in batch],
|
||||
[file_id] * len(batch),
|
||||
[workspace_id] * len(batch),
|
||||
[c["idx"] for c in batch],
|
||||
[c["content"] for c in batch],
|
||||
dense_vecs,
|
||||
[{"filename": file_record.original_name, "page": c.get("page", 0)} for c in batch],
|
||||
[f"{file_id}_{c['idx']}" for c in batch], # pk
|
||||
[file_id] * len(batch), # file_id
|
||||
[workspace_id] * len(batch), # workspace_id
|
||||
[c["idx"] for c in batch], # chunk_idx
|
||||
[c["content"] for c in batch], # content
|
||||
dense_vecs, # dense_vec
|
||||
sparse_vecs, # sparse_vec
|
||||
[c["clause_no"] for c in batch], # clause_no
|
||||
[c["article_no"] for c in batch], # article_no
|
||||
[c["regulation_name"] for c in batch], # regulation_name
|
||||
[{ # metadata
|
||||
"filename": file_record.original_name,
|
||||
"page": c.get("page", 0),
|
||||
"doc_number": reg_meta.doc_number,
|
||||
} for c in batch],
|
||||
]
|
||||
col.insert(entities)
|
||||
|
||||
@@ -119,7 +141,7 @@ async def _process_file(file_id: str, task_id: str, workspace_id: str):
|
||||
task.progress = 100
|
||||
task.completed_at = datetime.now(timezone.utc)
|
||||
await db.commit()
|
||||
logger.info(f"文件 {file_id} 处理完成")
|
||||
logger.info(f"文件 {file_id} 处理完成,共 {len(chunks)} 个向量块")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文件 {file_id} 处理失败:{e}")
|
||||
|
||||
Reference in New Issue
Block a user