Delete log file for May 14, 2026, to clean up unnecessary data and maintain log management.
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -57,3 +57,7 @@ ENV/
|
|||||||
# OS files
|
# OS files
|
||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
|
||||||
|
|
||||||
|
# logs files
|
||||||
|
logs/
|
||||||
8
backend/app/aliyun_parser/.claude/settings.local.json
Normal file
8
backend/app/aliyun_parser/.claude/settings.local.json
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(python3 *)",
|
||||||
|
"Bash(PGPASSWORD=postgresql123456 psql *)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
475
backend/app/aliyun_parser/parse_pdf.py
Normal file
475
backend/app/aliyun_parser/parse_pdf.py
Normal file
@@ -0,0 +1,475 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
阿里云文档智能 API 解析 PDF,输出三层结构 chunks
|
||||||
|
- structure_nodes: 目录树结构
|
||||||
|
- semantic_blocks: 语义块(章节文本、表格、图片)
|
||||||
|
- vector_chunks: 检索块(带 overlap 切分)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
|
||||||
|
from alibabacloud_tea_openapi import models as open_api_models
|
||||||
|
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||||
|
from alibabacloud_tea_util import models as util_models
|
||||||
|
|
||||||
|
# ===================== 阿里云配置 =====================
|
||||||
|
ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
|
||||||
|
ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
|
||||||
|
ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
|
||||||
|
|
||||||
|
# ===================== 切分参数 =====================
|
||||||
|
MAX_CHARS = 600
|
||||||
|
OVERLAP_CHARS = 80
|
||||||
|
|
||||||
|
# ===================== 布局类型常量 =====================
|
||||||
|
TOC_TITLES = {"目次", "目录"}
|
||||||
|
TITLE_SUBTYPES = {"doc_title", "para_title"}
|
||||||
|
TEXT_SUBTYPES = {"para", "none"}
|
||||||
|
FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
|
||||||
|
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 阿里云 API 客户端 =====================
|
||||||
|
def init_client() -> DocmindClient:
|
||||||
|
config = open_api_models.Config(
|
||||||
|
access_key_id=ALIBABA_ACCESS_KEY_ID,
|
||||||
|
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
|
||||||
|
)
|
||||||
|
config.endpoint = ALIBABA_ENDPOINT
|
||||||
|
return DocmindClient(config)
|
||||||
|
|
||||||
|
|
||||||
|
def submit_job(client: DocmindClient, file_path: str) -> str:
|
||||||
|
"""提交文档解析任务"""
|
||||||
|
file_name = Path(file_path).name
|
||||||
|
request = docmind_models.SubmitDocParserJobAdvanceRequest(
|
||||||
|
file_url_object=open(file_path, "rb"),
|
||||||
|
file_name=file_name,
|
||||||
|
file_name_extension=Path(file_path).suffix.lstrip("."),
|
||||||
|
llm_enhancement=True,
|
||||||
|
enhancement_mode="VLM",
|
||||||
|
)
|
||||||
|
runtime = util_models.RuntimeOptions()
|
||||||
|
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||||
|
return response.body.data.id
|
||||||
|
|
||||||
|
|
||||||
|
def query_status(client: DocmindClient, task_id: str) -> Dict:
|
||||||
|
"""查询任务状态"""
|
||||||
|
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
|
||||||
|
response = client.query_doc_parser_status(request)
|
||||||
|
return response.body.data.to_map() if response.body.data else None
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
|
||||||
|
"""等待任务完成"""
|
||||||
|
while True:
|
||||||
|
status_data = query_status(client, task_id)
|
||||||
|
if not status_data:
|
||||||
|
return False
|
||||||
|
status = status_data.get("Status", "").lower()
|
||||||
|
if status == "success":
|
||||||
|
return True
|
||||||
|
elif status == "failed":
|
||||||
|
print(f"任务失败: {status_data}")
|
||||||
|
return False
|
||||||
|
print(f"任务状态: {status}, 等待中...")
|
||||||
|
time.sleep(poll_interval)
|
||||||
|
|
||||||
|
|
||||||
|
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
|
||||||
|
"""获取解析结果"""
|
||||||
|
request = docmind_models.GetDocParserResultRequest(
|
||||||
|
id=task_id,
|
||||||
|
layout_step_size=layout_step_size,
|
||||||
|
layout_num=layout_num,
|
||||||
|
)
|
||||||
|
response = client.get_doc_parser_result(request)
|
||||||
|
return response.body.data if response.body.data else None
|
||||||
|
|
||||||
|
|
||||||
|
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
|
||||||
|
"""收集所有解析结果"""
|
||||||
|
all_layouts = []
|
||||||
|
layout_num = 0
|
||||||
|
while True:
|
||||||
|
result_data = get_result(client, task_id, layout_num, layout_step_size)
|
||||||
|
if not result_data:
|
||||||
|
break
|
||||||
|
layouts = result_data.get("layouts", [])
|
||||||
|
if not layouts:
|
||||||
|
break
|
||||||
|
all_layouts.extend(layouts)
|
||||||
|
layout_num += len(layouts)
|
||||||
|
if len(layouts) < layout_step_size:
|
||||||
|
break
|
||||||
|
return all_layouts
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 文本处理 =====================
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
text = text.replace("\r", "\n")
|
||||||
|
text = text.replace(" ", " ")
|
||||||
|
text = re.sub(r"\n+", "\n", text)
|
||||||
|
text = re.sub(r"[ \t]+", " ", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_page(layout: Dict) -> int:
|
||||||
|
return layout.get("pageNum", layout.get("pageNumber", 0))
|
||||||
|
|
||||||
|
|
||||||
|
def get_text(layout: Dict) -> str:
|
||||||
|
text = normalize_text(layout.get("text", ""))
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
return normalize_text(layout.get("markdownContent", ""))
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 布局类型判断 =====================
|
||||||
|
def is_title(layout: Dict) -> bool:
|
||||||
|
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
|
||||||
|
|
||||||
|
|
||||||
|
def is_text(layout: Dict) -> bool:
|
||||||
|
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
|
||||||
|
|
||||||
|
|
||||||
|
def is_figure(layout: Dict) -> bool:
|
||||||
|
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
|
||||||
|
|
||||||
|
|
||||||
|
def is_table(layout: Dict) -> bool:
|
||||||
|
return layout.get("type") == "table"
|
||||||
|
|
||||||
|
|
||||||
|
def is_toc_layout(layout: Dict) -> bool:
|
||||||
|
text = get_text(layout)
|
||||||
|
if text in TOC_TITLES:
|
||||||
|
return True
|
||||||
|
if get_page(layout) == 1 and re.match(r"^\d+(\.\d+)*\s+.+[.。…]{2,}\s*\d+$", text):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_table_text(layout: Dict) -> str:
|
||||||
|
rows = []
|
||||||
|
for cell in layout.get("cells", []):
|
||||||
|
texts = []
|
||||||
|
for cell_layout in cell.get("layouts", []):
|
||||||
|
cell_text = normalize_text(cell_layout.get("text", ""))
|
||||||
|
if cell_text:
|
||||||
|
texts.append(cell_text)
|
||||||
|
if texts:
|
||||||
|
rows.append(" ".join(texts))
|
||||||
|
return "\n".join(rows).strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 结构层:目录树 =====================
|
||||||
|
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
|
||||||
|
nodes = []
|
||||||
|
for layout in layouts:
|
||||||
|
if not is_title(layout):
|
||||||
|
continue
|
||||||
|
text = get_text(layout)
|
||||||
|
if not text or text in TOC_TITLES:
|
||||||
|
continue
|
||||||
|
nodes.append(
|
||||||
|
{
|
||||||
|
"unique_id": layout.get("uniqueId"),
|
||||||
|
"page": get_page(layout),
|
||||||
|
"index": layout.get("index", 0),
|
||||||
|
"level": layout.get("level", 0),
|
||||||
|
"title": text,
|
||||||
|
"type": layout.get("type"),
|
||||||
|
"sub_type": layout.get("subType"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 语义层:章节内容 =====================
|
||||||
|
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
|
||||||
|
level = layout.get("level", 0)
|
||||||
|
title = get_text(layout)
|
||||||
|
while section_stack and section_stack[-1]["level"] >= level:
|
||||||
|
section_stack.pop()
|
||||||
|
section_stack.append(
|
||||||
|
{
|
||||||
|
"level": level,
|
||||||
|
"title": title,
|
||||||
|
"page": get_page(layout),
|
||||||
|
"unique_id": layout.get("uniqueId"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return section_stack
|
||||||
|
|
||||||
|
|
||||||
|
def section_path_titles(section_stack: List[Dict]) -> List[str]:
|
||||||
|
return [item["title"] for item in section_stack]
|
||||||
|
|
||||||
|
|
||||||
|
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
|
||||||
|
if not blocks:
|
||||||
|
return block_id
|
||||||
|
|
||||||
|
texts = [item["text"] for item in blocks if item["text"]]
|
||||||
|
merged_text = "\n".join(texts).strip()
|
||||||
|
if not merged_text:
|
||||||
|
return block_id
|
||||||
|
|
||||||
|
semantic_blocks.append(
|
||||||
|
{
|
||||||
|
"semantic_id": f"semantic-{block_id}",
|
||||||
|
"block_type": "section_text",
|
||||||
|
"page_start": min(item["page"] for item in blocks),
|
||||||
|
"page_end": max(item["page"] for item in blocks),
|
||||||
|
"section_path": blocks[0]["section_path"],
|
||||||
|
"section_level": blocks[0]["section_level"],
|
||||||
|
"section_title": blocks[0]["section_title"],
|
||||||
|
"source_ids": [item["unique_id"] for item in blocks if item.get("unique_id")],
|
||||||
|
"text": merged_text,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return block_id + 1
|
||||||
|
|
||||||
|
|
||||||
|
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
|
||||||
|
semantic_blocks = []
|
||||||
|
section_stack = []
|
||||||
|
pending_text_blocks = []
|
||||||
|
block_id = 1
|
||||||
|
skip_toc_page = False
|
||||||
|
|
||||||
|
for layout in layouts:
|
||||||
|
text = get_text(layout)
|
||||||
|
page = get_page(layout)
|
||||||
|
|
||||||
|
if is_toc_layout(layout):
|
||||||
|
skip_toc_page = True
|
||||||
|
continue
|
||||||
|
if skip_toc_page and page == 1:
|
||||||
|
continue
|
||||||
|
if skip_toc_page and page != 1:
|
||||||
|
skip_toc_page = False
|
||||||
|
|
||||||
|
if is_title(layout):
|
||||||
|
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||||
|
pending_text_blocks = []
|
||||||
|
section_stack = update_section_path(section_stack, layout)
|
||||||
|
continue
|
||||||
|
|
||||||
|
section_path = section_path_titles(section_stack)
|
||||||
|
section_title = section_path[-1] if section_path else "未分类"
|
||||||
|
section_level = len(section_path)
|
||||||
|
|
||||||
|
if is_table(layout):
|
||||||
|
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||||
|
pending_text_blocks = []
|
||||||
|
table_text = extract_table_text(layout)
|
||||||
|
if table_text:
|
||||||
|
semantic_blocks.append(
|
||||||
|
{
|
||||||
|
"semantic_id": f"semantic-{block_id}",
|
||||||
|
"block_type": "table",
|
||||||
|
"page_start": page,
|
||||||
|
"page_end": page,
|
||||||
|
"section_path": section_path,
|
||||||
|
"section_level": section_level,
|
||||||
|
"section_title": section_title,
|
||||||
|
"source_ids": [layout.get("uniqueId")],
|
||||||
|
"text": table_text,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
block_id += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_figure(layout):
|
||||||
|
block_id = flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||||
|
pending_text_blocks = []
|
||||||
|
if text:
|
||||||
|
semantic_blocks.append(
|
||||||
|
{
|
||||||
|
"semantic_id": f"semantic-{block_id}",
|
||||||
|
"block_type": "figure",
|
||||||
|
"page_start": page,
|
||||||
|
"page_end": page,
|
||||||
|
"section_path": section_path,
|
||||||
|
"section_level": section_level,
|
||||||
|
"section_title": section_title,
|
||||||
|
"source_ids": [layout.get("uniqueId")],
|
||||||
|
"text": text,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
block_id += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_text(layout) and text:
|
||||||
|
pending_text_blocks.append(
|
||||||
|
{
|
||||||
|
"page": page,
|
||||||
|
"text": text,
|
||||||
|
"unique_id": layout.get("uniqueId"),
|
||||||
|
"section_path": section_path,
|
||||||
|
"section_level": section_level,
|
||||||
|
"section_title": section_title,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
flush_text_block(pending_text_blocks, semantic_blocks, block_id)
|
||||||
|
return semantic_blocks
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 检索层:向量 chunks =====================
|
||||||
|
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
|
||||||
|
text = text.strip()
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return [text] if text else []
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = min(len(text), start + max_chars)
|
||||||
|
parts.append(text[start:end].strip())
|
||||||
|
if end >= len(text):
|
||||||
|
break
|
||||||
|
start = max(0, end - overlap_chars)
|
||||||
|
return [part for part in parts if part]
|
||||||
|
|
||||||
|
|
||||||
|
def build_vector_chunks(
|
||||||
|
semantic_blocks: List[Dict],
|
||||||
|
doc_id: str,
|
||||||
|
doc_title: str,
|
||||||
|
max_chars: int,
|
||||||
|
overlap_chars: int,
|
||||||
|
) -> List[Dict]:
|
||||||
|
vector_chunks = []
|
||||||
|
chunk_index = 1
|
||||||
|
|
||||||
|
for block in semantic_blocks:
|
||||||
|
pieces = split_text_with_overlap(block["text"], max_chars, overlap_chars)
|
||||||
|
for piece_index, piece in enumerate(pieces, start=1):
|
||||||
|
if block["section_path"]:
|
||||||
|
header = f"标准:{doc_title}\n章节:{' > '.join(block['section_path'])}\n\n"
|
||||||
|
else:
|
||||||
|
header = f"标准:{doc_title}\n\n"
|
||||||
|
vector_chunks.append(
|
||||||
|
{
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"doc_title": doc_title,
|
||||||
|
"chunk_id": f"chunk-{chunk_index}",
|
||||||
|
"chunk_index": chunk_index,
|
||||||
|
"semantic_id": block["semantic_id"],
|
||||||
|
"chunk_type": block["block_type"],
|
||||||
|
"piece_index": piece_index,
|
||||||
|
"page_start": block["page_start"],
|
||||||
|
"page_end": block["page_end"],
|
||||||
|
"section_path": block["section_path"],
|
||||||
|
"section_level": block["section_level"],
|
||||||
|
"section_title": block["section_title"],
|
||||||
|
"source_ids": block["source_ids"],
|
||||||
|
"text": piece,
|
||||||
|
"embedding_text": header + piece,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
chunk_index += 1
|
||||||
|
|
||||||
|
return vector_chunks
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 主转换函数 =====================
|
||||||
|
def convert_layouts(
|
||||||
|
layouts: List[Dict],
|
||||||
|
doc_id: str,
|
||||||
|
doc_title: str,
|
||||||
|
max_chars: int,
|
||||||
|
overlap_chars: int,
|
||||||
|
) -> Dict:
|
||||||
|
structure_nodes = build_structure_nodes(layouts)
|
||||||
|
semantic_blocks = build_semantic_blocks(layouts)
|
||||||
|
vector_chunks = build_vector_chunks(
|
||||||
|
semantic_blocks,
|
||||||
|
doc_id=doc_id,
|
||||||
|
doc_title=doc_title,
|
||||||
|
max_chars=max_chars,
|
||||||
|
overlap_chars=overlap_chars,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"doc_title": doc_title,
|
||||||
|
"structure_nodes": structure_nodes,
|
||||||
|
"semantic_blocks": semantic_blocks,
|
||||||
|
"vector_chunks": vector_chunks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== CLI 入口 =====================
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF,输出三层结构 chunks")
|
||||||
|
parser.add_argument("pdf_path", help="PDF 文件路径")
|
||||||
|
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
|
||||||
|
parser.add_argument("--layouts-out", dest="layouts_output", help="输出原始 layouts JSON")
|
||||||
|
parser.add_argument("--doc-id", default="GB14747-2006", help="文档 ID")
|
||||||
|
parser.add_argument("--doc-title", default="GB 14747—2006 儿童三轮车安全要求", help="文档标题")
|
||||||
|
parser.add_argument("--max-chars", type=int, default=MAX_CHARS, help="单个检索 chunk 最大字符数")
|
||||||
|
parser.add_argument("--overlap-chars", type=int, default=OVERLAP_CHARS, help="相邻检索 chunk 重叠字符数")
|
||||||
|
parser.add_argument("--poll-interval", type=int, default=5, help="轮询间隔(秒)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
pdf_path = Path(args.pdf_path).expanduser().resolve()
|
||||||
|
if not pdf_path.exists():
|
||||||
|
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
|
||||||
|
|
||||||
|
# 1. 提交阿里云任务
|
||||||
|
client = init_client()
|
||||||
|
print(f"提交任务: {pdf_path}")
|
||||||
|
task_id = submit_job(client, str(pdf_path))
|
||||||
|
print(f"任务 ID: {task_id}")
|
||||||
|
|
||||||
|
# 2. 等待完成
|
||||||
|
print("等待任务完成...")
|
||||||
|
if not wait_for_completion(client, task_id, args.poll_interval):
|
||||||
|
print("任务失败,退出")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 3. 获取 layouts
|
||||||
|
print("获取解析结果...")
|
||||||
|
layouts = collect_all_results(client, task_id)
|
||||||
|
print(f"获取到 {len(layouts)} 个布局块")
|
||||||
|
|
||||||
|
# 4. 输出原始 layouts(可选)
|
||||||
|
if args.layouts_output:
|
||||||
|
layouts_path = Path(args.layouts_output).expanduser().resolve()
|
||||||
|
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"原始 layouts 已写入: {layouts_path}")
|
||||||
|
|
||||||
|
# 5. 转换为三层结构
|
||||||
|
print("转换为三层结构...")
|
||||||
|
data = convert_layouts(
|
||||||
|
layouts,
|
||||||
|
doc_id=args.doc_id,
|
||||||
|
doc_title=args.doc_title,
|
||||||
|
max_chars=args.max_chars,
|
||||||
|
overlap_chars=args.overlap_chars,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 6. 输出结果
|
||||||
|
output_path = Path(args.out).expanduser().resolve()
|
||||||
|
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
print(f"结构层节点数: {len(data['structure_nodes'])}")
|
||||||
|
print(f"语义层块数: {len(data['semantic_blocks'])}")
|
||||||
|
print(f"检索层块数: {len(data['vector_chunks'])}")
|
||||||
|
print(f"输出文件: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
122
backend/app/aliyun_parser/schema.sql
Normal file
122
backend/app/aliyun_parser/schema.sql
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
-- 法规文档向量检索系统数据库表结构
|
||||||
|
-- PostgreSQL
|
||||||
|
|
||||||
|
-- ==================== 文档表 ====================
|
||||||
|
CREATE TABLE documents (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
|
||||||
|
title VARCHAR(512) NOT NULL, -- 文档标题
|
||||||
|
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
|
||||||
|
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
|
||||||
|
publish_date DATE, -- 发布日期
|
||||||
|
implement_date DATE, -- 实施日期
|
||||||
|
status VARCHAR(32), -- 状态:现行/废止/修订
|
||||||
|
source_url VARCHAR(512), -- 来源 URL
|
||||||
|
file_path VARCHAR(512), -- 本地 PDF 文件路径
|
||||||
|
file_size INT, -- 文件大小(字节)
|
||||||
|
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
|
||||||
|
created_at TIMESTAMP DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMP DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE documents IS '文档元数据表';
|
||||||
|
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
|
||||||
|
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
|
||||||
|
|
||||||
|
-- ==================== 章节结构表 ====================
|
||||||
|
CREATE TABLE sections (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
doc_id VARCHAR(128) NOT NULL,
|
||||||
|
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
|
||||||
|
level INT NOT NULL, -- 层级:1, 2, 3...
|
||||||
|
title VARCHAR(512) NOT NULL, -- 章节标题
|
||||||
|
page INT, -- 所在页码
|
||||||
|
index INT, -- 页内顺序
|
||||||
|
parent_id INT, -- 父章节 ID(树形结构)
|
||||||
|
created_at TIMESTAMP DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||||
|
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
|
||||||
|
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
|
||||||
|
COMMENT ON COLUMN sections.parent_id IS '父章节 ID,构建树形结构';
|
||||||
|
COMMENT ON COLUMN sections.level IS '层级深度,1 为最顶层';
|
||||||
|
|
||||||
|
-- ==================== 语义块表 ====================
|
||||||
|
CREATE TABLE semantic_blocks (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
doc_id VARCHAR(128) NOT NULL,
|
||||||
|
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
|
||||||
|
block_type VARCHAR(32) NOT NULL, -- 类型:section_text/table/figure
|
||||||
|
page_start INT NOT NULL, -- 起始页码
|
||||||
|
page_end INT NOT NULL, -- 结束页码
|
||||||
|
section_id INT, -- 所属章节
|
||||||
|
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
|
||||||
|
section_level INT, -- 章节层级
|
||||||
|
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||||
|
text TEXT NOT NULL, -- 完整内容(未被切分)
|
||||||
|
created_at TIMESTAMP DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||||
|
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
|
||||||
|
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
|
||||||
|
COMMENT ON COLUMN semantic_blocks.block_type IS '类型:section_text(正文)、table(表格)、figure(图示)';
|
||||||
|
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
|
||||||
|
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
|
||||||
|
|
||||||
|
-- ==================== 向量块元数据表 ====================
|
||||||
|
CREATE TABLE vector_chunks (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
doc_id VARCHAR(128) NOT NULL,
|
||||||
|
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
|
||||||
|
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
|
||||||
|
chunk_index INT NOT NULL, -- 切片序号(全局)
|
||||||
|
piece_index INT, -- 同语义块内的切片序号
|
||||||
|
page_start INT,
|
||||||
|
page_end INT,
|
||||||
|
section_title VARCHAR(512),
|
||||||
|
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
|
||||||
|
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||||
|
created_at TIMESTAMP DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||||
|
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
|
||||||
|
REFERENCES semantic_blocks(doc_id, semantic_id),
|
||||||
|
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
|
||||||
|
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
|
||||||
|
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
|
||||||
|
|
||||||
|
-- ==================== 索引 ====================
|
||||||
|
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
|
||||||
|
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
|
||||||
|
CREATE INDEX idx_sections_level ON sections(level);
|
||||||
|
|
||||||
|
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
|
||||||
|
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
|
||||||
|
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
|
||||||
|
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
|
||||||
|
|
||||||
|
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
|
||||||
|
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
|
||||||
|
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
|
||||||
|
|
||||||
|
-- ==================== 触发器:自动更新 updated_at ====================
|
||||||
|
CREATE OR REPLACE FUNCTION update_updated_at()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = NOW();
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CREATE TRIGGER tr_documents_updated_at
|
||||||
|
BEFORE UPDATE ON documents
|
||||||
|
FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
||||||
327
backend/app/aliyun_parser/upload_to_milvus.py
Normal file
327
backend/app/aliyun_parser/upload_to_milvus.py
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
|
||||||
|
使用中转站的 OpenAI 兼容 API
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import execute_values
|
||||||
|
from pymilvus import (
|
||||||
|
connections,
|
||||||
|
Collection,
|
||||||
|
FieldSchema,
|
||||||
|
CollectionSchema,
|
||||||
|
DataType,
|
||||||
|
utility,
|
||||||
|
)
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# ===================== 配置 =====================
|
||||||
|
# 中转站配置
|
||||||
|
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
|
||||||
|
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
|
||||||
|
EMBEDDING_MODEL = "text-embedding-v3" # 中转站支持的 embedding 模型
|
||||||
|
|
||||||
|
# Milvus 配置
|
||||||
|
MILVUS_HOST = "localhost"
|
||||||
|
MILVUS_PORT = "19530"
|
||||||
|
COLLECTION_NAME = "regulation_chunks"
|
||||||
|
|
||||||
|
# PostgreSQL 配置
|
||||||
|
PG_HOST = "6.86.80.10"
|
||||||
|
PG_PORT = 5432
|
||||||
|
PG_USER = "postgresql"
|
||||||
|
PG_PASSWORD = "postgresql123456"
|
||||||
|
PG_DATABASE = "postgres"
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== Embedding =====================
|
||||||
|
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
|
||||||
|
"""创建 OpenAI 客户端连接到中转站"""
|
||||||
|
return OpenAI(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
|
|
||||||
|
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
|
||||||
|
"""批量获取文本向量"""
|
||||||
|
all_embeddings = []
|
||||||
|
|
||||||
|
for i in range(0, len(texts), batch_size):
|
||||||
|
batch = texts[i:i + batch_size]
|
||||||
|
print(f"Embedding batch {i // batch_size + 1}/{(len(texts) - 1) // batch_size + 1}...")
|
||||||
|
|
||||||
|
response = client.embeddings.create(
|
||||||
|
model=EMBEDDING_MODEL,
|
||||||
|
input=batch,
|
||||||
|
)
|
||||||
|
|
||||||
|
embeddings = [item.embedding for item in response.data]
|
||||||
|
all_embeddings.extend(embeddings)
|
||||||
|
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== Milvus =====================
|
||||||
|
def init_milvus(host: str, port: str):
|
||||||
|
connections.connect("default", host=host, port=port)
|
||||||
|
print(f"已连接 Milvus: {host}:{port}")
|
||||||
|
|
||||||
|
|
||||||
|
def create_collection(name: str, dim: int) -> Collection:
|
||||||
|
"""创建或获取 collection"""
|
||||||
|
if utility.has_collection(name):
|
||||||
|
print(f"Collection '{name}' 已存在,删除重建")
|
||||||
|
utility.drop_collection(name)
|
||||||
|
|
||||||
|
fields = [
|
||||||
|
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=64, is_primary=True),
|
||||||
|
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=128),
|
||||||
|
FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=512),
|
||||||
|
FieldSchema(name="chunk_index", dtype=DataType.INT64),
|
||||||
|
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=64),
|
||||||
|
FieldSchema(name="chunk_type", dtype=DataType.VARCHAR, max_length=32),
|
||||||
|
FieldSchema(name="page_start", dtype=DataType.INT64),
|
||||||
|
FieldSchema(name="page_end", dtype=DataType.INT64),
|
||||||
|
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||||
|
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
|
||||||
|
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # JSON 字符串
|
||||||
|
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||||
|
]
|
||||||
|
|
||||||
|
schema = CollectionSchema(fields, description="法规文档检索 chunks")
|
||||||
|
collection = Collection(name, schema)
|
||||||
|
|
||||||
|
# 创建向量索引(IVF_FLAT,适合中小规模)
|
||||||
|
index_params = {
|
||||||
|
"metric_type": "COSINE",
|
||||||
|
"index_type": "IVF_FLAT",
|
||||||
|
"params": {"nlist": 128},
|
||||||
|
}
|
||||||
|
collection.create_index("embedding", index_params)
|
||||||
|
print(f"Collection '{name}' 创建完成,索引已建立")
|
||||||
|
|
||||||
|
return collection
|
||||||
|
|
||||||
|
|
||||||
|
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
|
||||||
|
"""插入 chunks 到 Milvus"""
|
||||||
|
data = [
|
||||||
|
[c["chunk_id"] for c in chunks],
|
||||||
|
[c["doc_id"] for c in chunks],
|
||||||
|
[c["doc_title"] for c in chunks],
|
||||||
|
[c["chunk_index"] for c in chunks],
|
||||||
|
[c["semantic_id"] for c in chunks],
|
||||||
|
[c["chunk_type"] for c in chunks],
|
||||||
|
[c["page_start"] for c in chunks],
|
||||||
|
[c["page_end"] for c in chunks],
|
||||||
|
[c["section_title"] for c in chunks],
|
||||||
|
[c["text"] for c in chunks],
|
||||||
|
[json.dumps(c.get("source_ids", [])) for c in chunks], # JSON 字符串
|
||||||
|
embeddings,
|
||||||
|
]
|
||||||
|
|
||||||
|
collection.insert(data)
|
||||||
|
collection.flush()
|
||||||
|
print(f"已插入 {len(chunks)} 个 chunks")
|
||||||
|
|
||||||
|
|
||||||
|
def load_collection(collection: Collection):
|
||||||
|
"""加载 collection 到内存(搜索前必须)"""
|
||||||
|
collection.load()
|
||||||
|
print(f"Collection 已加载到内存")
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== PostgreSQL =====================
|
||||||
|
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
|
||||||
|
"""获取 PostgreSQL 连接"""
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
database=database,
|
||||||
|
)
|
||||||
|
print(f"已连接 PostgreSQL: {host}:{port}/{database}")
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
|
||||||
|
"""插入 chunks 和相关数据到 PostgreSQL"""
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. 插入文档
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO documents (doc_id, title, standard_number, upload_time)
|
||||||
|
VALUES (%s, %s, %s, NOW())
|
||||||
|
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
|
||||||
|
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
|
||||||
|
|
||||||
|
# 2. 插入语义块
|
||||||
|
semantic_blocks = doc_data.get("semantic_blocks", [])
|
||||||
|
if semantic_blocks:
|
||||||
|
block_rows = [
|
||||||
|
(
|
||||||
|
doc_data["doc_id"],
|
||||||
|
block["semantic_id"],
|
||||||
|
block["block_type"],
|
||||||
|
block["page_start"],
|
||||||
|
block["page_end"],
|
||||||
|
block.get("section_title"),
|
||||||
|
block.get("section_level"),
|
||||||
|
json.dumps(block.get("source_ids", [])),
|
||||||
|
block["text"],
|
||||||
|
)
|
||||||
|
for block in semantic_blocks
|
||||||
|
]
|
||||||
|
execute_values(
|
||||||
|
cursor,
|
||||||
|
"""
|
||||||
|
INSERT INTO semantic_blocks
|
||||||
|
(doc_id, semantic_id, block_type, page_start, page_end, section_title, section_level, source_ids, text)
|
||||||
|
VALUES %s
|
||||||
|
ON CONFLICT (doc_id, semantic_id) DO UPDATE SET text = EXCLUDED.text
|
||||||
|
""",
|
||||||
|
block_rows,
|
||||||
|
)
|
||||||
|
print(f"已插入 {len(semantic_blocks)} 个语义块")
|
||||||
|
|
||||||
|
# 3. 插入向量块元数据
|
||||||
|
chunk_rows = [
|
||||||
|
(
|
||||||
|
doc_data["doc_id"],
|
||||||
|
chunk["chunk_id"],
|
||||||
|
chunk["semantic_id"],
|
||||||
|
chunk["chunk_index"],
|
||||||
|
chunk.get("piece_index"),
|
||||||
|
chunk["page_start"],
|
||||||
|
chunk["page_end"],
|
||||||
|
chunk.get("section_title"),
|
||||||
|
chunk["text"],
|
||||||
|
json.dumps(chunk.get("source_ids", [])),
|
||||||
|
)
|
||||||
|
for chunk in chunks
|
||||||
|
]
|
||||||
|
execute_values(
|
||||||
|
cursor,
|
||||||
|
"""
|
||||||
|
INSERT INTO vector_chunks
|
||||||
|
(doc_id, chunk_id, semantic_id, chunk_index, piece_index, page_start, page_end, section_title, text, source_ids)
|
||||||
|
VALUES %s
|
||||||
|
ON CONFLICT (doc_id, chunk_id) DO UPDATE SET text = EXCLUDED.text
|
||||||
|
""",
|
||||||
|
chunk_rows,
|
||||||
|
)
|
||||||
|
print(f"已插入 {len(chunks)} 个向量块元数据")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print("PostgreSQL 数据插入完成")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
raise e
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== 主流程 =====================
|
||||||
|
def load_data(file_path: Path) -> Dict:
|
||||||
|
"""加载 vector_chunks.json,返回完整数据"""
|
||||||
|
data = json.loads(file_path.read_text(encoding="utf-8"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_milvus_and_pg(
|
||||||
|
chunks_file: str,
|
||||||
|
api_key: str,
|
||||||
|
base_url: str,
|
||||||
|
milvus_host: str,
|
||||||
|
milvus_port: str,
|
||||||
|
collection_name: str,
|
||||||
|
batch_size: int,
|
||||||
|
pg_host: str,
|
||||||
|
pg_port: int,
|
||||||
|
pg_user: str,
|
||||||
|
pg_password: str,
|
||||||
|
pg_database: str,
|
||||||
|
):
|
||||||
|
# 1. 加载完整数据
|
||||||
|
chunks_path = Path(chunks_file).expanduser().resolve()
|
||||||
|
if not chunks_path.exists():
|
||||||
|
raise FileNotFoundError(f"文件不存在: {chunks_path}")
|
||||||
|
|
||||||
|
data = load_data(chunks_path)
|
||||||
|
chunks = data.get("vector_chunks", [])
|
||||||
|
if not chunks:
|
||||||
|
raise ValueError("vector_chunks 为空")
|
||||||
|
print(f"加载 {len(chunks)} 个 chunks")
|
||||||
|
|
||||||
|
# 2. 初始化连接
|
||||||
|
client = get_openai_client(api_key, base_url)
|
||||||
|
init_milvus(milvus_host, milvus_port)
|
||||||
|
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
|
||||||
|
|
||||||
|
# 3. 获取 embeddings
|
||||||
|
texts = [c["embedding_text"] for c in chunks]
|
||||||
|
embeddings = get_embeddings_batch(client, texts, batch_size)
|
||||||
|
print(f"生成 {len(embeddings)} 个向量")
|
||||||
|
|
||||||
|
# 4. 获取 embedding 维度
|
||||||
|
embedding_dim = len(embeddings[0])
|
||||||
|
print(f"Embedding 维度: {embedding_dim}")
|
||||||
|
|
||||||
|
# 5. 创建 collection 并插入 Milvus
|
||||||
|
collection = create_collection(collection_name, embedding_dim)
|
||||||
|
insert_chunks(collection, chunks, embeddings)
|
||||||
|
load_collection(collection)
|
||||||
|
|
||||||
|
# 6. 插入 PostgreSQL
|
||||||
|
insert_chunks_to_pg(pg_conn, chunks, data)
|
||||||
|
|
||||||
|
# 7. 关闭连接
|
||||||
|
pg_conn.close()
|
||||||
|
|
||||||
|
print("上传完成!")
|
||||||
|
|
||||||
|
|
||||||
|
# ===================== CLI =====================
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
|
||||||
|
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
|
||||||
|
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
|
||||||
|
parser.add_argument("--base-url", default=RELAY_BASE_URL, help="中转站 Base URL")
|
||||||
|
parser.add_argument("--milvus-host", default=MILVUS_HOST, help="Milvus host")
|
||||||
|
parser.add_argument("--milvus-port", default=MILVUS_PORT, help="Milvus port")
|
||||||
|
parser.add_argument("--collection", default=COLLECTION_NAME, help="Milvus collection 名称")
|
||||||
|
parser.add_argument("--batch-size", type=int, default=10, help="Embedding 批量大小(中转站限制最大10)")
|
||||||
|
parser.add_argument("--pg-host", default=PG_HOST, help="PostgreSQL host")
|
||||||
|
parser.add_argument("--pg-port", type=int, default=PG_PORT, help="PostgreSQL port")
|
||||||
|
parser.add_argument("--pg-user", default=PG_USER, help="PostgreSQL user")
|
||||||
|
parser.add_argument("--pg-password", default=PG_PASSWORD, help="PostgreSQL password")
|
||||||
|
parser.add_argument("--pg-database", default=PG_DATABASE, help="PostgreSQL database")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
upload_to_milvus_and_pg(
|
||||||
|
chunks_file=args.chunks_file,
|
||||||
|
api_key=args.api_key,
|
||||||
|
base_url=args.base_url,
|
||||||
|
milvus_host=args.milvus_host,
|
||||||
|
milvus_port=args.milvus_port,
|
||||||
|
collection_name=args.collection,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
pg_host=args.pg_host,
|
||||||
|
pg_port=args.pg_port,
|
||||||
|
pg_user=args.pg_user,
|
||||||
|
pg_password=args.pg_password,
|
||||||
|
pg_database=args.pg_database,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5212
backend/app/aliyun_parser/vector_chunks.json
Normal file
5212
backend/app/aliyun_parser/vector_chunks.json
Normal file
File diff suppressed because it is too large
Load Diff
263
backend/app/aliyun_parser/嵌入和召回.md
Normal file
263
backend/app/aliyun_parser/嵌入和召回.md
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
# 文档解析与向量检索说明
|
||||||
|
|
||||||
|
## 相关文件
|
||||||
|
|
||||||
|
- `aliyun_doc_parser.py`:调用阿里云文档智能解析 PDF,生成原始 `layouts.json`
|
||||||
|
- `layouts_to_vector_chunks.py`:把 `layouts.json` 转成适合向量数据库入库的三层结构
|
||||||
|
- `layouts.json`:阿里云返回的原始布局结果
|
||||||
|
- `vector_chunks.json`:转换后的结构化输出
|
||||||
|
|
||||||
|
## 一、`layouts.json` 的结构
|
||||||
|
|
||||||
|
`layouts.json` 顶层是一个数组,每个元素代表一个布局块(layout)。常见字段如下:
|
||||||
|
|
||||||
|
- `type`:主类型,例如 `title`、`text`、`table`、`figure`
|
||||||
|
- `subType`:更细的语义类型,例如 `doc_title`、`para_title`、`para`、`picture`、`pic_title`、`pic_caption`
|
||||||
|
- `text`:当前布局块的纯文本
|
||||||
|
- `markdownContent`:带 markdown 标记的文本
|
||||||
|
- `pageNum`:页码
|
||||||
|
- `index`:页内顺序
|
||||||
|
- `level`:标题层级
|
||||||
|
- `uniqueId`:布局块唯一标识
|
||||||
|
- `blocks`:更细粒度的文本与样式信息
|
||||||
|
- `cells`:表格单元格,仅 `table` 类型存在
|
||||||
|
|
||||||
|
这个结构不是简单 OCR 文本流,而是已经带有版面理解和语义分类的结构化数据。
|
||||||
|
|
||||||
|
## 二、推荐的三层转换结构
|
||||||
|
|
||||||
|
### 1. 结构层 `structure_nodes`
|
||||||
|
|
||||||
|
结构层用于恢复文档标题树,不直接作为最终向量检索单元。
|
||||||
|
|
||||||
|
示例:
|
||||||
|
|
||||||
|
- `1 范围`
|
||||||
|
- `2 规范性引用文件`
|
||||||
|
- `3 术语和定义`
|
||||||
|
- `3.1 儿童三轮车`
|
||||||
|
- `3.2 轮距`
|
||||||
|
|
||||||
|
结构层主要用于给下游 chunk 绑定 `section_path`。
|
||||||
|
|
||||||
|
### 2. 语义层 `semantic_blocks`
|
||||||
|
|
||||||
|
语义层是按文档意义聚合后的内容块,主要分为三类:
|
||||||
|
|
||||||
|
- `section_text`:同一章节下连续正文聚合而成
|
||||||
|
- `table`:表格内容单独成块
|
||||||
|
- `figure`:图、图名、图注等单独成块
|
||||||
|
|
||||||
|
这一层比单 layout 更适合做语义理解,也适合后续做上下文扩展。
|
||||||
|
|
||||||
|
### 3. 检索层 `vector_chunks`
|
||||||
|
|
||||||
|
检索层是最终写进向量数据库的 chunk。
|
||||||
|
|
||||||
|
处理方式:
|
||||||
|
|
||||||
|
- 对 `semantic_blocks` 中较短的块直接入库
|
||||||
|
- 对较长的块按 `max_chars` 再切分
|
||||||
|
- 相邻切片保留 `overlap_chars` 重叠
|
||||||
|
- 每个 chunk 都带完整 metadata,便于后续过滤、重排和邻域扩展
|
||||||
|
|
||||||
|
## 三、当前转换脚本做了什么
|
||||||
|
|
||||||
|
`layouts_to_vector_chunks.py` 当前已经实现:
|
||||||
|
|
||||||
|
1. 过滤目录页噪声(如 `目次`)
|
||||||
|
2. 根据标题层级维护章节路径
|
||||||
|
3. 将正文聚合成 `section_text`
|
||||||
|
4. 将表格单独转成 `table`
|
||||||
|
5. 将图相关内容单独转成 `figure`
|
||||||
|
6. 对长文本继续切分为最终 `vector_chunks`
|
||||||
|
7. 为每个检索 chunk 生成 `embedding_text`
|
||||||
|
|
||||||
|
## 四、为什么不要直接按 layout 入库
|
||||||
|
|
||||||
|
如果把 `layouts.json` 的每条 layout 直接做向量:
|
||||||
|
|
||||||
|
- 颗粒度太碎
|
||||||
|
- 标题和正文容易分离
|
||||||
|
- 表格会丢失结构上下文
|
||||||
|
- 图示信息无法完整表达
|
||||||
|
- 检索命中结果噪声较大
|
||||||
|
|
||||||
|
对于标准文档,最合适的单位通常不是“句子”,而是“条款语义块”。
|
||||||
|
|
||||||
|
## 五、建议的入库字段
|
||||||
|
|
||||||
|
建议向量数据库每条记录至少保存:
|
||||||
|
|
||||||
|
- `embedding_text`:用于生成向量
|
||||||
|
- `text`:原始 chunk 文本
|
||||||
|
- `chunk_id`
|
||||||
|
- `semantic_id`
|
||||||
|
- `chunk_type`:`section_text` / `table` / `figure`
|
||||||
|
- `section_path`
|
||||||
|
- `section_title`
|
||||||
|
- `section_level`
|
||||||
|
- `page_start`
|
||||||
|
- `page_end`
|
||||||
|
- `doc_id`
|
||||||
|
- `doc_title`
|
||||||
|
- `source_ids`
|
||||||
|
|
||||||
|
其中:
|
||||||
|
|
||||||
|
- 向量化字段:`embedding_text`
|
||||||
|
- 展示字段:`text`
|
||||||
|
- 检索增强字段:其余 metadata
|
||||||
|
|
||||||
|
## 六、推荐的检索方式
|
||||||
|
|
||||||
|
不要只做最简单的 top-k 向量搜索,建议采用:
|
||||||
|
|
||||||
|
**向量召回 + metadata 重排 + 邻域扩展**
|
||||||
|
|
||||||
|
### 1. 向量召回
|
||||||
|
|
||||||
|
使用 `vector_chunks[*].embedding_text` 做 embedding,并在向量数据库中检索 top 10 ~ 15 条。
|
||||||
|
|
||||||
|
查询时可以对用户问题做轻微改写,例如:
|
||||||
|
|
||||||
|
原问题:
|
||||||
|
|
||||||
|
`儿童三轮车的定义是什么?`
|
||||||
|
|
||||||
|
可改写为:
|
||||||
|
|
||||||
|
`请检索 GB 14747—2006 儿童三轮车安全要求 中关于“儿童三轮车定义”的条款、术语、表格或图示说明。`
|
||||||
|
|
||||||
|
这样更适合标准文档检索。
|
||||||
|
|
||||||
|
### 2. metadata 重排
|
||||||
|
|
||||||
|
向量召回后,根据 metadata 做轻量规则重排。
|
||||||
|
|
||||||
|
常见规则:
|
||||||
|
|
||||||
|
- `chunk_type == section_text`:对定义类、要求类问题优先级更高
|
||||||
|
- `section_path` 命中查询关键词:例如查询“定义”时,`术语和定义` 章节优先
|
||||||
|
- `chunk_type == table`:对“尺寸 / 参数 / 数值 / 对照 / 要求”类问题加权
|
||||||
|
- `chunk_type == figure`:对“图 / 结构 / 状态 / 示意”类问题加权
|
||||||
|
|
||||||
|
### 3. 邻域扩展
|
||||||
|
|
||||||
|
检索命中的是最终切片,但回答往往需要更完整上下文。
|
||||||
|
|
||||||
|
建议命中某个 `vector_chunk` 后:
|
||||||
|
|
||||||
|
1. 优先回捞同一个 `semantic_id` 下的所有 chunk
|
||||||
|
2. 如果还不够,再补充同 `section_path`、相邻页码或相邻 `chunk_index` 的内容
|
||||||
|
|
||||||
|
这样可以恢复完整条款,而不是只给模型一小段碎片。
|
||||||
|
|
||||||
|
## 七、不同问题的检索重点
|
||||||
|
|
||||||
|
### 1. 定义类问题
|
||||||
|
|
||||||
|
例如:
|
||||||
|
|
||||||
|
- `儿童三轮车的定义是什么?`
|
||||||
|
- `轮距是什么意思?`
|
||||||
|
|
||||||
|
优先检索:
|
||||||
|
|
||||||
|
- `section_text`
|
||||||
|
- `section_path` 中包含 `术语和定义` 的内容
|
||||||
|
|
||||||
|
### 2. 要求类问题
|
||||||
|
|
||||||
|
例如:
|
||||||
|
|
||||||
|
- `外露突出物有什么要求?`
|
||||||
|
- `辅助推杆有哪些安全要求?`
|
||||||
|
|
||||||
|
优先检索:
|
||||||
|
|
||||||
|
- `section_text`
|
||||||
|
- `table`
|
||||||
|
|
||||||
|
### 3. 数值 / 尺寸 / 对照类问题
|
||||||
|
|
||||||
|
例如:
|
||||||
|
|
||||||
|
- `鞍座到脚蹬距离要求是什么?`
|
||||||
|
- `哪些项目需要满足规定尺寸?`
|
||||||
|
|
||||||
|
优先检索:
|
||||||
|
|
||||||
|
- `table`
|
||||||
|
- `section_text`
|
||||||
|
|
||||||
|
### 4. 图示说明类问题
|
||||||
|
|
||||||
|
例如:
|
||||||
|
|
||||||
|
- `正常乘骑状态是什么意思?`
|
||||||
|
- `图1表示什么?`
|
||||||
|
|
||||||
|
优先检索:
|
||||||
|
|
||||||
|
- `figure`
|
||||||
|
- 同章节相邻 `section_text`
|
||||||
|
|
||||||
|
## 八、推荐的最终检索流程
|
||||||
|
|
||||||
|
建议采用以下固定流程:
|
||||||
|
|
||||||
|
1. 用 `vector_chunks.embedding_text` 做 embedding 检索
|
||||||
|
2. 取 top 10 ~ 15 条候选
|
||||||
|
3. 按 `chunk_type + section_path` 做规则重排
|
||||||
|
4. 以 `semantic_id` 为中心回捞完整语义块
|
||||||
|
5. 选 3 ~ 5 组上下文提供给大模型回答
|
||||||
|
|
||||||
|
## 九、给大模型的上下文组织方式
|
||||||
|
|
||||||
|
最终不要直接把原始 JSON 扔给模型,建议整理成如下格式:
|
||||||
|
|
||||||
|
```text
|
||||||
|
[命中片段 1]
|
||||||
|
章节:3 术语和定义 > 3.1 儿童三轮车
|
||||||
|
页码:1-2
|
||||||
|
类型:section_text
|
||||||
|
内容:
|
||||||
|
......
|
||||||
|
|
||||||
|
[命中片段 2]
|
||||||
|
章节:4 要求 > 4.3 外露突出物
|
||||||
|
页码:5
|
||||||
|
类型:section_text
|
||||||
|
内容:
|
||||||
|
......
|
||||||
|
|
||||||
|
[命中片段 3]
|
||||||
|
章节:5 试验方法
|
||||||
|
页码:8
|
||||||
|
类型:table
|
||||||
|
内容:
|
||||||
|
......
|
||||||
|
```
|
||||||
|
|
||||||
|
这种格式更利于模型稳定回答并引用出处。
|
||||||
|
|
||||||
|
## 十、转换命令
|
||||||
|
|
||||||
|
生成三层结构:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
|
||||||
|
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
|
||||||
|
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json
|
||||||
|
```
|
||||||
|
|
||||||
|
自定义切片大小:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 /home/huaci/dev/ai/SuperMew/tests/layouts_to_vector_chunks.py \
|
||||||
|
--layouts /home/huaci/dev/ai/SuperMew/tests/layouts.json \
|
||||||
|
--out /home/huaci/dev/ai/SuperMew/tests/vector_chunks.json \
|
||||||
|
--max-chars 500 \
|
||||||
|
--overlap-chars 80
|
||||||
|
```
|
||||||
93
dev.bat
93
dev.bat
@@ -92,7 +92,7 @@ echo.
|
|||||||
exit /b 0
|
exit /b 0
|
||||||
|
|
||||||
:setup
|
:setup
|
||||||
call :ensure_log_dir
|
if not exist "%LOG_DIR%" mkdir "%LOG_DIR%"
|
||||||
echo.
|
echo.
|
||||||
echo ========================================
|
echo ========================================
|
||||||
echo AI+合规智能中枢 - 环境初始化
|
echo AI+合规智能中枢 - 环境初始化
|
||||||
@@ -303,7 +303,7 @@ echo Unknown argument: %~1
|
|||||||
exit /b 1
|
exit /b 1
|
||||||
|
|
||||||
:status
|
:status
|
||||||
call :ensure_log_dir
|
if not exist "%LOG_DIR%" mkdir "%LOG_DIR%"
|
||||||
echo.
|
echo.
|
||||||
echo ========================================
|
echo ========================================
|
||||||
echo AI+合规智能中枢 - 服务状态
|
echo AI+合规智能中枢 - 服务状态
|
||||||
@@ -313,39 +313,47 @@ echo.
|
|||||||
echo API service:
|
echo API service:
|
||||||
set "API_PID="
|
set "API_PID="
|
||||||
set "API_RUNNING=0"
|
set "API_RUNNING=0"
|
||||||
|
set "API_LISTENER="
|
||||||
|
set "API_DISPLAY_PID="
|
||||||
if exist "%API_PID_FILE%" set /p API_PID=<"%API_PID_FILE%"
|
if exist "%API_PID_FILE%" set /p API_PID=<"%API_PID_FILE%"
|
||||||
if defined API_PID (
|
if defined API_PID (
|
||||||
call :pid_exists %API_PID%
|
call :pid_exists !API_PID!
|
||||||
if not errorlevel 1 (
|
if errorlevel 1 (
|
||||||
set "API_RUNNING=1"
|
|
||||||
echo Status: running
|
|
||||||
echo PID: %API_PID%
|
|
||||||
goto api_health
|
|
||||||
) else (
|
|
||||||
del /q "%API_PID_FILE%" >nul 2>nul
|
del /q "%API_PID_FILE%" >nul 2>nul
|
||||||
set "API_PID="
|
set "API_PID="
|
||||||
|
) else (
|
||||||
|
set "API_RUNNING=1"
|
||||||
|
set "API_DISPLAY_PID=!API_PID!"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "%API_RUNNING%"=="1" goto api_running
|
||||||
call :get_listener_pid %API_PORT% API_LISTENER
|
call :get_listener_pid %API_PORT% API_LISTENER
|
||||||
if defined API_LISTENER (
|
if defined API_LISTENER goto api_listener
|
||||||
set "API_RUNNING=1"
|
|
||||||
echo Status: running (no PID file)
|
|
||||||
echo PID: %API_LISTENER%
|
|
||||||
) else (
|
|
||||||
echo Status: stopped
|
echo Status: stopped
|
||||||
goto api_done
|
goto api_done
|
||||||
|
|
||||||
|
:api_running
|
||||||
|
echo Status: running
|
||||||
|
if defined API_DISPLAY_PID echo PID: !API_DISPLAY_PID!
|
||||||
|
call :check_api_health
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo Health: failed
|
||||||
|
) else (
|
||||||
|
echo Health: ok
|
||||||
|
)
|
||||||
|
goto api_done
|
||||||
|
|
||||||
|
:api_listener
|
||||||
|
echo Status: running (no PID file)
|
||||||
|
echo PID: !API_LISTENER!
|
||||||
|
call :check_api_health
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo Health: failed
|
||||||
|
) else (
|
||||||
|
echo Health: ok
|
||||||
)
|
)
|
||||||
|
|
||||||
:api_health
|
|
||||||
if "%API_RUNNING%"=="1" (
|
|
||||||
call :check_api_health
|
|
||||||
if not errorlevel 1 (
|
|
||||||
echo Health: ok
|
|
||||||
) else (
|
|
||||||
echo Health: failed
|
|
||||||
)
|
|
||||||
)
|
|
||||||
:api_done
|
:api_done
|
||||||
echo URL: http://localhost:%API_PORT%
|
echo URL: http://localhost:%API_PORT%
|
||||||
echo Docs: http://localhost:%API_PORT%/docs
|
echo Docs: http://localhost:%API_PORT%/docs
|
||||||
@@ -353,26 +361,35 @@ echo.
|
|||||||
|
|
||||||
echo Frontend service:
|
echo Frontend service:
|
||||||
set "FRONTEND_PID="
|
set "FRONTEND_PID="
|
||||||
|
set "FRONTEND_RUNNING=0"
|
||||||
|
set "FRONTEND_LISTENER="
|
||||||
|
set "FRONTEND_DISPLAY_PID="
|
||||||
if exist "%FRONTEND_PID_FILE%" set /p FRONTEND_PID=<"%FRONTEND_PID_FILE%"
|
if exist "%FRONTEND_PID_FILE%" set /p FRONTEND_PID=<"%FRONTEND_PID_FILE%"
|
||||||
if defined FRONTEND_PID (
|
if defined FRONTEND_PID (
|
||||||
call :pid_exists %FRONTEND_PID%
|
call :pid_exists !FRONTEND_PID!
|
||||||
if not errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo Status: running
|
|
||||||
echo PID: %FRONTEND_PID%
|
|
||||||
goto frontend_done
|
|
||||||
) else (
|
|
||||||
del /q "%FRONTEND_PID_FILE%" >nul 2>nul
|
del /q "%FRONTEND_PID_FILE%" >nul 2>nul
|
||||||
set "FRONTEND_PID="
|
set "FRONTEND_PID="
|
||||||
|
) else (
|
||||||
|
set "FRONTEND_RUNNING=1"
|
||||||
|
set "FRONTEND_DISPLAY_PID=!FRONTEND_PID!"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "%FRONTEND_RUNNING%"=="1" goto frontend_running
|
||||||
call :get_listener_pid %FRONTEND_PORT% FRONTEND_LISTENER
|
call :get_listener_pid %FRONTEND_PORT% FRONTEND_LISTENER
|
||||||
if defined FRONTEND_LISTENER (
|
if defined FRONTEND_LISTENER goto frontend_listener
|
||||||
echo Status: running (no PID file)
|
|
||||||
echo PID: %FRONTEND_LISTENER%
|
|
||||||
) else (
|
|
||||||
echo Status: stopped
|
echo Status: stopped
|
||||||
)
|
goto frontend_done
|
||||||
|
|
||||||
|
:frontend_running
|
||||||
|
echo Status: running
|
||||||
|
if defined FRONTEND_DISPLAY_PID echo PID: !FRONTEND_DISPLAY_PID!
|
||||||
|
goto frontend_done
|
||||||
|
|
||||||
|
:frontend_listener
|
||||||
|
echo Status: running (no PID file)
|
||||||
|
echo PID: !FRONTEND_LISTENER!
|
||||||
|
|
||||||
:frontend_done
|
:frontend_done
|
||||||
echo Mode: %FRONTEND_MODE%
|
echo Mode: %FRONTEND_MODE%
|
||||||
@@ -417,7 +434,7 @@ if /I "%~3"=="--follow" (
|
|||||||
exit /b %errorlevel%
|
exit /b %errorlevel%
|
||||||
|
|
||||||
:start_api_background
|
:start_api_background
|
||||||
call :ensure_log_dir
|
if not exist "%LOG_DIR%" mkdir "%LOG_DIR%"
|
||||||
if not exist "%VENV_PYTHON%" (
|
if not exist "%VENV_PYTHON%" (
|
||||||
echo Virtual environment not found. Run dev.bat setup first.
|
echo Virtual environment not found. Run dev.bat setup first.
|
||||||
exit /b 1
|
exit /b 1
|
||||||
@@ -471,7 +488,7 @@ exit /b %errorlevel%
|
|||||||
:start_frontend
|
:start_frontend
|
||||||
set "MODE=%~1"
|
set "MODE=%~1"
|
||||||
if "%MODE%"=="" set "MODE=%FRONTEND_MODE%"
|
if "%MODE%"=="" set "MODE=%FRONTEND_MODE%"
|
||||||
call :ensure_log_dir
|
if not exist "%LOG_DIR%" mkdir "%LOG_DIR%"
|
||||||
|
|
||||||
where npm >nul 2>nul || (
|
where npm >nul 2>nul || (
|
||||||
echo npm was not found. Install Node.js 20+ first.
|
echo npm was not found. Install Node.js 20+ first.
|
||||||
@@ -569,10 +586,6 @@ if defined FRONTEND_PORT_PID (
|
|||||||
echo Frontend is not running.
|
echo Frontend is not running.
|
||||||
exit /b 0
|
exit /b 0
|
||||||
|
|
||||||
:ensure_log_dir
|
|
||||||
if not exist "%LOG_DIR%" mkdir "%LOG_DIR%"
|
|
||||||
exit /b 0
|
|
||||||
|
|
||||||
:validate_mode
|
:validate_mode
|
||||||
if /I "%~1"=="dev" exit /b 0
|
if /I "%~1"=="dev" exit /b 0
|
||||||
if /I "%~1"=="static" exit /b 0
|
if /I "%~1"=="static" exit /b 0
|
||||||
|
|||||||
717
docs/architecture/backend-project-architecture.md
Normal file
717
docs/architecture/backend-project-architecture.md
Normal file
@@ -0,0 +1,717 @@
|
|||||||
|
# Backend Project Architecture
|
||||||
|
|
||||||
|
## 1. Purpose
|
||||||
|
|
||||||
|
本文档定义当前 backend 的目标态架构,用于在保持单服务部署的前提下,将系统整理为职责清晰、边界稳定、可替换实现的模块化结构。本文档的重点不是描述理想化分层,而是基于当前真实代码形态,明确后续重构时必须遵守的模块职责、依赖方向、内部稳定接口和替换边界。
|
||||||
|
|
||||||
|
本文档与 `docs/rfc/backend-api-parsing-embedding-migration-requirements.md` 的关系如下:
|
||||||
|
|
||||||
|
- RFC 负责冻结本轮迁移需求、范围、风险和约束。
|
||||||
|
- 本文档负责冻结目标模块边界、依赖规则和实现组织方式。
|
||||||
|
- 后续任何代码重构、能力替换或底座升级,都应同时满足 RFC 与本文档。
|
||||||
|
|
||||||
|
## 2. Current-State Problems
|
||||||
|
|
||||||
|
基于当前代码,后端已经具备以下能力:
|
||||||
|
|
||||||
|
- 文档上传、下载、列表
|
||||||
|
- 文档解析与切片
|
||||||
|
- 向量化与 Milvus 入库
|
||||||
|
- 检索
|
||||||
|
- 基于 RAG 的 Agent 问答 workflow
|
||||||
|
|
||||||
|
但这些能力当前主要是“可运行”,还不是“结构清晰、便于替换、便于演进”的状态。核心问题如下。
|
||||||
|
|
||||||
|
### 2.1 `DocumentProcessor` 责任过载
|
||||||
|
|
||||||
|
`backend/app/services/document_processor.py` 当前同时承担:
|
||||||
|
|
||||||
|
- 文档解析
|
||||||
|
- 摘要生成
|
||||||
|
- 分块
|
||||||
|
- 向量化
|
||||||
|
- Milvus 入库
|
||||||
|
- 检索入口
|
||||||
|
|
||||||
|
这使上传处理链路、检索链路与基础设施初始化逻辑耦合在一个大类中。流程编排与具体实现没有边界,后续无论替换 parser、embedding、vector store 还是增加文档状态管理,都会直接影响同一个类。
|
||||||
|
|
||||||
|
### 2.2 检索逻辑缺少稳定边界
|
||||||
|
|
||||||
|
`backend/app/services/rag/retriever.py` 当前同时管理:
|
||||||
|
|
||||||
|
- embedder 初始化
|
||||||
|
- Milvus 连接与 collection lifecycle
|
||||||
|
- 检索执行
|
||||||
|
- 结果映射
|
||||||
|
|
||||||
|
这意味着“检索能力”不是一个稳定的业务能力接口,而是一个直接依赖具体 embedding 和 Milvus 实现的复合服务。后续如果从 `BGE-M3 + hybrid search` 切到 `1536 dense-only` 或替换向量索引实现,会直接影响检索服务本身。
|
||||||
|
|
||||||
|
### 2.3 `QAAgent` 责任过载
|
||||||
|
|
||||||
|
`backend/app/services/agent/qa_agent.py` 当前同时承担:
|
||||||
|
|
||||||
|
- 检索调用
|
||||||
|
- 上下文构建
|
||||||
|
- Prompt 选择
|
||||||
|
- LLM 调用
|
||||||
|
- SSE 流式问答流程
|
||||||
|
- 会话 workflow 编排
|
||||||
|
|
||||||
|
这导致 Agent workflow 与检索底座、LLM provider、上下文构造逻辑紧耦合。后续切换 LLM provider、替换 session store、复用 retrieval 能力时,影响面会扩散到整个 Agent 实现。
|
||||||
|
|
||||||
|
### 2.4 API 层直接编排具体服务
|
||||||
|
|
||||||
|
当前 API 路由主要在:
|
||||||
|
|
||||||
|
- `backend/app/api/routes/documents.py`
|
||||||
|
- `backend/app/api/routes/knowledge.py`
|
||||||
|
- `backend/app/api/routes/agent.py`
|
||||||
|
|
||||||
|
这些路由直接实例化具体服务类,例如 `DocumentProcessor`、`QAAgent`、`MinIOClient`。这意味着:
|
||||||
|
|
||||||
|
- API 层不仅处理 transport concerns,也在做业务编排
|
||||||
|
- 路由层知道过多内部实现细节
|
||||||
|
- 后续如果内部模块调整,路由层也要跟着改
|
||||||
|
|
||||||
|
### 2.5 文档元数据与对象存储组织方式耦合
|
||||||
|
|
||||||
|
当前文档列表与下载逻辑高度依赖 MinIO 对象命名方式和对象遍历结果。对象存储目前承担了部分“业务真相”的角色,但对象存储只适合作为文件二进制载体,不适合作为完整文档元数据和状态管理的唯一来源。
|
||||||
|
|
||||||
|
### 2.6 `knowledge` 与 `agent` 共享检索底座的边界不清晰
|
||||||
|
|
||||||
|
当前 `/knowledge/*` 与 `/agent/*` 都依赖检索能力,但共享方式不够清晰:
|
||||||
|
|
||||||
|
- `knowledge` 通过 `DocumentProcessor.search()` 访问检索
|
||||||
|
- `agent` 通过 `Retriever` 访问检索
|
||||||
|
|
||||||
|
这会导致同一检索能力未来演进成两条链路,难以统一检索策略、元数据模型和可替换边界。
|
||||||
|
|
||||||
|
## 3. Architecture Goals
|
||||||
|
|
||||||
|
本项目后端的目标态架构必须满足以下目标。
|
||||||
|
|
||||||
|
### 3.1 单服务部署
|
||||||
|
|
||||||
|
系统继续保持单服务部署,不拆分为多个微服务。架构治理发生在单服务内部,通过清晰模块边界实现高内聚低耦合,而不是通过进程级拆分回避设计问题。
|
||||||
|
|
||||||
|
### 3.2 高内聚、低耦合优先级最高
|
||||||
|
|
||||||
|
后续模块设计以“一个模块只承载一类稳定职责”为原则。跨能力流程统一在编排层组织,不允许继续把 parser、embedding、storage、retrieval、LLM workflow 堆进同一个服务类。
|
||||||
|
|
||||||
|
### 3.3 外部 API 尽量保持兼容
|
||||||
|
|
||||||
|
现有前端与外部调用方依赖的主接口保持不变优先,包括但不限于:
|
||||||
|
|
||||||
|
- `/api/v1/documents/*`
|
||||||
|
- `/api/v1/knowledge/*`
|
||||||
|
- `/api/v1/agent/*`
|
||||||
|
|
||||||
|
内部可以重组,但外部接口不应因为内部重构而被迫大改。
|
||||||
|
|
||||||
|
### 3.4 关键能力必须可替换
|
||||||
|
|
||||||
|
以下能力必须通过稳定端口隔离实现细节:
|
||||||
|
|
||||||
|
- 文档解析
|
||||||
|
- 分块构建
|
||||||
|
- 向量化
|
||||||
|
- 向量索引
|
||||||
|
- 检索
|
||||||
|
- LLM 回答生成
|
||||||
|
- 会话存储
|
||||||
|
- 原始文件存储
|
||||||
|
|
||||||
|
后续替换方案时,只允许替换实现,不允许穿透影响其他模块。
|
||||||
|
|
||||||
|
### 3.5 `knowledge` 与 `agent` 共用同一检索底座
|
||||||
|
|
||||||
|
检索必须被视为独立的业务能力,由统一的 retrieval application service 对外暴露。`knowledge` 与 `agent` 必须复用同一个 retrieval 底座,避免两套召回策略、两套元数据模型、两套 adapter。
|
||||||
|
|
||||||
|
### 3.6 依赖必须单向流动
|
||||||
|
|
||||||
|
系统必须形成稳定的单向依赖关系:
|
||||||
|
|
||||||
|
- `api -> application -> domain`
|
||||||
|
- `application -> infrastructure` 通过端口/实现绑定
|
||||||
|
- `infrastructure -> external systems`
|
||||||
|
|
||||||
|
不允许出现基础设施实现反向驱动业务编排,也不允许 domain 依赖 Web 或第三方 SDK。
|
||||||
|
|
||||||
|
## 4. Target Module Layout
|
||||||
|
|
||||||
|
目标目录结构如下:
|
||||||
|
|
||||||
|
```text
|
||||||
|
backend/app/
|
||||||
|
api/
|
||||||
|
application/
|
||||||
|
documents/
|
||||||
|
knowledge/
|
||||||
|
agent/
|
||||||
|
domain/
|
||||||
|
documents/
|
||||||
|
retrieval/
|
||||||
|
conversation/
|
||||||
|
infrastructure/
|
||||||
|
storage/
|
||||||
|
vectorstore/
|
||||||
|
parser/
|
||||||
|
embedding/
|
||||||
|
llm/
|
||||||
|
session/
|
||||||
|
shared/
|
||||||
|
```
|
||||||
|
|
||||||
|
该结构是本项目 backend 的目标态模块布局。后续实现可以渐进迁移,但职责边界不能偏离。
|
||||||
|
|
||||||
|
### 4.1 `api`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- HTTP 路由注册
|
||||||
|
- 请求参数校验
|
||||||
|
- 响应模型映射
|
||||||
|
- 异常转换
|
||||||
|
- SSE 事件格式输出
|
||||||
|
|
||||||
|
非职责:
|
||||||
|
|
||||||
|
- 不直接组织完整业务流程
|
||||||
|
- 不直接访问 MinIO、Milvus、Parser SDK、LLM SDK
|
||||||
|
- 不直接 new 具体基础设施客户端
|
||||||
|
|
||||||
|
### 4.2 `application`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 用例编排
|
||||||
|
- 跨领域能力协作
|
||||||
|
- 业务流程统一入口
|
||||||
|
- workflow 级别的状态推进
|
||||||
|
|
||||||
|
非职责:
|
||||||
|
|
||||||
|
- 不直接依赖第三方 SDK
|
||||||
|
- 不承担具体存储、向量库、解析器实现细节
|
||||||
|
|
||||||
|
### 4.3 `domain`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 核心业务对象
|
||||||
|
- 领域术语
|
||||||
|
- 稳定端口接口
|
||||||
|
- 统一元数据模型
|
||||||
|
- 检索结果模型
|
||||||
|
- 会话消息模型
|
||||||
|
|
||||||
|
非职责:
|
||||||
|
|
||||||
|
- 不依赖 FastAPI
|
||||||
|
- 不依赖 MinIO、Milvus、LLM SDK
|
||||||
|
- 不依赖路由请求响应模型
|
||||||
|
|
||||||
|
### 4.4 `infrastructure`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 外部系统适配器实现
|
||||||
|
- 第三方 SDK 封装
|
||||||
|
- provider-specific 配置适配
|
||||||
|
- 数据格式转换
|
||||||
|
|
||||||
|
包含但不限于:
|
||||||
|
|
||||||
|
- MinIO binary store
|
||||||
|
- Milvus vector index
|
||||||
|
- Aliyun / local parser adapter
|
||||||
|
- OpenAI-compatible embedding adapter
|
||||||
|
- DeepSeek / Qwen LLM adapter
|
||||||
|
- in-memory / Redis session store
|
||||||
|
|
||||||
|
### 4.5 `shared`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 配置
|
||||||
|
- 日志
|
||||||
|
- 通用异常
|
||||||
|
- 通用工具
|
||||||
|
- 公共基础设施无关组件
|
||||||
|
|
||||||
|
非职责:
|
||||||
|
|
||||||
|
- 不承载业务编排
|
||||||
|
- 不变成新的 `services` 大杂烩目录
|
||||||
|
|
||||||
|
## 5. Module Responsibilities
|
||||||
|
|
||||||
|
### 5.1 `api`
|
||||||
|
|
||||||
|
`api` 是 transport 层,只关心请求进来和响应出去的表达方式。它应该把请求转换为 application service 的输入,把 application service 的结果转换为 HTTP 响应。
|
||||||
|
|
||||||
|
`api` 不应该知道:
|
||||||
|
|
||||||
|
- MinIO bucket 怎么组织
|
||||||
|
- Milvus collection 怎么建
|
||||||
|
- parser 是本地还是阿里云
|
||||||
|
- embedding 是本地模型还是 API
|
||||||
|
- session 是内存还是 Redis
|
||||||
|
|
||||||
|
### 5.2 `application`
|
||||||
|
|
||||||
|
`application` 是业务编排层,是系统内唯一允许跨模块组织完整流程的层。它应该定义稳定的用例服务,而不是把流程散落在路由或基础设施实现中。
|
||||||
|
|
||||||
|
本项目至少固定以下 4 类 application service:
|
||||||
|
|
||||||
|
- `DocumentCommandService`
|
||||||
|
- `DocumentQueryService`
|
||||||
|
- `KnowledgeRetrievalService`
|
||||||
|
- `AgentConversationService`
|
||||||
|
|
||||||
|
### 5.3 `domain`
|
||||||
|
|
||||||
|
`domain` 层定义系统内部真正稳定的概念,例如:
|
||||||
|
|
||||||
|
- `Document`
|
||||||
|
- `DocumentStatus`
|
||||||
|
- `ParsedDocument`
|
||||||
|
- `Chunk`
|
||||||
|
- `RetrievalQuery`
|
||||||
|
- `RetrievedChunk`
|
||||||
|
- `ConversationSession`
|
||||||
|
- `ConversationMessage`
|
||||||
|
- `AnswerSource`
|
||||||
|
|
||||||
|
这些对象必须脱离具体技术实现,成为 parser、embedding、vector index、agent workflow 之间的公共契约。
|
||||||
|
|
||||||
|
### 5.4 `infrastructure`
|
||||||
|
|
||||||
|
`infrastructure` 只负责“怎么接某个外部系统”,不负责“业务上应该先做什么后做什么”。例如:
|
||||||
|
|
||||||
|
- MinIO adapter 负责上传和下载文件
|
||||||
|
- Milvus adapter 负责 upsert/search/delete
|
||||||
|
- Qwen / DeepSeek adapter 负责生成回答
|
||||||
|
- Aliyun parser adapter 负责把解析结果映射成统一 `ParsedDocument`
|
||||||
|
|
||||||
|
### 5.5 `shared`
|
||||||
|
|
||||||
|
`shared` 只放横切能力。任何和文档 ingest、检索、问答编排直接相关的业务逻辑,都不应该放进 `shared`。
|
||||||
|
|
||||||
|
## 6. Stable Internal Ports
|
||||||
|
|
||||||
|
以下端口是系统内部稳定契约。后续方案替换时,只能替换实现,不允许改动上层 application service 的调用方式,也不允许影响 sibling 模块。
|
||||||
|
|
||||||
|
### 6.1 `DocumentRepository`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 管理文档元数据
|
||||||
|
- 管理文档状态
|
||||||
|
- 管理统计字段,例如 chunk 数、索引状态、摘要状态
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 列表和状态查询应以 `DocumentRepository` 为主,而不是直接遍历对象存储。
|
||||||
|
|
||||||
|
### 6.2 `DocumentBinaryStore`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 保存原始文件
|
||||||
|
- 下载原始文件
|
||||||
|
- 删除原始文件
|
||||||
|
- 处理对象存储相关细节
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 替换 MinIO 或对象存储方案时,只替换该实现。
|
||||||
|
|
||||||
|
### 6.3 `DocumentParser`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 输入原始文件
|
||||||
|
- 输出统一结构化解析结果
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 本地 PDF/MinerU 或阿里云解析只能作为实现差异,不能外溢到业务流程层。
|
||||||
|
|
||||||
|
### 6.4 `ChunkBuilder`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 输入统一解析结果
|
||||||
|
- 输出统一 chunk 模型
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- chunk 规则变化只能影响该端口实现,不应影响 retrieval、agent 或 API。
|
||||||
|
|
||||||
|
### 6.5 `EmbeddingProvider`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 输入文本列表
|
||||||
|
- 输出 embedding 向量结果
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 从本地模型切到 OpenAI-compatible embedding,只替换该实现。
|
||||||
|
|
||||||
|
### 6.6 `VectorIndex`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- upsert chunks
|
||||||
|
- delete by document
|
||||||
|
- search by query vector
|
||||||
|
- 管理索引内部 schema
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- Milvus schema 或向量库替换,只能影响该层。
|
||||||
|
|
||||||
|
### 6.7 `Retriever`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 基于 query、filter、top_k 返回统一检索结果
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- `Retriever` 是业务侧的检索端口,不应再直接持有 embedder、Milvus lifecycle 和 provider-specific 逻辑。
|
||||||
|
|
||||||
|
### 6.8 `AnswerGenerator`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 基于 query 与 context 生成最终回答
|
||||||
|
- 屏蔽具体 LLM provider 差异
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- DeepSeek、Qwen 或其他模型切换时,只替换该实现。
|
||||||
|
|
||||||
|
### 6.9 `ConversationStore`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 创建和读取 session
|
||||||
|
- 持久化消息历史
|
||||||
|
- 管理会话生命周期
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 从内存实现切到 Redis 或数据库实现时,只替换该实现。
|
||||||
|
|
||||||
|
## 7. Application Services
|
||||||
|
|
||||||
|
### 7.1 `DocumentCommandService`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 接收文档上传命令
|
||||||
|
- 生成 `doc_id`
|
||||||
|
- 保存原始文件
|
||||||
|
- 触发解析、分块、向量化、入库
|
||||||
|
- 更新文档状态和统计信息
|
||||||
|
- 返回最终处理结果
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 当前 `DocumentProcessor` 的“流程编排”职责在目标态应迁移到这里。
|
||||||
|
- parser、chunker、embedder、vector index 的具体实现不应继续塞进一个大类里统一管理。
|
||||||
|
|
||||||
|
### 7.2 `DocumentQueryService`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 文档列表
|
||||||
|
- 文档下载
|
||||||
|
- 文档状态查询
|
||||||
|
- 文档管理视图查询
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 列表和状态查询应基于 `DocumentRepository`
|
||||||
|
- 下载应通过 `DocumentBinaryStore`
|
||||||
|
- 不再依赖 MinIO 对象结构作为业务视图主来源
|
||||||
|
|
||||||
|
### 7.3 `KnowledgeRetrievalService`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 对外提供统一检索能力
|
||||||
|
- 管理 retrieval query 到 retrieval result 的业务转换
|
||||||
|
- 被 `/knowledge/*` 和 Agent workflow 共用
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 当前 `knowledge` 与 `agent` 必须统一依赖这一层,不允许各自再维护一套检索流程。
|
||||||
|
|
||||||
|
### 7.4 `AgentConversationService`
|
||||||
|
|
||||||
|
职责:
|
||||||
|
|
||||||
|
- 统一管理问答 workflow
|
||||||
|
- 读取或创建会话
|
||||||
|
- 调用 `KnowledgeRetrievalService`
|
||||||
|
- 构建问答上下文
|
||||||
|
- 调用 `AnswerGenerator`
|
||||||
|
- 保存回答和引用来源
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- 当前 `QAAgent` 的 workflow 编排职责在目标态应迁移到这里,或被其吸收后只保留 façade 角色。
|
||||||
|
- SSE 与普通问答必须共用这一层,不允许复制业务编排逻辑。
|
||||||
|
|
||||||
|
## 8. Core Workflows
|
||||||
|
|
||||||
|
### 8.1 文档上传入库链路
|
||||||
|
|
||||||
|
目标流程如下:
|
||||||
|
|
||||||
|
1. `api/documents` 接收上传请求并完成输入校验。
|
||||||
|
2. `DocumentCommandService` 生成 `doc_id`,初始化文档记录和状态。
|
||||||
|
3. `DocumentBinaryStore` 保存原始文件。
|
||||||
|
4. `DocumentParser` 对原始文件执行解析,输出统一结构化结果。
|
||||||
|
5. `ChunkBuilder` 将解析结果转换为统一 chunk 集合。
|
||||||
|
6. `EmbeddingProvider` 为 chunks 生成向量。
|
||||||
|
7. `VectorIndex` 将 chunks 与 vectors 写入索引。
|
||||||
|
8. `DocumentRepository` 更新文档状态、chunk 数量、索引状态、元数据。
|
||||||
|
9. API 返回处理结果。
|
||||||
|
|
||||||
|
约束:
|
||||||
|
|
||||||
|
- 上传处理链路的主编排必须只存在于 `DocumentCommandService`
|
||||||
|
- 不允许再由 route 或基础设施类直接组织全流程
|
||||||
|
|
||||||
|
### 8.2 文档查询链路
|
||||||
|
|
||||||
|
目标流程如下:
|
||||||
|
|
||||||
|
1. `api/documents` 调用 `DocumentQueryService`
|
||||||
|
2. 文档列表与状态查询通过 `DocumentRepository`
|
||||||
|
3. 文档下载通过 `DocumentBinaryStore`
|
||||||
|
4. 对象存储命名规则只作为实现细节,不作为最终业务真相
|
||||||
|
|
||||||
|
约束:
|
||||||
|
|
||||||
|
- 文档“存在、状态、统计信息”必须有稳定元数据模型
|
||||||
|
- 不允许继续通过对象存储遍历结果拼出全部业务语义
|
||||||
|
|
||||||
|
### 8.3 Agent 问答链路
|
||||||
|
|
||||||
|
目标流程如下:
|
||||||
|
|
||||||
|
1. `api/agent` 接收问答请求
|
||||||
|
2. `AgentConversationService` 读取或创建 session
|
||||||
|
3. `KnowledgeRetrievalService` 统一执行检索
|
||||||
|
4. `AnswerGenerator` 基于 query 和 retrieval context 生成回答
|
||||||
|
5. `ConversationStore` 保存消息历史和引用来源
|
||||||
|
6. API 将结果以普通 JSON 或 SSE 格式输出
|
||||||
|
|
||||||
|
约束:
|
||||||
|
|
||||||
|
- 普通问答和 SSE 问答只允许输出形式不同
|
||||||
|
- 业务编排链必须完全复用
|
||||||
|
- 检索能力必须来自同一 `KnowledgeRetrievalService`
|
||||||
|
|
||||||
|
## 9. Dependency Rules
|
||||||
|
|
||||||
|
系统内部依赖方向固定如下:
|
||||||
|
|
||||||
|
```text
|
||||||
|
api -> application -> domain
|
||||||
|
application -> infrastructure (through ports)
|
||||||
|
infrastructure -> external systems
|
||||||
|
```
|
||||||
|
|
||||||
|
具体规则如下:
|
||||||
|
|
||||||
|
- `api` 可以依赖 `application` 和 API 自己的 request/response models
|
||||||
|
- `application` 可以依赖 `domain` 和端口绑定后的 infrastructure 实现
|
||||||
|
- `domain` 不能依赖 `api` 或 `infrastructure`
|
||||||
|
- `infrastructure` 可以依赖 `domain` 定义的端口和数据模型,但不能反向驱动 application 逻辑
|
||||||
|
|
||||||
|
## 10. Migration Mapping From Current Code
|
||||||
|
|
||||||
|
当前关键代码到目标模块的映射如下。
|
||||||
|
|
||||||
|
### 10.1 文档处理
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/document_processor.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 其流程编排职责迁移到 `application/documents/DocumentCommandService`
|
||||||
|
- 解析、分块、向量、入库分别通过端口接入
|
||||||
|
- 检索入口从该类中剥离,不再由 ingest orchestration 承担 search 职责
|
||||||
|
|
||||||
|
### 10.2 检索
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/rag/retriever.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- `domain/retrieval` 中定义 `Retriever` 端口和统一检索结果模型
|
||||||
|
- `infrastructure/vectorstore` 中承载具体检索实现
|
||||||
|
- `application/knowledge/KnowledgeRetrievalService` 作为统一检索用例入口
|
||||||
|
|
||||||
|
### 10.3 Agent Workflow
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/agent/qa_agent.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- workflow 编排职责迁移到 `application/agent/AgentConversationService`
|
||||||
|
- 具体 LLM 调用走 `AnswerGenerator`
|
||||||
|
- 具体 session 读写走 `ConversationStore`
|
||||||
|
- 检索统一走 `KnowledgeRetrievalService`
|
||||||
|
|
||||||
|
### 10.4 存储
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/storage/minio_client.py`
|
||||||
|
- `backend/app/services/storage/milvus_client.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- MinIO 迁移到 `infrastructure/storage`
|
||||||
|
- Milvus 迁移到 `infrastructure/vectorstore`
|
||||||
|
|
||||||
|
### 10.5 解析
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/parser/*`
|
||||||
|
- `backend/app/services/parser/mineru_parser.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 全部迁移到 `infrastructure/parser`
|
||||||
|
- 对外只暴露统一 `DocumentParser` 端口实现
|
||||||
|
|
||||||
|
### 10.6 向量化
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/embedding/*`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 迁移到 `infrastructure/embedding`
|
||||||
|
- 对外只暴露统一 `EmbeddingProvider`
|
||||||
|
|
||||||
|
### 10.7 LLM
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/llm/*`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 迁移到 `infrastructure/llm`
|
||||||
|
- 由 `AnswerGenerator` 屏蔽 provider 差异
|
||||||
|
|
||||||
|
### 10.8 会话
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/services/agent/session_manager.py`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 迁移到 `infrastructure/session`
|
||||||
|
- 对外通过 `ConversationStore` 暴露
|
||||||
|
|
||||||
|
### 10.9 API 模型与内部模型
|
||||||
|
|
||||||
|
当前:
|
||||||
|
|
||||||
|
- `backend/app/api/models/*`
|
||||||
|
- `backend/app/schemas/*`
|
||||||
|
|
||||||
|
目标:
|
||||||
|
|
||||||
|
- 对外 request/response model 保留在 `api`
|
||||||
|
- 内部 DTO / VO / domain object 收敛到 `application` 或 `domain`
|
||||||
|
- 不允许 API model 直接渗透到 domain
|
||||||
|
|
||||||
|
## 11. Technology Replacement Boundaries
|
||||||
|
|
||||||
|
### 11.1 本地解析 / MinerU -> 阿里云文档解析
|
||||||
|
|
||||||
|
替换原则:
|
||||||
|
|
||||||
|
- 只替换 `DocumentParser` adapter
|
||||||
|
- `DocumentCommandService` 不应感知解析提供商差异
|
||||||
|
- `ChunkBuilder` 只接收统一解析结果模型
|
||||||
|
|
||||||
|
### 11.2 BGE-M3 -> OpenAI-compatible embedding
|
||||||
|
|
||||||
|
替换原则:
|
||||||
|
|
||||||
|
- 只替换 `EmbeddingProvider`
|
||||||
|
- `KnowledgeRetrievalService` 与 `DocumentCommandService` 不应感知 embedding 来源变化
|
||||||
|
|
||||||
|
### 11.3 Milvus `1024 + sparse` -> `1536 dense-only`
|
||||||
|
|
||||||
|
替换原则:
|
||||||
|
|
||||||
|
- 只替换 `VectorIndex` 实现
|
||||||
|
- collection schema、index 参数、dense-only search 属于 index 内部实现细节
|
||||||
|
- 上层 retrieval 和 agent workflow 不应因为 schema 变化而改业务接口
|
||||||
|
|
||||||
|
### 11.4 DeepSeek / Qwen 切换
|
||||||
|
|
||||||
|
替换原则:
|
||||||
|
|
||||||
|
- 只替换 `AnswerGenerator` 背后的 provider adapter
|
||||||
|
- 上层 conversation workflow 不应直接依赖具体模型 SDK
|
||||||
|
|
||||||
|
### 11.5 内存 session -> Redis / DB session
|
||||||
|
|
||||||
|
替换原则:
|
||||||
|
|
||||||
|
- 只替换 `ConversationStore`
|
||||||
|
- API 和 application service 不应感知 session 持久化细节
|
||||||
|
|
||||||
|
## 12. Guardrails
|
||||||
|
|
||||||
|
后续所有 backend 重构和新增功能必须遵守以下规则:
|
||||||
|
|
||||||
|
- 禁止 `api/routes` 直接实例化 parser、embedder、Milvus、MinIO、LLM client
|
||||||
|
- 禁止 `application` 层直接 import 第三方 SDK
|
||||||
|
- 禁止 `domain` 层依赖 FastAPI、Pydantic route model、MinIO SDK、Milvus SDK、LLM SDK
|
||||||
|
- 禁止 SSE 和普通问答各自维护独立 workflow
|
||||||
|
- 禁止把对象存储命名规则作为唯一业务元数据来源
|
||||||
|
- 禁止新建第二个“大一统流程类”替代 `DocumentProcessor`
|
||||||
|
- 禁止 `knowledge` 和 `agent` 各自维护独立检索实现
|
||||||
|
- 禁止 parser、embedding、vector index、llm provider 的替换穿透到 API 层
|
||||||
|
|
||||||
|
## 13. Architecture Review Checklist
|
||||||
|
|
||||||
|
后续评审和重构验收时,至少核对以下问题:
|
||||||
|
|
||||||
|
1. 上传、下载、列表、解析、切片、向量、入库、检索、Agent Workflow 是否都映射到了明确模块。
|
||||||
|
2. 系统是否仍保持单服务,而不是被动演化成伪微服务结构。
|
||||||
|
3. 是否存在唯一、清晰的目标目录结构。
|
||||||
|
4. 是否定义了稳定端口列表。
|
||||||
|
5. 是否定义了文档上传入库、文档查询、Agent 问答三条核心 workflow。
|
||||||
|
6. 是否定义了单向依赖方向。
|
||||||
|
7. 是否明确列出了架构禁令。
|
||||||
|
8. 是否定义了当前关键代码到目标模块的映射。
|
||||||
|
9. 是否明确定义了 parser、embedding、vector index、LLM、session store 的替换边界。
|
||||||
|
10. 是否明确 `knowledge` 与 `agent` 共用同一 retrieval 底座。
|
||||||
|
11. 是否明确 API 层只负责 transport concerns,不再直接承担业务编排。
|
||||||
|
12. 是否保证后续替换方案时,上层 application service 与外部 API 契约不被迫变化。
|
||||||
170
docs/rfc/backend-api-parsing-embedding-migration-requirements.md
Normal file
170
docs/rfc/backend-api-parsing-embedding-migration-requirements.md
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
# BGE-M3 下线与阿里云/API 解析迁移需求说明
|
||||||
|
|
||||||
|
## 1. 当前状态
|
||||||
|
|
||||||
|
当前后端文档上传与处理主链路已经存在,且真实入口与核心依赖如下:
|
||||||
|
|
||||||
|
- 现有真实上传入口是 `backend/app/api/routes/documents.py` 的 `/api/v1/documents/upload`
|
||||||
|
- 当前主链路依赖 `backend/app/services/document_processor.py`
|
||||||
|
- 当前解析链路是本地 PDF/DOCX/MinerU
|
||||||
|
- 当前嵌入链路依赖 `backend/app/services/embedding/bge_m3_embedder.py`
|
||||||
|
- 当前检索链路依赖 `backend/app/services/storage/milvus_client.py` 和 `backend/app/services/rag/retriever.py`
|
||||||
|
|
||||||
|
本文件用于冻结本轮迁移需求、影响面和约束条件,作为后续 backend architecture 梳理、实施拆解和验收对齐的输入基线。
|
||||||
|
|
||||||
|
## 2. 背景与动机
|
||||||
|
|
||||||
|
当前系统的文档处理能力建立在本地解析与本地向量模型基础之上,但该路径已经不再满足后续演进要求。为支持统一的解析质量、降低本地模型依赖、并为后续后端架构调整预留空间,本期需要先冻结迁移需求。
|
||||||
|
|
||||||
|
本期背景和动机明确如下:
|
||||||
|
|
||||||
|
- 不再使用本地 `models--BAAI--bge-m3`
|
||||||
|
- 解析和 embedding 主链路准备切换到 API 方式
|
||||||
|
- 后续还会整体调整 backend 架构,因此本文件只冻结需求,不提前固化最终模块设计
|
||||||
|
|
||||||
|
## 3. 目标需求
|
||||||
|
|
||||||
|
本期目标是完成文档解析、分块、向量化和检索底座的迁移需求定义,明确后续架构和实施阶段必须满足的结果边界。
|
||||||
|
|
||||||
|
已确认的目标需求如下:
|
||||||
|
|
||||||
|
- 文档解析统一改为阿里云文档智能能力
|
||||||
|
- 当前阿里云接入基础来自 `backend/app/aliyun_parser/parse_pdf.py`
|
||||||
|
- 解析结果以 `structure_nodes`、`semantic_blocks`、`vector_chunks` 三层结构为基础
|
||||||
|
- 分块以阿里云 `vector_chunks` 为准,不再走当前本地 `RegulationChunker`
|
||||||
|
- embedding 改为 OpenAI 兼容 API 调用,模型使用 `text-embedding-v3`
|
||||||
|
- 检索能力本期降级为 `dense-only`
|
||||||
|
- Milvus 继续保留,但 schema 需要围绕 `1536` 维 dense 向量重建
|
||||||
|
|
||||||
|
以上内容属于本期已经确认的迁移方向,不再作为待讨论事项。
|
||||||
|
|
||||||
|
## 4. 范围
|
||||||
|
|
||||||
|
本期需求范围覆盖以下内容:
|
||||||
|
|
||||||
|
- 上传处理链路
|
||||||
|
- 阿里云解析适配
|
||||||
|
- embedding API 适配
|
||||||
|
- Milvus 入库与检索
|
||||||
|
- RAG/Agent 检索依赖的元数据适配
|
||||||
|
- 配置、依赖、README 和部署说明同步清理
|
||||||
|
|
||||||
|
本期范围的核心目标是让现有上传后处理主链路可以在新的 API 化解析和 embedding 方式下继续工作,并保持主要外部接口不变。
|
||||||
|
|
||||||
|
## 5. 非目标
|
||||||
|
|
||||||
|
以下事项不属于本期需求目标,不应在本文件内被提前设计或默认纳入实施:
|
||||||
|
|
||||||
|
- 本文件不定义最终 backend 分层、目录结构和 service boundary
|
||||||
|
- 本文件不引入异步任务系统
|
||||||
|
- 本文件不把 PostgreSQL 三层结构表接入主链路
|
||||||
|
- 本文件不处理前端大规模交互改版
|
||||||
|
|
||||||
|
如果后续实施阶段需要触及上述内容,应另行在架构方案或单独 RFC 中说明,而不是在本需求说明中默认展开。
|
||||||
|
|
||||||
|
## 6. 影响面清单
|
||||||
|
|
||||||
|
本期迁移将影响现有后端多个子系统。以下清单用于冻结影响面,方便后续做架构设计、任务拆分和回归验证。
|
||||||
|
|
||||||
|
### 6.1 入口与流程
|
||||||
|
|
||||||
|
受影响的入口与主流程文件包括:
|
||||||
|
|
||||||
|
- `backend/app/api/routes/documents.py`
|
||||||
|
- `backend/app/services/document_processor.py`
|
||||||
|
|
||||||
|
该部分需要承接上传接口保持不变的前提下,对解析、分块、向量化和入库主流程进行迁移。
|
||||||
|
|
||||||
|
### 6.2 解析能力
|
||||||
|
|
||||||
|
受影响的解析能力范围包括:
|
||||||
|
|
||||||
|
- 当前本地 parser 目录
|
||||||
|
- `backend/app/aliyun_parser`
|
||||||
|
|
||||||
|
迁移后阿里云文档智能能力将成为主解析来源,本地 PDF/DOCX/MinerU 解析链路需要重新界定保留、下线或回退策略,但具体模块组织方式不在本文件内定义。
|
||||||
|
|
||||||
|
### 6.3 向量能力
|
||||||
|
|
||||||
|
受影响的向量能力范围包括:
|
||||||
|
|
||||||
|
- `backend/app/services/embedding/bge_m3_embedder.py`
|
||||||
|
- embedding 配置
|
||||||
|
- embedding 相关依赖包
|
||||||
|
|
||||||
|
该部分需要移除对本地 BGE-M3 模型的运行时依赖,并改为 OpenAI 兼容 API 方式调用 `text-embedding-v3`。
|
||||||
|
|
||||||
|
### 6.4 存储检索
|
||||||
|
|
||||||
|
受影响的存储与检索能力包括:
|
||||||
|
|
||||||
|
- `backend/app/services/storage/milvus_client.py`
|
||||||
|
- `backend/app/services/rag/retriever.py`
|
||||||
|
- `backend/app/api/routes/knowledge.py`
|
||||||
|
- `backend/app/services/agent/qa_agent.py`
|
||||||
|
|
||||||
|
该部分需要围绕 `1536` 维 dense 向量重建 Milvus schema,并确保知识检索与 Agent 检索链路继续可用。
|
||||||
|
|
||||||
|
### 6.5 配置与状态
|
||||||
|
|
||||||
|
受影响的配置与状态相关文件包括:
|
||||||
|
|
||||||
|
- `backend/app/config/settings.py`
|
||||||
|
- `backend/app/core/config.py`
|
||||||
|
- `backend/app/api/routes/status.py`
|
||||||
|
- `backend/app/services/mock_data.py`
|
||||||
|
|
||||||
|
该部分需要清理与旧本地模型和旧处理链路耦合的配置项、状态展示和 mock 数据假设。
|
||||||
|
|
||||||
|
### 6.6 文档与部署
|
||||||
|
|
||||||
|
受影响的文档与部署项包括:
|
||||||
|
|
||||||
|
- `README.md`
|
||||||
|
- `QUICK_DEPLOY.md`
|
||||||
|
- `.env.example`
|
||||||
|
- `requirements` 相关文件
|
||||||
|
- `pyproject.toml`
|
||||||
|
|
||||||
|
该部分需要同步反映新的 API 化解析与 embedding 依赖,去除或更新本地模型准备、运行说明和环境配置描述。
|
||||||
|
|
||||||
|
## 7. 风险与约束
|
||||||
|
|
||||||
|
以下风险和约束在本期已经明确,需要在后续架构和实施阶段优先处理:
|
||||||
|
|
||||||
|
- 旧 Milvus collection 与新 `1536` 维 schema 不兼容,需要新 collection 和重建索引
|
||||||
|
- `backend/app/aliyun_parser` 现有脚本含硬编码密钥,后续必须全部移到环境变量
|
||||||
|
- RAG 下游当前对 `clause_number` 有依赖,迁移后需要优先适配 `section_title` 和 Aliyun chunk metadata
|
||||||
|
- 如果阿里云返回字段与当前样例不同,需要在架构阶段补充 adapter 层
|
||||||
|
|
||||||
|
上述条目属于实施约束和迁移风险,不代表当前已经确定最终解决方案,只代表这些问题必须被显式处理。
|
||||||
|
|
||||||
|
## 8. 待架构阶段决策
|
||||||
|
|
||||||
|
以下事项属于后续 backend architecture 阶段需要单独拍板的决策项,不属于本文件已确认的需求结论:
|
||||||
|
|
||||||
|
- 阿里云能力封装为内部模块还是独立 adapter package
|
||||||
|
- 同步阻塞上传还是改为异步 job
|
||||||
|
- `DocumentProcessor` 是否拆为 ingest orchestrator
|
||||||
|
- 检索元数据模型是否统一重命名
|
||||||
|
- status/config 是否改为真实运行态而不是 mock
|
||||||
|
|
||||||
|
后续如输出架构方案,应围绕这些待决策项给出明确取舍和原因,但不应回退本文件已经确认的迁移目标。
|
||||||
|
|
||||||
|
## 9. 验收基线
|
||||||
|
|
||||||
|
本期需求的验收基线固定如下:
|
||||||
|
|
||||||
|
- 上传接口外部契约保持不变
|
||||||
|
- PDF/DOC/DOCX 上传后能完成解析、向量化、入库
|
||||||
|
- 新索引可支持 `/knowledge/retrieval` 和 `/agent/ask`
|
||||||
|
- 系统中不再依赖本地 `bge-m3` 模型文件
|
||||||
|
- 所有敏感凭据从代码移出
|
||||||
|
|
||||||
|
以上验收基线用于后续架构方案评审和实施完成后的回归核对。
|
||||||
|
|
||||||
|
## 10. 说明
|
||||||
|
|
||||||
|
本文件是需求说明,不是最终技术设计文档。文中只冻结目标、范围、影响面、风险和约束,不定义最终 backend 分层、类图、目录结构、模块边界或详细实现步骤。
|
||||||
|
|
||||||
|
后续待新的 backend architecture 整理完成后,应基于本文件再补充对应的架构方案文档,或直接拆解为实施计划。
|
||||||
@@ -8,8 +8,8 @@ export default defineConfig({
|
|||||||
host: '0.0.0.0',
|
host: '0.0.0.0',
|
||||||
port: 5173,
|
port: 5173,
|
||||||
proxy: {
|
proxy: {
|
||||||
'/api': {
|
'^/api/.*': {
|
||||||
target: 'http://localhost:8000',
|
target: 'http://6.86.80.8:8000',
|
||||||
changeOrigin: true,
|
changeOrigin: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
1487
logs/api.log
1487
logs/api.log
File diff suppressed because it is too large
Load Diff
@@ -1,119 +0,0 @@
|
|||||||
2026-05-14 16:41:52 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:41:52 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:41:52 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:41:54 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:41:54 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:41:54 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:41:55 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:41:55 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:41:55 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:42:22 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:42:22 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:42:28 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:42:28 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:42:28 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:42:28 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:42:28 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:42:28 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:42:29 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:42:29 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:42:29 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:42:31 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:42:31 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:42:37 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:42:37 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:42:37 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:42:37 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:42:37 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:42:37 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:42:38 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:42:38 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:42:38 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:43:28 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:43:28 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:43:34 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:43:34 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:43:34 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:43:34 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:43:34 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:43:34 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:43:34 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:43:34 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:43:34 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:43:35 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:43:35 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:46:25 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:46:25 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:46:25 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:46:26 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:46:26 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:46:26 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:46:27 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:46:27 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:46:27 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:46:40 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:46:40 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:47:08 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:47:08 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:47:08 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:47:08 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:47:08 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:47:08 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:47:08 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:47:08 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:47:08 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:57:16 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 16:57:16 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 16:57:36 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 16:57:36 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:57:36 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:57:36 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:57:36 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 16:57:36 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 16:57:36 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 16:57:37 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:57:37 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:57:37 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 16:57:37 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 16:57:37 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 16:57:37 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 17:14:37 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 17:14:37 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 17:14:37 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 17:14:37 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 17:14:53 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 17:14:53 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 17:14:53 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 17:14:54 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 17:14:54 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 17:14:54 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 17:14:54 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 17:14:54 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 17:14:54 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 17:16:10 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 17:16:10 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 17:16:21 | INFO | src.api.main:lifespan:27 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 17:16:21 | INFO | src.api.main:lifespan:28 - 调试模式: False
|
|
||||||
2026-05-14 17:16:21 | INFO | src.api.main:lifespan:31 - 预加载LLM客户端...
|
|
||||||
2026-05-14 17:16:22 | INFO | src.services.llm.qwen_client:_init_client:59 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 17:16:22 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 17:16:22 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 17:16:22 | INFO | src.services.llm.deepseek_client:_init_client:50 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 17:16:22 | INFO | src.services.llm.llm_factory:create:113 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 17:16:22 | SUCCESS | src.services.llm.llm_factory:preload_clients:201 - 预加载LLM客户端成功: deepseek
|
|
||||||
2026-05-14 17:17:07 | INFO | src.api.main:lifespan:37 - 应用关闭,执行清理...
|
|
||||||
2026-05-14 17:17:07 | INFO | src.services.llm.llm_factory:cleanup:226 - 所有LLM客户端已清理
|
|
||||||
2026-05-14 17:19:47 | INFO | app.api.main:lifespan:22 - 启动 AI+合规智能中枢 v0.1.0
|
|
||||||
2026-05-14 17:19:47 | INFO | app.api.main:lifespan:23 - 调试模式: False
|
|
||||||
2026-05-14 17:19:47 | INFO | app.api.main:lifespan:24 - 预加载LLM客户端...
|
|
||||||
2026-05-14 17:19:48 | INFO | app.services.llm.qwen_client:_init_client:58 - Qwen客户端初始化完成: http://6.86.80.4:30080/v1 - qwen3.5-flash
|
|
||||||
2026-05-14 17:19:48 | INFO | app.services.llm.llm_factory:create:112 - LLM客户端创建成功并缓存: qwen - qwen3.5-flash
|
|
||||||
2026-05-14 17:19:48 | SUCCESS | app.services.llm.llm_factory:preload_clients:200 - 预加载LLM客户端成功: qwen
|
|
||||||
2026-05-14 17:19:49 | INFO | app.services.llm.deepseek_client:_init_client:49 - DeepSeek客户端初始化完成: http://6.86.80.4:30080/v1 - deepseek-v4-flash
|
|
||||||
2026-05-14 17:19:49 | INFO | app.services.llm.llm_factory:create:112 - LLM客户端创建成功并缓存: deepseek - deepseek-v4-flash
|
|
||||||
2026-05-14 17:19:49 | SUCCESS | app.services.llm.llm_factory:preload_clients:200 - 预加载LLM客户端成功: deepseek
|
|
||||||
Reference in New Issue
Block a user