Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,14 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
阿里云文档智能 API 解析 PDF,输出三层结构 chunks
|
||||
- structure_nodes: 目录树结构
|
||||
- semantic_blocks: 语义块(章节文本、表格、图片)
|
||||
- vector_chunks: 检索块(带 overlap 切分)
|
||||
"""
|
||||
"""Handle Aliyun parsing support for parse pdf."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -19,16 +15,16 @@ from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_models
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
|
||||
# ===================== 阿里云配置 =====================
|
||||
ALIBABA_ACCESS_KEY_ID = "LTAI5t6fWvAsvZkoF9WTbtys"
|
||||
ALIBABA_ACCESS_KEY_SECRET = "WX4oaE4FLYRa5L85TMQkqRPHeTJAF0"
|
||||
ALIBABA_ENDPOINT = "docmind-api.cn-hangzhou.aliyuncs.com"
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
ALIBABA_ACCESS_KEY_ID = os.getenv("ALIBABA_ACCESS_KEY_ID", "")
|
||||
ALIBABA_ACCESS_KEY_SECRET = os.getenv("ALIBABA_ACCESS_KEY_SECRET", "")
|
||||
ALIBABA_ENDPOINT = os.getenv("ALIBABA_ENDPOINT", "docmind-api.cn-hangzhou.aliyuncs.com")
|
||||
|
||||
# ===================== 切分参数 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
MAX_CHARS = 600
|
||||
OVERLAP_CHARS = 80
|
||||
|
||||
# ===================== 布局类型常量 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
TOC_TITLES = {"目次", "目录"}
|
||||
TITLE_SUBTYPES = {"doc_title", "para_title"}
|
||||
TEXT_SUBTYPES = {"para", "none"}
|
||||
@@ -36,8 +32,11 @@ FIGURE_TYPES = {"figure", "figure_name", "figure_note"}
|
||||
FIGURE_SUBTYPES = {"picture", "pic_title", "pic_caption"}
|
||||
|
||||
|
||||
# ===================== 阿里云 API 客户端 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def init_client() -> DocmindClient:
|
||||
"""Handle init client."""
|
||||
if not ALIBABA_ACCESS_KEY_ID or not ALIBABA_ACCESS_KEY_SECRET:
|
||||
raise ValueError("缺少阿里云文档解析凭据,请设置 ALIBABA_ACCESS_KEY_ID 和 ALIBABA_ACCESS_KEY_SECRET")
|
||||
config = open_api_models.Config(
|
||||
access_key_id=ALIBABA_ACCESS_KEY_ID,
|
||||
access_key_secret=ALIBABA_ACCESS_KEY_SECRET,
|
||||
@@ -47,7 +46,7 @@ def init_client() -> DocmindClient:
|
||||
|
||||
|
||||
def submit_job(client: DocmindClient, file_path: str) -> str:
|
||||
"""提交文档解析任务"""
|
||||
"""Submit job."""
|
||||
file_name = Path(file_path).name
|
||||
request = docmind_models.SubmitDocParserJobAdvanceRequest(
|
||||
file_url_object=open(file_path, "rb"),
|
||||
@@ -62,14 +61,14 @@ def submit_job(client: DocmindClient, file_path: str) -> str:
|
||||
|
||||
|
||||
def query_status(client: DocmindClient, task_id: str) -> Dict:
|
||||
"""查询任务状态"""
|
||||
"""Handle query status."""
|
||||
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
|
||||
response = client.query_doc_parser_status(request)
|
||||
return response.body.data.to_map() if response.body.data else None
|
||||
|
||||
|
||||
def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int = 5) -> bool:
|
||||
"""等待任务完成"""
|
||||
"""Wait for for completion."""
|
||||
while True:
|
||||
status_data = query_status(client, task_id)
|
||||
if not status_data:
|
||||
@@ -85,7 +84,7 @@ def wait_for_completion(client: DocmindClient, task_id: str, poll_interval: int
|
||||
|
||||
|
||||
def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_step_size: int = 50) -> Dict:
|
||||
"""获取解析结果"""
|
||||
"""Return result."""
|
||||
request = docmind_models.GetDocParserResultRequest(
|
||||
id=task_id,
|
||||
layout_step_size=layout_step_size,
|
||||
@@ -96,7 +95,7 @@ def get_result(client: DocmindClient, task_id: str, layout_num: int = 0, layout_
|
||||
|
||||
|
||||
def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: int = 50) -> List[Dict]:
|
||||
"""收集所有解析结果"""
|
||||
"""Collect all results."""
|
||||
all_layouts = []
|
||||
layout_num = 0
|
||||
while True:
|
||||
@@ -113,8 +112,9 @@ def collect_all_results(client: DocmindClient, task_id: str, layout_step_size: i
|
||||
return all_layouts
|
||||
|
||||
|
||||
# ===================== 文本处理 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalize text."""
|
||||
text = text.replace("\r", "\n")
|
||||
text = text.replace(" ", " ")
|
||||
text = re.sub(r"\n+", "\n", text)
|
||||
@@ -123,34 +123,41 @@ def normalize_text(text: str) -> str:
|
||||
|
||||
|
||||
def get_page(layout: Dict) -> int:
|
||||
"""Return page."""
|
||||
return layout.get("pageNum", layout.get("pageNumber", 0))
|
||||
|
||||
|
||||
def get_text(layout: Dict) -> str:
|
||||
"""Return text."""
|
||||
text = normalize_text(layout.get("text", ""))
|
||||
if text:
|
||||
return text
|
||||
return normalize_text(layout.get("markdownContent", ""))
|
||||
|
||||
|
||||
# ===================== 布局类型判断 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def is_title(layout: Dict) -> bool:
|
||||
"""Return whether title."""
|
||||
return layout.get("type") == "title" or layout.get("subType") in TITLE_SUBTYPES
|
||||
|
||||
|
||||
def is_text(layout: Dict) -> bool:
|
||||
"""Return whether text."""
|
||||
return layout.get("type") == "text" and layout.get("subType", "none") in TEXT_SUBTYPES
|
||||
|
||||
|
||||
def is_figure(layout: Dict) -> bool:
|
||||
"""Return whether figure."""
|
||||
return layout.get("type") in FIGURE_TYPES or layout.get("subType") in FIGURE_SUBTYPES
|
||||
|
||||
|
||||
def is_table(layout: Dict) -> bool:
|
||||
"""Return whether table."""
|
||||
return layout.get("type") == "table"
|
||||
|
||||
|
||||
def is_toc_layout(layout: Dict) -> bool:
|
||||
"""Return whether toc layout."""
|
||||
text = get_text(layout)
|
||||
if text in TOC_TITLES:
|
||||
return True
|
||||
@@ -160,6 +167,7 @@ def is_toc_layout(layout: Dict) -> bool:
|
||||
|
||||
|
||||
def extract_table_text(layout: Dict) -> str:
|
||||
"""Extract table text."""
|
||||
rows = []
|
||||
for cell in layout.get("cells", []):
|
||||
texts = []
|
||||
@@ -172,8 +180,9 @@ def extract_table_text(layout: Dict) -> str:
|
||||
return "\n".join(rows).strip()
|
||||
|
||||
|
||||
# ===================== 结构层:目录树 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
|
||||
"""Build structure nodes."""
|
||||
nodes = []
|
||||
for layout in layouts:
|
||||
if not is_title(layout):
|
||||
@@ -195,8 +204,9 @@ def build_structure_nodes(layouts: List[Dict]) -> List[Dict]:
|
||||
return nodes
|
||||
|
||||
|
||||
# ===================== 语义层:章节内容 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
|
||||
"""Update section path."""
|
||||
level = layout.get("level", 0)
|
||||
title = get_text(layout)
|
||||
while section_stack and section_stack[-1]["level"] >= level:
|
||||
@@ -213,10 +223,12 @@ def update_section_path(section_stack: List[Dict], layout: Dict) -> List[Dict]:
|
||||
|
||||
|
||||
def section_path_titles(section_stack: List[Dict]) -> List[str]:
|
||||
"""Handle section path titles."""
|
||||
return [item["title"] for item in section_stack]
|
||||
|
||||
|
||||
def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id: int) -> int:
|
||||
"""Handle flush text block."""
|
||||
if not blocks:
|
||||
return block_id
|
||||
|
||||
@@ -242,6 +254,7 @@ def flush_text_block(blocks: List[Dict], semantic_blocks: List[Dict], block_id:
|
||||
|
||||
|
||||
def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
|
||||
"""Build semantic blocks."""
|
||||
semantic_blocks = []
|
||||
section_stack = []
|
||||
pending_text_blocks = []
|
||||
@@ -327,8 +340,9 @@ def build_semantic_blocks(layouts: List[Dict]) -> List[Dict]:
|
||||
return semantic_blocks
|
||||
|
||||
|
||||
# ===================== 检索层:向量 chunks =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def split_text_with_overlap(text: str, max_chars: int, overlap_chars: int) -> List[str]:
|
||||
"""Handle split text with overlap."""
|
||||
text = text.strip()
|
||||
if len(text) <= max_chars:
|
||||
return [text] if text else []
|
||||
@@ -351,6 +365,7 @@ def build_vector_chunks(
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> List[Dict]:
|
||||
"""Build vector chunks."""
|
||||
vector_chunks = []
|
||||
chunk_index = 1
|
||||
|
||||
@@ -385,7 +400,31 @@ def build_vector_chunks(
|
||||
return vector_chunks
|
||||
|
||||
|
||||
# ===================== 主转换函数 =====================
|
||||
def parse_pdf_to_structured_chunks(
|
||||
pdf_path: str,
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_title: str,
|
||||
max_chars: int = MAX_CHARS,
|
||||
overlap_chars: int = OVERLAP_CHARS,
|
||||
poll_interval: int = 5,
|
||||
) -> Dict:
|
||||
"""Parse pdf to structured chunks."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, pdf_path)
|
||||
if not wait_for_completion(client, task_id, poll_interval):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
return convert_layouts(
|
||||
layouts,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_title,
|
||||
max_chars=max_chars,
|
||||
overlap_chars=overlap_chars,
|
||||
)
|
||||
|
||||
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def convert_layouts(
|
||||
layouts: List[Dict],
|
||||
doc_id: str,
|
||||
@@ -393,6 +432,7 @@ def convert_layouts(
|
||||
max_chars: int,
|
||||
overlap_chars: int,
|
||||
) -> Dict:
|
||||
"""Handle convert layouts."""
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
@@ -411,8 +451,9 @@ def convert_layouts(
|
||||
}
|
||||
|
||||
|
||||
# ===================== CLI 入口 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def main() -> None:
|
||||
"""Run the module entrypoint."""
|
||||
parser = argparse.ArgumentParser(description="阿里云文档智能解析 PDF,输出三层结构 chunks")
|
||||
parser.add_argument("pdf_path", help="PDF 文件路径")
|
||||
parser.add_argument("--out", default="vector_chunks.json", help="输出 JSON 文件路径")
|
||||
@@ -428,30 +469,30 @@ def main() -> None:
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
|
||||
|
||||
# 1. 提交阿里云任务
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
client = init_client()
|
||||
print(f"提交任务: {pdf_path}")
|
||||
task_id = submit_job(client, str(pdf_path))
|
||||
print(f"任务 ID: {task_id}")
|
||||
|
||||
# 2. 等待完成
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
print("等待任务完成...")
|
||||
if not wait_for_completion(client, task_id, args.poll_interval):
|
||||
print("任务失败,退出")
|
||||
return
|
||||
|
||||
# 3. 获取 layouts
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
print("获取解析结果...")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
print(f"获取到 {len(layouts)} 个布局块")
|
||||
|
||||
# 4. 输出原始 layouts(可选)
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
if args.layouts_output:
|
||||
layouts_path = Path(args.layouts_output).expanduser().resolve()
|
||||
layouts_path.write_text(json.dumps(layouts, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"原始 layouts 已写入: {layouts_path}")
|
||||
|
||||
# 5. 转换为三层结构
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
print("转换为三层结构...")
|
||||
data = convert_layouts(
|
||||
layouts,
|
||||
@@ -461,7 +502,7 @@ def main() -> None:
|
||||
overlap_chars=args.overlap_chars,
|
||||
)
|
||||
|
||||
# 6. 输出结果
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
output_path = Path(args.out).expanduser().resolve()
|
||||
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
@@ -472,4 +513,4 @@ def main() -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
将 vector_chunks.json 向量化并上传到 Milvus 和 PostgreSQL
|
||||
使用中转站的 OpenAI 兼容 API
|
||||
"""
|
||||
"""Handle Aliyun parsing support for upload to milvus."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@@ -23,18 +20,18 @@ from pymilvus import (
|
||||
)
|
||||
from openai import OpenAI
|
||||
|
||||
# ===================== 配置 =====================
|
||||
# 中转站配置
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
RELAY_BASE_URL = "http://6.86.80.4:30080/v1"
|
||||
RELAY_API_KEY = "sk-5HeY7gfSIlyZMacfuXOf5cphpymsNqufEu1ou4U3avbULcyY"
|
||||
EMBEDDING_MODEL = "text-embedding-v3" # 中转站支持的 embedding 模型
|
||||
EMBEDDING_MODEL = "text-embedding-v3" # Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
|
||||
# Milvus 配置
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
MILVUS_HOST = "localhost"
|
||||
MILVUS_PORT = "19530"
|
||||
COLLECTION_NAME = "regulation_chunks"
|
||||
|
||||
# PostgreSQL 配置
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
PG_HOST = "6.86.80.10"
|
||||
PG_PORT = 5432
|
||||
PG_USER = "postgresql"
|
||||
@@ -44,12 +41,12 @@ PG_DATABASE = "postgres"
|
||||
|
||||
# ===================== Embedding =====================
|
||||
def get_openai_client(api_key: str, base_url: str) -> OpenAI:
|
||||
"""创建 OpenAI 客户端连接到中转站"""
|
||||
"""Return openai client."""
|
||||
return OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
|
||||
def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10) -> List[List[float]]:
|
||||
"""批量获取文本向量"""
|
||||
"""Return embeddings batch."""
|
||||
all_embeddings = []
|
||||
|
||||
for i in range(0, len(texts), batch_size):
|
||||
@@ -69,12 +66,13 @@ def get_embeddings_batch(client: OpenAI, texts: List[str], batch_size: int = 10)
|
||||
|
||||
# ===================== Milvus =====================
|
||||
def init_milvus(host: str, port: str):
|
||||
"""Handle init milvus."""
|
||||
connections.connect("default", host=host, port=port)
|
||||
print(f"已连接 Milvus: {host}:{port}")
|
||||
|
||||
|
||||
def create_collection(name: str, dim: int) -> Collection:
|
||||
"""创建或获取 collection"""
|
||||
"""Create collection."""
|
||||
if utility.has_collection(name):
|
||||
print(f"Collection '{name}' 已存在,删除重建")
|
||||
utility.drop_collection(name)
|
||||
@@ -90,14 +88,14 @@ def create_collection(name: str, dim: int) -> Collection:
|
||||
FieldSchema(name="page_end", dtype=DataType.INT64),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
|
||||
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # JSON 字符串
|
||||
FieldSchema(name="source_ids", dtype=DataType.VARCHAR, max_length=4096), # Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
||||
schema = CollectionSchema(fields, description="法规文档检索 chunks")
|
||||
collection = Collection(name, schema)
|
||||
|
||||
# 创建向量索引(IVF_FLAT,适合中小规模)
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
index_params = {
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "IVF_FLAT",
|
||||
@@ -110,7 +108,7 @@ def create_collection(name: str, dim: int) -> Collection:
|
||||
|
||||
|
||||
def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[List[float]]):
|
||||
"""插入 chunks 到 Milvus"""
|
||||
"""Handle insert chunks."""
|
||||
data = [
|
||||
[c["chunk_id"] for c in chunks],
|
||||
[c["doc_id"] for c in chunks],
|
||||
@@ -122,7 +120,7 @@ def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[L
|
||||
[c["page_end"] for c in chunks],
|
||||
[c["section_title"] for c in chunks],
|
||||
[c["text"] for c in chunks],
|
||||
[json.dumps(c.get("source_ids", [])) for c in chunks], # JSON 字符串
|
||||
[json.dumps(c.get("source_ids", [])) for c in chunks], # Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
embeddings,
|
||||
]
|
||||
|
||||
@@ -132,14 +130,14 @@ def insert_chunks(collection: Collection, chunks: List[Dict], embeddings: List[L
|
||||
|
||||
|
||||
def load_collection(collection: Collection):
|
||||
"""加载 collection 到内存(搜索前必须)"""
|
||||
"""Load collection."""
|
||||
collection.load()
|
||||
print(f"Collection 已加载到内存")
|
||||
|
||||
|
||||
# ===================== PostgreSQL =====================
|
||||
def get_pg_connection(host: str, port: int, user: str, password: str, database: str):
|
||||
"""获取 PostgreSQL 连接"""
|
||||
"""Return pg connection."""
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
@@ -152,18 +150,18 @@ def get_pg_connection(host: str, port: int, user: str, password: str, database:
|
||||
|
||||
|
||||
def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
|
||||
"""插入 chunks 和相关数据到 PostgreSQL"""
|
||||
"""Handle insert chunks to pg."""
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# 1. 插入文档
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
cursor.execute("""
|
||||
INSERT INTO documents (doc_id, title, standard_number, upload_time)
|
||||
VALUES (%s, %s, %s, NOW())
|
||||
ON CONFLICT (doc_id) DO UPDATE SET title = EXCLUDED.title, updated_at = NOW()
|
||||
""", (doc_data["doc_id"], doc_data["doc_title"], doc_data.get("standard_number")))
|
||||
|
||||
# 2. 插入语义块
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
semantic_blocks = doc_data.get("semantic_blocks", [])
|
||||
if semantic_blocks:
|
||||
block_rows = [
|
||||
@@ -192,7 +190,7 @@ def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
|
||||
)
|
||||
print(f"已插入 {len(semantic_blocks)} 个语义块")
|
||||
|
||||
# 3. 插入向量块元数据
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
chunk_rows = [
|
||||
(
|
||||
doc_data["doc_id"],
|
||||
@@ -230,9 +228,9 @@ def insert_chunks_to_pg(conn, chunks: List[Dict], doc_data: Dict):
|
||||
cursor.close()
|
||||
|
||||
|
||||
# ===================== 主流程 =====================
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
def load_data(file_path: Path) -> Dict:
|
||||
"""加载 vector_chunks.json,返回完整数据"""
|
||||
"""Load data."""
|
||||
data = json.loads(file_path.read_text(encoding="utf-8"))
|
||||
return data
|
||||
|
||||
@@ -251,7 +249,8 @@ def upload_to_milvus_and_pg(
|
||||
pg_password: str,
|
||||
pg_database: str,
|
||||
):
|
||||
# 1. 加载完整数据
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
"""Handle upload to milvus and pg."""
|
||||
chunks_path = Path(chunks_file).expanduser().resolve()
|
||||
if not chunks_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {chunks_path}")
|
||||
@@ -262,29 +261,29 @@ def upload_to_milvus_and_pg(
|
||||
raise ValueError("vector_chunks 为空")
|
||||
print(f"加载 {len(chunks)} 个 chunks")
|
||||
|
||||
# 2. 初始化连接
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
client = get_openai_client(api_key, base_url)
|
||||
init_milvus(milvus_host, milvus_port)
|
||||
pg_conn = get_pg_connection(pg_host, pg_port, pg_user, pg_password, pg_database)
|
||||
|
||||
# 3. 获取 embeddings
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
texts = [c["embedding_text"] for c in chunks]
|
||||
embeddings = get_embeddings_batch(client, texts, batch_size)
|
||||
print(f"生成 {len(embeddings)} 个向量")
|
||||
|
||||
# 4. 获取 embedding 维度
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
embedding_dim = len(embeddings[0])
|
||||
print(f"Embedding 维度: {embedding_dim}")
|
||||
|
||||
# 5. 创建 collection 并插入 Milvus
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
collection = create_collection(collection_name, embedding_dim)
|
||||
insert_chunks(collection, chunks, embeddings)
|
||||
load_collection(collection)
|
||||
|
||||
# 6. 插入 PostgreSQL
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
insert_chunks_to_pg(pg_conn, chunks, data)
|
||||
|
||||
# 7. 关闭连接
|
||||
# Keep parser integration steps explicit so external workflow behavior stays traceable.
|
||||
pg_conn.close()
|
||||
|
||||
print("上传完成!")
|
||||
@@ -292,6 +291,7 @@ def upload_to_milvus_and_pg(
|
||||
|
||||
# ===================== CLI =====================
|
||||
def main():
|
||||
"""Run the module entrypoint."""
|
||||
parser = argparse.ArgumentParser(description="将 vector_chunks 向量化并上传到 Milvus 和 PostgreSQL")
|
||||
parser.add_argument("chunks_file", help="vector_chunks.json 文件路径")
|
||||
parser.add_argument("--api-key", default=RELAY_API_KEY, help="中转站 API Key")
|
||||
|
||||
Reference in New Issue
Block a user