Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.embedding package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,59 @@
"""Implement infrastructure support for openai compatible embedding provider."""
from __future__ import annotations
import os
import httpx
from app.config.settings import settings
from app.domain.retrieval import EmbeddingProvider
# Keep adapter behavior explicit so integration details remain easy to audit.
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
"""Provide the Open A I Compatible Embedding Provider provider."""
def __init__(self) -> None:
"""Initialize the Open A I Compatible Embedding Provider instance."""
self.base_url = settings.embedding_base_url.rstrip("/")
self.api_key = (
settings.embedding_api_key
or os.getenv("OPENAI_API_KEY", "")
or os.getenv("QWEN_API_KEY", "")
or os.getenv("DEEPSEEK_API_KEY", "")
)
self.model = settings.embedding_model
self.timeout = settings.embedding_timeout_seconds
self.dimension = settings.embedding_dim
def _request(self, texts: list[str]) -> list[list[float]]:
"""Handle request for this module for the Open A I Compatible Embedding Provider instance."""
if not self.api_key:
raise ValueError("缺少 EMBEDDING_API_KEY / OPENAI_API_KEY")
response = httpx.post(
f"{self.base_url}/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={"model": self.model, "input": texts},
timeout=self.timeout,
)
response.raise_for_status()
data = response.json()
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
if any(len(vector) != self.dimension for vector in vectors):
raise ValueError(f"embedding 维度不匹配,期望 {self.dimension}")
return vectors
def embed_texts(self, texts: list[str]) -> list[list[float]]:
"""Embed texts for the Open A I Compatible Embedding Provider instance."""
if not texts:
return []
return self._request(texts)
def embed_query(self, text: str) -> list[float]:
"""Embed query for the Open A I Compatible Embedding Provider instance."""
vectors = self._request([text])
return vectors[0]

View File

@@ -0,0 +1,144 @@
"""Implement infrastructure support for openai compatible answer generator."""
from __future__ import annotations
import time
from typing import Generator
from app.config.settings import settings
from app.domain.conversation import AnswerGenerator, AnswerResult, AnswerSource
from app.domain.retrieval import RetrievedChunk
from app.services.llm.llm_factory import get_llm_client
# Keep adapter behavior explicit so integration details remain easy to audit.
PROMPT_TEMPLATES = {
"default": "你是法规知识问答助手。请仅依据提供的上下文回答;如果上下文不足,明确说明。",
"compliance_qa": "你是法规合规问答助手。优先引用给定法规原文,回答要准确、克制,并注明依据来源。",
}
class OpenAICompatibleAnswerGenerator(AnswerGenerator):
"""Represent the Open A I Compatible Answer Generator type."""
def _build_messages(
self,
*,
query: str,
retrieved_chunks: list[RetrievedChunk],
history: list[dict[str, str]] | None,
prompt_template: str | None,
) -> tuple[list[dict[str, str]], int]:
"""Handle build messages for this module for the Open A I Compatible Answer Generator instance."""
system_prompt = PROMPT_TEMPLATES.get(prompt_template or "compliance_qa", PROMPT_TEMPLATES["default"])
context_blocks = []
context_tokens = 0
for idx, chunk in enumerate(retrieved_chunks, start=1):
block = (
f"[{idx}] 文档: {chunk.doc_name}\n"
f"章节: {chunk.section_title or '未标注'}\n"
f"页码: {chunk.page_number}\n"
f"内容: {chunk.content}"
)
context_tokens += len(block)
context_blocks.append(block)
context = "\n\n".join(context_blocks)[: settings.rag_max_context_tokens * 4]
messages = [{"role": "system", "content": system_prompt}]
for item in history or []:
messages.append({"role": item["role"], "content": item["content"]})
messages.append(
{
"role": "user",
"content": f"问题:{query}\n\n参考上下文:\n{context}\n\n请在回答后给出简要引用编号。",
}
)
return messages, min(context_tokens, settings.rag_max_context_tokens)
def _sources(self, chunks: list[RetrievedChunk]) -> list[AnswerSource]:
"""Handle sources for this module for the Open A I Compatible Answer Generator instance."""
return [
AnswerSource(
doc_id=chunk.doc_id,
doc_name=chunk.doc_name,
chunk_id=chunk.chunk_id,
section_title=chunk.section_title,
page_number=chunk.page_number,
score=chunk.score,
content=chunk.content,
metadata=chunk.metadata,
)
for chunk in chunks
]
def generate(
self,
*,
query: str,
retrieved_chunks: list[RetrievedChunk],
history: list[dict[str, str]] | None = None,
provider: str | None = None,
model: str | None = None,
prompt_template: str | None = None,
) -> AnswerResult:
"""Handle generate for the Open A I Compatible Answer Generator instance."""
start = time.time()
messages, context_tokens = self._build_messages(
query=query,
retrieved_chunks=retrieved_chunks,
history=history,
prompt_template=prompt_template,
)
client = get_llm_client(provider=provider or settings.llm_provider, model=model or settings.llm_model)
response = client.chat(messages)
latency_ms = int((time.time() - start) * 1000)
return AnswerResult(
answer=response.content if response.is_success else "",
sources=self._sources(retrieved_chunks),
model=response.model or (model or settings.llm_model),
latency_ms=latency_ms,
retrieved_count=len(retrieved_chunks),
context_tokens=context_tokens,
truncated=False,
error=response.error,
)
def stream_generate(
self,
*,
query: str,
retrieved_chunks: list[RetrievedChunk],
history: list[dict[str, str]] | None = None,
provider: str | None = None,
model: str | None = None,
prompt_template: str | None = None,
) -> Generator[dict, None, AnswerResult]:
"""Stream generate for the Open A I Compatible Answer Generator instance."""
start = time.time()
messages, context_tokens = self._build_messages(
query=query,
retrieved_chunks=retrieved_chunks,
history=history,
prompt_template=prompt_template,
)
sources = [source.__dict__ for source in self._sources(retrieved_chunks)]
yield {"event": "sources", "data": sources}
client = get_llm_client(provider=provider or settings.llm_provider, model=model or settings.llm_model)
answer_parts: list[str] = []
if hasattr(client, "stream_chat"):
for chunk in client.stream_chat(messages):
answer_parts.append(chunk)
yield {"event": "content", "data": chunk}
else:
response = client.chat(messages)
answer_parts.append(response.content)
yield {"event": "content", "data": response.content}
full_answer = "".join(answer_parts)
yield {
"event": "done",
"data": {
"latency_ms": int((time.time() - start) * 1000),
"retrieved_count": len(retrieved_chunks),
"context_tokens": context_tokens,
"model": model or settings.llm_model,
},
}

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.parser package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,55 @@
"""Implement infrastructure support for aliyun document parser."""
from __future__ import annotations
from app.aliyun_parser.parse_pdf import (
MAX_CHARS,
OVERLAP_CHARS,
build_semantic_blocks,
build_structure_nodes,
build_vector_chunks,
collect_all_results,
init_client,
submit_job,
wait_for_completion,
)
from app.domain.documents import DocumentParser, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.
class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind"
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance."""
client = init_client()
task_id = submit_job(client, file_path)
if not wait_for_completion(client, task_id):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_name,
max_chars=MAX_CHARS,
overlap_chars=OVERLAP_CHARS,
)
raw_text = "\n\n".join(
block.get("text", "")
for block in semantic_blocks
if block.get("text")
)
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=structure_nodes,
semantic_blocks=semantic_blocks,
vector_chunks=vector_chunks,
parser_name=self.parser_name,
raw_text=raw_text,
metadata={"task_id": task_id, "layout_count": len(layouts)},
)

View File

@@ -0,0 +1,66 @@
"""Local chunk builder adapter for the migrated backend architecture."""
from __future__ import annotations
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
from app.services.embedding.text_chunker import RegulationChunker
class LocalRegulationChunkBuilder(ChunkBuilder):
"""Adapt the existing markdown chunker to the new chunk builder port."""
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
self.chunker = RegulationChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def build(
self,
*,
parsed_document: ParsedDocument,
regulation_type: str,
version: str,
) -> list[Chunk]:
markdown_text = parsed_document.raw_text.strip()
if not markdown_text:
return []
legacy_chunks = self.chunker.chunk_document(
markdown_text,
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
regulation_type=regulation_type,
version=version,
)
chunks: list[Chunk] = []
for item in legacy_chunks:
metadata = {
"section_number": item.metadata.section_number,
"section_title": item.metadata.section_title,
"clause_number": item.metadata.clause_number,
"start_position": item.metadata.start_position,
"end_position": item.metadata.end_position,
"token_count": item.token_count,
"source": "local_chunker",
}
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
chunks.append(
Chunk(
chunk_id=item.metadata.chunk_id,
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content=item.content,
embedding_text=item.content,
section_title=item.metadata.section_title or item.metadata.section_number,
section_path=section_path,
page_number=item.metadata.page_number,
regulation_type=regulation_type,
version=version,
semantic_id=item.metadata.clause_number,
block_type="local_markdown_chunk",
metadata=metadata,
)
)
return chunks

View File

@@ -0,0 +1,38 @@
"""Local parser adapter for the migrated backend architecture."""
from __future__ import annotations
from pathlib import Path
from app.domain.documents import DocumentParser, ParsedDocument
from app.services.parser.docx_parser import parse_docx_to_markdown
from app.services.parser.pdf_parser import parse_pdf_to_markdown
class LocalDocumentParser(DocumentParser):
"""Adapt the existing local PDF/DOCX parsers to the new parser port."""
parser_name = "local_markdown_parser"
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
suffix = Path(file_path).suffix.lower()
if suffix == ".pdf":
markdown_text = parse_pdf_to_markdown(file_path)
elif suffix in {".docx", ".doc"}:
markdown_text = parse_docx_to_markdown(file_path)
else:
raise ValueError(f"不支持的文件类型: {suffix}")
if not markdown_text.strip():
raise ValueError("本地解析完成但未提取到有效文本")
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=[],
semantic_blocks=[],
vector_chunks=[],
parser_name=self.parser_name,
raw_text=markdown_text,
metadata={"source": "local_parser", "file_suffix": suffix},
)

View File

@@ -0,0 +1,48 @@
"""Implement infrastructure support for vector chunk builder."""
from __future__ import annotations
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.
class AliyunVectorChunkBuilder(ChunkBuilder):
"""Provide the Aliyun Vector Chunk Builder builder."""
def build(
self,
*,
parsed_document: ParsedDocument,
regulation_type: str,
version: str,
) -> list[Chunk]:
"""Handle build for the Aliyun Vector Chunk Builder instance."""
chunks: list[Chunk] = []
for index, item in enumerate(parsed_document.vector_chunks):
content = item.get("content") or item.get("text") or ""
embedding_text = item.get("embedding_text") or content
if not embedding_text.strip():
continue
section_path = item.get("section_path") or []
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
page_number = item.get("page_start") or item.get("page") or 0
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
chunks.append(
Chunk(
chunk_id=str(chunk_id),
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content=content,
embedding_text=embedding_text,
section_title=section_title,
section_path=section_path,
page_number=int(page_number or 0),
regulation_type=regulation_type,
version=version,
semantic_id=item.get("semantic_id", ""),
block_type=item.get("block_type", ""),
metadata=metadata,
)
)
return chunks

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.session package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,95 @@
"""Implement infrastructure support for in memory conversation store."""
from __future__ import annotations
import time
import uuid
from app.domain.conversation import ConversationMessage, ConversationSession, ConversationStore
# Keep adapter behavior explicit so integration details remain easy to audit.
class InMemoryConversationStore(ConversationStore):
"""Provide the In Memory Conversation Store store implementation."""
def __init__(self, *, max_sessions: int = 100, timeout_minutes: int = 30) -> None:
"""Initialize the In Memory Conversation Store instance."""
self.max_sessions = max_sessions
self.timeout_seconds = timeout_minutes * 60
self.sessions: dict[str, ConversationSession] = {}
def _now(self) -> int:
"""Handle now for this module for the In Memory Conversation Store instance."""
return int(time.time())
def _cleanup_expired(self) -> None:
"""Handle cleanup expired for this module for the In Memory Conversation Store instance."""
now = self._now()
expired = [
session_id
for session_id, session in self.sessions.items()
if (now - session.updated_at) > self.timeout_seconds
]
for session_id in expired:
self.sessions.pop(session_id, None)
def create_session(self, metadata: dict | None = None) -> ConversationSession:
"""Create session for the In Memory Conversation Store instance."""
self._cleanup_expired()
if len(self.sessions) >= self.max_sessions:
oldest = min(self.sessions.values(), key=lambda item: item.updated_at)
self.sessions.pop(oldest.session_id, None)
session_id = str(uuid.uuid4())[:8]
session = ConversationSession(
session_id=session_id,
created_at=self._now(),
updated_at=self._now(),
metadata=metadata or {},
)
self.sessions[session_id] = session
return session
def get_session(self, session_id: str) -> ConversationSession | None:
"""Return session for the In Memory Conversation Store instance."""
self._cleanup_expired()
return self.sessions.get(session_id)
def save_message(
self,
session_id: str,
*,
role: str,
content: str,
sources: list[dict] | None = None,
) -> ConversationSession | None:
"""Save message for the In Memory Conversation Store instance."""
session = self.get_session(session_id)
if not session:
return None
session.messages.append(
ConversationMessage(
role=role,
content=content,
timestamp=self._now(),
sources=sources or [],
)
)
session.updated_at = self._now()
return session
def delete_session(self, session_id: str) -> bool:
"""Delete session for the In Memory Conversation Store instance."""
return self.sessions.pop(session_id, None) is not None
def list_sessions(self) -> list[dict]:
"""List sessions for the In Memory Conversation Store instance."""
self._cleanup_expired()
return [
{
"session_id": session.session_id,
"message_count": len(session.messages),
"created_at": session.created_at,
"updated_at": session.updated_at,
}
for session in self.sessions.values()
]

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.storage package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,109 @@
"""Implement infrastructure support for json document repository."""
from __future__ import annotations
import json
from datetime import UTC, datetime
from pathlib import Path
from app.domain.documents import Document, DocumentRepository, DocumentStatus
# Keep adapter behavior explicit so integration details remain easy to audit.
class JsonDocumentRepository(DocumentRepository):
"""Provide the Json Document Repository repository implementation."""
def __init__(self, file_path: str) -> None:
"""Initialize the Json Document Repository instance."""
self.file_path = Path(file_path)
self.file_path.parent.mkdir(parents=True, exist_ok=True)
if not self.file_path.exists():
self.file_path.write_text("{}", encoding="utf-8")
def _load(self) -> dict[str, dict]:
"""Handle load for this module for the Json Document Repository instance."""
return json.loads(self.file_path.read_text(encoding="utf-8") or "{}")
def _save(self, payload: dict[str, dict]) -> None:
"""Handle save for this module for the Json Document Repository instance."""
self.file_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def _serialize(self, document: Document) -> dict:
"""Handle serialize for this module for the Json Document Repository instance."""
payload = document.__dict__.copy()
payload["status"] = document.status.value
payload["created_at"] = document.created_at.isoformat()
payload["updated_at"] = document.updated_at.isoformat()
return payload
def _deserialize(self, payload: dict) -> Document:
"""Handle deserialize for this module for the Json Document Repository instance."""
return Document(
**{
**payload,
"status": DocumentStatus(payload["status"]),
"created_at": datetime.fromisoformat(payload["created_at"]),
"updated_at": datetime.fromisoformat(payload["updated_at"]),
}
)
def create(self, document: Document) -> Document:
"""Handle create for the Json Document Repository instance."""
payload = self._load()
payload[document.doc_id] = self._serialize(document)
self._save(payload)
return document
def update(self, document: Document) -> Document:
"""Handle update for the Json Document Repository instance."""
document.updated_at = datetime.now(UTC)
payload = self._load()
payload[document.doc_id] = self._serialize(document)
self._save(payload)
return document
def get(self, doc_id: str) -> Document | None:
"""Handle get for the Json Document Repository instance."""
payload = self._load()
item = payload.get(doc_id)
return self._deserialize(item) if item else None
def list(self, limit: int | None = None) -> list[Document]:
"""Handle list for the Json Document Repository instance."""
payload = self._load()
documents = [self._deserialize(item) for item in payload.values()]
documents.sort(key=lambda item: item.updated_at, reverse=True)
return documents[:limit] if limit is not None else documents
def update_status(
self,
doc_id: str,
status: DocumentStatus,
*,
error_message: str = "",
chunk_count: int | None = None,
summary: str | None = None,
summary_latency_ms: int | None = None,
parser_name: str | None = None,
index_name: str | None = None,
metadata: dict | None = None,
) -> Document | None:
"""Update status for the Json Document Repository instance."""
document = self.get(doc_id)
if not document:
return None
document.status = status
document.error_message = error_message
if chunk_count is not None:
document.chunk_count = chunk_count
if summary is not None:
document.summary = summary
if summary_latency_ms is not None:
document.summary_latency_ms = summary_latency_ms
if parser_name is not None:
document.parser_name = parser_name
if index_name is not None:
document.index_name = index_name
if metadata:
document.metadata.update(metadata)
return self.update(document)

View File

@@ -0,0 +1,47 @@
"""Implement infrastructure support for minio binary store."""
from __future__ import annotations
from app.domain.documents import DocumentBinaryStore
from app.services.storage.minio_client import MinIOClient
# Keep adapter behavior explicit so integration details remain easy to audit.
class MinioDocumentBinaryStore(DocumentBinaryStore):
"""Provide the Minio Document Binary Store store implementation."""
def __init__(self) -> None:
"""Initialize the Minio Document Binary Store instance."""
self.client = MinIOClient()
self.client.connect()
self.client.ensure_bucket()
def save(
self,
*,
object_name: str,
data: bytes,
content_type: str,
metadata: dict[str, str] | None = None,
) -> None:
"""Handle save for the Minio Document Binary Store instance."""
success = self.client.upload_bytes(
data=data,
object_name=object_name,
content_type=content_type,
metadata=metadata,
)
if not success:
raise RuntimeError("MinIO 保存失败")
def read(self, object_name: str) -> bytes:
"""Handle read for the Minio Document Binary Store instance."""
data = self.client.get_object_data(object_name)
if data is None:
raise FileNotFoundError(f"对象不存在: {object_name}")
return data
def delete(self, object_name: str) -> None:
"""Handle delete for the Minio Document Binary Store instance."""
if not self.client.delete_object(object_name):
raise FileNotFoundError(f"对象删除失败: {object_name}")

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.vectorstore package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,24 @@
"""Implement infrastructure support for dense retriever."""
from __future__ import annotations
from app.domain.retrieval import EmbeddingProvider, RetrievalQuery, Retriever, RetrievedChunk, VectorIndex
# Keep adapter behavior explicit so integration details remain easy to audit.
class DenseRetriever(Retriever):
"""Provide the Dense Retriever retriever."""
def __init__(self, *, embedding_provider: EmbeddingProvider, vector_index: VectorIndex) -> None:
"""Initialize the Dense Retriever instance."""
self.embedding_provider = embedding_provider
self.vector_index = vector_index
def retrieve(self, query: RetrievalQuery) -> list[RetrievedChunk]:
"""Handle retrieve for the Dense Retriever instance."""
query_vector = self.embedding_provider.embed_query(query.query)
return self.vector_index.search(query_vector, query.top_k, query.filters)
def search(self, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle search for the Dense Retriever instance."""
return self.retrieve(RetrievalQuery(query=query, top_k=top_k, filters=filters))

View File

@@ -0,0 +1,154 @@
"""Implement infrastructure support for milvus vector index."""
from __future__ import annotations
import json
import time
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
from app.config.settings import settings
from app.domain.documents import Chunk
from app.domain.retrieval import RetrievedChunk, VectorIndex
# Keep adapter behavior explicit so integration details remain easy to audit.
class MilvusVectorIndex(VectorIndex):
"""Provide the Milvus Vector Index index implementation."""
def __init__(self) -> None:
"""Initialize the Milvus Vector Index instance."""
self.collection_name = settings.milvus_collection
self.db_name = settings.milvus_db_name
connections.connect(
alias="default",
host=settings.milvus_host,
port=settings.milvus_port,
db_name=self.db_name,
)
self.collection = self._ensure_collection()
def _ensure_collection(self) -> Collection:
"""Handle ensure collection for this module for the Milvus Vector Index instance."""
if utility.has_collection(self.collection_name):
collection = Collection(self.collection_name)
collection.load()
return collection
schema = CollectionSchema(
fields=[
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="page_number", dtype=DataType.INT64),
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="created_at", dtype=DataType.INT64),
],
description="Dense-only regulations index",
enable_dynamic_field=False,
)
collection = Collection(name=self.collection_name, schema=schema)
collection.create_index(
field_name="embedding",
index_params={
"metric_type": "COSINE",
"index_type": settings.milvus_index_type,
"params": {"nlist": settings.milvus_nlist},
},
)
collection.load()
return collection
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
"""Handle upsert for the Milvus Vector Index instance."""
if len(chunks) != len(vectors):
raise ValueError("chunks 与 vectors 数量不一致")
data = []
now = int(time.time())
for chunk, vector in zip(chunks, vectors):
data.append(
{
"id": chunk.chunk_id,
"doc_id": chunk.doc_id,
"doc_name": chunk.doc_name,
"content": chunk.content[:65535],
"embedding": vector,
"section_title": chunk.section_title[:512],
"section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
"page_number": chunk.page_number,
"regulation_type": chunk.regulation_type[:128],
"version": chunk.version[:64],
"semantic_id": chunk.semantic_id[:128],
"block_type": chunk.block_type[:64],
"metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
"created_at": now,
}
)
self.collection.insert(data)
self.collection.flush()
return len(data)
def delete_by_document(self, doc_id: str) -> int:
"""Delete by document for the Milvus Vector Index instance."""
result = self.collection.delete(f'doc_id == "{doc_id}"')
return len(result.primary_keys)
def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle search for the Milvus Vector Index instance."""
results = self.collection.search(
data=[query_vector],
anns_field="embedding",
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
limit=top_k,
filter=filters,
output_fields=[
"doc_id",
"doc_name",
"content",
"section_title",
"page_number",
"regulation_type",
"version",
"semantic_id",
"block_type",
"metadata_json",
],
)
payload: list[RetrievedChunk] = []
for hits in results:
for hit in hits:
metadata = {}
raw_metadata = hit.entity.get("metadata_json", "")
if raw_metadata:
try:
metadata = json.loads(raw_metadata)
except json.JSONDecodeError:
metadata = {"raw_metadata": raw_metadata}
payload.append(
RetrievedChunk(
chunk_id=str(hit.id),
doc_id=hit.entity.get("doc_id", ""),
doc_name=hit.entity.get("doc_name", ""),
content=hit.entity.get("content", ""),
score=float(hit.score),
section_title=hit.entity.get("section_title", ""),
page_number=int(hit.entity.get("page_number", 0) or 0),
metadata=metadata,
)
)
return payload
def health(self) -> dict:
"""Handle health for the Milvus Vector Index instance."""
return {
"connected": True,
"collection_name": self.collection_name,
"num_entities": self.collection.num_entities if self.collection else 0,
}