Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -0,0 +1,5 @@
"""Initialize the app.infrastructure.vectorstore package."""
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = []

View File

@@ -0,0 +1,24 @@
"""Implement infrastructure support for dense retriever."""
from __future__ import annotations
from app.domain.retrieval import EmbeddingProvider, RetrievalQuery, Retriever, RetrievedChunk, VectorIndex
# Keep adapter behavior explicit so integration details remain easy to audit.
class DenseRetriever(Retriever):
"""Provide the Dense Retriever retriever."""
def __init__(self, *, embedding_provider: EmbeddingProvider, vector_index: VectorIndex) -> None:
"""Initialize the Dense Retriever instance."""
self.embedding_provider = embedding_provider
self.vector_index = vector_index
def retrieve(self, query: RetrievalQuery) -> list[RetrievedChunk]:
"""Handle retrieve for the Dense Retriever instance."""
query_vector = self.embedding_provider.embed_query(query.query)
return self.vector_index.search(query_vector, query.top_k, query.filters)
def search(self, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle search for the Dense Retriever instance."""
return self.retrieve(RetrievalQuery(query=query, top_k=top_k, filters=filters))

View File

@@ -0,0 +1,154 @@
"""Implement infrastructure support for milvus vector index."""
from __future__ import annotations
import json
import time
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
from app.config.settings import settings
from app.domain.documents import Chunk
from app.domain.retrieval import RetrievedChunk, VectorIndex
# Keep adapter behavior explicit so integration details remain easy to audit.
class MilvusVectorIndex(VectorIndex):
"""Provide the Milvus Vector Index index implementation."""
def __init__(self) -> None:
"""Initialize the Milvus Vector Index instance."""
self.collection_name = settings.milvus_collection
self.db_name = settings.milvus_db_name
connections.connect(
alias="default",
host=settings.milvus_host,
port=settings.milvus_port,
db_name=self.db_name,
)
self.collection = self._ensure_collection()
def _ensure_collection(self) -> Collection:
"""Handle ensure collection for this module for the Milvus Vector Index instance."""
if utility.has_collection(self.collection_name):
collection = Collection(self.collection_name)
collection.load()
return collection
schema = CollectionSchema(
fields=[
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
FieldSchema(name="page_number", dtype=DataType.INT64),
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="created_at", dtype=DataType.INT64),
],
description="Dense-only regulations index",
enable_dynamic_field=False,
)
collection = Collection(name=self.collection_name, schema=schema)
collection.create_index(
field_name="embedding",
index_params={
"metric_type": "COSINE",
"index_type": settings.milvus_index_type,
"params": {"nlist": settings.milvus_nlist},
},
)
collection.load()
return collection
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
"""Handle upsert for the Milvus Vector Index instance."""
if len(chunks) != len(vectors):
raise ValueError("chunks 与 vectors 数量不一致")
data = []
now = int(time.time())
for chunk, vector in zip(chunks, vectors):
data.append(
{
"id": chunk.chunk_id,
"doc_id": chunk.doc_id,
"doc_name": chunk.doc_name,
"content": chunk.content[:65535],
"embedding": vector,
"section_title": chunk.section_title[:512],
"section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
"page_number": chunk.page_number,
"regulation_type": chunk.regulation_type[:128],
"version": chunk.version[:64],
"semantic_id": chunk.semantic_id[:128],
"block_type": chunk.block_type[:64],
"metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
"created_at": now,
}
)
self.collection.insert(data)
self.collection.flush()
return len(data)
def delete_by_document(self, doc_id: str) -> int:
"""Delete by document for the Milvus Vector Index instance."""
result = self.collection.delete(f'doc_id == "{doc_id}"')
return len(result.primary_keys)
def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle search for the Milvus Vector Index instance."""
results = self.collection.search(
data=[query_vector],
anns_field="embedding",
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
limit=top_k,
filter=filters,
output_fields=[
"doc_id",
"doc_name",
"content",
"section_title",
"page_number",
"regulation_type",
"version",
"semantic_id",
"block_type",
"metadata_json",
],
)
payload: list[RetrievedChunk] = []
for hits in results:
for hit in hits:
metadata = {}
raw_metadata = hit.entity.get("metadata_json", "")
if raw_metadata:
try:
metadata = json.loads(raw_metadata)
except json.JSONDecodeError:
metadata = {"raw_metadata": raw_metadata}
payload.append(
RetrievedChunk(
chunk_id=str(hit.id),
doc_id=hit.entity.get("doc_id", ""),
doc_name=hit.entity.get("doc_name", ""),
content=hit.entity.get("content", ""),
score=float(hit.score),
section_title=hit.entity.get("section_title", ""),
page_number=int(hit.entity.get("page_number", 0) or 0),
metadata=metadata,
)
)
return payload
def health(self) -> dict:
"""Handle health for the Milvus Vector Index instance."""
return {
"connected": True,
"collection_name": self.collection_name,
"num_entities": self.collection.num_entities if self.collection else 0,
}