Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/infrastructure/vectorstore/init.py
+++ b/backend/app/infrastructure/vectorstore/init.py
@@ -0,0 +1,5 @@
+"""Initialize the app.infrastructure.vectorstore package."""
+# Keep package boundaries explicit so backend imports stay predictable.
+
+
+__all__ = []
--- a/backend/app/infrastructure/vectorstore/dense_retriever.py
+++ b/backend/app/infrastructure/vectorstore/dense_retriever.py
@@ -0,0 +1,24 @@
+"""Implement infrastructure support for dense retriever."""
+
+from __future__ import annotations
+
+from app.domain.retrieval import EmbeddingProvider, RetrievalQuery, Retriever, RetrievedChunk, VectorIndex
+# Keep adapter behavior explicit so integration details remain easy to audit.
+
+
+
+class DenseRetriever(Retriever):
+    """Provide the Dense Retriever retriever."""
+    def __init__(self, *, embedding_provider: EmbeddingProvider, vector_index: VectorIndex) -> None:
+        """Initialize the Dense Retriever instance."""
+        self.embedding_provider = embedding_provider
+        self.vector_index = vector_index
+
+    def retrieve(self, query: RetrievalQuery) -> list[RetrievedChunk]:
+        """Handle retrieve for the Dense Retriever instance."""
+        query_vector = self.embedding_provider.embed_query(query.query)
+        return self.vector_index.search(query_vector, query.top_k, query.filters)
+
+    def search(self, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
+        """Handle search for the Dense Retriever instance."""
+        return self.retrieve(RetrievalQuery(query=query, top_k=top_k, filters=filters))
--- a/backend/app/infrastructure/vectorstore/milvus_vector_index.py
+++ b/backend/app/infrastructure/vectorstore/milvus_vector_index.py
@@ -0,0 +1,154 @@
+"""Implement infrastructure support for milvus vector index."""
+
+from __future__ import annotations
+
+import json
+import time
+
+from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
+
+from app.config.settings import settings
+from app.domain.documents import Chunk
+from app.domain.retrieval import RetrievedChunk, VectorIndex
+# Keep adapter behavior explicit so integration details remain easy to audit.
+
+
+
+class MilvusVectorIndex(VectorIndex):
+    """Provide the Milvus Vector Index index implementation."""
+    def __init__(self) -> None:
+        """Initialize the Milvus Vector Index instance."""
+        self.collection_name = settings.milvus_collection
+        self.db_name = settings.milvus_db_name
+        connections.connect(
+            alias="default",
+            host=settings.milvus_host,
+            port=settings.milvus_port,
+            db_name=self.db_name,
+        )
+        self.collection = self._ensure_collection()
+
+    def _ensure_collection(self) -> Collection:
+        """Handle ensure collection for this module for the Milvus Vector Index instance."""
+        if utility.has_collection(self.collection_name):
+            collection = Collection(self.collection_name)
+            collection.load()
+            return collection
+        schema = CollectionSchema(
+            fields=[
+                FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
+                FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
+                FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
+                FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
+                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
+                FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
+                FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
+                FieldSchema(name="page_number", dtype=DataType.INT64),
+                FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
+                FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
+                FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
+                FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
+                FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
+                FieldSchema(name="created_at", dtype=DataType.INT64),
+            ],
+            description="Dense-only regulations index",
+            enable_dynamic_field=False,
+        )
+        collection = Collection(name=self.collection_name, schema=schema)
+        collection.create_index(
+            field_name="embedding",
+            index_params={
+                "metric_type": "COSINE",
+                "index_type": settings.milvus_index_type,
+                "params": {"nlist": settings.milvus_nlist},
+            },
+        )
+        collection.load()
+        return collection
+
+    def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
+        """Handle upsert for the Milvus Vector Index instance."""
+        if len(chunks) != len(vectors):
+            raise ValueError("chunks 与 vectors 数量不一致")
+        data = []
+        now = int(time.time())
+        for chunk, vector in zip(chunks, vectors):
+            data.append(
+                {
+                    "id": chunk.chunk_id,
+                    "doc_id": chunk.doc_id,
+                    "doc_name": chunk.doc_name,
+                    "content": chunk.content[:65535],
+                    "embedding": vector,
+                    "section_title": chunk.section_title[:512],
+                    "section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
+                    "page_number": chunk.page_number,
+                    "regulation_type": chunk.regulation_type[:128],
+                    "version": chunk.version[:64],
+                    "semantic_id": chunk.semantic_id[:128],
+                    "block_type": chunk.block_type[:64],
+                    "metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
+                    "created_at": now,
+                }
+            )
+        self.collection.insert(data)
+        self.collection.flush()
+        return len(data)
+
+    def delete_by_document(self, doc_id: str) -> int:
+        """Delete by document for the Milvus Vector Index instance."""
+        result = self.collection.delete(f'doc_id == "{doc_id}"')
+        return len(result.primary_keys)
+
+    def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
+        """Handle search for the Milvus Vector Index instance."""
+        results = self.collection.search(
+            data=[query_vector],
+            anns_field="embedding",
+            param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
+            limit=top_k,
+            filter=filters,
+            output_fields=[
+                "doc_id",
+                "doc_name",
+                "content",
+                "section_title",
+                "page_number",
+                "regulation_type",
+                "version",
+                "semantic_id",
+                "block_type",
+                "metadata_json",
+            ],
+        )
+        payload: list[RetrievedChunk] = []
+        for hits in results:
+            for hit in hits:
+                metadata = {}
+                raw_metadata = hit.entity.get("metadata_json", "")
+                if raw_metadata:
+                    try:
+                        metadata = json.loads(raw_metadata)
+                    except json.JSONDecodeError:
+                        metadata = {"raw_metadata": raw_metadata}
+                payload.append(
+                    RetrievedChunk(
+                        chunk_id=str(hit.id),
+                        doc_id=hit.entity.get("doc_id", ""),
+                        doc_name=hit.entity.get("doc_name", ""),
+                        content=hit.entity.get("content", ""),
+                        score=float(hit.score),
+                        section_title=hit.entity.get("section_title", ""),
+                        page_number=int(hit.entity.get("page_number", 0) or 0),
+                        metadata=metadata,
+                    )
+                )
+        return payload
+
+    def health(self) -> dict:
+        """Handle health for the Milvus Vector Index instance."""
+        return {
+            "connected": True,
+            "collection_name": self.collection_name,
+            "num_entities": self.collection.num_entities if self.collection else 0,
+        }