Fix SSE route dependency and align architecture docs
This commit is contained in:
5
backend/app/infrastructure/__init__.py
Normal file
5
backend/app/infrastructure/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
5
backend/app/infrastructure/embedding/__init__.py
Normal file
5
backend/app/infrastructure/embedding/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.embedding package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Implement infrastructure support for openai compatible embedding provider."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.retrieval import EmbeddingProvider
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class OpenAICompatibleEmbeddingProvider(EmbeddingProvider):
|
||||
"""Provide the Open A I Compatible Embedding Provider provider."""
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Open A I Compatible Embedding Provider instance."""
|
||||
self.base_url = settings.embedding_base_url.rstrip("/")
|
||||
self.api_key = (
|
||||
settings.embedding_api_key
|
||||
or os.getenv("OPENAI_API_KEY", "")
|
||||
or os.getenv("QWEN_API_KEY", "")
|
||||
or os.getenv("DEEPSEEK_API_KEY", "")
|
||||
)
|
||||
self.model = settings.embedding_model
|
||||
self.timeout = settings.embedding_timeout_seconds
|
||||
self.dimension = settings.embedding_dim
|
||||
|
||||
def _request(self, texts: list[str]) -> list[list[float]]:
|
||||
"""Handle request for this module for the Open A I Compatible Embedding Provider instance."""
|
||||
if not self.api_key:
|
||||
raise ValueError("缺少 EMBEDDING_API_KEY / OPENAI_API_KEY")
|
||||
response = httpx.post(
|
||||
f"{self.base_url}/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={"model": self.model, "input": texts},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
vectors = [item["embedding"] for item in sorted(data.get("data", []), key=lambda item: item["index"])]
|
||||
if any(len(vector) != self.dimension for vector in vectors):
|
||||
raise ValueError(f"embedding 维度不匹配,期望 {self.dimension}")
|
||||
return vectors
|
||||
|
||||
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
||||
"""Embed texts for the Open A I Compatible Embedding Provider instance."""
|
||||
if not texts:
|
||||
return []
|
||||
return self._request(texts)
|
||||
|
||||
def embed_query(self, text: str) -> list[float]:
|
||||
"""Embed query for the Open A I Compatible Embedding Provider instance."""
|
||||
vectors = self._request([text])
|
||||
return vectors[0]
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Implement infrastructure support for openai compatible answer generator."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Generator
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.conversation import AnswerGenerator, AnswerResult, AnswerSource
|
||||
from app.domain.retrieval import RetrievedChunk
|
||||
from app.services.llm.llm_factory import get_llm_client
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
PROMPT_TEMPLATES = {
|
||||
"default": "你是法规知识问答助手。请仅依据提供的上下文回答;如果上下文不足,明确说明。",
|
||||
"compliance_qa": "你是法规合规问答助手。优先引用给定法规原文,回答要准确、克制,并注明依据来源。",
|
||||
}
|
||||
|
||||
|
||||
class OpenAICompatibleAnswerGenerator(AnswerGenerator):
|
||||
"""Represent the Open A I Compatible Answer Generator type."""
|
||||
def _build_messages(
|
||||
self,
|
||||
*,
|
||||
query: str,
|
||||
retrieved_chunks: list[RetrievedChunk],
|
||||
history: list[dict[str, str]] | None,
|
||||
prompt_template: str | None,
|
||||
) -> tuple[list[dict[str, str]], int]:
|
||||
"""Handle build messages for this module for the Open A I Compatible Answer Generator instance."""
|
||||
system_prompt = PROMPT_TEMPLATES.get(prompt_template or "compliance_qa", PROMPT_TEMPLATES["default"])
|
||||
context_blocks = []
|
||||
context_tokens = 0
|
||||
for idx, chunk in enumerate(retrieved_chunks, start=1):
|
||||
block = (
|
||||
f"[{idx}] 文档: {chunk.doc_name}\n"
|
||||
f"章节: {chunk.section_title or '未标注'}\n"
|
||||
f"页码: {chunk.page_number}\n"
|
||||
f"内容: {chunk.content}"
|
||||
)
|
||||
context_tokens += len(block)
|
||||
context_blocks.append(block)
|
||||
context = "\n\n".join(context_blocks)[: settings.rag_max_context_tokens * 4]
|
||||
messages = [{"role": "system", "content": system_prompt}]
|
||||
for item in history or []:
|
||||
messages.append({"role": item["role"], "content": item["content"]})
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"问题:{query}\n\n参考上下文:\n{context}\n\n请在回答后给出简要引用编号。",
|
||||
}
|
||||
)
|
||||
return messages, min(context_tokens, settings.rag_max_context_tokens)
|
||||
|
||||
def _sources(self, chunks: list[RetrievedChunk]) -> list[AnswerSource]:
|
||||
"""Handle sources for this module for the Open A I Compatible Answer Generator instance."""
|
||||
return [
|
||||
AnswerSource(
|
||||
doc_id=chunk.doc_id,
|
||||
doc_name=chunk.doc_name,
|
||||
chunk_id=chunk.chunk_id,
|
||||
section_title=chunk.section_title,
|
||||
page_number=chunk.page_number,
|
||||
score=chunk.score,
|
||||
content=chunk.content,
|
||||
metadata=chunk.metadata,
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
def generate(
|
||||
self,
|
||||
*,
|
||||
query: str,
|
||||
retrieved_chunks: list[RetrievedChunk],
|
||||
history: list[dict[str, str]] | None = None,
|
||||
provider: str | None = None,
|
||||
model: str | None = None,
|
||||
prompt_template: str | None = None,
|
||||
) -> AnswerResult:
|
||||
"""Handle generate for the Open A I Compatible Answer Generator instance."""
|
||||
start = time.time()
|
||||
messages, context_tokens = self._build_messages(
|
||||
query=query,
|
||||
retrieved_chunks=retrieved_chunks,
|
||||
history=history,
|
||||
prompt_template=prompt_template,
|
||||
)
|
||||
client = get_llm_client(provider=provider or settings.llm_provider, model=model or settings.llm_model)
|
||||
response = client.chat(messages)
|
||||
latency_ms = int((time.time() - start) * 1000)
|
||||
return AnswerResult(
|
||||
answer=response.content if response.is_success else "",
|
||||
sources=self._sources(retrieved_chunks),
|
||||
model=response.model or (model or settings.llm_model),
|
||||
latency_ms=latency_ms,
|
||||
retrieved_count=len(retrieved_chunks),
|
||||
context_tokens=context_tokens,
|
||||
truncated=False,
|
||||
error=response.error,
|
||||
)
|
||||
|
||||
def stream_generate(
|
||||
self,
|
||||
*,
|
||||
query: str,
|
||||
retrieved_chunks: list[RetrievedChunk],
|
||||
history: list[dict[str, str]] | None = None,
|
||||
provider: str | None = None,
|
||||
model: str | None = None,
|
||||
prompt_template: str | None = None,
|
||||
) -> Generator[dict, None, AnswerResult]:
|
||||
"""Stream generate for the Open A I Compatible Answer Generator instance."""
|
||||
start = time.time()
|
||||
messages, context_tokens = self._build_messages(
|
||||
query=query,
|
||||
retrieved_chunks=retrieved_chunks,
|
||||
history=history,
|
||||
prompt_template=prompt_template,
|
||||
)
|
||||
sources = [source.__dict__ for source in self._sources(retrieved_chunks)]
|
||||
yield {"event": "sources", "data": sources}
|
||||
client = get_llm_client(provider=provider or settings.llm_provider, model=model or settings.llm_model)
|
||||
answer_parts: list[str] = []
|
||||
if hasattr(client, "stream_chat"):
|
||||
for chunk in client.stream_chat(messages):
|
||||
answer_parts.append(chunk)
|
||||
yield {"event": "content", "data": chunk}
|
||||
else:
|
||||
response = client.chat(messages)
|
||||
answer_parts.append(response.content)
|
||||
yield {"event": "content", "data": response.content}
|
||||
full_answer = "".join(answer_parts)
|
||||
yield {
|
||||
"event": "done",
|
||||
"data": {
|
||||
"latency_ms": int((time.time() - start) * 1000),
|
||||
"retrieved_count": len(retrieved_chunks),
|
||||
"context_tokens": context_tokens,
|
||||
"model": model or settings.llm_model,
|
||||
},
|
||||
}
|
||||
5
backend/app/infrastructure/parser/__init__.py
Normal file
5
backend/app/infrastructure/parser/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.parser package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Implement infrastructure support for aliyun document parser."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.aliyun_parser.parse_pdf import (
|
||||
MAX_CHARS,
|
||||
OVERLAP_CHARS,
|
||||
build_semantic_blocks,
|
||||
build_structure_nodes,
|
||||
build_vector_chunks,
|
||||
collect_all_results,
|
||||
init_client,
|
||||
submit_job,
|
||||
wait_for_completion,
|
||||
)
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class AliyunDocumentParser(DocumentParser):
|
||||
"""Provide the Aliyun Document Parser parser."""
|
||||
parser_name = "aliyun_docmind"
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
"""Handle parse for the Aliyun Document Parser instance."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, file_path)
|
||||
if not wait_for_completion(client, task_id):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_name,
|
||||
max_chars=MAX_CHARS,
|
||||
overlap_chars=OVERLAP_CHARS,
|
||||
)
|
||||
raw_text = "\n\n".join(
|
||||
block.get("text", "")
|
||||
for block in semantic_blocks
|
||||
if block.get("text")
|
||||
)
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
structure_nodes=structure_nodes,
|
||||
semantic_blocks=semantic_blocks,
|
||||
vector_chunks=vector_chunks,
|
||||
parser_name=self.parser_name,
|
||||
raw_text=raw_text,
|
||||
metadata={"task_id": task_id, "layout_count": len(layouts)},
|
||||
)
|
||||
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Local chunk builder adapter for the migrated backend architecture."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
|
||||
|
||||
class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
"""Adapt the existing markdown chunker to the new chunk builder port."""
|
||||
|
||||
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
|
||||
self.chunker = RegulationChunker(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
parsed_document: ParsedDocument,
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
markdown_text = parsed_document.raw_text.strip()
|
||||
if not markdown_text:
|
||||
return []
|
||||
|
||||
legacy_chunks = self.chunker.chunk_document(
|
||||
markdown_text,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
for item in legacy_chunks:
|
||||
metadata = {
|
||||
"section_number": item.metadata.section_number,
|
||||
"section_title": item.metadata.section_title,
|
||||
"clause_number": item.metadata.clause_number,
|
||||
"start_position": item.metadata.start_position,
|
||||
"end_position": item.metadata.end_position,
|
||||
"token_count": item.token_count,
|
||||
"source": "local_chunker",
|
||||
}
|
||||
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=item.metadata.chunk_id,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=item.content,
|
||||
embedding_text=item.content,
|
||||
section_title=item.metadata.section_title or item.metadata.section_number,
|
||||
section_path=section_path,
|
||||
page_number=item.metadata.page_number,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.metadata.clause_number,
|
||||
block_type="local_markdown_chunk",
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
38
backend/app/infrastructure/parser/local_document_parser.py
Normal file
38
backend/app/infrastructure/parser/local_document_parser.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Local parser adapter for the migrated backend architecture."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.services.parser.docx_parser import parse_docx_to_markdown
|
||||
from app.services.parser.pdf_parser import parse_pdf_to_markdown
|
||||
|
||||
|
||||
class LocalDocumentParser(DocumentParser):
|
||||
"""Adapt the existing local PDF/DOCX parsers to the new parser port."""
|
||||
|
||||
parser_name = "local_markdown_parser"
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
markdown_text = parse_pdf_to_markdown(file_path)
|
||||
elif suffix in {".docx", ".doc"}:
|
||||
markdown_text = parse_docx_to_markdown(file_path)
|
||||
else:
|
||||
raise ValueError(f"不支持的文件类型: {suffix}")
|
||||
|
||||
if not markdown_text.strip():
|
||||
raise ValueError("本地解析完成但未提取到有效文本")
|
||||
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
structure_nodes=[],
|
||||
semantic_blocks=[],
|
||||
vector_chunks=[],
|
||||
parser_name=self.parser_name,
|
||||
raw_text=markdown_text,
|
||||
metadata={"source": "local_parser", "file_suffix": suffix},
|
||||
)
|
||||
48
backend/app/infrastructure/parser/vector_chunk_builder.py
Normal file
48
backend/app/infrastructure/parser/vector_chunk_builder.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Implement infrastructure support for vector chunk builder."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class AliyunVectorChunkBuilder(ChunkBuilder):
|
||||
"""Provide the Aliyun Vector Chunk Builder builder."""
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
parsed_document: ParsedDocument,
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
"""Handle build for the Aliyun Vector Chunk Builder instance."""
|
||||
chunks: list[Chunk] = []
|
||||
for index, item in enumerate(parsed_document.vector_chunks):
|
||||
content = item.get("content") or item.get("text") or ""
|
||||
embedding_text = item.get("embedding_text") or content
|
||||
if not embedding_text.strip():
|
||||
continue
|
||||
section_path = item.get("section_path") or []
|
||||
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
|
||||
page_number = item.get("page_start") or item.get("page") or 0
|
||||
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
|
||||
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=str(chunk_id),
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=content,
|
||||
embedding_text=embedding_text,
|
||||
section_title=section_title,
|
||||
section_path=section_path,
|
||||
page_number=int(page_number or 0),
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.get("semantic_id", ""),
|
||||
block_type=item.get("block_type", ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
5
backend/app/infrastructure/session/__init__.py
Normal file
5
backend/app/infrastructure/session/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.session package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Implement infrastructure support for in memory conversation store."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from app.domain.conversation import ConversationMessage, ConversationSession, ConversationStore
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class InMemoryConversationStore(ConversationStore):
|
||||
"""Provide the In Memory Conversation Store store implementation."""
|
||||
def __init__(self, *, max_sessions: int = 100, timeout_minutes: int = 30) -> None:
|
||||
"""Initialize the In Memory Conversation Store instance."""
|
||||
self.max_sessions = max_sessions
|
||||
self.timeout_seconds = timeout_minutes * 60
|
||||
self.sessions: dict[str, ConversationSession] = {}
|
||||
|
||||
def _now(self) -> int:
|
||||
"""Handle now for this module for the In Memory Conversation Store instance."""
|
||||
return int(time.time())
|
||||
|
||||
def _cleanup_expired(self) -> None:
|
||||
"""Handle cleanup expired for this module for the In Memory Conversation Store instance."""
|
||||
now = self._now()
|
||||
expired = [
|
||||
session_id
|
||||
for session_id, session in self.sessions.items()
|
||||
if (now - session.updated_at) > self.timeout_seconds
|
||||
]
|
||||
for session_id in expired:
|
||||
self.sessions.pop(session_id, None)
|
||||
|
||||
def create_session(self, metadata: dict | None = None) -> ConversationSession:
|
||||
"""Create session for the In Memory Conversation Store instance."""
|
||||
self._cleanup_expired()
|
||||
if len(self.sessions) >= self.max_sessions:
|
||||
oldest = min(self.sessions.values(), key=lambda item: item.updated_at)
|
||||
self.sessions.pop(oldest.session_id, None)
|
||||
session_id = str(uuid.uuid4())[:8]
|
||||
session = ConversationSession(
|
||||
session_id=session_id,
|
||||
created_at=self._now(),
|
||||
updated_at=self._now(),
|
||||
metadata=metadata or {},
|
||||
)
|
||||
self.sessions[session_id] = session
|
||||
return session
|
||||
|
||||
def get_session(self, session_id: str) -> ConversationSession | None:
|
||||
"""Return session for the In Memory Conversation Store instance."""
|
||||
self._cleanup_expired()
|
||||
return self.sessions.get(session_id)
|
||||
|
||||
def save_message(
|
||||
self,
|
||||
session_id: str,
|
||||
*,
|
||||
role: str,
|
||||
content: str,
|
||||
sources: list[dict] | None = None,
|
||||
) -> ConversationSession | None:
|
||||
"""Save message for the In Memory Conversation Store instance."""
|
||||
session = self.get_session(session_id)
|
||||
if not session:
|
||||
return None
|
||||
session.messages.append(
|
||||
ConversationMessage(
|
||||
role=role,
|
||||
content=content,
|
||||
timestamp=self._now(),
|
||||
sources=sources or [],
|
||||
)
|
||||
)
|
||||
session.updated_at = self._now()
|
||||
return session
|
||||
|
||||
def delete_session(self, session_id: str) -> bool:
|
||||
"""Delete session for the In Memory Conversation Store instance."""
|
||||
return self.sessions.pop(session_id, None) is not None
|
||||
|
||||
def list_sessions(self) -> list[dict]:
|
||||
"""List sessions for the In Memory Conversation Store instance."""
|
||||
self._cleanup_expired()
|
||||
return [
|
||||
{
|
||||
"session_id": session.session_id,
|
||||
"message_count": len(session.messages),
|
||||
"created_at": session.created_at,
|
||||
"updated_at": session.updated_at,
|
||||
}
|
||||
for session in self.sessions.values()
|
||||
]
|
||||
5
backend/app/infrastructure/storage/__init__.py
Normal file
5
backend/app/infrastructure/storage/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.storage package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
109
backend/app/infrastructure/storage/json_document_repository.py
Normal file
109
backend/app/infrastructure/storage/json_document_repository.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Implement infrastructure support for json document repository."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.domain.documents import Document, DocumentRepository, DocumentStatus
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class JsonDocumentRepository(DocumentRepository):
|
||||
"""Provide the Json Document Repository repository implementation."""
|
||||
def __init__(self, file_path: str) -> None:
|
||||
"""Initialize the Json Document Repository instance."""
|
||||
self.file_path = Path(file_path)
|
||||
self.file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not self.file_path.exists():
|
||||
self.file_path.write_text("{}", encoding="utf-8")
|
||||
|
||||
def _load(self) -> dict[str, dict]:
|
||||
"""Handle load for this module for the Json Document Repository instance."""
|
||||
return json.loads(self.file_path.read_text(encoding="utf-8") or "{}")
|
||||
|
||||
def _save(self, payload: dict[str, dict]) -> None:
|
||||
"""Handle save for this module for the Json Document Repository instance."""
|
||||
self.file_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
def _serialize(self, document: Document) -> dict:
|
||||
"""Handle serialize for this module for the Json Document Repository instance."""
|
||||
payload = document.__dict__.copy()
|
||||
payload["status"] = document.status.value
|
||||
payload["created_at"] = document.created_at.isoformat()
|
||||
payload["updated_at"] = document.updated_at.isoformat()
|
||||
return payload
|
||||
|
||||
def _deserialize(self, payload: dict) -> Document:
|
||||
"""Handle deserialize for this module for the Json Document Repository instance."""
|
||||
return Document(
|
||||
**{
|
||||
**payload,
|
||||
"status": DocumentStatus(payload["status"]),
|
||||
"created_at": datetime.fromisoformat(payload["created_at"]),
|
||||
"updated_at": datetime.fromisoformat(payload["updated_at"]),
|
||||
}
|
||||
)
|
||||
|
||||
def create(self, document: Document) -> Document:
|
||||
"""Handle create for the Json Document Repository instance."""
|
||||
payload = self._load()
|
||||
payload[document.doc_id] = self._serialize(document)
|
||||
self._save(payload)
|
||||
return document
|
||||
|
||||
def update(self, document: Document) -> Document:
|
||||
"""Handle update for the Json Document Repository instance."""
|
||||
document.updated_at = datetime.now(UTC)
|
||||
payload = self._load()
|
||||
payload[document.doc_id] = self._serialize(document)
|
||||
self._save(payload)
|
||||
return document
|
||||
|
||||
def get(self, doc_id: str) -> Document | None:
|
||||
"""Handle get for the Json Document Repository instance."""
|
||||
payload = self._load()
|
||||
item = payload.get(doc_id)
|
||||
return self._deserialize(item) if item else None
|
||||
|
||||
def list(self, limit: int | None = None) -> list[Document]:
|
||||
"""Handle list for the Json Document Repository instance."""
|
||||
payload = self._load()
|
||||
documents = [self._deserialize(item) for item in payload.values()]
|
||||
documents.sort(key=lambda item: item.updated_at, reverse=True)
|
||||
return documents[:limit] if limit is not None else documents
|
||||
|
||||
def update_status(
|
||||
self,
|
||||
doc_id: str,
|
||||
status: DocumentStatus,
|
||||
*,
|
||||
error_message: str = "",
|
||||
chunk_count: int | None = None,
|
||||
summary: str | None = None,
|
||||
summary_latency_ms: int | None = None,
|
||||
parser_name: str | None = None,
|
||||
index_name: str | None = None,
|
||||
metadata: dict | None = None,
|
||||
) -> Document | None:
|
||||
"""Update status for the Json Document Repository instance."""
|
||||
document = self.get(doc_id)
|
||||
if not document:
|
||||
return None
|
||||
document.status = status
|
||||
document.error_message = error_message
|
||||
if chunk_count is not None:
|
||||
document.chunk_count = chunk_count
|
||||
if summary is not None:
|
||||
document.summary = summary
|
||||
if summary_latency_ms is not None:
|
||||
document.summary_latency_ms = summary_latency_ms
|
||||
if parser_name is not None:
|
||||
document.parser_name = parser_name
|
||||
if index_name is not None:
|
||||
document.index_name = index_name
|
||||
if metadata:
|
||||
document.metadata.update(metadata)
|
||||
return self.update(document)
|
||||
47
backend/app/infrastructure/storage/minio_binary_store.py
Normal file
47
backend/app/infrastructure/storage/minio_binary_store.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Implement infrastructure support for minio binary store."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import DocumentBinaryStore
|
||||
from app.services.storage.minio_client import MinIOClient
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class MinioDocumentBinaryStore(DocumentBinaryStore):
|
||||
"""Provide the Minio Document Binary Store store implementation."""
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Minio Document Binary Store instance."""
|
||||
self.client = MinIOClient()
|
||||
self.client.connect()
|
||||
self.client.ensure_bucket()
|
||||
|
||||
def save(
|
||||
self,
|
||||
*,
|
||||
object_name: str,
|
||||
data: bytes,
|
||||
content_type: str,
|
||||
metadata: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""Handle save for the Minio Document Binary Store instance."""
|
||||
success = self.client.upload_bytes(
|
||||
data=data,
|
||||
object_name=object_name,
|
||||
content_type=content_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
if not success:
|
||||
raise RuntimeError("MinIO 保存失败")
|
||||
|
||||
def read(self, object_name: str) -> bytes:
|
||||
"""Handle read for the Minio Document Binary Store instance."""
|
||||
data = self.client.get_object_data(object_name)
|
||||
if data is None:
|
||||
raise FileNotFoundError(f"对象不存在: {object_name}")
|
||||
return data
|
||||
|
||||
def delete(self, object_name: str) -> None:
|
||||
"""Handle delete for the Minio Document Binary Store instance."""
|
||||
if not self.client.delete_object(object_name):
|
||||
raise FileNotFoundError(f"对象删除失败: {object_name}")
|
||||
5
backend/app/infrastructure/vectorstore/__init__.py
Normal file
5
backend/app/infrastructure/vectorstore/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Initialize the app.infrastructure.vectorstore package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
|
||||
__all__ = []
|
||||
24
backend/app/infrastructure/vectorstore/dense_retriever.py
Normal file
24
backend/app/infrastructure/vectorstore/dense_retriever.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""Implement infrastructure support for dense retriever."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.retrieval import EmbeddingProvider, RetrievalQuery, Retriever, RetrievedChunk, VectorIndex
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class DenseRetriever(Retriever):
|
||||
"""Provide the Dense Retriever retriever."""
|
||||
def __init__(self, *, embedding_provider: EmbeddingProvider, vector_index: VectorIndex) -> None:
|
||||
"""Initialize the Dense Retriever instance."""
|
||||
self.embedding_provider = embedding_provider
|
||||
self.vector_index = vector_index
|
||||
|
||||
def retrieve(self, query: RetrievalQuery) -> list[RetrievedChunk]:
|
||||
"""Handle retrieve for the Dense Retriever instance."""
|
||||
query_vector = self.embedding_provider.embed_query(query.query)
|
||||
return self.vector_index.search(query_vector, query.top_k, query.filters)
|
||||
|
||||
def search(self, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
|
||||
"""Handle search for the Dense Retriever instance."""
|
||||
return self.retrieve(RetrievalQuery(query=query, top_k=top_k, filters=filters))
|
||||
154
backend/app/infrastructure/vectorstore/milvus_vector_index.py
Normal file
154
backend/app/infrastructure/vectorstore/milvus_vector_index.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Implement infrastructure support for milvus vector index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
|
||||
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import Chunk
|
||||
from app.domain.retrieval import RetrievedChunk, VectorIndex
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class MilvusVectorIndex(VectorIndex):
|
||||
"""Provide the Milvus Vector Index index implementation."""
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Milvus Vector Index instance."""
|
||||
self.collection_name = settings.milvus_collection
|
||||
self.db_name = settings.milvus_db_name
|
||||
connections.connect(
|
||||
alias="default",
|
||||
host=settings.milvus_host,
|
||||
port=settings.milvus_port,
|
||||
db_name=self.db_name,
|
||||
)
|
||||
self.collection = self._ensure_collection()
|
||||
|
||||
def _ensure_collection(self) -> Collection:
|
||||
"""Handle ensure collection for this module for the Milvus Vector Index instance."""
|
||||
if utility.has_collection(self.collection_name):
|
||||
collection = Collection(self.collection_name)
|
||||
collection.load()
|
||||
return collection
|
||||
schema = CollectionSchema(
|
||||
fields=[
|
||||
FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=128, is_primary=True, auto_id=False),
|
||||
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
|
||||
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=settings.embedding_dim),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="section_path", dtype=DataType.VARCHAR, max_length=4096),
|
||||
FieldSchema(name="page_number", dtype=DataType.INT64),
|
||||
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="semantic_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="block_type", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="metadata_json", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="created_at", dtype=DataType.INT64),
|
||||
],
|
||||
description="Dense-only regulations index",
|
||||
enable_dynamic_field=False,
|
||||
)
|
||||
collection = Collection(name=self.collection_name, schema=schema)
|
||||
collection.create_index(
|
||||
field_name="embedding",
|
||||
index_params={
|
||||
"metric_type": "COSINE",
|
||||
"index_type": settings.milvus_index_type,
|
||||
"params": {"nlist": settings.milvus_nlist},
|
||||
},
|
||||
)
|
||||
collection.load()
|
||||
return collection
|
||||
|
||||
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
|
||||
"""Handle upsert for the Milvus Vector Index instance."""
|
||||
if len(chunks) != len(vectors):
|
||||
raise ValueError("chunks 与 vectors 数量不一致")
|
||||
data = []
|
||||
now = int(time.time())
|
||||
for chunk, vector in zip(chunks, vectors):
|
||||
data.append(
|
||||
{
|
||||
"id": chunk.chunk_id,
|
||||
"doc_id": chunk.doc_id,
|
||||
"doc_name": chunk.doc_name,
|
||||
"content": chunk.content[:65535],
|
||||
"embedding": vector,
|
||||
"section_title": chunk.section_title[:512],
|
||||
"section_path": json.dumps(chunk.section_path, ensure_ascii=False)[:4096],
|
||||
"page_number": chunk.page_number,
|
||||
"regulation_type": chunk.regulation_type[:128],
|
||||
"version": chunk.version[:64],
|
||||
"semantic_id": chunk.semantic_id[:128],
|
||||
"block_type": chunk.block_type[:64],
|
||||
"metadata_json": json.dumps(chunk.metadata, ensure_ascii=False)[:65535],
|
||||
"created_at": now,
|
||||
}
|
||||
)
|
||||
self.collection.insert(data)
|
||||
self.collection.flush()
|
||||
return len(data)
|
||||
|
||||
def delete_by_document(self, doc_id: str) -> int:
|
||||
"""Delete by document for the Milvus Vector Index instance."""
|
||||
result = self.collection.delete(f'doc_id == "{doc_id}"')
|
||||
return len(result.primary_keys)
|
||||
|
||||
def search(self, query_vector: list[float], top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
|
||||
"""Handle search for the Milvus Vector Index instance."""
|
||||
results = self.collection.search(
|
||||
data=[query_vector],
|
||||
anns_field="embedding",
|
||||
param={"metric_type": "COSINE", "params": {"nprobe": settings.milvus_nprobe}},
|
||||
limit=top_k,
|
||||
filter=filters,
|
||||
output_fields=[
|
||||
"doc_id",
|
||||
"doc_name",
|
||||
"content",
|
||||
"section_title",
|
||||
"page_number",
|
||||
"regulation_type",
|
||||
"version",
|
||||
"semantic_id",
|
||||
"block_type",
|
||||
"metadata_json",
|
||||
],
|
||||
)
|
||||
payload: list[RetrievedChunk] = []
|
||||
for hits in results:
|
||||
for hit in hits:
|
||||
metadata = {}
|
||||
raw_metadata = hit.entity.get("metadata_json", "")
|
||||
if raw_metadata:
|
||||
try:
|
||||
metadata = json.loads(raw_metadata)
|
||||
except json.JSONDecodeError:
|
||||
metadata = {"raw_metadata": raw_metadata}
|
||||
payload.append(
|
||||
RetrievedChunk(
|
||||
chunk_id=str(hit.id),
|
||||
doc_id=hit.entity.get("doc_id", ""),
|
||||
doc_name=hit.entity.get("doc_name", ""),
|
||||
content=hit.entity.get("content", ""),
|
||||
score=float(hit.score),
|
||||
section_title=hit.entity.get("section_title", ""),
|
||||
page_number=int(hit.entity.get("page_number", 0) or 0),
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return payload
|
||||
|
||||
def health(self) -> dict:
|
||||
"""Handle health for the Milvus Vector Index instance."""
|
||||
return {
|
||||
"connected": True,
|
||||
"collection_name": self.collection_name,
|
||||
"num_entities": self.collection.num_entities if self.collection else 0,
|
||||
}
|
||||
Reference in New Issue
Block a user