2026-05-18 16:32:42 +08:00
|
|
|
|
"""Provide service-layer logic for document summarizer."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
from typing import Dict, Optional
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
from app.services.llm.base_client import BaseLLMClient
|
|
|
|
|
|
from app.services.llm.llm_factory import get_llm_client
|
2026-05-14 15:07:34 +08:00
|
|
|
|
from app.services.rag.prompt_templates import get_prompt_template
|
|
|
|
|
|
from app.config.settings import settings
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
|
|
|
|
|
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class DocumentSummary:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Represent the Document Summary type."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_name: str
|
|
|
|
|
|
summary: str
|
|
|
|
|
|
applicable_scope: str
|
|
|
|
|
|
key_clauses: list
|
|
|
|
|
|
key_terms: list
|
|
|
|
|
|
compliance_points: list
|
|
|
|
|
|
model: str
|
|
|
|
|
|
latency_ms: int
|
|
|
|
|
|
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
def is_success(self) -> bool:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Return whether success for the Document Summary instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
return self.error is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentSummarizer:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Represent the Document Summarizer type."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
provider: str = None,
|
|
|
|
|
|
model: str = None,
|
|
|
|
|
|
max_tokens: int = None
|
|
|
|
|
|
):
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Initialize the Document Summarizer instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
self.provider = provider or settings.llm_provider
|
|
|
|
|
|
self.model = model or settings.llm_model
|
|
|
|
|
|
self.max_tokens = max_tokens or settings.rag_summary_max_tokens
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
self.llm: Optional[BaseLLMClient] = None
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"摘要生成器初始化: provider={self.provider}, model={self.model}")
|
|
|
|
|
|
|
|
|
|
|
|
def _init_llm(self):
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle init llm for this module for the Document Summarizer instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if self.llm is None:
|
|
|
|
|
|
self.llm = get_llm_client(
|
|
|
|
|
|
provider=self.provider,
|
|
|
|
|
|
model=self.model
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def summarize(
|
|
|
|
|
|
self,
|
|
|
|
|
|
doc_name: str,
|
|
|
|
|
|
content: str,
|
|
|
|
|
|
regulation_type: str = "",
|
|
|
|
|
|
max_tokens: Optional[int] = None
|
|
|
|
|
|
) -> DocumentSummary:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle summarize for the Document Summarizer instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
import time
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"生成文档摘要: {doc_name}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self._init_llm()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
template = get_prompt_template("document_summary")
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
user_content = template.user_template.format(
|
|
|
|
|
|
doc_name=doc_name,
|
2026-05-18 16:32:42 +08:00
|
|
|
|
content=content[:8000] # Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
response = self.llm.chat(
|
|
|
|
|
|
messages=[
|
|
|
|
|
|
{"role": "system", "content": template.system_prompt},
|
|
|
|
|
|
{"role": "user", "content": user_content}
|
|
|
|
|
|
],
|
|
|
|
|
|
max_tokens=max_tokens or self.max_tokens,
|
2026-05-18 16:32:42 +08:00
|
|
|
|
temperature=0.3 # Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
latency_ms = int((time.time() - start_time) * 1000)
|
|
|
|
|
|
|
|
|
|
|
|
if not response.is_success:
|
|
|
|
|
|
return DocumentSummary(
|
|
|
|
|
|
doc_name=doc_name,
|
|
|
|
|
|
summary="",
|
|
|
|
|
|
applicable_scope="",
|
|
|
|
|
|
key_clauses=[],
|
|
|
|
|
|
key_terms=[],
|
|
|
|
|
|
compliance_points=[],
|
|
|
|
|
|
model=self.model,
|
|
|
|
|
|
latency_ms=latency_ms,
|
|
|
|
|
|
error=response.error
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
summary_data = self._parse_summary(response.content)
|
|
|
|
|
|
|
|
|
|
|
|
logger.success(f"摘要生成完成: {doc_name}, {latency_ms}ms")
|
|
|
|
|
|
|
|
|
|
|
|
return DocumentSummary(
|
|
|
|
|
|
doc_name=doc_name,
|
|
|
|
|
|
summary=summary_data.get("summary", response.content),
|
|
|
|
|
|
applicable_scope=summary_data.get("applicable_scope", ""),
|
|
|
|
|
|
key_clauses=summary_data.get("key_clauses", []),
|
|
|
|
|
|
key_terms=summary_data.get("key_terms", []),
|
|
|
|
|
|
compliance_points=summary_data.get("compliance_points", []),
|
|
|
|
|
|
model=response.model,
|
|
|
|
|
|
latency_ms=latency_ms
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"摘要生成失败: {e}")
|
|
|
|
|
|
return DocumentSummary(
|
|
|
|
|
|
doc_name=doc_name,
|
|
|
|
|
|
summary="",
|
|
|
|
|
|
applicable_scope="",
|
|
|
|
|
|
key_clauses=[],
|
|
|
|
|
|
key_terms=[],
|
|
|
|
|
|
compliance_points=[],
|
|
|
|
|
|
model=self.model,
|
|
|
|
|
|
latency_ms=0,
|
|
|
|
|
|
error=str(e)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_summary(self, content: str) -> Dict:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle parse summary for this module for the Document Summarizer instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
result = {
|
|
|
|
|
|
"summary": content,
|
|
|
|
|
|
"applicable_scope": "",
|
|
|
|
|
|
"key_clauses": [],
|
|
|
|
|
|
"key_terms": [],
|
|
|
|
|
|
"compliance_points": []
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines = content.split("\n")
|
|
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if "适用范围" in line or "适用对象" in line:
|
|
|
|
|
|
result["applicable_scope"] = line.split(":")[-1].strip() if ":" in line else line.split(":")[-1].strip()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if line.startswith("- 【条款") or line.startswith("【条款"):
|
|
|
|
|
|
result["key_clauses"].append(line)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if "关键术语" in line or "术语定义" in line:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
pass
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep provider-specific behavior explicit so debugging stays straightforward.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if "合规要点" in line or "必须满足" in line:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def batch_summarize(
|
|
|
|
|
|
self,
|
|
|
|
|
|
documents: list
|
|
|
|
|
|
) -> list:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle batch summarize for the Document Summarizer instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
results = []
|
|
|
|
|
|
for doc in documents:
|
|
|
|
|
|
result = self.summarize(doc["doc_name"], doc["content"])
|
|
|
|
|
|
results.append(result)
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_document(
|
|
|
|
|
|
doc_name: str,
|
|
|
|
|
|
content: str,
|
|
|
|
|
|
**kwargs
|
|
|
|
|
|
) -> DocumentSummary:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle summarize document."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
summarizer = DocumentSummarizer(**kwargs)
|
|
|
|
|
|
return summarizer.summarize(doc_name, content)
|