This commit is contained in:
2026-05-14 15:07:34 +08:00
parent c2a398930d
commit 10d04c4083
179 changed files with 24073 additions and 1243 deletions

View File

@@ -0,0 +1,231 @@
# src/services/llm/document_summarizer.py
"""文档摘要生成服务 - LLM生成法规文档摘要"""
from typing import Dict, Optional
from dataclasses import dataclass
from loguru import logger
from app.services.llm import get_llm_client, BaseLLMClient
from app.services.rag.prompt_templates import get_prompt_template
from app.config.settings import settings
@dataclass
class DocumentSummary:
"""文档摘要结果"""
doc_name: str
summary: str
applicable_scope: str
key_clauses: list
key_terms: list
compliance_points: list
model: str
latency_ms: int
error: Optional[str] = None
@property
def is_success(self) -> bool:
return self.error is None
class DocumentSummarizer:
"""
文档摘要生成器
功能:
- 生成法规文档的核心要点摘要
- 提取适用范围
- 突出关键条款
- 列出合规要点
使用示例:
summarizer = DocumentSummarizer()
result = summarizer.summarize("GB 7258-2017", markdown_content)
print(result.summary)
"""
def __init__(
self,
provider: str = None,
model: str = None,
max_tokens: int = None
):
"""
初始化摘要生成器
Args:
provider: LLM提供商
model: LLM模型名称
max_tokens: 最大输出token数
"""
self.provider = provider or settings.llm_provider
self.model = model or settings.llm_model
self.max_tokens = max_tokens or settings.rag_summary_max_tokens
# LLM客户端延迟加载
self.llm: Optional[BaseLLMClient] = None
logger.info(f"摘要生成器初始化: provider={self.provider}, model={self.model}")
def _init_llm(self):
"""延迟初始化LLM"""
if self.llm is None:
self.llm = get_llm_client(
provider=self.provider,
model=self.model
)
def summarize(
self,
doc_name: str,
content: str,
regulation_type: str = "",
max_tokens: Optional[int] = None
) -> DocumentSummary:
"""
生成文档摘要
Args:
doc_name: 文档名称
content: 文档内容Markdown格式
regulation_type: 法规类型
max_tokens: 最大输出token数
Returns:
DocumentSummary: 摘要结果
"""
import time
start_time = time.time()
logger.info(f"生成文档摘要: {doc_name}")
try:
self._init_llm()
# 使用摘要模板
template = get_prompt_template("document_summary")
# 构建用户消息
user_content = template.user_template.format(
doc_name=doc_name,
content=content[:8000] # 截取前8000字符避免超出token限制
)
# 调用LLM
response = self.llm.chat(
messages=[
{"role": "system", "content": template.system_prompt},
{"role": "user", "content": user_content}
],
max_tokens=max_tokens or self.max_tokens,
temperature=0.3 # 低温度保证摘要准确性
)
latency_ms = int((time.time() - start_time) * 1000)
if not response.is_success:
return DocumentSummary(
doc_name=doc_name,
summary="",
applicable_scope="",
key_clauses=[],
key_terms=[],
compliance_points=[],
model=self.model,
latency_ms=latency_ms,
error=response.error
)
# 解析摘要结构
summary_data = self._parse_summary(response.content)
logger.success(f"摘要生成完成: {doc_name}, {latency_ms}ms")
return DocumentSummary(
doc_name=doc_name,
summary=summary_data.get("summary", response.content),
applicable_scope=summary_data.get("applicable_scope", ""),
key_clauses=summary_data.get("key_clauses", []),
key_terms=summary_data.get("key_terms", []),
compliance_points=summary_data.get("compliance_points", []),
model=response.model,
latency_ms=latency_ms
)
except Exception as e:
logger.error(f"摘要生成失败: {e}")
return DocumentSummary(
doc_name=doc_name,
summary="",
applicable_scope="",
key_clauses=[],
key_terms=[],
compliance_points=[],
model=self.model,
latency_ms=0,
error=str(e)
)
def _parse_summary(self, content: str) -> Dict:
"""解析摘要内容(提取结构化信息)"""
result = {
"summary": content,
"applicable_scope": "",
"key_clauses": [],
"key_terms": [],
"compliance_points": []
}
# 简单解析(提取关键信息)
lines = content.split("\n")
for line in lines:
line = line.strip()
# 提取适用范围
if "适用范围" in line or "适用对象" in line:
result["applicable_scope"] = line.split("")[-1].strip() if "" in line else line.split(":")[-1].strip()
# 提取关键条款
if line.startswith("- 【条款") or line.startswith("【条款"):
result["key_clauses"].append(line)
# 提取关键术语
if "关键术语" in line or "术语定义" in line:
# 继续读取后续几行
pass
# 提取合规要点
if "合规要点" in line or "必须满足" in line:
pass
return result
def batch_summarize(
self,
documents: list
) -> list:
"""
批量生成摘要
Args:
documents: 文档列表 [{"doc_name": str, "content": str}, ...]
Returns:
list: 摘要结果列表
"""
results = []
for doc in documents:
result = self.summarize(doc["doc_name"], doc["content"])
results.append(result)
return results
def summarize_document(
doc_name: str,
content: str,
**kwargs
) -> DocumentSummary:
"""便捷函数:生成文档摘要"""
summarizer = DocumentSummarizer(**kwargs)
return summarizer.summarize(doc_name, content)