Files
AIRegulation-DocAnalysis/backend/app/services/llm/document_summarizer.py
2026-05-14 15:07:34 +08:00

232 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# src/services/llm/document_summarizer.py
"""文档摘要生成服务 - LLM生成法规文档摘要"""
from typing import Dict, Optional
from dataclasses import dataclass
from loguru import logger
from app.services.llm import get_llm_client, BaseLLMClient
from app.services.rag.prompt_templates import get_prompt_template
from app.config.settings import settings
@dataclass
class DocumentSummary:
"""文档摘要结果"""
doc_name: str
summary: str
applicable_scope: str
key_clauses: list
key_terms: list
compliance_points: list
model: str
latency_ms: int
error: Optional[str] = None
@property
def is_success(self) -> bool:
return self.error is None
class DocumentSummarizer:
"""
文档摘要生成器
功能:
- 生成法规文档的核心要点摘要
- 提取适用范围
- 突出关键条款
- 列出合规要点
使用示例:
summarizer = DocumentSummarizer()
result = summarizer.summarize("GB 7258-2017", markdown_content)
print(result.summary)
"""
def __init__(
self,
provider: str = None,
model: str = None,
max_tokens: int = None
):
"""
初始化摘要生成器
Args:
provider: LLM提供商
model: LLM模型名称
max_tokens: 最大输出token数
"""
self.provider = provider or settings.llm_provider
self.model = model or settings.llm_model
self.max_tokens = max_tokens or settings.rag_summary_max_tokens
# LLM客户端延迟加载
self.llm: Optional[BaseLLMClient] = None
logger.info(f"摘要生成器初始化: provider={self.provider}, model={self.model}")
def _init_llm(self):
"""延迟初始化LLM"""
if self.llm is None:
self.llm = get_llm_client(
provider=self.provider,
model=self.model
)
def summarize(
self,
doc_name: str,
content: str,
regulation_type: str = "",
max_tokens: Optional[int] = None
) -> DocumentSummary:
"""
生成文档摘要
Args:
doc_name: 文档名称
content: 文档内容Markdown格式
regulation_type: 法规类型
max_tokens: 最大输出token数
Returns:
DocumentSummary: 摘要结果
"""
import time
start_time = time.time()
logger.info(f"生成文档摘要: {doc_name}")
try:
self._init_llm()
# 使用摘要模板
template = get_prompt_template("document_summary")
# 构建用户消息
user_content = template.user_template.format(
doc_name=doc_name,
content=content[:8000] # 截取前8000字符避免超出token限制
)
# 调用LLM
response = self.llm.chat(
messages=[
{"role": "system", "content": template.system_prompt},
{"role": "user", "content": user_content}
],
max_tokens=max_tokens or self.max_tokens,
temperature=0.3 # 低温度保证摘要准确性
)
latency_ms = int((time.time() - start_time) * 1000)
if not response.is_success:
return DocumentSummary(
doc_name=doc_name,
summary="",
applicable_scope="",
key_clauses=[],
key_terms=[],
compliance_points=[],
model=self.model,
latency_ms=latency_ms,
error=response.error
)
# 解析摘要结构
summary_data = self._parse_summary(response.content)
logger.success(f"摘要生成完成: {doc_name}, {latency_ms}ms")
return DocumentSummary(
doc_name=doc_name,
summary=summary_data.get("summary", response.content),
applicable_scope=summary_data.get("applicable_scope", ""),
key_clauses=summary_data.get("key_clauses", []),
key_terms=summary_data.get("key_terms", []),
compliance_points=summary_data.get("compliance_points", []),
model=response.model,
latency_ms=latency_ms
)
except Exception as e:
logger.error(f"摘要生成失败: {e}")
return DocumentSummary(
doc_name=doc_name,
summary="",
applicable_scope="",
key_clauses=[],
key_terms=[],
compliance_points=[],
model=self.model,
latency_ms=0,
error=str(e)
)
def _parse_summary(self, content: str) -> Dict:
"""解析摘要内容(提取结构化信息)"""
result = {
"summary": content,
"applicable_scope": "",
"key_clauses": [],
"key_terms": [],
"compliance_points": []
}
# 简单解析(提取关键信息)
lines = content.split("\n")
for line in lines:
line = line.strip()
# 提取适用范围
if "适用范围" in line or "适用对象" in line:
result["applicable_scope"] = line.split("")[-1].strip() if "" in line else line.split(":")[-1].strip()
# 提取关键条款
if line.startswith("- 【条款") or line.startswith("【条款"):
result["key_clauses"].append(line)
# 提取关键术语
if "关键术语" in line or "术语定义" in line:
# 继续读取后续几行
pass
# 提取合规要点
if "合规要点" in line or "必须满足" in line:
pass
return result
def batch_summarize(
self,
documents: list
) -> list:
"""
批量生成摘要
Args:
documents: 文档列表 [{"doc_name": str, "content": str}, ...]
Returns:
list: 摘要结果列表
"""
results = []
for doc in documents:
result = self.summarize(doc["doc_name"], doc["content"])
results.append(result)
return results
def summarize_document(
doc_name: str,
content: str,
**kwargs
) -> DocumentSummary:
"""便捷函数:生成文档摘要"""
summarizer = DocumentSummarizer(**kwargs)
return summarizer.summarize(doc_name, content)