"""文档摘要生成服务 - LLM生成法规文档摘要""" from typing import Dict, Optional from dataclasses import dataclass from loguru import logger from app.services.llm import get_llm_client, BaseLLMClient from app.services.rag.prompt_templates import get_prompt_template from app.config.settings import settings @dataclass class DocumentSummary: """文档摘要结果""" doc_name: str summary: str applicable_scope: str key_clauses: list key_terms: list compliance_points: list model: str latency_ms: int error: Optional[str] = None @property def is_success(self) -> bool: return self.error is None class DocumentSummarizer: """ 文档摘要生成器 功能: - 生成法规文档的核心要点摘要 - 提取适用范围 - 突出关键条款 - 列出合规要点 使用示例: summarizer = DocumentSummarizer() result = summarizer.summarize("GB 7258-2017", markdown_content) print(result.summary) """ def __init__( self, provider: str = None, model: str = None, max_tokens: int = None ): """ 初始化摘要生成器 Args: provider: LLM提供商 model: LLM模型名称 max_tokens: 最大输出token数 """ self.provider = provider or settings.llm_provider self.model = model or settings.llm_model self.max_tokens = max_tokens or settings.rag_summary_max_tokens # LLM客户端(延迟加载) self.llm: Optional[BaseLLMClient] = None logger.info(f"摘要生成器初始化: provider={self.provider}, model={self.model}") def _init_llm(self): """延迟初始化LLM""" if self.llm is None: self.llm = get_llm_client( provider=self.provider, model=self.model ) def summarize( self, doc_name: str, content: str, regulation_type: str = "", max_tokens: Optional[int] = None ) -> DocumentSummary: """ 生成文档摘要 Args: doc_name: 文档名称 content: 文档内容(Markdown格式) regulation_type: 法规类型 max_tokens: 最大输出token数 Returns: DocumentSummary: 摘要结果 """ import time start_time = time.time() logger.info(f"生成文档摘要: {doc_name}") try: self._init_llm() # 使用摘要模板 template = get_prompt_template("document_summary") # 构建用户消息 user_content = template.user_template.format( doc_name=doc_name, content=content[:8000] # 截取前8000字符(避免超出token限制) ) # 调用LLM response = self.llm.chat( messages=[ {"role": "system", "content": template.system_prompt}, {"role": "user", "content": user_content} ], max_tokens=max_tokens or self.max_tokens, temperature=0.3 # 低温度保证摘要准确性 ) latency_ms = int((time.time() - start_time) * 1000) if not response.is_success: return DocumentSummary( doc_name=doc_name, summary="", applicable_scope="", key_clauses=[], key_terms=[], compliance_points=[], model=self.model, latency_ms=latency_ms, error=response.error ) # 解析摘要结构 summary_data = self._parse_summary(response.content) logger.success(f"摘要生成完成: {doc_name}, {latency_ms}ms") return DocumentSummary( doc_name=doc_name, summary=summary_data.get("summary", response.content), applicable_scope=summary_data.get("applicable_scope", ""), key_clauses=summary_data.get("key_clauses", []), key_terms=summary_data.get("key_terms", []), compliance_points=summary_data.get("compliance_points", []), model=response.model, latency_ms=latency_ms ) except Exception as e: logger.error(f"摘要生成失败: {e}") return DocumentSummary( doc_name=doc_name, summary="", applicable_scope="", key_clauses=[], key_terms=[], compliance_points=[], model=self.model, latency_ms=0, error=str(e) ) def _parse_summary(self, content: str) -> Dict: """解析摘要内容(提取结构化信息)""" result = { "summary": content, "applicable_scope": "", "key_clauses": [], "key_terms": [], "compliance_points": [] } # 简单解析(提取关键信息) lines = content.split("\n") for line in lines: line = line.strip() # 提取适用范围 if "适用范围" in line or "适用对象" in line: result["applicable_scope"] = line.split(":")[-1].strip() if ":" in line else line.split(":")[-1].strip() # 提取关键条款 if line.startswith("- 【条款") or line.startswith("【条款"): result["key_clauses"].append(line) # 提取关键术语 if "关键术语" in line or "术语定义" in line: # 继续读取后续几行 pass # 提取合规要点 if "合规要点" in line or "必须满足" in line: pass return result def batch_summarize( self, documents: list ) -> list: """ 批量生成摘要 Args: documents: 文档列表 [{"doc_name": str, "content": str}, ...] Returns: list: 摘要结果列表 """ results = [] for doc in documents: result = self.summarize(doc["doc_name"], doc["content"]) results.append(result) return results def summarize_document( doc_name: str, content: str, **kwargs ) -> DocumentSummary: """便捷函数:生成文档摘要""" summarizer = DocumentSummarizer(**kwargs) return summarizer.summarize(doc_name, content)