Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,51 +1,46 @@
"""智能分块器 - 章节级+条款级双粒度切割"""
"""Provide service-layer logic for text chunker."""
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from loguru import logger
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class ChunkMetadata:
"""分块元数据"""
"""Represent the Chunk Metadata type."""
doc_id: str = ""
doc_name: str = ""
chunk_id: str = ""
section_number: str = "" # 章节编号(如 "第一章"
section_title: str = "" # 章节标题
clause_number: str = "" # 条款编号(如 "第一条"
section_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
section_title: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
clause_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
page_number: int = 0
start_position: int = 0 # 在原文中的起始位置
end_position: int = 0 # 在原文中的结束位置
regulation_type: str = "" # 法规类型
start_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
end_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
regulation_type: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
version: str = ""
@dataclass
class TextChunk:
"""文本分块"""
"""Represent the Text Chunk type."""
content: str
metadata: ChunkMetadata
token_count: int = 0 # 估算的token数量
token_count: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
class RegulationChunker:
"""
法规文档智能分块器
"""Represent the Regulation Chunker type."""
实现章节级/条款级双粒度切割适配国标GB文档结构
- 国标文档通常有明确的层级结构:章 > 节 > 条
- 每个条款应作为一个独立的语义单元
- 保留条款完整性,避免跨条款截断
"""
# 法规标题模式
# Keep service responsibilities explicit so downstream behavior stays predictable.
CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')
# 条款子项模式
# Keep service responsibilities explicit so downstream behavior stays predictable.
SUB_ITEM_PATTERN = re.compile(r'^[\(][一二三四五六七八九十]+[\)]\s')
NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')
@@ -56,15 +51,7 @@ class RegulationChunker:
max_chunk_size: int = 2048,
min_chunk_size: int = 100
):
"""
初始化分块器
Args:
chunk_size: 默认分块大小(字符数)
chunk_overlap: 分块重叠大小
max_chunk_size: 最大分块大小(防止单个条款过长)
min_chunk_size: 最小分块大小(防止碎片化)
"""
"""Initialize the Regulation Chunker instance."""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.max_chunk_size = max_chunk_size
@@ -78,30 +65,18 @@ class RegulationChunker:
regulation_type: str = "",
version: str = ""
) -> List[TextChunk]:
"""
对法规文档进行智能分块
Args:
markdown_text: Markdown格式的文档内容
doc_id: 文档ID
doc_name: 文档名称
regulation_type: 法规类型
version: 文档版本
Returns:
List[TextChunk]: 分块列表
"""
"""Handle chunk document for the Regulation Chunker instance."""
logger.info(f"开始分块文档: {doc_name}")
# 1. 按章节分割(一级分块)
# Keep service responsibilities explicit so downstream behavior stays predictable.
sections = self._split_by_sections(markdown_text)
# 2. 在每个章节内按条款分割(二级分块)
# Keep service responsibilities explicit so downstream behavior stays predictable.
chunks = []
global_position = 0
for section_num, section_title, section_content, section_start in sections:
# 在章节内按条款分割
# Keep service responsibilities explicit so downstream behavior stays predictable.
clause_chunks = self._split_by_clauses(
section_content,
section_num,
@@ -110,7 +85,7 @@ class RegulationChunker:
)
for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
# 处理过长的条款(进一步细分)
# Keep service responsibilities explicit so downstream behavior stays predictable.
if len(chunk_content) > self.max_chunk_size:
sub_chunks = self._split_long_clause(
chunk_content,
@@ -150,12 +125,7 @@ class RegulationChunker:
return chunks
def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
"""
按章节分割文档
Returns:
List of (section_number, section_title, section_content, start_position)
"""
"""Handle split by sections for this module for the Regulation Chunker instance."""
sections = []
lines = markdown_text.split('\n')
@@ -165,12 +135,12 @@ class RegulationChunker:
current_section_start = 0
for i, line in enumerate(lines):
# 检测章节标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
chapter_match = self.CHAPTER_PATTERN.match(line.strip())
section_match = self.SECTION_PATTERN.match(line.strip())
if chapter_match or section_match:
# 保存上一个章节
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
@@ -180,7 +150,7 @@ class RegulationChunker:
current_section_start
))
# 开始新章节
# Keep service responsibilities explicit so downstream behavior stays predictable.
current_section_start = sum(len(l) + 1 for l in lines[:i])
current_section_content = []
@@ -193,7 +163,7 @@ class RegulationChunker:
current_section_content.append(line)
# 保存最后一个章节
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
@@ -203,7 +173,7 @@ class RegulationChunker:
current_section_start
))
# 如果没有检测到章节,将整个文档作为一个大章节
# Keep service responsibilities explicit so downstream behavior stays predictable.
if not sections:
sections.append((
"",
@@ -221,12 +191,7 @@ class RegulationChunker:
section_title: str,
section_start: int
) -> List[Tuple[str, str, str, int, int]]:
"""
在章节内按条款分割
Returns:
List of (content, clause_number, clause_title, start_position, end_position)
"""
"""Handle split by clauses for this module for the Regulation Chunker instance."""
clauses = []
lines = section_content.split('\n')
@@ -236,11 +201,11 @@ class RegulationChunker:
current_clause_start = section_start
for i, line in enumerate(lines):
# 检测条款标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
clause_match = self.CLAUSE_PATTERN.match(line.strip())
if clause_match:
# 保存上一个条款
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
@@ -252,7 +217,7 @@ class RegulationChunker:
end_pos
))
# 开始新条款
# Keep service responsibilities explicit so downstream behavior stays predictable.
current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
current_clause_content = []
current_clause_num = self._extract_clause_number(line.strip())
@@ -260,7 +225,7 @@ class RegulationChunker:
current_clause_content.append(line)
# 保存最后一个条款
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
@@ -272,7 +237,7 @@ class RegulationChunker:
end_pos
))
# 如果没有检测到条款,将整个章节作为一个条款
# Keep service responsibilities explicit so downstream behavior stays predictable.
if not clauses:
clauses.append((
section_content,
@@ -290,15 +255,11 @@ class RegulationChunker:
clause_num: str,
clause_title: str
) -> List[Tuple[str, int, int]]:
"""
分割过长的条款内容
按条款子项或段落分割,保持语义完整性
"""
"""Handle split long clause for this module for the Regulation Chunker instance."""
sub_chunks = []
lines = content.split('\n')
# 检测是否有子项结构
# Keep service responsibilities explicit so downstream behavior stays predictable.
has_sub_items = any(
self.SUB_ITEM_PATTERN.match(line.strip()) or
self.NUMBER_ITEM_PATTERN.match(line.strip())
@@ -306,7 +267,7 @@ class RegulationChunker:
)
if has_sub_items:
# 按子项分割
# Keep service responsibilities explicit so downstream behavior stays predictable.
current_sub_content = []
current_sub_start = 0
@@ -326,14 +287,14 @@ class RegulationChunker:
current_sub_content.append(line)
# 保存最后一个子项
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_sub_content:
sub_content = '\n'.join(current_sub_content)
sub_end = current_sub_start + len(sub_content)
sub_chunks.append((sub_content, current_sub_start, sub_end))
else:
# 按段落分割(滑动窗口)
# Keep service responsibilities explicit so downstream behavior stays predictable.
paragraphs = []
current_para = []
@@ -348,7 +309,7 @@ class RegulationChunker:
if current_para:
paragraphs.append('\n'.join(current_para))
# 合并段落直到达到chunk_size
# Keep service responsibilities explicit so downstream behavior stays predictable.
current_chunk = []
current_length = 0
chunk_start = 0
@@ -365,7 +326,7 @@ class RegulationChunker:
current_chunk.append(para)
current_length += len(para)
# 保存最后一个chunk
# Keep service responsibilities explicit so downstream behavior stays predictable.
if current_chunk:
chunk_content = '\n'.join(current_chunk)
chunk_end = chunk_start + len(chunk_content)
@@ -374,13 +335,13 @@ class RegulationChunker:
return sub_chunks
def _extract_title(self, header_line: str) -> str:
"""从标题行提取标题内容"""
# 移除"第X章"、"第X节"前缀
"""Handle extract title for this module for the Regulation Chunker instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
return title.strip()
def _extract_clause_number(self, clause_line: str) -> str:
"""从条款行提取条款编号"""
"""Handle extract clause number for this module for the Regulation Chunker instance."""
match = self.CLAUSE_PATTERN.match(clause_line)
if match:
return match.group(0).strip()
@@ -399,14 +360,14 @@ class RegulationChunker:
regulation_type: str,
version: str
) -> TextChunk:
"""创建文本分块"""
# 清理内容
"""Handle create chunk for this module for the Regulation Chunker instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
content = content.strip()
# 计算估算token数中文约1.5字符/token
token_count = int(len(content) * 0.7) # 简化估算
# Keep service responsibilities explicit so downstream behavior stays predictable.
token_count = int(len(content) * 0.7) # Keep service responsibilities explicit so downstream behavior stays predictable.
# 生成chunk_id
# Keep service responsibilities explicit so downstream behavior stays predictable.
chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"
metadata = ChunkMetadata(
@@ -437,7 +398,7 @@ def chunk_regulation_document(
version: str = "",
chunk_size: int = 512
) -> List[TextChunk]:
"""便捷函数:对法规文档进行分块"""
"""Handle chunk regulation document."""
chunker = RegulationChunker(chunk_size=chunk_size)
return chunker.chunk_document(
markdown_text,