Files

410 lines
16 KiB
Python
Raw Permalink Normal View History

"""Provide service-layer logic for text chunker."""
2026-05-14 15:07:34 +08:00
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from loguru import logger
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
@dataclass
class ChunkMetadata:
"""Represent the Chunk Metadata type."""
2026-05-14 15:07:34 +08:00
doc_id: str = ""
doc_name: str = ""
chunk_id: str = ""
section_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
section_title: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
clause_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
page_number: int = 0
start_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
end_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
regulation_type: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
version: str = ""
@dataclass
class TextChunk:
"""Represent the Text Chunk type."""
2026-05-14 15:07:34 +08:00
content: str
metadata: ChunkMetadata
token_count: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
class RegulationChunker:
"""Represent the Regulation Chunker type."""
2026-05-14 15:07:34 +08:00
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
SUB_ITEM_PATTERN = re.compile(r'^[\(][一二三四五六七八九十]+[\)]\s')
NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')
def __init__(
self,
chunk_size: int = 512,
chunk_overlap: int = 50,
max_chunk_size: int = 2048,
min_chunk_size: int = 100
):
"""Initialize the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.max_chunk_size = max_chunk_size
self.min_chunk_size = min_chunk_size
def chunk_document(
self,
markdown_text: str,
doc_id: str = "",
doc_name: str = "",
regulation_type: str = "",
version: str = ""
) -> List[TextChunk]:
"""Handle chunk document for the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
logger.info(f"开始分块文档: {doc_name}")
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
sections = self._split_by_sections(markdown_text)
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
chunks = []
global_position = 0
for section_num, section_title, section_content, section_start in sections:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
clause_chunks = self._split_by_clauses(
section_content,
section_num,
section_title,
section_start + global_position
)
for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if len(chunk_content) > self.max_chunk_size:
sub_chunks = self._split_long_clause(
chunk_content,
clause_num,
clause_title
)
for sub_content, sub_start, sub_end in sub_chunks:
chunk = self._create_chunk(
sub_content,
doc_id,
doc_name,
section_num,
section_title,
clause_num,
sub_start + start_pos,
sub_end + start_pos,
regulation_type,
version
)
chunks.append(chunk)
else:
chunk = self._create_chunk(
chunk_content,
doc_id,
doc_name,
section_num,
section_title,
clause_num,
start_pos,
end_pos,
regulation_type,
version
)
chunks.append(chunk)
logger.success(f"分块完成,共{len(chunks)}个chunk")
return chunks
def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
"""Handle split by sections for this module for the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
sections = []
lines = markdown_text.split('\n')
current_section_num = ""
current_section_title = ""
current_section_content = []
current_section_start = 0
for i, line in enumerate(lines):
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
chapter_match = self.CHAPTER_PATTERN.match(line.strip())
section_match = self.SECTION_PATTERN.match(line.strip())
if chapter_match or section_match:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
current_section_num,
current_section_title,
content,
current_section_start
))
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
current_section_start = sum(len(l) + 1 for l in lines[:i])
current_section_content = []
if chapter_match:
current_section_num = line.strip()
current_section_title = self._extract_title(line.strip())
else:
current_section_num = line.strip()
current_section_title = self._extract_title(line.strip())
current_section_content.append(line)
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
current_section_num,
current_section_title,
content,
current_section_start
))
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if not sections:
sections.append((
"",
"全文",
markdown_text,
0
))
return sections
def _split_by_clauses(
self,
section_content: str,
section_num: str,
section_title: str,
section_start: int
) -> List[Tuple[str, str, str, int, int]]:
"""Handle split by clauses for this module for the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
clauses = []
lines = section_content.split('\n')
current_clause_num = ""
current_clause_title = ""
current_clause_content = []
current_clause_start = section_start
for i, line in enumerate(lines):
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
clause_match = self.CLAUSE_PATTERN.match(line.strip())
if clause_match:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
clauses.append((
content,
current_clause_num,
current_clause_title,
current_clause_start,
end_pos
))
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
current_clause_content = []
current_clause_num = self._extract_clause_number(line.strip())
current_clause_title = line.strip()
current_clause_content.append(line)
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
clauses.append((
content,
current_clause_num,
current_clause_title,
current_clause_start,
end_pos
))
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if not clauses:
clauses.append((
section_content,
"",
section_title,
section_start,
section_start + len(section_content)
))
return clauses
def _split_long_clause(
self,
content: str,
clause_num: str,
clause_title: str
) -> List[Tuple[str, int, int]]:
"""Handle split long clause for this module for the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
sub_chunks = []
lines = content.split('\n')
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
has_sub_items = any(
self.SUB_ITEM_PATTERN.match(line.strip()) or
self.NUMBER_ITEM_PATTERN.match(line.strip())
for line in lines
)
if has_sub_items:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
current_sub_content = []
current_sub_start = 0
for i, line in enumerate(lines):
is_sub_item = (
self.SUB_ITEM_PATTERN.match(line.strip()) or
self.NUMBER_ITEM_PATTERN.match(line.strip())
)
if is_sub_item and current_sub_content:
sub_content = '\n'.join(current_sub_content)
sub_end = current_sub_start + len(sub_content)
if len(sub_content) >= self.min_chunk_size:
sub_chunks.append((sub_content, current_sub_start, sub_end))
current_sub_content = []
current_sub_start = sum(len(l) + 1 for l in lines[:i])
current_sub_content.append(line)
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_sub_content:
sub_content = '\n'.join(current_sub_content)
sub_end = current_sub_start + len(sub_content)
sub_chunks.append((sub_content, current_sub_start, sub_end))
else:
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
paragraphs = []
current_para = []
for line in lines:
if line.strip():
current_para.append(line)
else:
if current_para:
paragraphs.append('\n'.join(current_para))
current_para = []
if current_para:
paragraphs.append('\n'.join(current_para))
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
current_chunk = []
current_length = 0
chunk_start = 0
for para in paragraphs:
if current_length + len(para) > self.chunk_size and current_chunk:
chunk_content = '\n'.join(current_chunk)
chunk_end = chunk_start + len(chunk_content)
sub_chunks.append((chunk_content, chunk_start, chunk_end))
current_chunk = []
current_length = 0
chunk_start = chunk_end
current_chunk.append(para)
current_length += len(para)
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
if current_chunk:
chunk_content = '\n'.join(current_chunk)
chunk_end = chunk_start + len(chunk_content)
sub_chunks.append((chunk_content, chunk_start, chunk_end))
return sub_chunks
def _extract_title(self, header_line: str) -> str:
"""Handle extract title for this module for the Regulation Chunker instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
return title.strip()
def _extract_clause_number(self, clause_line: str) -> str:
"""Handle extract clause number for this module for the Regulation Chunker instance."""
2026-05-14 15:07:34 +08:00
match = self.CLAUSE_PATTERN.match(clause_line)
if match:
return match.group(0).strip()
return ""
def _create_chunk(
self,
content: str,
doc_id: str,
doc_name: str,
section_num: str,
section_title: str,
clause_num: str,
start_pos: int,
end_pos: int,
regulation_type: str,
version: str
) -> TextChunk:
"""Handle create chunk for this module for the Regulation Chunker instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
content = content.strip()
# Keep service responsibilities explicit so downstream behavior stays predictable.
token_count = int(len(content) * 0.7) # Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
# Keep service responsibilities explicit so downstream behavior stays predictable.
2026-05-14 15:07:34 +08:00
chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"
metadata = ChunkMetadata(
doc_id=doc_id,
doc_name=doc_name,
chunk_id=chunk_id,
section_number=section_num,
section_title=section_title,
clause_number=clause_num,
start_position=start_pos,
end_position=end_pos,
regulation_type=regulation_type,
version=version
)
return TextChunk(
content=content,
metadata=metadata,
token_count=token_count
)
def chunk_regulation_document(
markdown_text: str,
doc_id: str = "",
doc_name: str = "",
regulation_type: str = "",
version: str = "",
chunk_size: int = 512
) -> List[TextChunk]:
"""Handle chunk regulation document."""
2026-05-14 15:07:34 +08:00
chunker = RegulationChunker(chunk_size=chunk_size)
return chunker.chunk_document(
markdown_text,
doc_id,
doc_name,
regulation_type,
version
)