AIRegulation-DocAnalysis/backend/app/services/embedding/text_chunker.py

"""Provide service-layer logic for text chunker."""

import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from loguru import logger
# Keep service responsibilities explicit so downstream behavior stays predictable.


@dataclass
class ChunkMetadata:
    """Represent the Chunk Metadata type."""
    doc_id: str = ""
    doc_name: str = ""
    chunk_id: str = ""
    section_number: str = ""  # Keep service responsibilities explicit so downstream behavior stays predictable.
    section_title: str = ""   # Keep service responsibilities explicit so downstream behavior stays predictable.
    clause_number: str = ""   # Keep service responsibilities explicit so downstream behavior stays predictable.
    page_number: int = 0
    start_position: int = 0   # Keep service responsibilities explicit so downstream behavior stays predictable.
    end_position: int = 0     # Keep service responsibilities explicit so downstream behavior stays predictable.
    regulation_type: str = ""  # Keep service responsibilities explicit so downstream behavior stays predictable.
    version: str = ""


@dataclass
class TextChunk:
    """Represent the Text Chunk type."""
    content: str
    metadata: ChunkMetadata
    token_count: int = 0  # Keep service responsibilities explicit so downstream behavior stays predictable.


class RegulationChunker:
    """Represent the Regulation Chunker type."""

    # Keep service responsibilities explicit so downstream behavior stays predictable.
    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
    SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
    CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')

    # Keep service responsibilities explicit so downstream behavior stays predictable.
    SUB_ITEM_PATTERN = re.compile(r'^[\(（][一二三四五六七八九十]+[\)）]\s')
    NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')

    def __init__(
        self,
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        max_chunk_size: int = 2048,
        min_chunk_size: int = 100
    ):
        """Initialize the Regulation Chunker instance."""
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size

    def chunk_document(
        self,
        markdown_text: str,
        doc_id: str = "",
        doc_name: str = "",
        regulation_type: str = "",
        version: str = ""
    ) -> List[TextChunk]:
        """Handle chunk document for the Regulation Chunker instance."""
        logger.info(f"开始分块文档: {doc_name}")

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        sections = self._split_by_sections(markdown_text)

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        chunks = []
        global_position = 0

        for section_num, section_title, section_content, section_start in sections:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            clause_chunks = self._split_by_clauses(
                section_content,
                section_num,
                section_title,
                section_start + global_position
            )

            for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                if len(chunk_content) > self.max_chunk_size:
                    sub_chunks = self._split_long_clause(
                        chunk_content,
                        clause_num,
                        clause_title
                    )
                    for sub_content, sub_start, sub_end in sub_chunks:
                        chunk = self._create_chunk(
                            sub_content,
                            doc_id,
                            doc_name,
                            section_num,
                            section_title,
                            clause_num,
                            sub_start + start_pos,
                            sub_end + start_pos,
                            regulation_type,
                            version
                        )
                        chunks.append(chunk)
                else:
                    chunk = self._create_chunk(
                        chunk_content,
                        doc_id,
                        doc_name,
                        section_num,
                        section_title,
                        clause_num,
                        start_pos,
                        end_pos,
                        regulation_type,
                        version
                    )
                    chunks.append(chunk)

        logger.success(f"分块完成，共{len(chunks)}个chunk")
        return chunks

    def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
        """Handle split by sections for this module for the Regulation Chunker instance."""
        sections = []
        lines = markdown_text.split('\n')

        current_section_num = ""
        current_section_title = ""
        current_section_content = []
        current_section_start = 0

        for i, line in enumerate(lines):
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            chapter_match = self.CHAPTER_PATTERN.match(line.strip())
            section_match = self.SECTION_PATTERN.match(line.strip())

            if chapter_match or section_match:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                if current_section_content:
                    content = '\n'.join(current_section_content)
                    sections.append((
                        current_section_num,
                        current_section_title,
                        content,
                        current_section_start
                    ))

                # Keep service responsibilities explicit so downstream behavior stays predictable.
                current_section_start = sum(len(l) + 1 for l in lines[:i])
                current_section_content = []

                if chapter_match:
                    current_section_num = line.strip()
                    current_section_title = self._extract_title(line.strip())
                else:
                    current_section_num = line.strip()
                    current_section_title = self._extract_title(line.strip())

            current_section_content.append(line)

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if current_section_content:
            content = '\n'.join(current_section_content)
            sections.append((
                current_section_num,
                current_section_title,
                content,
                current_section_start
            ))

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if not sections:
            sections.append((
                "",
                "全文",
                markdown_text,
                0
            ))

        return sections

    def _split_by_clauses(
        self,
        section_content: str,
        section_num: str,
        section_title: str,
        section_start: int
    ) -> List[Tuple[str, str, str, int, int]]:
        """Handle split by clauses for this module for the Regulation Chunker instance."""
        clauses = []
        lines = section_content.split('\n')

        current_clause_num = ""
        current_clause_title = ""
        current_clause_content = []
        current_clause_start = section_start

        for i, line in enumerate(lines):
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            clause_match = self.CLAUSE_PATTERN.match(line.strip())

            if clause_match:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                if current_clause_content:
                    content = '\n'.join(current_clause_content)
                    end_pos = current_clause_start + len(content)
                    clauses.append((
                        content,
                        current_clause_num,
                        current_clause_title,
                        current_clause_start,
                        end_pos
                    ))

                # Keep service responsibilities explicit so downstream behavior stays predictable.
                current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
                current_clause_content = []
                current_clause_num = self._extract_clause_number(line.strip())
                current_clause_title = line.strip()

            current_clause_content.append(line)

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if current_clause_content:
            content = '\n'.join(current_clause_content)
            end_pos = current_clause_start + len(content)
            clauses.append((
                content,
                current_clause_num,
                current_clause_title,
                current_clause_start,
                end_pos
            ))

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if not clauses:
            clauses.append((
                section_content,
                "",
                section_title,
                section_start,
                section_start + len(section_content)
            ))

        return clauses

    def _split_long_clause(
        self,
        content: str,
        clause_num: str,
        clause_title: str
    ) -> List[Tuple[str, int, int]]:
        """Handle split long clause for this module for the Regulation Chunker instance."""
        sub_chunks = []
        lines = content.split('\n')

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        has_sub_items = any(
            self.SUB_ITEM_PATTERN.match(line.strip()) or
            self.NUMBER_ITEM_PATTERN.match(line.strip())
            for line in lines
        )

        if has_sub_items:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            current_sub_content = []
            current_sub_start = 0

            for i, line in enumerate(lines):
                is_sub_item = (
                    self.SUB_ITEM_PATTERN.match(line.strip()) or
                    self.NUMBER_ITEM_PATTERN.match(line.strip())
                )

                if is_sub_item and current_sub_content:
                    sub_content = '\n'.join(current_sub_content)
                    sub_end = current_sub_start + len(sub_content)
                    if len(sub_content) >= self.min_chunk_size:
                        sub_chunks.append((sub_content, current_sub_start, sub_end))
                    current_sub_content = []
                    current_sub_start = sum(len(l) + 1 for l in lines[:i])

                current_sub_content.append(line)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if current_sub_content:
                sub_content = '\n'.join(current_sub_content)
                sub_end = current_sub_start + len(sub_content)
                sub_chunks.append((sub_content, current_sub_start, sub_end))

        else:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            paragraphs = []
            current_para = []

            for line in lines:
                if line.strip():
                    current_para.append(line)
                else:
                    if current_para:
                        paragraphs.append('\n'.join(current_para))
                        current_para = []

            if current_para:
                paragraphs.append('\n'.join(current_para))

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            current_chunk = []
            current_length = 0
            chunk_start = 0

            for para in paragraphs:
                if current_length + len(para) > self.chunk_size and current_chunk:
                    chunk_content = '\n'.join(current_chunk)
                    chunk_end = chunk_start + len(chunk_content)
                    sub_chunks.append((chunk_content, chunk_start, chunk_end))
                    current_chunk = []
                    current_length = 0
                    chunk_start = chunk_end

                current_chunk.append(para)
                current_length += len(para)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if current_chunk:
                chunk_content = '\n'.join(current_chunk)
                chunk_end = chunk_start + len(chunk_content)
                sub_chunks.append((chunk_content, chunk_start, chunk_end))

        return sub_chunks

    def _extract_title(self, header_line: str) -> str:
        """Handle extract title for this module for the Regulation Chunker instance."""
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
        return title.strip()

    def _extract_clause_number(self, clause_line: str) -> str:
        """Handle extract clause number for this module for the Regulation Chunker instance."""
        match = self.CLAUSE_PATTERN.match(clause_line)
        if match:
            return match.group(0).strip()
        return ""

    def _create_chunk(
        self,
        content: str,
        doc_id: str,
        doc_name: str,
        section_num: str,
        section_title: str,
        clause_num: str,
        start_pos: int,
        end_pos: int,
        regulation_type: str,
        version: str
    ) -> TextChunk:
        """Handle create chunk for this module for the Regulation Chunker instance."""
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        content = content.strip()

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        token_count = int(len(content) * 0.7)  # Keep service responsibilities explicit so downstream behavior stays predictable.

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"

        metadata = ChunkMetadata(
            doc_id=doc_id,
            doc_name=doc_name,
            chunk_id=chunk_id,
            section_number=section_num,
            section_title=section_title,
            clause_number=clause_num,
            start_position=start_pos,
            end_position=end_pos,
            regulation_type=regulation_type,
            version=version
        )

        return TextChunk(
            content=content,
            metadata=metadata,
            token_count=token_count
        )


def chunk_regulation_document(
    markdown_text: str,
    doc_id: str = "",
    doc_name: str = "",
    regulation_type: str = "",
    version: str = "",
    chunk_size: int = 512
) -> List[TextChunk]:
    """Handle chunk regulation document."""
    chunker = RegulationChunker(chunk_size=chunk_size)
    return chunker.chunk_document(
        markdown_text,
        doc_id,
        doc_name,
        regulation_type,
        version
    )