catonline_ai/vw-document-ai-indexer/third_level_service.py

"""
Level 3 title recommendation algorithm - only count the number of most frequently used # numbers
"""

from collections import Counter
from typing import Dict, Any, List
import re

def get_third_level_hash_counts_simple(content: str) -> List[int]:
    hash_counts = []
    in_code_block = False

    for line in content.split('\n'):
        line = line.strip()

        if not line:
            continue

        # Processing code blocks
        if line.startswith('```'):
            in_code_block = not in_code_block
            continue

        if in_code_block:
            continue

        # Match the title line: #+ space Content
        match = re.match(r'^(#{1,6})\s+(.+)$', line)
        if match:
            hash_count = len(match.group(1))
            title_text = match.group(2).strip()

            # Check if it is a third-level heading - supports two formats:
            # 1. Traditional numeric format: "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
            # 2. Letter+number format: "A.1.2.3" (treat A.x.x.x as a third-level heading)

            is_third_level = False

            # Traditional numeric third-level format: x.x.x
            if re.match(r'^\d+\s*\.\s*\d+\s*\.\s*\d+(?:\s|$|[^\d\.])', title_text):
                is_third_level = True

            # Letter+number third-level format: A.x.x.x (treat as third-level heading)
            elif re.match(r'^[A-Z]\.\d+\.\d+\.\d+(?:\s|$|[^\d\.])', title_text):
                is_third_level = True

            if is_third_level:
                hash_counts.append(hash_count)

    return hash_counts

def get_recommended_hash_count_simple(content: str) -> Dict[str, Any]:
    hash_counts = get_third_level_hash_counts_simple(content)

    if not hash_counts:
        return {
            'recommendation': 5,  # Default value
            'reason': 'No third-level headings detected, using default value',
            'statistics': {},
            'total_count': 0
        }

    # Count the frequency of various # usage
    usage_stats = Counter(hash_counts)

    # Select the most frequently used # count
    most_common = usage_stats.most_common(1)[0]
    recommended_hash_count = most_common[0]
    frequency = most_common[1]
    total_count = len(hash_counts)
    percentage = frequency / total_count * 100

    return {
        'recommendation': recommended_hash_count,
        'reason': f'Most frequently used: {frequency}/{total_count} times ({percentage:.1f}%)',
        'statistics': dict(usage_stats),
        'total_count': total_count
    }