Files
catonline_ai/vw-document-ai-indexer/third_level_service.py

79 lines
2.5 KiB
Python
Raw Permalink Normal View History

2025-09-26 17:15:54 +08:00
"""
Level 3 title recommendation algorithm - only count the number of most frequently used # numbers
"""
from collections import Counter
from typing import Dict, Any, List
import re
def get_third_level_hash_counts_simple(content: str) -> List[int]:
hash_counts = []
in_code_block = False
for line in content.split('\n'):
line = line.strip()
if not line:
continue
# Processing code blocks
if line.startswith('```'):
in_code_block = not in_code_block
continue
if in_code_block:
continue
# Match the title line: #+ space Content
match = re.match(r'^(#{1,6})\s+(.+)$', line)
if match:
hash_count = len(match.group(1))
title_text = match.group(2).strip()
# Check if it is a third-level heading - supports two formats:
# 1. Traditional numeric format: "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
# 2. Letter+number format: "A.1.2.3" (treat A.x.x.x as a third-level heading)
is_third_level = False
# Traditional numeric third-level format: x.x.x
if re.match(r'^\d+\s*\.\s*\d+\s*\.\s*\d+(?:\s|$|[^\d\.])', title_text):
is_third_level = True
# Letter+number third-level format: A.x.x.x (treat as third-level heading)
elif re.match(r'^[A-Z]\.\d+\.\d+\.\d+(?:\s|$|[^\d\.])', title_text):
is_third_level = True
if is_third_level:
hash_counts.append(hash_count)
return hash_counts
def get_recommended_hash_count_simple(content: str) -> Dict[str, Any]:
hash_counts = get_third_level_hash_counts_simple(content)
if not hash_counts:
return {
'recommendation': 5, # Default value
'reason': 'No third-level headings detected, using default value',
'statistics': {},
'total_count': 0
}
# Count the frequency of various # usage
usage_stats = Counter(hash_counts)
# Select the most frequently used # count
most_common = usage_stats.most_common(1)[0]
recommended_hash_count = most_common[0]
frequency = most_common[1]
total_count = len(hash_counts)
percentage = frequency / total_count * 100
return {
'recommendation': recommended_hash_count,
'reason': f'Most frequently used: {frequency}/{total_count} times ({percentage:.1f}%)',
'statistics': dict(usage_stats),
'total_count': total_count
}