init
This commit is contained in:
78
vw-document-ai-indexer/third_level_service.py
Normal file
78
vw-document-ai-indexer/third_level_service.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
Level 3 title recommendation algorithm - only count the number of most frequently used # numbers
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
from typing import Dict, Any, List
|
||||
import re
|
||||
|
||||
def get_third_level_hash_counts_simple(content: str) -> List[int]:
|
||||
hash_counts = []
|
||||
in_code_block = False
|
||||
|
||||
for line in content.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Processing code blocks
|
||||
if line.startswith('```'):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
continue
|
||||
|
||||
# Match the title line: #+ space Content
|
||||
match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
if match:
|
||||
hash_count = len(match.group(1))
|
||||
title_text = match.group(2).strip()
|
||||
|
||||
# Check if it is a third-level heading - supports two formats:
|
||||
# 1. Traditional numeric format: "1.2.3", "1 . 2 . 3", "1. 2. 3", etc.
|
||||
# 2. Letter+number format: "A.1.2.3" (treat A.x.x.x as a third-level heading)
|
||||
|
||||
is_third_level = False
|
||||
|
||||
# Traditional numeric third-level format: x.x.x
|
||||
if re.match(r'^\d+\s*\.\s*\d+\s*\.\s*\d+(?:\s|$|[^\d\.])', title_text):
|
||||
is_third_level = True
|
||||
|
||||
# Letter+number third-level format: A.x.x.x (treat as third-level heading)
|
||||
elif re.match(r'^[A-Z]\.\d+\.\d+\.\d+(?:\s|$|[^\d\.])', title_text):
|
||||
is_third_level = True
|
||||
|
||||
if is_third_level:
|
||||
hash_counts.append(hash_count)
|
||||
|
||||
return hash_counts
|
||||
|
||||
def get_recommended_hash_count_simple(content: str) -> Dict[str, Any]:
|
||||
hash_counts = get_third_level_hash_counts_simple(content)
|
||||
|
||||
if not hash_counts:
|
||||
return {
|
||||
'recommendation': 5, # Default value
|
||||
'reason': 'No third-level headings detected, using default value',
|
||||
'statistics': {},
|
||||
'total_count': 0
|
||||
}
|
||||
|
||||
# Count the frequency of various # usage
|
||||
usage_stats = Counter(hash_counts)
|
||||
|
||||
# Select the most frequently used # count
|
||||
most_common = usage_stats.most_common(1)[0]
|
||||
recommended_hash_count = most_common[0]
|
||||
frequency = most_common[1]
|
||||
total_count = len(hash_counts)
|
||||
percentage = frequency / total_count * 100
|
||||
|
||||
return {
|
||||
'recommendation': recommended_hash_count,
|
||||
'reason': f'Most frequently used: {frequency}/{total_count} times ({percentage:.1f}%)',
|
||||
'statistics': dict(usage_stats),
|
||||
'total_count': total_count
|
||||
}
|
||||
Reference in New Issue
Block a user