""" Fixed the problem of mismatch between the upper and lower titles in MD documents. Solve the problem that the # number of the lower title is raised to the same as the upper title, or is higher than the upper title. """ import re from typing import Any, List, Dict, Optional class HeaderInfo: """Title information""" def __init__(self, line_number: int, original_line: str, hash_count: int, level: int, number_pattern: str, title_text: str): self.line_number = line_number self.original_line = original_line self.hash_count = hash_count self.level = level self.number_pattern = number_pattern self.title_text = title_text self.correct_hash_count = hash_count # Will be updated by Fixer class HierarchyFixer: """Special fixer for title hierarchy # number mismatch issues""" def __init__(self): # Number pattern matching - supports both formats with and without trailing dots self.number_patterns = [ r'^(\d+)\.?$', # 1 or 1. r'^(\d+)\.(\d+)\.?$', # 1.1 or 1.1. r'^(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1 or 1.1.1. r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1 or 1.1.1.1. r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1 or 1.1.1.1.1. r'^(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.?$', # 1.1.1.1.1.1 or 1.1.1.1.1.1. ] # Letter+number pattern matching - supports both "A.x.x.x" and "C. x.x.x" formats self.letter_number_patterns = [ # Single letter: A, B, C (followed by space or end) (r'^([A-Z])(?:\s|$)', 1), # Letter + space + numbers: "C. 1", "A. 2" (r'^([A-Z])\.\s+(\d+)(?:\s|$)', 2), (r'^([A-Z])\.\s+(\d+)\.(\d+)(?:\s|$)', 3), # C. 1.1, A. 2.3 (r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)(?:\s|$)', 4), # C. 1.1.1, A. 2.3.4 (r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 5), # C. 1.1.1.1, A. 2.3.4.5 (r'^([A-Z])\.\s+(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$)', 6), # C. 1.1.1.1.1, A. 2.3.4.5.6 # Compact format (no space): A.1, A.1.2, A.1.2.3 etc. (r'^([A-Z])\.(\d+)(?:\s|$|[^\d\.])', 2), # A.1, A.2 (r'^([A-Z])\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 3), # A.1.2, A.1.3 (r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 4), # A.1.2.3 (r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 5), # A.1.2.3.4 (r'^([A-Z])\.(\d+)\.(\d+)\.(\d+)\.(\d+)\.(\d+)(?:\s|$|[^\d\.])', 6), # A.1.2.3.4.5 ] def detect_headers(self, content: str) -> List[HeaderInfo]: """Detect all headers and determine their logical levels""" lines = content.split('\n') headers: List[HeaderInfo] = [] for line_num, line in enumerate(lines): if line.strip().startswith('#'): header_info = self._parse_header_line(line_num, line) if header_info: headers.append(header_info) return headers def _parse_header_line(self, line_num: int, line: str) -> Optional[HeaderInfo]: """Analyze the title line""" line = line.strip() # Count the number of # characters hash_count = 0 for char in line: if char == '#': hash_count += 1 else: break if hash_count == 0: return None # Extract title content title_content = line[hash_count:].strip() # Try to match number pattern level = 1 number_pattern = "" # Check for letter+number patterns first (A.1.2.3 format) for pattern, expected_level in self.letter_number_patterns: match = re.match(pattern, title_content) if match: level = expected_level # Extract the complete matched numbering pattern matched_text = match.group(0) # For space-separated patterns like "C. 1.1", we need to extract the full pattern if '. ' in matched_text: # This is a space-separated pattern like "C. 1.1" # The match already contains the complete pattern we want number_pattern = matched_text.rstrip() # Remove trailing space if any else: # This is a compact pattern like "A.1.2.3" number_pattern = matched_text return HeaderInfo( line_number=line_num, original_line=line, hash_count=hash_count, level=level, number_pattern=number_pattern, title_text=title_content ) # If no letter+number pattern, try traditional number patterns if title_content: # First, try to identify and extract the complete numbering part # Look for patterns like "1.2.3", "1 . 2 . 3", "1. 2. 3", etc. words = title_content.split() numbering_words = [] # Collect words that could be part of the numbering (digits, dots, spaces) for word in words: if re.match(r'^[\d\.]+$', word) or word == '.': numbering_words.append(word) else: break # Stop at first non-numbering word if numbering_words: # Join and normalize the numbering part numbering_text = ' '.join(numbering_words) # Normalize: "1 . 2 . 3" -> "1.2.3", "1. 2. 3" -> "1.2.3" normalized = re.sub(r'\s*\.\s*', '.', numbering_text) normalized = re.sub(r'\.+$', '', normalized) # Remove trailing dots normalized = normalized.strip() # Try to match the normalized pattern for i, pattern in enumerate(self.number_patterns, 1): match = re.match(pattern, normalized) if match: level = i number_pattern = normalized break else: # If no numbering pattern found in separate words, try the first word directly first_word = words[0] if words else "" for i, pattern in enumerate(self.number_patterns, 1): match = re.match(pattern, first_word) if match: level = i number_pattern = match.group(0).rstrip('.') break # If no number pattern is found, infer level from # count if not number_pattern: level = hash_count return HeaderInfo( line_number=line_num, original_line=line, hash_count=hash_count, level=level, number_pattern=number_pattern, title_text=title_content ) def find_hierarchy_problems(self, headers: List[HeaderInfo]) -> List[Dict]: """Find problems with mismatched # counts using adaptive analysis""" problems = [] # 首先分析文档的自适应层级映射 level_hash_mapping = self._analyze_document_hash_pattern(headers) # 1. Check for level-hash mismatch based on adaptive mapping for header in headers: if header.number_pattern: # Only check numbered headers expected_hash_count = level_hash_mapping.get(header.level, header.level) if header.hash_count != expected_hash_count: problems.append({ 'type': 'level_hash_mismatch', 'line': header.line_number + 1, 'level': header.level, 'current_hash': header.hash_count, 'expected_hash': expected_hash_count, 'title': header.title_text[:50], 'pattern': header.number_pattern, 'problem': f"Level {header.level} header '{header.number_pattern}' uses {header.hash_count} #, but document pattern suggests {expected_hash_count} #" }) # 2. Check for parent-child hierarchy issues for i in range(len(headers) - 1): current = headers[i] next_header = headers[i + 1] # Only consider headers with a clear number pattern if current.number_pattern and next_header.number_pattern: # Check if the child header's # count is less than or equal to the parent header's if next_header.level > current.level: # Child header expected_parent_hash = level_hash_mapping.get(current.level, current.level) expected_child_hash = level_hash_mapping.get(next_header.level, next_header.level) if next_header.hash_count <= current.hash_count: problems.append({ 'type': 'hierarchy_violation', 'parent_line': current.line_number + 1, 'parent_level': current.level, 'parent_hash': current.hash_count, 'parent_title': current.title_text[:50], 'child_line': next_header.line_number + 1, 'child_level': next_header.level, 'child_hash': next_header.hash_count, 'child_title': next_header.title_text[:50], 'problem': f"Child header ({next_header.level} level) # count ({next_header.hash_count}) should be greater than parent header ({current.level} level, {current.hash_count} #). Expected pattern: parent {expected_parent_hash}#, child {expected_child_hash}#" }) # 3. Check for significant inconsistency within same level (now less strict) same_level_problems = self._find_same_level_inconsistency(headers) problems.extend(same_level_problems) return problems def _find_same_level_inconsistency(self, headers: List[HeaderInfo]) -> List[Dict]: """Check the problem of inconsistent number of titles # numbers at the same level""" problems = [] # Group by level, only numbered titles level_groups = {} for header in headers: if header.number_pattern: # Only numbered titles if header.level not in level_groups: level_groups[header.level] = [] level_groups[header.level].append(header) # Check the consistency of # numbers within each level for level, group_headers in level_groups.items(): if len(group_headers) < 2: continue # Only one header, no need to check # Count the usage of different # numbers within the same level hash_count_stats = {} for header in group_headers: hash_count = header.hash_count if hash_count not in hash_count_stats: hash_count_stats[hash_count] = [] hash_count_stats[hash_count].append(header) # If there are different # numbers in the same level if len(hash_count_stats) > 1: # Find the most common # number as the standard most_common_hash_count = max(hash_count_stats.keys(), key=lambda x: len(hash_count_stats[x])) # Report titles that do not meet the standard for hash_count, headers_with_this_count in hash_count_stats.items(): if hash_count != most_common_hash_count: for header in headers_with_this_count: problems.append({ 'type': 'same_level_inconsistency', 'line': header.line_number + 1, 'level': header.level, 'current_hash': header.hash_count, 'expected_hash': most_common_hash_count, 'title': header.title_text[:50], 'pattern': header.number_pattern, 'problem': f"{header.level} level header uses {header.hash_count} #, but the majority of siblings use {most_common_hash_count} #" }) return problems def fix_hierarchy(self, content: str) -> Dict[str,Any]: """Fix hierarchy issues""" headers = self.detect_headers(content) if not headers: return { 'fixed_content': content, 'problems_found': [], 'fixes_applied': 0, 'message': 'No headers detected' } # Check for problems problems = self.find_hierarchy_problems(headers) if not problems: return { 'fixed_content': content, 'problems_found': [], 'fixes_applied': 0, 'message': 'No hierarchy issues found' } # Apply fixes lines = content.split('\n') fixes_applied = 0 # To ensure child headers have more # than parent headers, we need to recalculate the # count for each header fixed_headers = self._calculate_correct_hash_counts(headers) # Apply fixes for header in fixed_headers: if header.hash_count != header.correct_hash_count: old_line = lines[header.line_number] new_hash = '#' * header.correct_hash_count # Replace # part new_line = re.sub(r'^#+', new_hash, old_line) lines[header.line_number] = new_line fixes_applied += 1 fixed_content = '\n'.join(lines) return { 'fixed_content': fixed_content, 'original_content': content, 'problems_found': problems, 'fixes_applied': fixes_applied, 'fixed_headers': [(h.line_number + 1, h.hash_count, h.correct_hash_count, h.title_text[:30]) for h in fixed_headers if h.hash_count != h.correct_hash_count] } def _calculate_correct_hash_counts(self, headers: List[HeaderInfo]) -> List[HeaderInfo]: """Calculate the correct number of #'s based on adaptive analysis of the document""" if not headers: return [] # 1. 分析文档中各层级的#号使用模式 (自适应分析) level_hash_mapping = self._analyze_document_hash_pattern(headers) # Create copies with the correct number of #'s fixed_headers: list[HeaderInfo] = [] for header in headers: # Copy original information fixed_header = HeaderInfo( line_number=header.line_number, original_line=header.original_line, hash_count=header.hash_count, level=header.level, number_pattern=header.number_pattern, title_text=header.title_text ) if fixed_header.number_pattern: # For numbered headers, use the adaptive mapping if fixed_header.level in level_hash_mapping: fixed_header.correct_hash_count = level_hash_mapping[fixed_header.level] else: # Fallback: extrapolate from existing pattern fixed_header.correct_hash_count = self._extrapolate_hash_count( fixed_header.level, level_hash_mapping) else: # For non-numbered headers, keep the original # count fixed_header.correct_hash_count = fixed_header.hash_count fixed_headers.append(fixed_header) return fixed_headers def _analyze_document_hash_pattern(self, headers: List[HeaderInfo]) -> Dict[int, int]: """Analyze the document's # pattern to determine the adaptive mapping""" # Count the number of #'s used at each level level_hash_stats = {} for header in headers: if header.number_pattern: # Only numbered titles are considered level = header.level hash_count = header.hash_count if level not in level_hash_stats: level_hash_stats[level] = {} if hash_count not in level_hash_stats[level]: level_hash_stats[level][hash_count] = 0 level_hash_stats[level][hash_count] += 1 # Find out the most commonly used number of # numbers for each level level_hash_mapping = {} for level, hash_stats in level_hash_stats.items(): most_common_hash = max(hash_stats.keys(), key=lambda x: hash_stats[x]) level_hash_mapping[level] = most_common_hash # Verify and adjust the mapping to ensure that the incremental # number of the hierarchy is also incremented level_hash_mapping = self._ensure_monotonic_mapping(level_hash_mapping) return level_hash_mapping def _ensure_monotonic_mapping(self, level_hash_mapping: Dict[int, int]) -> Dict[int, int]: """Ensure that the level mapping is monotonically increasing (higher level = more #'s)""" if not level_hash_mapping: return level_hash_mapping # Sort by level sorted_levels = sorted(level_hash_mapping.keys()) adjusted_mapping = {} # Ensure that the # count for each level is at least 1 more than the previous level for i, level in enumerate(sorted_levels): current_hash = level_hash_mapping[level] if i == 0: # First level, use as is adjusted_mapping[level] = current_hash else: # Ensure at least 1 more # than the previous level prev_level = sorted_levels[i-1] min_required_hash = adjusted_mapping[prev_level] + 1 adjusted_mapping[level] = max(current_hash, min_required_hash) return adjusted_mapping def _extrapolate_hash_count(self, level: int, level_hash_mapping: Dict[int, int]) -> int: """Infer the number of # numbers for the hierarchy that have not appeared""" if not level_hash_mapping: return level # Fallback to simple 1:1 mapping sorted_levels = sorted(level_hash_mapping.keys()) if level < sorted_levels[0]: # Smaller than the minimum level, infer forward diff = sorted_levels[0] - level return max(1, level_hash_mapping[sorted_levels[0]] - diff) elif level > sorted_levels[-1]: # Larger than the maximum level, infer backward diff = level - sorted_levels[-1] return level_hash_mapping[sorted_levels[-1]] + diff else: # Between known levels, interpolation inference for i in range(len(sorted_levels) - 1): if sorted_levels[i] < level < sorted_levels[i + 1]: # Simple linear interpolation lower_level = sorted_levels[i] upper_level = sorted_levels[i + 1] lower_hash = level_hash_mapping[lower_level] upper_hash = level_hash_mapping[upper_level] # Linear interpolation ratio = (level - lower_level) / (upper_level - lower_level) return int(lower_hash + ratio * (upper_hash - lower_hash)) return level # Fallback def _fix_same_level_inconsistency(self, headers: List[HeaderInfo]) -> None: """Fix inconsistency of # count at the same level""" # Group by level, only process headers with a numbering pattern level_groups = {} for header in headers: if header.number_pattern: # Only process headers with a numbering pattern if header.level not in level_groups: level_groups[header.level] = [] level_groups[header.level].append(header) # Fix inconsistency of # count within each level for level, group_headers in level_groups.items(): if len(group_headers) < 2: continue # Only one header, no need to fix # Count the usage of different # counts within the same level hash_count_stats = {} for header in group_headers: hash_count = header.correct_hash_count if hash_count not in hash_count_stats: hash_count_stats[hash_count] = [] hash_count_stats[hash_count].append(header) # If different # counts exist at the same level if len(hash_count_stats) > 1: # Find the most common # count as the standard most_common_hash_count = max(hash_count_stats.keys(), key=lambda x: len(hash_count_stats[x])) # Unify all titles of the same level into the most commonly used number of # numbers for header in group_headers: header.correct_hash_count = most_common_hash_count