Files
code_scan/scanner/diff_parser.py

221 lines
7.3 KiB
Python
Raw Permalink Normal View History

2026-03-11 21:16:47 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Diff 解析器 - 将扫描问题与代码片段关联
"""
import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
@dataclass
class CodeChunk:
"""代码块"""
file_path: str
old_content: str = ""
new_content: str = ""
old_start: int = 0
new_start: int = 0
hunks: List[Dict] = field(default_factory=list)
class DiffParser:
"""Diff 解析器"""
def __init__(self, diff_text: str):
self.diff_text = diff_text
self.files: Dict[str, CodeChunk] = {}
self._parse()
def _parse(self):
"""解析 diff 文本"""
if not self.diff_text:
return
current_chunk = None
lines = self.diff_text.split('\n')
for line in lines:
diff_match = re.match(r'diff --git a/(.+) b/(.+)', line)
if diff_match:
file_path = diff_match.group(1)
current_chunk = CodeChunk(file_path=file_path)
self.files[file_path] = current_chunk
continue
hunk_match = re.match(r'@@ -(\d+),?\d* \+(\d+),?\d* @@', line)
if hunk_match and current_chunk:
current_chunk.old_start = int(hunk_match.group(1))
current_chunk.new_start = int(hunk_match.group(2))
continue
if current_chunk and line:
if line.startswith('+') and not line.startswith('+++'):
current_chunk.new_content += line[1:] + '\n'
elif line.startswith('-') and not line.startswith('---'):
current_chunk.old_content += line[1:] + '\n'
elif line.startswith(' '):
current_chunk.old_content += line[1:] + '\n'
current_chunk.new_content += line[1:] + '\n'
def get_file_content(self, file_path: str) -> Optional[CodeChunk]:
return self.files.get(file_path)
def get_line_context(self, file_path: str, line_number: int, context_lines: int = 3) -> Optional[Dict[str, Any]]:
chunk = self.files.get(file_path)
if not chunk:
return None
new_lines = chunk.new_content.split('\n')
if line_number > len(new_lines):
return None
start = max(0, line_number - context_lines - 1)
end = min(len(new_lines), line_number + context_lines)
context = []
for i in range(start, end):
code = new_lines[i].rstrip('\n')
is_current_line = (i == line_number - 1)
context.append({
'line_number': chunk.new_start + i,
'code': code,
'is_issue_line': is_current_line
})
return {
'file': file_path,
'line': line_number,
'context': context
}
def merge_issues_with_code(scan_results: Dict[str, Any], diff: str) -> Dict[str, Any]:
"""将扫描问题与代码片段关联"""
2026-03-12 14:42:23 +08:00
parser = DiffParser(diff) if diff else None
2026-03-11 21:16:47 +08:00
enriched_results = {
'scanners': [],
'summary': scan_results.get('summary', {}),
'total_issues': scan_results.get('total_issues', 0)
}
for scanner_name, scanner_data in scan_results.items():
if scanner_name in ['summary', 'total_issues', 'ai']:
continue
if isinstance(scanner_data, dict):
enriched_scanner = {
'name': scanner_name,
'issues': [],
'file_count': scanner_data.get('file_count', 0),
'total_issues': scanner_data.get('total_issues', 0)
}
issues = scanner_data.get('issues', [])
for issue in issues:
2026-03-12 14:42:23 +08:00
enriched_issue = enrich_issue_with_code(issue, parser) if parser else issue
2026-03-11 21:16:47 +08:00
enriched_scanner['issues'].append(enriched_issue)
enriched_results['scanners'].append(enriched_scanner)
2026-03-12 14:42:23 +08:00
# 处理 AI 审查结果,转换为问题格式
2026-03-11 21:16:47 +08:00
if 'ai' in scan_results:
2026-03-12 14:42:23 +08:00
ai_issues = convert_ai_reviews_to_issues(scan_results['ai'], parser)
enriched_results['ai'] = {
'name': 'ai',
'issues': ai_issues,
'summary': scan_results['ai'].get('summary', ''),
'files_reviewed': scan_results['ai'].get('files_reviewed', 0)
}
2026-03-11 21:16:47 +08:00
return enriched_results
2026-03-12 14:42:23 +08:00
def convert_ai_reviews_to_issues(ai_result: Dict[str, Any], parser: Optional[DiffParser] = None) -> List[Dict[str, Any]]:
2026-03-13 11:26:01 +08:00
"""将 AI 审查结果issues 格式)转换为统一问题格式"""
2026-03-12 14:42:23 +08:00
issues = []
2026-03-13 11:26:01 +08:00
ai_issues = ai_result.get('issues', [])
2026-03-12 14:42:23 +08:00
2026-03-13 11:26:01 +08:00
for issue in ai_issues:
file_path = issue.get('file', '')
if not file_path:
2026-03-12 14:42:23 +08:00
continue
code_context = None
if parser:
matched_path = None
for path in parser.files.keys():
if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
matched_path = path
break
if matched_path:
chunk = parser.get_file_content(matched_path)
if chunk and chunk.new_content:
lines = chunk.new_content.split('\n')[:10]
code_context = {
'file': matched_path,
2026-03-13 11:26:01 +08:00
'line': issue.get('line', 1),
2026-03-12 14:42:23 +08:00
'preview': '\n'.join(lines),
'has_more': len(chunk.new_content.split('\n')) > 10
}
2026-03-13 11:26:01 +08:00
sev = issue.get('severity', 'warning')
sev = sev.lower() if isinstance(sev, str) else 'warning'
issues.append({
'file': file_path,
'line': issue.get('line', 1),
'severity': sev,
'message': issue.get('message', ''),
'category': 'ai',
'code_context': code_context,
'defect_reason': issue.get('defect_reason', '')
})
2026-03-12 14:42:23 +08:00
return issues
2026-03-11 21:16:47 +08:00
def enrich_issue_with_code(issue: Dict[str, Any], parser: DiffParser) -> Dict[str, Any]:
"""为单个问题添加代码片段"""
enriched = issue.copy()
file_path = issue.get('file', '')
line_number = issue.get('line', 0)
if not file_path:
return enriched
if not line_number:
desc = issue.get('description', '') or issue.get('message', '')
line_match = re.search(r'line[:#]?\s*(\d+)', desc, re.IGNORECASE)
if line_match:
line_number = int(line_match.group(1))
matched_path = None
for path in parser.files.keys():
if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
matched_path = path
break
if matched_path:
enriched['file'] = matched_path
if matched_path and line_number:
context = parser.get_line_context(matched_path, line_number)
if context:
enriched['code_context'] = context
if 'code_context' not in enriched and matched_path:
chunk = parser.get_file_content(matched_path)
if chunk and chunk.new_content:
lines = chunk.new_content.split('\n')[:10]
enriched['code_context'] = {
'file': matched_path,
'line': line_number or 1,
'preview': '\n'.join(lines),
'has_more': len(chunk.new_content.split('\n')) > 10
}
return enriched