Files
code_scan/scanner/diff_parser.py
Dang Zerong cb90b66f09 代码测试
2026-03-13 11:26:01 +08:00

221 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Diff 解析器 - 将扫描问题与代码片段关联
"""
import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
@dataclass
class CodeChunk:
"""代码块"""
file_path: str
old_content: str = ""
new_content: str = ""
old_start: int = 0
new_start: int = 0
hunks: List[Dict] = field(default_factory=list)
class DiffParser:
"""Diff 解析器"""
def __init__(self, diff_text: str):
self.diff_text = diff_text
self.files: Dict[str, CodeChunk] = {}
self._parse()
def _parse(self):
"""解析 diff 文本"""
if not self.diff_text:
return
current_chunk = None
lines = self.diff_text.split('\n')
for line in lines:
diff_match = re.match(r'diff --git a/(.+) b/(.+)', line)
if diff_match:
file_path = diff_match.group(1)
current_chunk = CodeChunk(file_path=file_path)
self.files[file_path] = current_chunk
continue
hunk_match = re.match(r'@@ -(\d+),?\d* \+(\d+),?\d* @@', line)
if hunk_match and current_chunk:
current_chunk.old_start = int(hunk_match.group(1))
current_chunk.new_start = int(hunk_match.group(2))
continue
if current_chunk and line:
if line.startswith('+') and not line.startswith('+++'):
current_chunk.new_content += line[1:] + '\n'
elif line.startswith('-') and not line.startswith('---'):
current_chunk.old_content += line[1:] + '\n'
elif line.startswith(' '):
current_chunk.old_content += line[1:] + '\n'
current_chunk.new_content += line[1:] + '\n'
def get_file_content(self, file_path: str) -> Optional[CodeChunk]:
return self.files.get(file_path)
def get_line_context(self, file_path: str, line_number: int, context_lines: int = 3) -> Optional[Dict[str, Any]]:
chunk = self.files.get(file_path)
if not chunk:
return None
new_lines = chunk.new_content.split('\n')
if line_number > len(new_lines):
return None
start = max(0, line_number - context_lines - 1)
end = min(len(new_lines), line_number + context_lines)
context = []
for i in range(start, end):
code = new_lines[i].rstrip('\n')
is_current_line = (i == line_number - 1)
context.append({
'line_number': chunk.new_start + i,
'code': code,
'is_issue_line': is_current_line
})
return {
'file': file_path,
'line': line_number,
'context': context
}
def merge_issues_with_code(scan_results: Dict[str, Any], diff: str) -> Dict[str, Any]:
"""将扫描问题与代码片段关联"""
parser = DiffParser(diff) if diff else None
enriched_results = {
'scanners': [],
'summary': scan_results.get('summary', {}),
'total_issues': scan_results.get('total_issues', 0)
}
for scanner_name, scanner_data in scan_results.items():
if scanner_name in ['summary', 'total_issues', 'ai']:
continue
if isinstance(scanner_data, dict):
enriched_scanner = {
'name': scanner_name,
'issues': [],
'file_count': scanner_data.get('file_count', 0),
'total_issues': scanner_data.get('total_issues', 0)
}
issues = scanner_data.get('issues', [])
for issue in issues:
enriched_issue = enrich_issue_with_code(issue, parser) if parser else issue
enriched_scanner['issues'].append(enriched_issue)
enriched_results['scanners'].append(enriched_scanner)
# 处理 AI 审查结果,转换为问题格式
if 'ai' in scan_results:
ai_issues = convert_ai_reviews_to_issues(scan_results['ai'], parser)
enriched_results['ai'] = {
'name': 'ai',
'issues': ai_issues,
'summary': scan_results['ai'].get('summary', ''),
'files_reviewed': scan_results['ai'].get('files_reviewed', 0)
}
return enriched_results
def convert_ai_reviews_to_issues(ai_result: Dict[str, Any], parser: Optional[DiffParser] = None) -> List[Dict[str, Any]]:
"""将 AI 审查结果issues 格式)转换为统一问题格式"""
issues = []
ai_issues = ai_result.get('issues', [])
for issue in ai_issues:
file_path = issue.get('file', '')
if not file_path:
continue
code_context = None
if parser:
matched_path = None
for path in parser.files.keys():
if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
matched_path = path
break
if matched_path:
chunk = parser.get_file_content(matched_path)
if chunk and chunk.new_content:
lines = chunk.new_content.split('\n')[:10]
code_context = {
'file': matched_path,
'line': issue.get('line', 1),
'preview': '\n'.join(lines),
'has_more': len(chunk.new_content.split('\n')) > 10
}
sev = issue.get('severity', 'warning')
sev = sev.lower() if isinstance(sev, str) else 'warning'
issues.append({
'file': file_path,
'line': issue.get('line', 1),
'severity': sev,
'message': issue.get('message', ''),
'category': 'ai',
'code_context': code_context,
'defect_reason': issue.get('defect_reason', '')
})
return issues
def enrich_issue_with_code(issue: Dict[str, Any], parser: DiffParser) -> Dict[str, Any]:
"""为单个问题添加代码片段"""
enriched = issue.copy()
file_path = issue.get('file', '')
line_number = issue.get('line', 0)
if not file_path:
return enriched
if not line_number:
desc = issue.get('description', '') or issue.get('message', '')
line_match = re.search(r'line[:#]?\s*(\d+)', desc, re.IGNORECASE)
if line_match:
line_number = int(line_match.group(1))
matched_path = None
for path in parser.files.keys():
if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
matched_path = path
break
if matched_path:
enriched['file'] = matched_path
if matched_path and line_number:
context = parser.get_line_context(matched_path, line_number)
if context:
enriched['code_context'] = context
if 'code_context' not in enriched and matched_path:
chunk = parser.get_file_content(matched_path)
if chunk and chunk.new_content:
lines = chunk.new_content.split('\n')[:10]
enriched['code_context'] = {
'file': matched_path,
'line': line_number or 1,
'preview': '\n'.join(lines),
'has_more': len(chunk.new_content.split('\n')) > 10
}
return enriched