code_scan/scanner/diff_parser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Diff 解析器 - 将扫描问题与代码片段关联
"""
import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)


@dataclass
class CodeChunk:
    """代码块"""
    file_path: str
    old_content: str = ""
    new_content: str = ""
    old_start: int = 0
    new_start: int = 0
    hunks: List[Dict] = field(default_factory=list)


class DiffParser:
    """Diff 解析器"""

    def __init__(self, diff_text: str):
        self.diff_text = diff_text
        self.files: Dict[str, CodeChunk] = {}
        self._parse()

    def _parse(self):
        """解析 diff 文本"""
        if not self.diff_text:
            return

        current_chunk = None
        lines = self.diff_text.split('\n')
        for line in lines:
            diff_match = re.match(r'diff --git a/(.+) b/(.+)', line)
            if diff_match:
                file_path = diff_match.group(1)
                current_chunk = CodeChunk(file_path=file_path)
                self.files[file_path] = current_chunk
                continue

            hunk_match = re.match(r'@@ -(\d+),?\d* \+(\d+),?\d* @@', line)
            if hunk_match and current_chunk:
                current_chunk.old_start = int(hunk_match.group(1))
                current_chunk.new_start = int(hunk_match.group(2))
                continue

            if current_chunk and line:
                if line.startswith('+') and not line.startswith('+++'):
                    current_chunk.new_content += line[1:] + '\n'
                elif line.startswith('-') and not line.startswith('---'):
                    current_chunk.old_content += line[1:] + '\n'
                elif line.startswith(' '):
                    current_chunk.old_content += line[1:] + '\n'
                    current_chunk.new_content += line[1:] + '\n'

    def get_file_content(self, file_path: str) -> Optional[CodeChunk]:
        return self.files.get(file_path)

    def get_line_context(self, file_path: str, line_number: int, context_lines: int = 3) -> Optional[Dict[str, Any]]:
        chunk = self.files.get(file_path)
        if not chunk:
            return None

        new_lines = chunk.new_content.split('\n')
        if line_number > len(new_lines):
            return None

        start = max(0, line_number - context_lines - 1)
        end = min(len(new_lines), line_number + context_lines)

        context = []
        for i in range(start, end):
            code = new_lines[i].rstrip('\n')
            is_current_line = (i == line_number - 1)
            context.append({
                'line_number': chunk.new_start + i,
                'code': code,
                'is_issue_line': is_current_line
            })

        return {
            'file': file_path,
            'line': line_number,
            'context': context
        }


def merge_issues_with_code(scan_results: Dict[str, Any], diff: str) -> Dict[str, Any]:
    """将扫描问题与代码片段关联"""
    parser = DiffParser(diff) if diff else None

    enriched_results = {
        'scanners': [],
        'summary': scan_results.get('summary', {}),
        'total_issues': scan_results.get('total_issues', 0)
    }

    for scanner_name, scanner_data in scan_results.items():
        if scanner_name in ['summary', 'total_issues', 'ai']:
            continue

        if isinstance(scanner_data, dict):
            enriched_scanner = {
                'name': scanner_name,
                'issues': [],
                'file_count': scanner_data.get('file_count', 0),
                'total_issues': scanner_data.get('total_issues', 0)
            }

            issues = scanner_data.get('issues', [])
            for issue in issues:
                enriched_issue = enrich_issue_with_code(issue, parser) if parser else issue
                enriched_scanner['issues'].append(enriched_issue)

            enriched_results['scanners'].append(enriched_scanner)

    # 处理 AI 审查结果，转换为问题格式
    if 'ai' in scan_results:
        ai_issues = convert_ai_reviews_to_issues(scan_results['ai'], parser)
        enriched_results['ai'] = {
            'name': 'ai',
            'issues': ai_issues,
            'summary': scan_results['ai'].get('summary', ''),
            'files_reviewed': scan_results['ai'].get('files_reviewed', 0)
        }

    return enriched_results


def convert_ai_reviews_to_issues(ai_result: Dict[str, Any], parser: Optional[DiffParser] = None) -> List[Dict[str, Any]]:
    """将 AI 审查结果（issues 格式）转换为统一问题格式"""
    issues = []
    ai_issues = ai_result.get('issues', [])

    for issue in ai_issues:
        file_path = issue.get('file', '')
        if not file_path:
            continue

        code_context = None
        if parser:
            matched_path = None
            for path in parser.files.keys():
                if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
                    matched_path = path
                    break
            if matched_path:
                chunk = parser.get_file_content(matched_path)
                if chunk and chunk.new_content:
                    lines = chunk.new_content.split('\n')[:10]
                    code_context = {
                        'file': matched_path,
                        'line': issue.get('line', 1),
                        'preview': '\n'.join(lines),
                        'has_more': len(chunk.new_content.split('\n')) > 10
                    }

        sev = issue.get('severity', 'warning')
        sev = sev.lower() if isinstance(sev, str) else 'warning'
        issues.append({
            'file': file_path,
            'line': issue.get('line', 1),
            'severity': sev,
            'message': issue.get('message', ''),
            'category': 'ai',
            'code_context': code_context,
            'defect_reason': issue.get('defect_reason', '')
        })

    return issues


def enrich_issue_with_code(issue: Dict[str, Any], parser: DiffParser) -> Dict[str, Any]:
    """为单个问题添加代码片段"""
    enriched = issue.copy()

    file_path = issue.get('file', '')
    line_number = issue.get('line', 0)

    if not file_path:
        return enriched

    if not line_number:
        desc = issue.get('description', '') or issue.get('message', '')
        line_match = re.search(r'line[:#]?\s*(\d+)', desc, re.IGNORECASE)
        if line_match:
            line_number = int(line_match.group(1))

    matched_path = None
    for path in parser.files.keys():
        if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
            matched_path = path
            break

    if matched_path:
        enriched['file'] = matched_path
    if matched_path and line_number:
        context = parser.get_line_context(matched_path, line_number)
        if context:
            enriched['code_context'] = context

    if 'code_context' not in enriched and matched_path:
        chunk = parser.get_file_content(matched_path)
        if chunk and chunk.new_content:
            lines = chunk.new_content.split('\n')[:10]
            enriched['code_context'] = {
                'file': matched_path,
                'line': line_number or 1,
                'preview': '\n'.join(lines),
                'has_more': len(chunk.new_content.split('\n')) > 10
            }

    return enriched