code_scan/scanner/diff_parser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Diff 解析器 - 将扫描问题与代码片段关联
"""
import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)


@dataclass
class CodeChunk:
    """代码块"""
    file_path: str
    old_content: str = ""
    new_content: str = ""
    old_start: int = 0
    new_start: int = 0
    hunks: List[Dict] = field(default_factory=list)


class DiffParser:
    """Diff 解析器"""

    def __init__(self, diff_text: str):
        self.diff_text = diff_text
        self.files: Dict[str, CodeChunk] = {}
        self._parse()

    def _parse(self):
        """解析 diff 文本"""
        if not self.diff_text:
            return

        current_chunk = None
        lines = self.diff_text.split('\n')
        for line in lines:
            diff_match = re.match(r'diff --git a/(.+) b/(.+)', line)
            if diff_match:
                file_path = diff_match.group(1)
                current_chunk = CodeChunk(file_path=file_path)
                self.files[file_path] = current_chunk
                continue

            hunk_match = re.match(r'@@ -(\d+),?\d* \+(\d+),?\d* @@', line)
            if hunk_match and current_chunk:
                current_chunk.old_start = int(hunk_match.group(1))
                current_chunk.new_start = int(hunk_match.group(2))
                continue

            if current_chunk and line:
                if line.startswith('+') and not line.startswith('+++'):
                    current_chunk.new_content += line[1:] + '\n'
                elif line.startswith('-') and not line.startswith('---'):
                    current_chunk.old_content += line[1:] + '\n'
                elif line.startswith(' '):
                    current_chunk.old_content += line[1:] + '\n'
                    current_chunk.new_content += line[1:] + '\n'

    def get_file_content(self, file_path: str) -> Optional[CodeChunk]:
        return self.files.get(file_path)

    def get_line_context(self, file_path: str, line_number: int, context_lines: int = 3) -> Optional[Dict[str, Any]]:
        chunk = self.files.get(file_path)
        if not chunk:
            return None

        new_lines = chunk.new_content.split('\n')
        if line_number > len(new_lines):
            return None

        start = max(0, line_number - context_lines - 1)
        end = min(len(new_lines), line_number + context_lines)

        context = []
        for i in range(start, end):
            code = new_lines[i].rstrip('\n')
            is_current_line = (i == line_number - 1)
            context.append({
                'line_number': chunk.new_start + i,
                'code': code,
                'is_issue_line': is_current_line
            })

        return {
            'file': file_path,
            'line': line_number,
            'context': context
        }


def merge_issues_with_code(scan_results: Dict[str, Any], diff: str) -> Dict[str, Any]:
    """将扫描问题与代码片段关联"""
    if not diff:
        return scan_results

    parser = DiffParser(diff)
    enriched_results = {
        'scanners': [],
        'summary': scan_results.get('summary', {}),
        'total_issues': scan_results.get('total_issues', 0)
    }

    for scanner_name, scanner_data in scan_results.items():
        if scanner_name in ['summary', 'total_issues', 'ai']:
            continue

        if isinstance(scanner_data, dict):
            enriched_scanner = {
                'name': scanner_name,
                'issues': [],
                'file_count': scanner_data.get('file_count', 0),
                'total_issues': scanner_data.get('total_issues', 0)
            }

            issues = scanner_data.get('issues', [])
            for issue in issues:
                enriched_issue = enrich_issue_with_code(issue, parser)
                enriched_scanner['issues'].append(enriched_issue)

            enriched_results['scanners'].append(enriched_scanner)

    if 'ai' in scan_results:
        enriched_results['ai'] = scan_results['ai']

    return enriched_results


def enrich_issue_with_code(issue: Dict[str, Any], parser: DiffParser) -> Dict[str, Any]:
    """为单个问题添加代码片段"""
    enriched = issue.copy()

    file_path = issue.get('file', '')
    line_number = issue.get('line', 0)

    if not file_path:
        return enriched

    if not line_number:
        desc = issue.get('description', '') or issue.get('message', '')
        line_match = re.search(r'line[:#]?\s*(\d+)', desc, re.IGNORECASE)
        if line_match:
            line_number = int(line_match.group(1))

    matched_path = None
    for path in parser.files.keys():
        if file_path.endswith(path) or path.endswith(file_path) or file_path in path:
            matched_path = path
            break

    if matched_path:
        enriched['file'] = matched_path
    if matched_path and line_number:
        context = parser.get_line_context(matched_path, line_number)
        if context:
            enriched['code_context'] = context

    if 'code_context' not in enriched and matched_path:
        chunk = parser.get_file_content(matched_path)
        if chunk and chunk.new_content:
            lines = chunk.new_content.split('\n')[:10]
            enriched['code_context'] = {
                'file': matched_path,
                'line': line_number or 1,
                'preview': '\n'.join(lines),
                'has_more': len(chunk.new_content.split('\n')) > 10
            }

    return enriched
add web 2026-03-11 21:16:47 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
			`"""`
			`Diff 解析器 - 将扫描问题与代码片段关联`
			`"""`
			`import re`
			`import logging`
			`from typing import Dict, List, Any, Optional`
			`from dataclasses import dataclass, field`

			`logger = logging.getLogger(__name__)`


			`@dataclass`
			`class CodeChunk:`
			`"""代码块"""`
			`file_path: str`
			`old_content: str = ""`
			`new_content: str = ""`
			`old_start: int = 0`
			`new_start: int = 0`
			`hunks: List[Dict] = field(default_factory=list)`


			`class DiffParser:`
			`"""Diff 解析器"""`

			`def __init__(self, diff_text: str):`
			`self.diff_text = diff_text`
			`self.files: Dict[str, CodeChunk] = {}`
			`self._parse()`

			`def _parse(self):`
			`"""解析 diff 文本"""`
			`if not self.diff_text:`
			`return`

			`current_chunk = None`
			`lines = self.diff_text.split('\n')`
			`for line in lines:`
			`diff_match = re.match(r'diff --git a/(.+) b/(.+)', line)`
			`if diff_match:`
			`file_path = diff_match.group(1)`
			`current_chunk = CodeChunk(file_path=file_path)`
			`self.files[file_path] = current_chunk`
			`continue`

			`hunk_match = re.match(r'@@ -(\d+),?\d* \+(\d+),?\d* @@', line)`
			`if hunk_match and current_chunk:`
			`current_chunk.old_start = int(hunk_match.group(1))`
			`current_chunk.new_start = int(hunk_match.group(2))`
			`continue`

			`if current_chunk and line:`
			`if line.startswith('+') and not line.startswith('+++'):`
			`current_chunk.new_content += line[1:] + '\n'`
			`elif line.startswith('-') and not line.startswith('---'):`
			`current_chunk.old_content += line[1:] + '\n'`
			`elif line.startswith(' '):`
			`current_chunk.old_content += line[1:] + '\n'`
			`current_chunk.new_content += line[1:] + '\n'`

			`def get_file_content(self, file_path: str) -> Optional[CodeChunk]:`
			`return self.files.get(file_path)`

			`def get_line_context(self, file_path: str, line_number: int, context_lines: int = 3) -> Optional[Dict[str, Any]]:`
			`chunk = self.files.get(file_path)`
			`if not chunk:`
			`return None`

			`new_lines = chunk.new_content.split('\n')`
			`if line_number > len(new_lines):`
			`return None`

			`start = max(0, line_number - context_lines - 1)`
			`end = min(len(new_lines), line_number + context_lines)`

			`context = []`
			`for i in range(start, end):`
			`code = new_lines[i].rstrip('\n')`
			`is_current_line = (i == line_number - 1)`
			`context.append({`
			`'line_number': chunk.new_start + i,`
			`'code': code,`
			`'is_issue_line': is_current_line`
			`})`

			`return {`
			`'file': file_path,`
			`'line': line_number,`
			`'context': context`
			`}`


			`def merge_issues_with_code(scan_results: Dict[str, Any], diff: str) -> Dict[str, Any]:`
			`"""将扫描问题与代码片段关联"""`
			`if not diff:`
			`return scan_results`

			`parser = DiffParser(diff)`
			`enriched_results = {`
			`'scanners': [],`
			`'summary': scan_results.get('summary', {}),`
			`'total_issues': scan_results.get('total_issues', 0)`
			`}`

			`for scanner_name, scanner_data in scan_results.items():`
			`if scanner_name in ['summary', 'total_issues', 'ai']:`
			`continue`

			`if isinstance(scanner_data, dict):`
			`enriched_scanner = {`
			`'name': scanner_name,`
			`'issues': [],`
			`'file_count': scanner_data.get('file_count', 0),`
			`'total_issues': scanner_data.get('total_issues', 0)`
			`}`

			`issues = scanner_data.get('issues', [])`
			`for issue in issues:`
			`enriched_issue = enrich_issue_with_code(issue, parser)`
			`enriched_scanner['issues'].append(enriched_issue)`

			`enriched_results['scanners'].append(enriched_scanner)`

			`if 'ai' in scan_results:`
			`enriched_results['ai'] = scan_results['ai']`

			`return enriched_results`


			`def enrich_issue_with_code(issue: Dict[str, Any], parser: DiffParser) -> Dict[str, Any]:`
			`"""为单个问题添加代码片段"""`
			`enriched = issue.copy()`

			`file_path = issue.get('file', '')`
			`line_number = issue.get('line', 0)`

			`if not file_path:`
			`return enriched`

			`if not line_number:`
			`desc = issue.get('description', '') or issue.get('message', '')`
			`line_match = re.search(r'line[:#]?\s*(\d+)', desc, re.IGNORECASE)`
			`if line_match:`
			`line_number = int(line_match.group(1))`

			`matched_path = None`
			`for path in parser.files.keys():`
			`if file_path.endswith(path) or path.endswith(file_path) or file_path in path:`
			`matched_path = path`
			`break`

			`if matched_path:`
			`enriched['file'] = matched_path`
			`if matched_path and line_number:`
			`context = parser.get_line_context(matched_path, line_number)`
			`if context:`
			`enriched['code_context'] = context`

			`if 'code_context' not in enriched and matched_path:`
			`chunk = parser.get_file_content(matched_path)`
			`if chunk and chunk.new_content:`
			`lines = chunk.new_content.split('\n')[:10]`
			`enriched['code_context'] = {`
			`'file': matched_path,`
			`'line': line_number or 1,`
			`'preview': '\n'.join(lines),`
			`'has_more': len(chunk.new_content.split('\n')) > 10`
			`}`

			`return enriched`