AIRegulation-DocAnalysis/backend/app/application/compliance/pipeline.py

"""Compliance analysis pipeline helpers.

All functions are synchronous — call them via asyncio.to_thread() in async SSE generators.
"""

from __future__ import annotations

import json
import os
import re
import tempfile
from typing import TYPE_CHECKING

from loguru import logger

if TYPE_CHECKING:
    from app.application.knowledge import KnowledgeRetrievalService
    from app.domain.retrieval import RetrievedChunk
    from app.services.llm.base_client import BaseLLMClient


def _extract_json(text: str):
    """Extract JSON from LLM response, tolerating markdown wrappers."""
    stripped = text.strip()
    match = re.search(r"```(?:json)?\s*([\s\S]*?)```", stripped)
    if match:
        stripped = match.group(1).strip()
    try:
        return json.loads(stripped)
    except json.JSONDecodeError:
        pass
    for pattern in (r"(\[[\s\S]*\])", r"(\{[\s\S]*\})"):
        m = re.search(pattern, stripped)
        if m:
            try:
                return json.loads(m.group(1))
            except json.JSONDecodeError:
                continue
    raise ValueError(f"No valid JSON found in LLM response: {text[:300]}")


def extract_text_from_doc_id(doc_id: str) -> str:
    from app.shared.bootstrap import get_document_query_service, get_retrieval_service
    doc = get_document_query_service().get(doc_id)
    if not doc:
        raise ValueError(f"Document '{doc_id}' not found")
    service = get_retrieval_service()
    chunks = service.retrieve(query=doc.doc_name, top_k=30)
    doc_chunks = [c for c in chunks if c.doc_id == doc_id]
    if not doc_chunks:
        doc_chunks = chunks[:15]
    return "\n\n".join(c.text for c in doc_chunks[:15])


def extract_text_from_file(content: bytes, filename: str) -> str:
    from app.shared.bootstrap import get_document_command_service
    suffix = os.path.splitext(filename or "doc.pdf")[1] or ".pdf"
    tmp_path = ""
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp.write(content)
            tmp_path = tmp.name
        service = get_document_command_service()
        parsed = service.parser.parse(file_path=tmp_path, doc_id="tmp_analysis", doc_name=filename)
        if parsed.raw_text:
            return parsed.raw_text[:4000]
        return "\n".join(
            b.get("text", "") for b in parsed.semantic_blocks[:30] if b.get("text")
        )[:4000]
    except Exception as exc:
        logger.warning("File text extraction failed: {}", exc)
        return ""
    finally:
        if tmp_path:
            try: os.unlink(tmp_path)
            except OSError: pass


def split_into_clauses(text: str, client: "BaseLLMClient") -> list[str]:
    prompt = (
        "You are a compliance analysis expert. Split the following text into 3-8 "
        "semantically complete compliance clauses. Each clause should be an independent "
        "compliance requirement or technical statement.\n"
        "Return as JSON array of strings, e.g.:\n"
        '["Clause one...", "Clause two..."]\n'
        "Return ONLY the JSON array.\n\n"
        f"Text:\n{text[:2000]}"
    )
    response = client.chat([{"role": "user", "content": prompt}], max_tokens=1000)
    if response.is_success:
        try:
            result = _extract_json(response.content)
            if isinstance(result, list):
                clauses = [str(c).strip() for c in result if str(c).strip()]
                if clauses:
                    return clauses[:8]
        except (ValueError, TypeError):
            logger.warning("Clause split JSON parse failed, using fallback")
    sentences = re.split(r"[.?!;\n]+", text)
    return [s.strip() for s in sentences if len(s.strip()) > 20][:6]


def retrieve_for_clause(
    clause: str,
    retrieval_service: "KnowledgeRetrievalService",
    top_k: int = 5,
    domains: str | None = None,
) -> list["RetrievedChunk"]:
    return retrieval_service.retrieve(query=clause, top_k=top_k, filters=domains)


def check_clause_compliance(
    clause: str,
    chunks: list["RetrievedChunk"],
    client: "BaseLLMClient",
) -> dict | None:
    if not chunks:
        return None
    reg_context = "\n".join(
        f"[{i+1}] {c.doc_title} {c.section_title or ''}: {c.text[:300]}"
        for i, c in enumerate(chunks[:5])
    )
    prompt = (
        "You are a compliance expert. Judge whether the following business clause "
        "complies with the retrieved regulations.\n\n"
        f"Business clause:\n{clause}\n\n"
        f"Retrieved regulations:\n{reg_context}\n\n"
        "Return JSON:\n"
        "{\n"
        '  "status": "ok" | "warn" | "risk",\n'
        '  "title": "Short finding title (max 30 chars)",\n'
        '  "desc": "Description (50-120 chars)",\n'
        '  "clause_ref": "Regulation clause reference e.g. Art.9.1 or Sec.3.1"\n'
        "}\n"
        "status: ok=compliant, warn=gap exists, risk=critical/missing\n"
        "Return ONLY the JSON object."
    )
    response = client.chat([{"role": "user", "content": prompt}], max_tokens=500)
    if not response.is_success:
        return None
    try:
        result = _extract_json(response.content)
        if isinstance(result, dict) and "status" in result:
            return {
                "title": str(result.get("title", "Compliance finding")),
                "desc": str(result.get("desc", "")),
                "status": result.get("status", "info"),
                "clause_ref": result.get("clause_ref"),
            }
    except (ValueError, TypeError) as exc:
        logger.warning("Gap check JSON parse failed: {}", exc)
    return None


def synthesize_conclusion(
    para_text: str,
    findings: list[dict],
    client: "BaseLLMClient",
) -> dict:
    if not findings:
        return {
            "conclusion": "No significant compliance gaps found. Continue monitoring regulation updates.",
            "actions": [{"label": "Next action", "value": "Monitor regulation updates"}],
            "risk_score": 10,
            "highlight_terms": [],
            "para_text": para_text[:800],
        }
    findings_text = "\n".join(
        f"- [{f['status'].upper()}] {f['title']}: {f['desc']}"
        for f in findings
    )
    prompt = (
        "You are a compliance analysis expert. Generate a summary report "
        "based on the following compliance findings.\n\n"
        f"Original text (first 600 chars):\n{para_text[:600]}\n\n"
        f"Findings:\n{findings_text}\n\n"
        "Return JSON:\n"
        "{\n"
        '  "conclusion": "Overall compliance conclusion (100-200 chars)",\n'
        '  "actions": [\n'
        '    {"label": "Action label", "value": "Description"},\n'
        '    {"label": "Priority", "value": "High/Medium/Low", "risk": true}\n'
        '  ],\n'
        '  "risk_score": 0-100 (integer, higher=riskier),\n'
        '  "highlight_terms": ["Key terms to highlight, max 10 terms"],\n'
        '  "para_text": "Original text or summary (max 600 chars)"\n'
        "}\n"
        "Return ONLY the JSON object."
    )
    response = client.chat([{"role": "user", "content": prompt}], max_tokens=1200)
    fallback = {
        "conclusion": "Compliance analysis complete. Review findings and create remediation plan.",
        "actions": [
            {"label": "Next action", "value": "Review critical findings"},
            {"label": "Escalation", "value": "Legal review required", "risk": True},
        ],
        "risk_score": 60,
        "highlight_terms": [],
        "para_text": para_text[:800],
    }
    if not response.is_success:
        return fallback
    try:
        result = _extract_json(response.content)
        if isinstance(result, dict):
            return {
                "conclusion": str(result.get("conclusion", fallback["conclusion"])),
                "actions": result.get("actions", fallback["actions"]),
                "risk_score": int(result.get("risk_score", 60)),
                "highlight_terms": result.get("highlight_terms", []),
                "para_text": str(result.get("para_text", para_text[:800])),
            }
    except (ValueError, TypeError) as exc:
        logger.warning("Conclusion synthesis JSON parse failed: {}", exc)
    return fallback