"""Compliance analysis pipeline helpers. All functions are synchronous — call them via asyncio.to_thread() in async SSE generators. """ from __future__ import annotations import json import os import re import tempfile from typing import TYPE_CHECKING from loguru import logger if TYPE_CHECKING: from app.application.knowledge import KnowledgeRetrievalService from app.domain.retrieval import RetrievedChunk from app.services.llm.base_client import BaseLLMClient def _extract_json(text: str): """Extract JSON from LLM response, tolerating markdown wrappers.""" stripped = text.strip() match = re.search(r"```(?:json)?\s*([\s\S]*?)```", stripped) if match: stripped = match.group(1).strip() try: return json.loads(stripped) except json.JSONDecodeError: pass for pattern in (r"(\[[\s\S]*\])", r"(\{[\s\S]*\})"): m = re.search(pattern, stripped) if m: try: return json.loads(m.group(1)) except json.JSONDecodeError: continue raise ValueError(f"No valid JSON found in LLM response: {text[:300]}") def extract_text_from_doc_id(doc_id: str) -> str: from app.shared.bootstrap import get_document_query_service, get_retrieval_service doc = get_document_query_service().get(doc_id) if not doc: raise ValueError(f"Document '{doc_id}' not found") service = get_retrieval_service() chunks = service.retrieve(query=doc.doc_name, top_k=30) doc_chunks = [c for c in chunks if c.doc_id == doc_id] if not doc_chunks: doc_chunks = chunks[:15] return "\n\n".join(c.text for c in doc_chunks[:15]) def extract_text_from_file(content: bytes, filename: str) -> str: from app.shared.bootstrap import get_document_command_service suffix = os.path.splitext(filename or "doc.pdf")[1] or ".pdf" tmp_path = "" try: with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(content) tmp_path = tmp.name service = get_document_command_service() parsed = service.parser.parse(file_path=tmp_path, doc_id="tmp_analysis", doc_name=filename) if parsed.raw_text: return parsed.raw_text[:4000] return "\n".join( b.get("text", "") for b in parsed.semantic_blocks[:30] if b.get("text") )[:4000] except Exception as exc: logger.warning("File text extraction failed: {}", exc) return "" finally: if tmp_path: try: os.unlink(tmp_path) except OSError: pass def split_into_clauses(text: str, client: "BaseLLMClient") -> list[str]: prompt = ( "You are a compliance analysis expert. Split the following text into 3-8 " "semantically complete compliance clauses. Each clause should be an independent " "compliance requirement or technical statement.\n" "Return as JSON array of strings, e.g.:\n" '["Clause one...", "Clause two..."]\n' "Return ONLY the JSON array.\n\n" f"Text:\n{text[:2000]}" ) response = client.chat([{"role": "user", "content": prompt}], max_tokens=1000) if response.is_success: try: result = _extract_json(response.content) if isinstance(result, list): clauses = [str(c).strip() for c in result if str(c).strip()] if clauses: return clauses[:8] except (ValueError, TypeError): logger.warning("Clause split JSON parse failed, using fallback") sentences = re.split(r"[.?!;\n]+", text) return [s.strip() for s in sentences if len(s.strip()) > 20][:6] def retrieve_for_clause( clause: str, retrieval_service: "KnowledgeRetrievalService", top_k: int = 5, domains: str | None = None, ) -> list["RetrievedChunk"]: return retrieval_service.retrieve(query=clause, top_k=top_k, filters=domains) def check_clause_compliance( clause: str, chunks: list["RetrievedChunk"], client: "BaseLLMClient", ) -> dict | None: if not chunks: return None reg_context = "\n".join( f"[{i+1}] {c.doc_title} {c.section_title or ''}: {c.text[:300]}" for i, c in enumerate(chunks[:5]) ) prompt = ( "You are a compliance expert. Judge whether the following business clause " "complies with the retrieved regulations.\n\n" f"Business clause:\n{clause}\n\n" f"Retrieved regulations:\n{reg_context}\n\n" "Return JSON:\n" "{\n" ' "status": "ok" | "warn" | "risk",\n' ' "title": "Short finding title (max 30 chars)",\n' ' "desc": "Description (50-120 chars)",\n' ' "clause_ref": "Regulation clause reference e.g. Art.9.1 or Sec.3.1"\n' "}\n" "status: ok=compliant, warn=gap exists, risk=critical/missing\n" "Return ONLY the JSON object." ) response = client.chat([{"role": "user", "content": prompt}], max_tokens=500) if not response.is_success: return None try: result = _extract_json(response.content) if isinstance(result, dict) and "status" in result: return { "title": str(result.get("title", "Compliance finding")), "desc": str(result.get("desc", "")), "status": result.get("status", "info"), "clause_ref": result.get("clause_ref"), } except (ValueError, TypeError) as exc: logger.warning("Gap check JSON parse failed: {}", exc) return None def synthesize_conclusion( para_text: str, findings: list[dict], client: "BaseLLMClient", ) -> dict: if not findings: return { "conclusion": "No significant compliance gaps found. Continue monitoring regulation updates.", "actions": [{"label": "Next action", "value": "Monitor regulation updates"}], "risk_score": 10, "highlight_terms": [], "para_text": para_text[:800], } findings_text = "\n".join( f"- [{f['status'].upper()}] {f['title']}: {f['desc']}" for f in findings ) prompt = ( "You are a compliance analysis expert. Generate a summary report " "based on the following compliance findings.\n\n" f"Original text (first 600 chars):\n{para_text[:600]}\n\n" f"Findings:\n{findings_text}\n\n" "Return JSON:\n" "{\n" ' "conclusion": "Overall compliance conclusion (100-200 chars)",\n' ' "actions": [\n' ' {"label": "Action label", "value": "Description"},\n' ' {"label": "Priority", "value": "High/Medium/Low", "risk": true}\n' ' ],\n' ' "risk_score": 0-100 (integer, higher=riskier),\n' ' "highlight_terms": ["Key terms to highlight, max 10 terms"],\n' ' "para_text": "Original text or summary (max 600 chars)"\n' "}\n" "Return ONLY the JSON object." ) response = client.chat([{"role": "user", "content": prompt}], max_tokens=1200) fallback = { "conclusion": "Compliance analysis complete. Review findings and create remediation plan.", "actions": [ {"label": "Next action", "value": "Review critical findings"}, {"label": "Escalation", "value": "Legal review required", "risk": True}, ], "risk_score": 60, "highlight_terms": [], "para_text": para_text[:800], } if not response.is_success: return fallback try: result = _extract_json(response.content) if isinstance(result, dict): return { "conclusion": str(result.get("conclusion", fallback["conclusion"])), "actions": result.get("actions", fallback["actions"]), "risk_score": int(result.get("risk_score", 60)), "highlight_terms": result.get("highlight_terms", []), "para_text": str(result.get("para_text", para_text[:800])), } except (ValueError, TypeError) as exc: logger.warning("Conclusion synthesis JSON parse failed: {}", exc) return fallback