From e7963b267e6c17b2bf1a681908c86f3ffdfab38b Mon Sep 17 00:00:00 2001 From: wangwei Date: Mon, 8 Jun 2026 11:16:28 +0800 Subject: [PATCH] fix somethings --- .env | 5 + .env.example | 5 + backend/app/api/routes/perception.py | 80 +- .../application/perception/crawl_service.py | 147 + .../app/application/perception/services.py | 4 +- backend/app/config/settings.py | 12 + .../perception/base_event_store.py | 39 + .../perception/crawlers/__init__.py | 0 .../perception/crawlers/_utils.py | 43 + .../perception/crawlers/base.py | 32 + .../perception/crawlers/catarc_crawler.py | 83 + .../perception/crawlers/eurlex_crawler.py | 117 + .../perception/crawlers/guobiao_crawler.py | 92 + .../infrastructure/perception/llm_pipeline.py | 241 ++ .../perception/mock_event_store.py | 40 +- .../perception/postgres_event_store.py | 225 ++ backend/app/shared/bootstrap.py | 37 +- backend/requirements.txt | 2 + backend/tests/perception/__init__.py | 0 .../tests/perception/test_base_event_store.py | 95 + .../tests/perception/test_crawl_service.py | 111 + backend/tests/perception/test_crawlers.py | 127 + backend/tests/perception/test_llm_pipeline.py | 77 + .../perception/test_postgres_event_store.py | 98 + .../2026-06-05-perception-intelligence.md | 2500 +++++++++++++++++ ...26-06-05-perception-intelligence-design.md | 328 +++ frontend/src/App.tsx | 6 +- frontend/src/contexts/PageStateContext.tsx | 211 ++ frontend/src/contexts/index.ts | 15 + .../pages/Compliance/useComplianceAnalysis.ts | 95 +- .../src/pages/Perception/PerceptionPage.tsx | 394 ++- frontend/src/pages/RagChat/RagChatPage.tsx | 148 +- frontend/src/styles/globals.css | 30 + pyproject.toml | 2 + 34 files changed, 5195 insertions(+), 246 deletions(-) create mode 100644 backend/app/application/perception/crawl_service.py create mode 100644 backend/app/infrastructure/perception/base_event_store.py create mode 100644 backend/app/infrastructure/perception/crawlers/__init__.py create mode 100644 backend/app/infrastructure/perception/crawlers/_utils.py create mode 100644 backend/app/infrastructure/perception/crawlers/base.py create mode 100644 backend/app/infrastructure/perception/crawlers/catarc_crawler.py create mode 100644 backend/app/infrastructure/perception/crawlers/eurlex_crawler.py create mode 100644 backend/app/infrastructure/perception/crawlers/guobiao_crawler.py create mode 100644 backend/app/infrastructure/perception/llm_pipeline.py create mode 100644 backend/app/infrastructure/perception/postgres_event_store.py create mode 100644 backend/tests/perception/__init__.py create mode 100644 backend/tests/perception/test_base_event_store.py create mode 100644 backend/tests/perception/test_crawl_service.py create mode 100644 backend/tests/perception/test_crawlers.py create mode 100644 backend/tests/perception/test_llm_pipeline.py create mode 100644 backend/tests/perception/test_postgres_event_store.py create mode 100644 docs/superpowers/plans/2026-06-05-perception-intelligence.md create mode 100644 docs/superpowers/specs/2026-06-05-perception-intelligence-design.md create mode 100644 frontend/src/contexts/PageStateContext.tsx diff --git a/.env b/.env index 7cef945..a92f5b7 100644 --- a/.env +++ b/.env @@ -54,6 +54,11 @@ DOCUMENT_REPOSITORY_BACKEND=json # Default false: processing runs in FastAPI's threadpool — no external worker needed. USE_CELERY_WORKER=false +# ===== 法规感知爬取配置 ===== +PERCEPTION_CRAWL_TIMEOUT_SECONDS=120 +PERCEPTION_MAX_EVENTS_PER_SOURCE=100 +PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85 + # ===== API配置 ===== API_HOST=0.0.0.0 API_PORT=8000 diff --git a/.env.example b/.env.example index 26131db..13a7539 100644 --- a/.env.example +++ b/.env.example @@ -55,6 +55,11 @@ DOCUMENT_REPOSITORY_BACKEND=json # Default false: document processing runs in FastAPI's threadpool (no external worker needed). USE_CELERY_WORKER=false +# ===== 法规感知爬取配置 ===== +PERCEPTION_CRAWL_TIMEOUT_SECONDS=120 +PERCEPTION_MAX_EVENTS_PER_SOURCE=100 +PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85 + # ===== 阿里云文档解析 ===== ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret diff --git a/backend/app/api/routes/perception.py b/backend/app/api/routes/perception.py index 7470234..e398e49 100644 --- a/backend/app/api/routes/perception.py +++ b/backend/app/api/routes/perception.py @@ -4,10 +4,12 @@ from __future__ import annotations import json -from fastapi import APIRouter, Query +from fastapi import APIRouter, Depends, Query from fastapi.responses import StreamingResponse -from app.shared.bootstrap import get_perception_service +from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service +from app.api.dependencies.auth import get_current_user +from app.domain.auth.models import UserClaims from app.shared.async_utils import iter_in_thread router = APIRouter(prefix="/perception", tags=["智能感知"]) @@ -65,3 +67,77 @@ async def analyze_event(event_id: str): "X-Accel-Buffering": "no", }, ) + + +@router.post("/crawl") +async def run_crawl( + body: dict = None, + current_user: UserClaims = Depends(get_current_user), +): + """Trigger manual crawl of regulatory sources. Streams SSE progress. + + Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]} + Omit sources to crawl all registered sources. + """ + sources: list[str] | None = (body or {}).get("sources") + crawl_svc = get_crawl_service() + + async def crawl_stream(): + async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)): + event_name = item.get("event", "message") + data = item.get("data", "") + if isinstance(data, (dict, list)): + data = json.dumps(data, ensure_ascii=False) + yield f"event: {event_name}\ndata: {data}\n\n" + + return StreamingResponse( + crawl_stream(), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + +@router.post("/events/{event_id}/process") +async def process_event( + event_id: str, + current_user: UserClaims = Depends(get_current_user), +): + """Trigger LLM pipeline (extract + assess + diff) for a single event.""" + from datetime import UTC, datetime + from app.infrastructure.perception.llm_pipeline import LlmPipeline + from app.shared.bootstrap import get_retrieval_service + + event = get_perception_service().get_event(event_id) + if not event: + from fastapi import HTTPException + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + + store = get_event_store() + pipeline = LlmPipeline() + + structure = pipeline.extract_structure(event) + event.update(structure) + event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service()) + event["processed_at"] = datetime.now(UTC).isoformat() + store.upsert(event) + + return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]} + + +@router.get("/events/{event_id}/diff") +async def get_event_diff(event_id: str): + """Return semantic diff detail for an event (only available if previously crawled twice).""" + event = get_perception_service().get_event(event_id) + if not event: + from fastapi import HTTPException + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + if not event.get("change_summary"): + from fastapi import HTTPException + raise HTTPException(status_code=404, detail="No diff available for this event") + return { + "event_id": event_id, + "change_summary": event.get("change_summary"), + "changed_sections": event.get("changed_sections") or [], + "previous_hash": event.get("previous_hash"), + "content_hash": event.get("content_hash"), + } diff --git a/backend/app/application/perception/crawl_service.py b/backend/app/application/perception/crawl_service.py new file mode 100644 index 0000000..afcc452 --- /dev/null +++ b/backend/app/application/perception/crawl_service.py @@ -0,0 +1,147 @@ +"""Orchestrates regulatory source crawlers and LLM enrichment pipeline.""" + +from __future__ import annotations + +import hashlib +from typing import Any, Generator + +from loguru import logger + +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from app.infrastructure.perception.llm_pipeline import LlmPipeline + + +def _event_id(source: str, standard_code: str) -> str: + """Deterministic 12-char ID from source + standard_code.""" + return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12] + + +def _content_hash(raw_text: str) -> str: + return hashlib.sha256(raw_text.encode()).hexdigest() + + +def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict: + return { + "id": event_id, + "source": raw.source, + "source_label": raw.source_label, + "standard_code": raw.standard_code, + "title": raw.title, + "summary": raw.summary, + "full_text_url": raw.full_text_url, + "status": raw.status, + "impact_level": "medium", + "published_at": raw.published_at, + "effective_at": raw.effective_at, + "category": raw.category, + "tags": raw.tags, + "content_hash": content_hash, + "previous_hash": None, + } + + +class CrawlService: + """Orchestrate crawlers, hash-based change detection, and LLM enrichment.""" + + def __init__( + self, + crawlers: dict[str, BaseCrawler], + event_store: BaseEventStore, + llm_pipeline: LlmPipeline, + retrieval_service: Any, + ) -> None: + self._crawlers = crawlers + self._store = event_store + self._pipeline = llm_pipeline + self._retrieval = retrieval_service + + def run_crawl( + self, sources: list[str] | None = None + ) -> Generator[dict, None, None]: + """Run crawl for selected sources. Yields SSE-ready progress dicts.""" + targets = sources or list(self._crawlers.keys()) + total_new = 0 + total_updated = 0 + + for source_key in targets: + crawler = self._crawlers.get(source_key) + if not crawler: + yield {"event": "error", "data": f"Unknown source: {source_key}"} + continue + + yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}} + try: + raw_events = crawler.fetch(limit=100) + except Exception as exc: + logger.exception("Crawler failed source={}", source_key) + yield {"event": "error", "data": {"source": source_key, "message": str(exc)}} + continue + + yield { + "event": "progress", + "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)}, + } + + new_count = 0 + updated_count = 0 + + for raw in raw_events: + eid = _event_id(raw.source, raw.standard_code) + new_hash = _content_hash(raw.raw_text or raw.title) + existing = self._store.get(eid) + + if existing and existing.get("content_hash") == new_hash: + continue + + is_update = existing is not None + old_text = existing.get("summary", "") if is_update else "" + previous_hash = existing.get("content_hash") if is_update else None + + event_dict = _raw_to_dict(raw, eid, new_hash) + event_dict["previous_hash"] = previous_hash + + try: + structure = self._pipeline.extract_structure(event_dict) + event_dict.update(structure) + except Exception as exc: + logger.warning("Structure extraction failed id={} err={}", eid, exc) + + try: + affected = self._pipeline.assess_impact(event_dict, self._retrieval) + event_dict["affected_docs"] = affected + except Exception as exc: + logger.warning("Impact assessment failed id={} err={}", eid, exc) + + if is_update and old_text and raw.raw_text: + try: + diff = self._pipeline.compute_diff(old_text, raw.raw_text) + event_dict["change_summary"] = diff.get("change_summary") + event_dict["changed_sections"] = diff.get("changed_sections") + except Exception as exc: + logger.warning("Diff failed id={} err={}", eid, exc) + + self._store.upsert(event_dict) + + if is_update: + updated_count += 1 + else: + new_count += 1 + + total_new += new_count + total_updated += updated_count + + yield { + "event": "progress", + "data": { + "source": source_key, + "stage": "done", + "new": new_count, + "updated": updated_count, + }, + } + + yield { + "event": "done", + "data": {"total_new": total_new, "total_updated": total_updated}, + } diff --git a/backend/app/application/perception/services.py b/backend/app/application/perception/services.py index bda2f56..c49cd15 100644 --- a/backend/app/application/perception/services.py +++ b/backend/app/application/perception/services.py @@ -6,7 +6,7 @@ import json from typing import Generator from app.application.knowledge.services import KnowledgeRetrievalService -from app.infrastructure.perception.mock_event_store import MockEventStore +from app.infrastructure.perception.base_event_store import BaseEventStore from app.services.llm.llm_factory import get_llm_client from app.config.settings import settings @@ -22,7 +22,7 @@ class PerceptionService: def __init__( self, - event_store: MockEventStore, + event_store: BaseEventStore, retrieval_service: KnowledgeRetrievalService, ) -> None: self._store = event_store diff --git a/backend/app/config/settings.py b/backend/app/config/settings.py index ffdd480..917ab51 100644 --- a/backend/app/config/settings.py +++ b/backend/app/config/settings.py @@ -87,6 +87,18 @@ class Settings(BaseSettings): # no external worker needed. Switch to True only when a Celery worker is running. use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)") + # ── Perception crawl ────────────────────────────────────────────────────── + perception_crawl_timeout_seconds: int = Field( + default=120, description="HTTP timeout for regulatory source crawlers." + ) + perception_max_events_per_source: int = Field( + default=100, description="Maximum events fetched per source per crawl run." + ) + perception_diff_similarity_threshold: float = Field( + default=0.85, + description="Cosine similarity below which a paragraph is flagged as changed.", + ) + # Keep configuration setup explicit so runtime behavior is easy to reason about. api_host: str = Field(default="0.0.0.0", description="API服务地址") api_port: int = Field(default=8000, description="API服务端口") diff --git a/backend/app/infrastructure/perception/base_event_store.py b/backend/app/infrastructure/perception/base_event_store.py new file mode 100644 index 0000000..2314424 --- /dev/null +++ b/backend/app/infrastructure/perception/base_event_store.py @@ -0,0 +1,39 @@ +"""Abstract base class for regulatory event stores.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class BaseEventStore(ABC): + """Port interface for regulatory event persistence.""" + + @abstractmethod + def all(self) -> list[dict]: + """Return all events, most-recent first.""" + + @abstractmethod + def get(self, event_id: str) -> dict | None: + """Return a single event by ID, or None.""" + + @abstractmethod + def filter( + self, + *, + source: str | None = None, + impact_level: str | None = None, + limit: int = 50, + ) -> list[dict]: + """Return filtered events sorted by published_at descending.""" + + @abstractmethod + def stats(self) -> dict: + """Return {total, high_impact, medium_impact, low_impact, recent_90d}.""" + + @abstractmethod + def upsert(self, event: dict) -> None: + """Insert or update an event record.""" + + @abstractmethod + def get_by_standard_code(self, standard_code: str) -> dict | None: + """Return the most-recent event with matching standard_code, or None.""" diff --git a/backend/app/infrastructure/perception/crawlers/__init__.py b/backend/app/infrastructure/perception/crawlers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/infrastructure/perception/crawlers/_utils.py b/backend/app/infrastructure/perception/crawlers/_utils.py new file mode 100644 index 0000000..d2f96b1 --- /dev/null +++ b/backend/app/infrastructure/perception/crawlers/_utils.py @@ -0,0 +1,43 @@ +"""Shared utility functions for crawlers.""" + +from __future__ import annotations + +import re +from datetime import date + + +def parse_date(text: str) -> str: + """Return YYYY-MM-DD from common Chinese date formats, or today's date.""" + text = text.strip() + if not text: + return date.today().isoformat() + m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text) + if m: + try: + return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat() + except ValueError: + pass + m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text) + if m2: + try: + return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat() + except ValueError: + pass + return date.today().isoformat() + + +def extract_tags(standard_code: str, title: str) -> list[str]: + """Derive simple keyword tags from standard code and title.""" + tags: list[str] = [] + code_upper = standard_code.upper() + if "GB" in code_upper: + tags.append("国家标准") + if "/T" in code_upper: + tags.append("推荐性") + else: + tags.append("强制性") + keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"] + for kw in keywords: + if kw in title: + tags.append(kw) + return tags[:5] diff --git a/backend/app/infrastructure/perception/crawlers/base.py b/backend/app/infrastructure/perception/crawlers/base.py new file mode 100644 index 0000000..b359a5c --- /dev/null +++ b/backend/app/infrastructure/perception/crawlers/base.py @@ -0,0 +1,32 @@ +"""Shared contracts for regulatory source crawlers.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field + + +@dataclass +class RawEvent: + """Raw regulatory event returned by a crawler before enrichment.""" + + source: str + source_label: str + standard_code: str + title: str + summary: str + full_text_url: str + status: str # 'enacted' | 'draft' | 'consultation' + published_at: str # YYYY-MM-DD string + effective_at: str | None + category: str + tags: list[str] = field(default_factory=list) + raw_text: str = "" # full crawled text for hashing + LLM + + +class BaseCrawler(ABC): + """Abstract regulatory source crawler.""" + + @abstractmethod + def fetch(self, limit: int = 50) -> list[RawEvent]: + """Fetch up to `limit` recent events from the data source.""" diff --git a/backend/app/infrastructure/perception/crawlers/catarc_crawler.py b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py new file mode 100644 index 0000000..3ff5dd9 --- /dev/null +++ b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py @@ -0,0 +1,83 @@ +"""Crawler for CATARC automotive standard catalogue.""" + +from __future__ import annotations + +from urllib.parse import urljoin + +import httpx +from bs4 import BeautifulSoup +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from ._utils import extract_tags, parse_date + +_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html" +_HOST = "https://www.catarc.org.cn" + +_STATUS_MAP = { + "现行": "enacted", + "即将实施": "enacted", + "废止": "enacted", + "征求意见": "consultation", + "报批": "draft", +} + + +class CatarcCrawler(BaseCrawler): + """Scrape the CATARC automotive standard list page.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + max_pages = max(10, limit) + while len(events) < limit and page <= max_pages: + url = f"{_BASE_URL}?page={page}" + try: + resp = httpx.get(url, timeout=30, follow_redirects=True) + resp.raise_for_status() + except Exception as exc: + logger.warning("CATARC fetch failed page={} err={}", page, exc) + break + + soup = BeautifulSoup(resp.text, "lxml") + rows = soup.select("table tr") + if not rows: + break + + batch: list[RawEvent] = [] + for row in rows: + cells = row.find_all("td") + if len(cells) < 3: + continue + link = cells[0].find("a") + standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True) + title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code + date_text = cells[2].get_text(strip=True) if len(cells) > 2 else "" + published_at = parse_date(date_text) + status_text = cells[3].get_text(strip=True) if len(cells) > 3 else "" + status = _STATUS_MAP.get(status_text, "enacted") + detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url + raw_text = f"{standard_code} {title}" + batch.append(RawEvent( + source="CATARC", + source_label="全国汽车标准化技术委员会", + standard_code=standard_code, + title=title, + summary=title, + full_text_url=detail_url, + status=status, + published_at=published_at, + effective_at=None, + category="汽车标准", + tags=extract_tags(standard_code, title), + raw_text=raw_text, + )) + + if not batch: + break + events.extend(batch) + page += 1 + + return events[:limit] + + diff --git a/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py new file mode 100644 index 0000000..3f5fdd2 --- /dev/null +++ b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py @@ -0,0 +1,117 @@ +"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations.""" + +from __future__ import annotations + +import re +from email.utils import parsedate_to_datetime + +import httpx +from bs4 import BeautifulSoup +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from ._utils import parse_date + +_EURLEX_RSS_URLS = [ + "https://eur-lex.europa.eu/rss-feed/OJ-L.rss", +] + +_AUTOMOTIVE_KEYWORDS = [ + "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous", + "AI Act", "artificial intelligence", "cybersecurity", "software update", + "R155", "R156", "汽车", "车辆", +] + + +_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS] + + +def _is_automotive_relevant(title: str, description: str) -> bool: + combined = (title + " " + description).lower() + return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER) + + +def _extract_celex(url: str) -> str: + m = re.search(r"CELEX[:/]([0-9A-Z]+)", url) + return m.group(1) if m else "" + + +def _parse_rss_date(rfc2822: str) -> str: + try: + dt = parsedate_to_datetime(rfc2822) + return dt.date().isoformat() + except Exception: + return parse_date(rfc2822) + + +class EurlexCrawler(BaseCrawler): + """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + for rss_url in _EURLEX_RSS_URLS: + if len(events) >= limit: + break + try: + resp = httpx.get(rss_url, timeout=30, follow_redirects=True) + resp.raise_for_status() + except Exception as exc: + logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc) + continue + + soup = BeautifulSoup(resp.content, "lxml-xml") + for item in soup.find_all("item"): + if len(events) >= limit: + break + title_tag = item.find("title") + title = title_tag.get_text(strip=True) if title_tag else "" + desc_tag = item.find("description") + description = desc_tag.get_text(strip=True) if desc_tag else "" + link_tag = item.find("link") + link = link_tag.get_text(strip=True) if link_tag else "" + pub_date_tag = item.find("pubDate") + pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else "" + + if not _is_automotive_relevant(title, description): + continue + + celex = _extract_celex(link) + standard_code = celex if celex else title[:60] + published_at = _parse_rss_date(pub_date) if pub_date else "" + + events.append(RawEvent( + source="EUR-Lex", + source_label="欧盟官方公报", + standard_code=standard_code, + title=title, + summary=description[:500], + full_text_url=link, + status="enacted", + published_at=published_at, + effective_at=None, + category="EU法规", + tags=_extract_eurlex_tags(title, description), + raw_text=f"{title}\n{description}", + )) + + return events[:limit] + + +def _extract_eurlex_tags(title: str, description: str) -> list[str]: + combined = title + " " + description + tag_map = { + "AI Act": "EU AI Act", + "artificial intelligence": "EU AI Act", + "R155": "UN R155", + "R156": "UN R156", + "cybersecurity": "网络安全", + "emission": "排放", + "autonomous": "自动驾驶", + "ADAS": "ADAS", + } + combined_lower = combined.lower() + tags = [] + for kw, tag in tag_map.items(): + if kw.lower() in combined_lower: + tags.append(tag) + return tags[:5] diff --git a/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py new file mode 100644 index 0000000..77c5b7b --- /dev/null +++ b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py @@ -0,0 +1,92 @@ +"""Crawlers for the 国标委 (SAMR) standard information platform.""" + +from __future__ import annotations + +import httpx +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from ._utils import extract_tags, parse_date + +_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type" +_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"} + + +def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]: + params = { + "p.p1": std_type, + "p.p2": "车", + "p.p90": "circulation_date", + "p.p91": "desc", + "p.p6": page, + "p.p7": page_size, + } + try: + resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30) + resp.raise_for_status() + data = resp.json() + return data.get("rows", []) or [] + except Exception as exc: + logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc) + return [] + + +def _row_to_raw_event(row: dict, source_label: str) -> RawEvent: + standard_code = row.get("std_code", "") + title = row.get("std_name", standard_code) + published_at = parse_date(row.get("release_date", "")) + effective_at_raw = row.get("implement_date", "") + effective_at = parse_date(effective_at_raw) if effective_at_raw else None + status_text = row.get("std_status", "") + if "征求意见" in status_text: + status = "consultation" + elif "报批" in status_text or "草案" in status_text: + status = "draft" + else: + status = "enacted" + return RawEvent( + source="国标委", + source_label=source_label, + standard_code=standard_code, + title=title, + summary=title, + full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}", + status=status, + published_at=published_at, + effective_at=effective_at, + category=row.get("std_type", "国家标准"), + tags=extract_tags(standard_code, title), + raw_text=f"{standard_code} {title}", + ) + + +class GuobiaoMandatoryCrawler(BaseCrawler): + """Fetch mandatory national standards (强制性) related to vehicles.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + max_pages = max(10, limit) + while len(events) < limit and page <= max_pages: + rows = _fetch_page(std_type=1, page=page, page_size=20) + if not rows: + break + events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows) + page += 1 + return events[:limit] + + +class GuobiaoRecommendedCrawler(BaseCrawler): + """Fetch recommended national standards (推荐性) related to vehicles.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + max_pages = max(10, limit) + while len(events) < limit and page <= max_pages: + rows = _fetch_page(std_type=2, page=page, page_size=20) + if not rows: + break + events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows) + page += 1 + return events[:limit] diff --git a/backend/app/infrastructure/perception/llm_pipeline.py b/backend/app/infrastructure/perception/llm_pipeline.py new file mode 100644 index 0000000..37cdce5 --- /dev/null +++ b/backend/app/infrastructure/perception/llm_pipeline.py @@ -0,0 +1,241 @@ +"""LLM-driven pipeline for regulatory event enrichment.""" + +from __future__ import annotations + +import json +import math +from typing import Any + +from loguru import logger + +from app.config.settings import settings +from app.infrastructure.embedding.openai_compatible_embedding_provider import ( + OpenAICompatibleEmbeddingProvider, +) +from app.services.llm.llm_factory import get_llm_client + +_EXTRACT_SYSTEM = ( + "You are a regulatory compliance expert specialising in automotive standards " + "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. " + "Return valid JSON only — no markdown fences, no extra keys." +) + +_ASSESS_SYSTEM = ( + "You are an automotive compliance analyst. Given a regulation and related document excerpts, " + "identify which documents are affected and what actions are required. " + "Return a JSON array only." +) + +_DIFF_SYSTEM = ( + "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, " + "classify the type of change and summarise it. " + "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}" +) + +_SIMILARITY_THRESHOLD = 0.85 + + +def _cosine(a: list[float], b: list[float]) -> float: + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + if norm_a == 0 or norm_b == 0: + return 0.0 + return dot / (norm_a * norm_b) + + +def _llm_json(client: Any, messages: list[dict]) -> Any: + """Call LLM and parse JSON response; return None on failure.""" + try: + resp = client.chat(messages) + text = (resp.content or "").strip() + if text.startswith("```"): + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + return json.loads(text) + except Exception as exc: + logger.warning("LLM JSON parse failed: {}", exc) + return None + + +class LlmPipeline: + """Three-step enrichment pipeline for crawled regulatory events.""" + + def __init__(self) -> None: + self._client = get_llm_client( + provider=settings.llm_provider, + model=settings.llm_model, + ) + self._embedder = OpenAICompatibleEmbeddingProvider() + + # ------------------------------------------------------------------ + # Step 1: Structure extraction + # ------------------------------------------------------------------ + + def extract_structure(self, event: dict) -> dict: + """Extract obligations, deadlines, scope, penalties, impact_level from event text.""" + prompt = f"""Extract structured compliance information from this regulation: + +Standard: {event.get('standard_code', '')} +Title: {event.get('title', '')} +Source: {event.get('source_label', '')} +Summary: {event.get('summary', '')} +Tags: {', '.join(event.get('tags') or [])} + +Return JSON with exactly these keys: +{{ + "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}], + "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}], + "scope": "one sentence describing who/what this applies to", + "penalties": "one sentence on consequences of non-compliance, or null", + "impact_level": "high|medium|low" +}}""" + + messages = [ + {"role": "system", "content": _EXTRACT_SYSTEM}, + {"role": "user", "content": prompt}, + ] + result = _llm_json(self._client, messages) + if not isinstance(result, dict): + return { + "obligations": [], + "deadlines": [], + "scope": "", + "penalties": "", + "impact_level": "medium", + } + return result + + # ------------------------------------------------------------------ + # Step 2: Impact assessment + # ------------------------------------------------------------------ + + def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]: + """Use RAG to find affected documents and generate recommendations.""" + obligations = event.get("obligations") or [] + obligation_texts = " ".join(o.get("text", "") for o in obligations[:3]) + query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}" + + try: + chunks = retrieval_service.retrieve(query=query, top_k=5) + except Exception as exc: + logger.warning("RAG retrieval failed: {}", exc) + return [] + + if not chunks: + return [] + + seen: set[str] = set() + doc_excerpts: list[dict] = [] + for chunk in chunks: + if chunk.doc_id not in seen: + seen.add(chunk.doc_id) + doc_excerpts.append({ + "doc_id": chunk.doc_id, + "doc_name": chunk.doc_title, + "score": round(float(chunk.score if chunk.score is not None else 0), 4), + "snippet": (chunk.text or "")[:300], + "clause": getattr(chunk, "section_title", "") or "", + }) + + context = "\n".join( + f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}" + for d in doc_excerpts + ) + prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')} +Obligations: {obligation_texts or event.get('summary', '')} + +Affected documents found in knowledge base: +{context} + +For each document, assess impact and recommend action. Return JSON array: +[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]""" + + messages = [ + {"role": "system", "content": _ASSESS_SYSTEM}, + {"role": "user", "content": prompt}, + ] + result = _llm_json(self._client, messages) + if isinstance(result, list): + score_map = {d["doc_id"]: d["score"] for d in doc_excerpts} + for item in result: + if isinstance(item, dict) and item.get("doc_id") in score_map: + item["score"] = score_map[item["doc_id"]] + return result + return doc_excerpts + + # ------------------------------------------------------------------ + # Step 3: Semantic diff + # ------------------------------------------------------------------ + + def compute_diff(self, old_text: str, new_text: str) -> dict: + """Compare old and new regulation text; return changed sections and summary.""" + old_paras = [p.strip() for p in old_text.split("\n") if p.strip()] + new_paras = [p.strip() for p in new_text.split("\n") if p.strip()] + + if not old_paras or not new_paras: + return {"changed_sections": [], "change_summary": "No comparable text."} + + all_paras = old_paras + new_paras + try: + all_embeddings = self._embedder.embed_texts(all_paras) + except Exception as exc: + logger.warning("Embedding for diff failed: {}", exc) + return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."} + + old_embeddings = all_embeddings[: len(old_paras)] + new_embeddings = all_embeddings[len(old_paras):] + + changed_sections: list[dict] = [] + max_len = max(len(old_paras), len(new_paras)) + + for i in range(max_len): + if i >= len(old_paras): + # New paragraph added + changed_sections.append({ + "old_text": "", + "new_text": new_paras[i][:300], + "similarity": 0.0, + "change_type": "added", + "summary": "New paragraph added.", + }) + continue + if i >= len(new_paras): + # Old paragraph removed + changed_sections.append({ + "old_text": old_paras[i][:300], + "new_text": "", + "similarity": 0.0, + "change_type": "removed", + "summary": "Paragraph removed.", + }) + continue + # Both exist — compare via embeddings + sim = _cosine(old_embeddings[i], new_embeddings[i]) + if sim < _SIMILARITY_THRESHOLD: + messages = [ + {"role": "system", "content": _DIFF_SYSTEM}, + {"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"}, + ] + classification = _llm_json(self._client, messages) or {} + changed_sections.append({ + "old_text": old_paras[i][:300], + "new_text": new_paras[i][:300], + "similarity": round(sim, 3), + "change_type": classification.get("change_type", "modified"), + "summary": classification.get("summary", ""), + }) + + if not changed_sections: + change_summary = "No substantive changes detected between versions." + else: + types = [s["change_type"] for s in changed_sections] + change_summary = ( + f"{len(changed_sections)} paragraph(s) changed: " + + ", ".join(f"{t}" for t in set(types)) + + ". " + + (changed_sections[0].get("summary", "") if changed_sections else "") + ) + + return {"changed_sections": changed_sections, "change_summary": change_summary} diff --git a/backend/app/infrastructure/perception/mock_event_store.py b/backend/app/infrastructure/perception/mock_event_store.py index a927cee..71a8e60 100644 --- a/backend/app/infrastructure/perception/mock_event_store.py +++ b/backend/app/infrastructure/perception/mock_event_store.py @@ -4,6 +4,8 @@ from __future__ import annotations from typing import Any +from app.infrastructure.perception.base_event_store import BaseEventStore + MOCK_EVENTS: list[dict[str, Any]] = [ # ------------------------------------------------------------------ HIGH { @@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [ }, ] -# Index for fast lookup -_EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS} - - -class MockEventStore: +class MockEventStore(BaseEventStore): """In-memory mock store for regulatory events.""" + def __init__(self) -> None: + self._events: list[dict] = [dict(e) for e in MOCK_EVENTS] + self._index: dict[str, dict] = {e["id"]: e for e in self._events} + def all(self) -> list[dict]: - return list(MOCK_EVENTS) + return list(self._events) def get(self, event_id: str) -> dict | None: - return _EVENT_INDEX.get(event_id) + return self._index.get(event_id) def filter( self, @@ -399,23 +401,39 @@ class MockEventStore: impact_level: str | None = None, limit: int = 50, ) -> list[dict]: - events = list(MOCK_EVENTS) + events = list(self._events) if source: events = [e for e in events if e["source"] == source] if impact_level: events = [e for e in events if e["impact_level"] == impact_level] - events.sort(key=lambda e: e["published_at"], reverse=True) + events.sort(key=lambda e: e.get("published_at") or "", reverse=True) return events[:limit] def stats(self) -> dict: from datetime import date, timedelta - events = MOCK_EVENTS + events = self._events cutoff = (date.today() - timedelta(days=90)).isoformat() return { "total": len(events), "high_impact": sum(1 for e in events if e["impact_level"] == "high"), "medium_impact": sum(1 for e in events if e["impact_level"] == "medium"), "low_impact": sum(1 for e in events if e["impact_level"] == "low"), - "recent_90d": sum(1 for e in events if e["published_at"] >= cutoff), + "recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff), } + + def upsert(self, event: dict) -> None: + """Insert or update event in the in-memory list (used in tests).""" + existing = self._index.get(event["id"]) + if existing: + existing.update(event) + else: + self._events.append(event) + self._index[event["id"]] = event + + def get_by_standard_code(self, standard_code: str) -> dict | None: + """Return most-recent event with matching standard_code.""" + matches = [e for e in self._events if e.get("standard_code") == standard_code] + if not matches: + return None + return max(matches, key=lambda e: e.get("published_at", "")) diff --git a/backend/app/infrastructure/perception/postgres_event_store.py b/backend/app/infrastructure/perception/postgres_event_store.py new file mode 100644 index 0000000..4782ae0 --- /dev/null +++ b/backend/app/infrastructure/perception/postgres_event_store.py @@ -0,0 +1,225 @@ +"""PostgreSQL-backed regulatory event store.""" + +from __future__ import annotations + +import json +from contextlib import contextmanager +from datetime import UTC, date, datetime, timedelta +from typing import Any + +import psycopg2 +import psycopg2.extras +from psycopg2.pool import ThreadedConnectionPool + +from app.config.settings import settings +from app.infrastructure.perception.base_event_store import BaseEventStore + +_CREATE_TABLE = """ +CREATE TABLE IF NOT EXISTS regulation_events ( + id TEXT PRIMARY KEY, + source TEXT NOT NULL, + source_label TEXT, + standard_code TEXT NOT NULL, + title TEXT NOT NULL, + summary TEXT, + full_text_url TEXT, + status TEXT, + impact_level TEXT, + published_at DATE, + effective_at DATE, + category TEXT, + tags TEXT[], + obligations JSONB, + deadlines JSONB, + scope TEXT, + penalties TEXT, + content_hash TEXT, + previous_hash TEXT, + change_summary TEXT, + changed_sections JSONB, + affected_docs JSONB, + crawled_at TIMESTAMPTZ DEFAULT now(), + processed_at TIMESTAMPTZ, + raw_storage_key TEXT +); +CREATE INDEX IF NOT EXISTS reg_events_source_date + ON regulation_events (source, published_at DESC); +CREATE INDEX IF NOT EXISTS reg_events_impact_date + ON regulation_events (impact_level, published_at DESC); +""" + +_ALL_COLUMNS = ( + "id", "source", "source_label", "standard_code", "title", "summary", + "full_text_url", "status", "impact_level", "published_at", "effective_at", + "category", "tags", "obligations", "deadlines", "scope", "penalties", + "content_hash", "previous_hash", "change_summary", "changed_sections", + "affected_docs", "crawled_at", "processed_at", "raw_storage_key", +) + + +def _row_to_dict(row: dict[str, Any]) -> dict: + """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields.""" + d = dict(row) + for field in ("obligations", "deadlines", "changed_sections", "affected_docs"): + val = d.get(field) + if isinstance(val, str): + d[field] = json.loads(val) + for date_field in ("published_at", "effective_at"): + val = d.get(date_field) + if isinstance(val, datetime): + d[date_field] = val.date().isoformat() + elif isinstance(val, date): + d[date_field] = val.isoformat() + for ts_field in ("crawled_at", "processed_at"): + val = d.get(ts_field) + if isinstance(val, datetime): + d[ts_field] = val.isoformat() + return d + + +class PostgresEventStore(BaseEventStore): + """Regulatory event store backed by PostgreSQL.""" + + def __init__(self) -> None: + self._pool = ThreadedConnectionPool( + minconn=1, + maxconn=5, + host=settings.postgres_host, + port=settings.postgres_port, + user=settings.postgres_user, + password=settings.postgres_password, + dbname=settings.postgres_db, + ) + self._ensure_schema() + + def _ensure_schema(self) -> None: + with self._conn() as conn: + try: + with conn.cursor() as cur: + cur.execute(_CREATE_TABLE) + conn.commit() + except Exception: + conn.rollback() + raise + + @contextmanager + def _conn(self): + conn = None + try: + conn = self._pool.getconn() + yield conn + finally: + if conn is not None: + self._pool.putconn(conn) + + def all(self) -> list[dict]: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST" + ) + return [_row_to_dict(r) for r in cur.fetchall()] + + def get(self, event_id: str) -> dict | None: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + "SELECT * FROM regulation_events WHERE id = %s", (event_id,) + ) + row = cur.fetchone() + return _row_to_dict(row) if row else None + + def filter( + self, + *, + source: str | None = None, + impact_level: str | None = None, + limit: int = 50, + ) -> list[dict]: + conditions: list[str] = [] + params: list[Any] = [] + if source: + conditions.append("source = %s") + params.append(source) + if impact_level: + conditions.append("impact_level = %s") + params.append(impact_level) + where = ("WHERE " + " AND ".join(conditions)) if conditions else "" + params.append(limit) + sql = f""" + SELECT * FROM regulation_events + {where} + ORDER BY published_at DESC NULLS LAST + LIMIT %s + """ + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, params) + return [_row_to_dict(r) for r in cur.fetchall()] + + def stats(self) -> dict: + cutoff = (date.today() - timedelta(days=90)).isoformat() + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("SELECT COUNT(*) AS count FROM regulation_events") + total = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'" + ) + high = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'" + ) + medium = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s", + (cutoff,), + ) + recent = (cur.fetchone() or {}).get("count", 0) + return { + "total": int(total), + "high_impact": int(high), + "medium_impact": int(medium), + "recent_90d": int(recent), + } + + def upsert(self, event: dict) -> None: + """Insert or update a regulation event.""" + cols = [c for c in _ALL_COLUMNS if c in event] + placeholders = ", ".join(f"%({c})s" for c in cols) + updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id") + sql = f""" + INSERT INTO regulation_events ({', '.join(cols)}) + VALUES ({placeholders}) + ON CONFLICT (id) DO UPDATE SET {updates} + """ + row: dict[str, Any] = {} + for c in cols: + val = event.get(c) + if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None: + row[c] = json.dumps(val, ensure_ascii=False) + elif c == "tags" and isinstance(val, list): + row[c] = val + else: + row[c] = val + with self._conn() as conn: + try: + with conn.cursor() as cur: + cur.execute(sql, row) + conn.commit() + except Exception: + conn.rollback() + raise + + def get_by_standard_code(self, standard_code: str) -> dict | None: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + """SELECT * FROM regulation_events + WHERE standard_code = %s + ORDER BY published_at DESC NULLS LAST + LIMIT 1""", + (standard_code,), + ) + row = cur.fetchone() + return _row_to_dict(row) if row else None diff --git a/backend/app/shared/bootstrap.py b/backend/app/shared/bootstrap.py index 7821924..1f2d981 100644 --- a/backend/app/shared/bootstrap.py +++ b/backend/app/shared/bootstrap.py @@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu from app.infrastructure.parser.local_document_parser import LocalDocumentParser from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder from app.infrastructure.perception.mock_event_store import MockEventStore +from app.application.perception.crawl_service import CrawlService +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler +from app.infrastructure.perception.crawlers.guobiao_crawler import ( + GuobiaoMandatoryCrawler, + GuobiaoRecommendedCrawler, +) +from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler +from app.infrastructure.perception.llm_pipeline import LlmPipeline from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore from app.infrastructure.storage.json_document_repository import JsonDocumentRepository @@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService: ) +@lru_cache +def get_event_store() -> BaseEventStore: + """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting.""" + if settings.document_repository_backend == "postgres": + from app.infrastructure.perception.postgres_event_store import PostgresEventStore + return PostgresEventStore() + return MockEventStore() + + @lru_cache def get_perception_service() -> PerceptionService: - """Return perception service for regulatory intelligence.""" return PerceptionService( - event_store=MockEventStore(), + event_store=get_event_store(), + retrieval_service=get_retrieval_service(), + ) + + +@lru_cache +def get_crawl_service() -> CrawlService: + crawlers = { + "CATARC": CatarcCrawler(), + "国标委·强制性": GuobiaoMandatoryCrawler(), + "国标委·推荐性": GuobiaoRecommendedCrawler(), + "EUR-Lex": EurlexCrawler(), + } + return CrawlService( + crawlers=crawlers, + event_store=get_event_store(), + llm_pipeline=LlmPipeline(), retrieval_service=get_retrieval_service(), ) diff --git a/backend/requirements.txt b/backend/requirements.txt index b75a8f0..5150ad0 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -9,6 +9,8 @@ pydantic-settings>=2.0.0 python-dotenv>=1.0.0 loguru>=0.7.0 httpx>=0.25.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 tiktoken>=0.5.0 tenacity>=8.2.0 diff --git a/backend/tests/perception/__init__.py b/backend/tests/perception/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tests/perception/test_base_event_store.py b/backend/tests/perception/test_base_event_store.py new file mode 100644 index 0000000..ebc4e1d --- /dev/null +++ b/backend/tests/perception/test_base_event_store.py @@ -0,0 +1,95 @@ +"""Contract tests: any BaseEventStore implementation must pass these.""" +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.mock_event_store import MockEventStore + + +def _store() -> BaseEventStore: + return MockEventStore() + + +def test_is_base_event_store(): + assert isinstance(_store(), BaseEventStore) + + +def test_all_returns_list(): + result = _store().all() + assert isinstance(result, list) + assert len(result) > 0 + + +def test_get_known_id(): + store = _store() + first = store.all()[0] + result = store.get(first["id"]) + assert result is not None + assert result["id"] == first["id"] + + +def test_get_unknown_returns_none(): + assert _store().get("does-not-exist") is None + + +def test_filter_by_impact(): + store = _store() + highs = store.filter(impact_level="high", limit=100) + assert all(e["impact_level"] == "high" for e in highs) + + +def test_filter_limit(): + store = _store() + result = store.filter(limit=3) + assert len(result) <= 3 + + +def test_stats_keys(): + stats = _store().stats() + for key in ("total", "high_impact", "medium_impact", "recent_90d"): + assert key in stats, f"missing key: {key}" + + +def test_upsert_and_get(): + store = _store() + event = { + "id": "test-upsert-001", + "source": "TEST", + "source_label": "Test Source", + "standard_code": "TST-001", + "title": "Test Event", + "summary": "A test event", + "full_text_url": "https://example.com", + "status": "draft", + "impact_level": "low", + "published_at": "2026-01-01", + "effective_at": None, + "category": "test", + "tags": ["test"], + "content_hash": "abc123", + "previous_hash": None, + } + store.upsert(event) + result = store.get("test-upsert-001") + assert result is not None + assert result["title"] == "Test Event" + + +def test_get_by_standard_code(): + store = _store() + first = store.all()[0] + result = store.get_by_standard_code(first["standard_code"]) + assert result is not None + assert result["standard_code"] == first["standard_code"] + + +def test_upsert_updates_existing(): + store = _store() + first = store.all()[0] + original_id = first["id"] + store.upsert({"id": original_id, "title": "Updated Title", "impact_level": first["impact_level"], + "standard_code": first.get("standard_code", ""), "source": first["source"], + "source_label": first.get("source_label", ""), "summary": "Updated", + "full_text_url": "", "status": first["status"], "published_at": first.get("published_at", ""), + "effective_at": None, "category": first.get("category", ""), "tags": [], + "content_hash": "newhash", "previous_hash": None}) + result = store.get(original_id) + assert result is not None + assert result["title"] == "Updated Title" diff --git a/backend/tests/perception/test_crawl_service.py b/backend/tests/perception/test_crawl_service.py new file mode 100644 index 0000000..50dec69 --- /dev/null +++ b/backend/tests/perception/test_crawl_service.py @@ -0,0 +1,111 @@ +"""Integration tests for CrawlService.""" +from __future__ import annotations +from unittest.mock import MagicMock +import hashlib +import pytest + +from app.infrastructure.perception.crawlers.base import RawEvent +from app.infrastructure.perception.mock_event_store import MockEventStore + + +def _make_raw_event(code="TST-001"): + return RawEvent( + source="TEST", source_label="Test", standard_code=code, + title=f"Test {code}", summary="Summary", full_text_url="https://example.com", + status="enacted", published_at="2026-01-01", effective_at=None, + category="test", tags=["test"], raw_text="full text", + ) + + +def _make_service(raw_events): + from app.application.perception.crawl_service import CrawlService + + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = raw_events + + mock_pipeline = MagicMock() + mock_pipeline.extract_structure.return_value = { + "obligations": [], "deadlines": [], "scope": "test", + "penalties": None, "impact_level": "low", + } + mock_pipeline.assess_impact.return_value = [] + mock_pipeline.compute_diff.return_value = { + "changed_sections": [], "change_summary": "No changes.", + } + + mock_retrieval = MagicMock() + store = MockEventStore() + + return CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=mock_retrieval, + ) + + +def test_crawl_yields_progress_and_done(): + svc = _make_service([_make_raw_event("TST-001")]) + events = list(svc.run_crawl()) + event_types = [e.get("event") for e in events] + assert "done" in event_types + + +def test_crawl_upserts_to_store(): + store = MockEventStore() + from app.application.perception.crawl_service import CrawlService + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")] + mock_pipeline = MagicMock() + mock_pipeline.extract_structure.return_value = { + "obligations": [], "deadlines": [], "scope": "", + "penalties": None, "impact_level": "medium", + } + mock_pipeline.assess_impact.return_value = [] + mock_pipeline.compute_diff.return_value = { + "changed_sections": [], "change_summary": "", + } + svc = CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=MagicMock(), + ) + list(svc.run_crawl()) + result = store.get_by_standard_code("NEW-001") + assert result is not None + assert result["title"] == "Test NEW-001" + + +def test_crawl_skips_unchanged_events(): + store = MockEventStore() + raw = _make_raw_event("SKIP-001") + content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest() + store.upsert({ + "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12], + "standard_code": "SKIP-001", + "source": "TEST", + "source_label": "Test", + "title": "Test SKIP-001", + "summary": "", + "full_text_url": "", + "status": "enacted", + "impact_level": "low", + "published_at": "2026-01-01", + "effective_at": None, + "category": "test", + "tags": [], + "content_hash": content_hash, + }) + mock_pipeline = MagicMock() + from app.application.perception.crawl_service import CrawlService + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = [raw] + svc = CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=MagicMock(), + ) + list(svc.run_crawl()) + mock_pipeline.extract_structure.assert_not_called() diff --git a/backend/tests/perception/test_crawlers.py b/backend/tests/perception/test_crawlers.py new file mode 100644 index 0000000..2aa43b6 --- /dev/null +++ b/backend/tests/perception/test_crawlers.py @@ -0,0 +1,127 @@ +"""Unit tests for crawlers — mock httpx responses.""" +from __future__ import annotations +from unittest.mock import MagicMock, patch +import pytest + +from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler + + +def test_raw_event_fields(): + ev = RawEvent( + source="TEST", + source_label="Test", + standard_code="TST-001", + title="Test", + summary="Summary", + full_text_url="https://example.com", + status="enacted", + published_at="2026-01-01", + effective_at=None, + category="test", + tags=["a"], + raw_text="full text here", + ) + assert ev.source == "TEST" + assert ev.tags == ["a"] + + +CATARC_HTML = """ + + + + + + + + + + + + + + +
GB 18384-2025电动汽车安全要求2025-11-15现行
GB/T 40429-2026汽车驾驶自动化分级2026-02-01即将实施
+ +""" + + +def test_catarc_crawler_parses_html(): + from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = CATARC_HTML + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = CatarcCrawler() + events = crawler.fetch(limit=10) + + assert isinstance(events, list) + assert len(events) >= 1 + assert all(isinstance(e, RawEvent) for e in events) + codes = [e.standard_code for e in events] + assert "GB 18384-2025" in codes + + +GUOBIAO_JSON = { + "rows": [ + { + "std_code": "GB 18384-2025", + "std_name": "电动汽车安全要求", + "release_date": "2025-11-15", + "implement_date": "2026-07-01", + "std_status": "现行", + "std_type": "强制性", + }, + ] +} + + +def test_guobiao_crawler_parses_json(): + from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = GUOBIAO_JSON + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = GuobiaoMandatoryCrawler() + events = crawler.fetch(limit=10) + + assert len(events) >= 1 + assert events[0].source == "国标委" + assert events[0].standard_code == "GB 18384-2025" + + +EURLEX_RSS = """ + + + EUR-Lex + + Regulation (EU) 2024/1689 — AI Act + https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689 + The EU Artificial Intelligence Act enters into force. + Fri, 12 Jul 2024 00:00:00 GMT + + +""" + + +def test_eurlex_crawler_parses_rss(): + from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = EURLEX_RSS + mock_resp.content = EURLEX_RSS + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = EurlexCrawler() + events = crawler.fetch(limit=5) + + assert isinstance(events, list) + assert len(events) >= 1 + assert events[0].source == "EUR-Lex" diff --git a/backend/tests/perception/test_llm_pipeline.py b/backend/tests/perception/test_llm_pipeline.py new file mode 100644 index 0000000..f828f01 --- /dev/null +++ b/backend/tests/perception/test_llm_pipeline.py @@ -0,0 +1,77 @@ +"""Unit tests for LlmPipeline — mock LLM client and embedding provider.""" +from __future__ import annotations +from unittest.mock import MagicMock, patch +import json +import pytest + + +def _make_pipeline(): + with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \ + patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls: + + mock_client = MagicMock() + mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}') + mock_llm_fn.return_value = mock_client + + mock_emb = MagicMock() + mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024] + mock_emb_cls.return_value = mock_emb + + from app.infrastructure.perception.llm_pipeline import LlmPipeline + return LlmPipeline(), mock_client, mock_emb + + +def test_extract_structure_returns_dict(): + pipeline, mock_client, _ = _make_pipeline() + event = { + "id": "evt-001", + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "summary": "新增 IP67 级别防护", + "source_label": "CATARC", + "tags": ["电池安全"], + } + result = pipeline.extract_structure(event) + assert isinstance(result, dict) + assert "obligations" in result + assert "impact_level" in result + + +def test_assess_impact_returns_list(): + pipeline, mock_client, _ = _make_pipeline() + mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]') + mock_retrieval = MagicMock() + chunk = MagicMock() + chunk.doc_id = "d1" + chunk.doc_title = "Safety Manual" + chunk.score = 0.85 + chunk.text = "relevant text" + chunk.section_title = "§4.2" + mock_retrieval.retrieve.return_value = [chunk] + event = { + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "obligations": [{"text": "OEM shall comply"}], + } + result = pipeline.assess_impact(event, mock_retrieval) + assert isinstance(result, list) + + +def test_compute_diff_no_change(): + pipeline, _, mock_emb = _make_pipeline() + mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024] + result = pipeline.compute_diff("paragraph one", "paragraph one") + assert isinstance(result, dict) + assert "changed_sections" in result + assert "change_summary" in result + + +def test_compute_diff_detects_change(): + pipeline, mock_client, mock_emb = _make_pipeline() + mock_emb.embed_texts.return_value = [ + [1.0] + [0.0] * 1023, + [0.0] + [1.0] + [0.0] * 1022, + ] + mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}') + result = pipeline.compute_diff("old paragraph text", "new tighter requirement text") + assert isinstance(result["changed_sections"], list) diff --git a/backend/tests/perception/test_postgres_event_store.py b/backend/tests/perception/test_postgres_event_store.py new file mode 100644 index 0000000..95957bf --- /dev/null +++ b/backend/tests/perception/test_postgres_event_store.py @@ -0,0 +1,98 @@ +"""Unit tests for PostgresEventStore using a mocked psycopg2 pool.""" +from __future__ import annotations +import json +from unittest.mock import MagicMock, patch +import pytest + +# Patch psycopg2 before importing the module under test +import sys +mock_psycopg2 = MagicMock() +mock_psycopg2.extras = MagicMock() +sys.modules.setdefault("psycopg2", mock_psycopg2) +sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras) +sys.modules.setdefault("psycopg2.pool", MagicMock()) + +from app.infrastructure.perception.base_event_store import BaseEventStore + + +SAMPLE_ROW = { + "id": "pg-001", + "source": "国标委", + "source_label": "国家标准化管理委员会", + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "summary": "新增要求", + "full_text_url": "https://openstd.samr.gov.cn", + "status": "enacted", + "impact_level": "high", + "published_at": "2025-11-15", + "effective_at": "2026-07-01", + "category": "电动汽车安全", + "tags": ["电池安全"], + "obligations": None, + "deadlines": None, + "scope": None, + "penalties": None, + "content_hash": "abc123", + "previous_hash": None, + "change_summary": None, + "changed_sections": None, + "affected_docs": None, + "crawled_at": "2026-06-05T10:00:00+00:00", + "processed_at": None, + "raw_storage_key": None, +} + + +def _make_store_with_pool(mock_pool): + with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool): + with patch( + "app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema" + ): + from app.infrastructure.perception.postgres_event_store import PostgresEventStore + return PostgresEventStore() + + +def _cursor_returning(rows): + cursor = MagicMock() + cursor.__enter__ = lambda s: s + cursor.__exit__ = MagicMock(return_value=False) + cursor.fetchall.return_value = rows + cursor.fetchone.return_value = rows[0] if rows else None + return cursor + + +def test_is_base_event_store(): + mock_pool = MagicMock() + store = _make_store_with_pool(mock_pool) + assert isinstance(store, BaseEventStore) + + +def test_filter_returns_list(): + mock_pool = MagicMock() + conn = MagicMock() + conn.__enter__ = lambda s: s + conn.__exit__ = MagicMock(return_value=False) + cursor = _cursor_returning([SAMPLE_ROW]) + conn.cursor.return_value = cursor + mock_pool.getconn.return_value = conn + store = _make_store_with_pool(mock_pool) + result = store.filter(limit=10) + assert isinstance(result, list) + + +def test_stats_returns_correct_keys(): + mock_pool = MagicMock() + conn = MagicMock() + conn.__enter__ = lambda s: s + conn.__exit__ = MagicMock(return_value=False) + cursor = MagicMock() + cursor.__enter__ = lambda s: s + cursor.__exit__ = MagicMock(return_value=False) + cursor.fetchone.return_value = {"count": 5} + conn.cursor.return_value = cursor + mock_pool.getconn.return_value = conn + store = _make_store_with_pool(mock_pool) + stats = store.stats() + for key in ("total", "high_impact", "medium_impact", "recent_90d"): + assert key in stats diff --git a/docs/superpowers/plans/2026-06-05-perception-intelligence.md b/docs/superpowers/plans/2026-06-05-perception-intelligence.md new file mode 100644 index 0000000..319404b --- /dev/null +++ b/docs/superpowers/plans/2026-06-05-perception-intelligence.md @@ -0,0 +1,2500 @@ +# Regulatory Signals Intelligence Enhancement — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace MockEventStore with real regulatory data from CATARC / 国标委 / EUR-Lex / UN-ECE, add LLM-driven structure extraction + impact assessment + semantic diff, and expose all of this through a manual-trigger crawl UI. + +**Architecture:** New `BaseEventStore` ABC → `PostgresEventStore` implementation (psycopg2, same pattern as `PostgresDocumentRepository`) → `CrawlService` orchestrates 4 crawlers + `LlmPipeline` → 3 new API endpoints (SSE crawl progress, single-event process, diff detail) → `bootstrap.py` selects store by `DOCUMENT_REPOSITORY_BACKEND` → frontend adds crawl bar + detail tabs. + +**Tech Stack:** httpx (already in requirements), BeautifulSoup4 + lxml (new), psycopg2-binary (already present), existing LLM factory (`app.services.llm.llm_factory`), existing `OpenAICompatibleEmbeddingProvider` for semantic diff, FastAPI SSE (existing pattern from `perception.py` + `async_utils.iter_in_thread`). + +--- + +## File Map + +| Action | Path | Purpose | +|--------|------|---------| +| Create | `backend/app/infrastructure/perception/base_event_store.py` | ABC with `all/get/filter/stats/upsert/get_by_standard_code` | +| Modify | `backend/app/infrastructure/perception/mock_event_store.py` | Inherit `BaseEventStore` | +| Create | `backend/app/infrastructure/perception/postgres_event_store.py` | PostgreSQL-backed store | +| Create | `backend/app/infrastructure/perception/crawlers/__init__.py` | Package init | +| Create | `backend/app/infrastructure/perception/crawlers/base.py` | `RawEvent` dataclass + `BaseCrawler` ABC | +| Create | `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` | CATARC scraper | +| Create | `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` | 国标委 JSON API crawler | +| Create | `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` | EUR-Lex RSS + CELLAR | +| Create | `backend/app/infrastructure/perception/llm_pipeline.py` | Extract / assess / diff | +| Create | `backend/app/application/perception/crawl_service.py` | Orchestrates crawlers + pipeline | +| Modify | `backend/app/application/perception/services.py` | Type hint: `BaseEventStore` instead of `MockEventStore` | +| Modify | `backend/app/api/routes/perception.py` | Add 3 new endpoints | +| Modify | `backend/app/shared/bootstrap.py` | Wire new classes; add `get_crawl_service()` | +| Modify | `backend/app/config/settings.py` | 3 new perception settings | +| Modify | `backend/.env` + `.env.example` | New env vars | +| Modify | `backend/requirements.txt` | Add beautifulsoup4, lxml | +| Modify | `frontend/src/pages/Perception/PerceptionPage.tsx` | Crawl bar + detail tabs | +| Create | `backend/tests/perception/__init__.py` | Test package | +| Create | `backend/tests/perception/test_base_event_store.py` | BaseEventStore contract tests | +| Create | `backend/tests/perception/test_postgres_event_store.py` | PostgresEventStore unit tests (mock psycopg2) | +| Create | `backend/tests/perception/test_crawlers.py` | Crawler unit tests (mock httpx) | +| Create | `backend/tests/perception/test_llm_pipeline.py` | Pipeline unit tests (mock LLM + embed) | +| Create | `backend/tests/perception/test_crawl_service.py` | CrawlService integration tests | + +--- + +## Task 1: BaseEventStore ABC + MockEventStore implements it + +**Files:** +- Create: `backend/app/infrastructure/perception/base_event_store.py` +- Modify: `backend/app/infrastructure/perception/mock_event_store.py` +- Create: `backend/tests/perception/__init__.py` +- Create: `backend/tests/perception/test_base_event_store.py` + +- [ ] **Step 1: Write the failing test** + +```python +# backend/tests/perception/__init__.py +# (empty) +``` + +```python +# backend/tests/perception/test_base_event_store.py +"""Contract tests: any BaseEventStore implementation must pass these.""" +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.mock_event_store import MockEventStore + + +def _store() -> BaseEventStore: + return MockEventStore() + + +def test_is_base_event_store(): + assert isinstance(_store(), BaseEventStore) + + +def test_all_returns_list(): + result = _store().all() + assert isinstance(result, list) + assert len(result) > 0 + + +def test_get_known_id(): + store = _store() + first = store.all()[0] + result = store.get(first["id"]) + assert result is not None + assert result["id"] == first["id"] + + +def test_get_unknown_returns_none(): + assert _store().get("does-not-exist") is None + + +def test_filter_by_impact(): + store = _store() + highs = store.filter(impact_level="high", limit=100) + assert all(e["impact_level"] == "high" for e in highs) + + +def test_filter_limit(): + store = _store() + result = store.filter(limit=3) + assert len(result) <= 3 + + +def test_stats_keys(): + stats = _store().stats() + for key in ("total", "high_impact", "medium_impact", "recent_90d"): + assert key in stats, f"missing key: {key}" + + +def test_upsert_and_get(): + store = _store() + event = { + "id": "test-upsert-001", + "source": "TEST", + "source_label": "Test Source", + "standard_code": "TST-001", + "title": "Test Event", + "summary": "A test event", + "full_text_url": "https://example.com", + "status": "draft", + "impact_level": "low", + "published_at": "2026-01-01", + "effective_at": None, + "category": "test", + "tags": ["test"], + "content_hash": "abc123", + "previous_hash": None, + } + store.upsert(event) + result = store.get("test-upsert-001") + assert result is not None + assert result["title"] == "Test Event" + + +def test_get_by_standard_code(): + store = _store() + first = store.all()[0] + result = store.get_by_standard_code(first["standard_code"]) + assert result is not None + assert result["standard_code"] == first["standard_code"] +``` + +- [ ] **Step 2: Run test to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_base_event_store.py -v +``` +Expected: ImportError on `base_event_store` + +- [ ] **Step 3: Create BaseEventStore ABC** + +```python +# backend/app/infrastructure/perception/base_event_store.py +"""Abstract base class for regulatory event stores.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class BaseEventStore(ABC): + """Port interface for regulatory event persistence.""" + + @abstractmethod + def all(self) -> list[dict]: + """Return all events, most-recent first.""" + + @abstractmethod + def get(self, event_id: str) -> dict | None: + """Return a single event by ID, or None.""" + + @abstractmethod + def filter( + self, + *, + source: str | None = None, + impact_level: str | None = None, + limit: int = 50, + ) -> list[dict]: + """Return filtered events sorted by published_at descending.""" + + @abstractmethod + def stats(self) -> dict: + """Return {total, high_impact, medium_impact, recent_90d}.""" + + @abstractmethod + def upsert(self, event: dict) -> None: + """Insert or update an event record.""" + + @abstractmethod + def get_by_standard_code(self, standard_code: str) -> dict | None: + """Return the most-recent event with matching standard_code, or None.""" +``` + +- [ ] **Step 4: Patch MockEventStore to inherit BaseEventStore and add new methods** + +Open `backend/app/infrastructure/perception/mock_event_store.py`. + +Add at the top (after existing imports): +```python +from app.infrastructure.perception.base_event_store import BaseEventStore +``` + +Change class definition from: +```python +class MockEventStore: +``` +to: +```python +class MockEventStore(BaseEventStore): +``` + +Add these two methods at the end of `MockEventStore`, after `stats()`: +```python + def upsert(self, event: dict) -> None: + """Insert or update event in the in-memory list (used in tests).""" + existing = _EVENT_INDEX.get(event["id"]) + if existing: + existing.update(event) + else: + MOCK_EVENTS.append(event) + _EVENT_INDEX[event["id"]] = event + + def get_by_standard_code(self, standard_code: str) -> dict | None: + """Return most-recent event with matching standard_code.""" + matches = [e for e in MOCK_EVENTS if e.get("standard_code") == standard_code] + if not matches: + return None + return max(matches, key=lambda e: e.get("published_at", "")) +``` + +- [ ] **Step 5: Run tests — expect PASS** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_base_event_store.py -v +``` +Expected: 8 tests PASS + +--- + +## Task 2: PostgresEventStore + +**Files:** +- Create: `backend/app/infrastructure/perception/postgres_event_store.py` +- Create: `backend/tests/perception/test_postgres_event_store.py` + +- [ ] **Step 1: Write the failing test (mock psycopg2)** + +```python +# backend/tests/perception/test_postgres_event_store.py +"""Unit tests for PostgresEventStore using a mocked psycopg2 pool.""" +from __future__ import annotations +import json +from unittest.mock import MagicMock, patch, call +import pytest + +# Patch psycopg2 before importing the module under test +import sys +mock_psycopg2 = MagicMock() +mock_psycopg2.extras = MagicMock() +sys.modules.setdefault("psycopg2", mock_psycopg2) +sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras) +sys.modules.setdefault("psycopg2.pool", MagicMock()) + +from app.infrastructure.perception.base_event_store import BaseEventStore + + +SAMPLE_ROW = { + "id": "pg-001", + "source": "国标委", + "source_label": "国家标准化管理委员会", + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "summary": "新增要求", + "full_text_url": "https://openstd.samr.gov.cn", + "status": "enacted", + "impact_level": "high", + "published_at": "2025-11-15", + "effective_at": "2026-07-01", + "category": "电动汽车安全", + "tags": ["电池安全"], + "obligations": None, + "deadlines": None, + "scope": None, + "penalties": None, + "content_hash": "abc123", + "previous_hash": None, + "change_summary": None, + "changed_sections": None, + "affected_docs": None, + "crawled_at": "2026-06-05T10:00:00+00:00", + "processed_at": None, + "raw_storage_key": None, +} + + +def _make_store_with_pool(mock_pool): + with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool): + with patch( + "app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema" + ): + from app.infrastructure.perception.postgres_event_store import PostgresEventStore + return PostgresEventStore() + + +def _cursor_returning(rows): + cursor = MagicMock() + cursor.__enter__ = lambda s: s + cursor.__exit__ = MagicMock(return_value=False) + cursor.fetchall.return_value = rows + cursor.fetchone.return_value = rows[0] if rows else None + return cursor + + +def test_is_base_event_store(): + mock_pool = MagicMock() + store = _make_store_with_pool(mock_pool) + assert isinstance(store, BaseEventStore) + + +def test_filter_returns_list(): + mock_pool = MagicMock() + conn = MagicMock() + conn.__enter__ = lambda s: s + conn.__exit__ = MagicMock(return_value=False) + cursor = _cursor_returning([SAMPLE_ROW]) + conn.cursor.return_value = cursor + mock_pool.getconn.return_value = conn + store = _make_store_with_pool(mock_pool) + result = store.filter(limit=10) + assert isinstance(result, list) + + +def test_stats_returns_correct_keys(): + mock_pool = MagicMock() + conn = MagicMock() + conn.__enter__ = lambda s: s + conn.__exit__ = MagicMock(return_value=False) + # stats runs 4 queries + cursor = MagicMock() + cursor.__enter__ = lambda s: s + cursor.__exit__ = MagicMock(return_value=False) + cursor.fetchone.return_value = {"count": 5} + conn.cursor.return_value = cursor + mock_pool.getconn.return_value = conn + store = _make_store_with_pool(mock_pool) + stats = store.stats() + for key in ("total", "high_impact", "medium_impact", "recent_90d"): + assert key in stats +``` + +- [ ] **Step 2: Run test to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_postgres_event_store.py -v +``` +Expected: ImportError on `postgres_event_store` + +- [ ] **Step 3: Implement PostgresEventStore** + +```python +# backend/app/infrastructure/perception/postgres_event_store.py +"""PostgreSQL-backed regulatory event store.""" + +from __future__ import annotations + +import json +from contextlib import contextmanager +from datetime import UTC, date, datetime, timedelta +from typing import Any + +import psycopg2 +import psycopg2.extras +from psycopg2.pool import ThreadedConnectionPool + +from app.config.settings import settings +from app.infrastructure.perception.base_event_store import BaseEventStore + +_CREATE_TABLE = """ +CREATE TABLE IF NOT EXISTS regulation_events ( + id TEXT PRIMARY KEY, + source TEXT NOT NULL, + source_label TEXT, + standard_code TEXT NOT NULL, + title TEXT NOT NULL, + summary TEXT, + full_text_url TEXT, + status TEXT, + impact_level TEXT, + published_at DATE, + effective_at DATE, + category TEXT, + tags TEXT[], + obligations JSONB, + deadlines JSONB, + scope TEXT, + penalties TEXT, + content_hash TEXT, + previous_hash TEXT, + change_summary TEXT, + changed_sections JSONB, + affected_docs JSONB, + crawled_at TIMESTAMPTZ DEFAULT now(), + processed_at TIMESTAMPTZ, + raw_storage_key TEXT +); +CREATE INDEX IF NOT EXISTS reg_events_source_date + ON regulation_events (source, published_at DESC); +CREATE INDEX IF NOT EXISTS reg_events_impact_date + ON regulation_events (impact_level, published_at DESC); +""" + +_ALL_COLUMNS = ( + "id", "source", "source_label", "standard_code", "title", "summary", + "full_text_url", "status", "impact_level", "published_at", "effective_at", + "category", "tags", "obligations", "deadlines", "scope", "penalties", + "content_hash", "previous_hash", "change_summary", "changed_sections", + "affected_docs", "crawled_at", "processed_at", "raw_storage_key", +) + + +def _row_to_dict(row: dict[str, Any]) -> dict: + """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields.""" + d = dict(row) + for field in ("obligations", "deadlines", "changed_sections", "affected_docs"): + val = d.get(field) + if isinstance(val, str): + d[field] = json.loads(val) + for date_field in ("published_at", "effective_at"): + val = d.get(date_field) + if isinstance(val, date): + d[date_field] = val.isoformat() + for ts_field in ("crawled_at", "processed_at"): + val = d.get(ts_field) + if isinstance(val, datetime): + d[ts_field] = val.isoformat() + return d + + +class PostgresEventStore(BaseEventStore): + """Regulatory event store backed by PostgreSQL.""" + + def __init__(self) -> None: + self._pool = ThreadedConnectionPool( + minconn=1, + maxconn=5, + host=settings.postgres_host, + port=settings.postgres_port, + user=settings.postgres_user, + password=settings.postgres_password, + dbname=settings.postgres_db, + ) + self._ensure_schema() + + def _ensure_schema(self) -> None: + with self._conn() as conn: + with conn.cursor() as cur: + cur.execute(_CREATE_TABLE) + conn.commit() + + @contextmanager + def _conn(self): + conn = self._pool.getconn() + try: + yield conn + finally: + self._pool.putconn(conn) + + def all(self) -> list[dict]: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST" + ) + return [_row_to_dict(r) for r in cur.fetchall()] + + def get(self, event_id: str) -> dict | None: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + "SELECT * FROM regulation_events WHERE id = %s", (event_id,) + ) + row = cur.fetchone() + return _row_to_dict(row) if row else None + + def filter( + self, + *, + source: str | None = None, + impact_level: str | None = None, + limit: int = 50, + ) -> list[dict]: + conditions: list[str] = [] + params: list[Any] = [] + if source: + conditions.append("source = %s") + params.append(source) + if impact_level: + conditions.append("impact_level = %s") + params.append(impact_level) + where = ("WHERE " + " AND ".join(conditions)) if conditions else "" + params.append(limit) + sql = f""" + SELECT * FROM regulation_events + {where} + ORDER BY published_at DESC NULLS LAST + LIMIT %s + """ + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, params) + return [_row_to_dict(r) for r in cur.fetchall()] + + def stats(self) -> dict: + cutoff = (date.today() - timedelta(days=90)).isoformat() + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("SELECT COUNT(*) AS count FROM regulation_events") + total = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'" + ) + high = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'" + ) + medium = (cur.fetchone() or {}).get("count", 0) + cur.execute( + "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s", + (cutoff,), + ) + recent = (cur.fetchone() or {}).get("count", 0) + return { + "total": int(total), + "high_impact": int(high), + "medium_impact": int(medium), + "recent_90d": int(recent), + } + + def upsert(self, event: dict) -> None: + """Insert or update a regulation event.""" + cols = [c for c in _ALL_COLUMNS if c in event] + placeholders = ", ".join(f"%({c})s" for c in cols) + updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id") + sql = f""" + INSERT INTO regulation_events ({', '.join(cols)}) + VALUES ({placeholders}) + ON CONFLICT (id) DO UPDATE SET {updates} + """ + row: dict[str, Any] = {} + for c in cols: + val = event.get(c) + if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None: + row[c] = json.dumps(val, ensure_ascii=False) + elif c == "tags" and isinstance(val, list): + row[c] = val # psycopg2 handles list→array + else: + row[c] = val + with self._conn() as conn: + with conn.cursor() as cur: + cur.execute(sql, row) + conn.commit() + + def get_by_standard_code(self, standard_code: str) -> dict | None: + with self._conn() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + """SELECT * FROM regulation_events + WHERE standard_code = %s + ORDER BY published_at DESC NULLS LAST + LIMIT 1""", + (standard_code,), + ) + row = cur.fetchone() + return _row_to_dict(row) if row else None +``` + +- [ ] **Step 4: Run tests — expect PASS** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_postgres_event_store.py -v +``` +Expected: 3 tests PASS + +--- + +## Task 3: Crawler base + CATARC crawler + +**Files:** +- Create: `backend/app/infrastructure/perception/crawlers/__init__.py` +- Create: `backend/app/infrastructure/perception/crawlers/base.py` +- Create: `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` +- Create: `backend/tests/perception/test_crawlers.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/perception/test_crawlers.py +"""Unit tests for crawlers — mock httpx responses.""" +from __future__ import annotations +from unittest.mock import MagicMock, patch +import pytest + +from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler + + +def test_raw_event_fields(): + ev = RawEvent( + source="TEST", + source_label="Test", + standard_code="TST-001", + title="Test", + summary="Summary", + full_text_url="https://example.com", + status="enacted", + published_at="2026-01-01", + effective_at=None, + category="test", + tags=["a"], + raw_text="full text here", + ) + assert ev.source == "TEST" + assert ev.tags == ["a"] + + +CATARC_HTML = """ + + + + + + + + + + + + + + +
GB 18384-2025电动汽车安全要求2025-11-15现行
GB/T 40429-2026汽车驾驶自动化分级2026-02-01即将实施
+ +""" + + +def test_catarc_crawler_parses_html(): + from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = CATARC_HTML + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = CatarcCrawler() + events = crawler.fetch(limit=10) + + assert isinstance(events, list) + assert len(events) >= 1 + assert all(isinstance(e, RawEvent) for e in events) + codes = [e.standard_code for e in events] + assert "GB 18384-2025" in codes + + +GUOBIAO_JSON = { + "rows": [ + { + "std_code": "GB 18384-2025", + "std_name": "电动汽车安全要求", + "release_date": "2025-11-15", + "implement_date": "2026-07-01", + "std_status": "现行", + "std_type": "强制性", + }, + ] +} + + +def test_guobiao_crawler_parses_json(): + from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = GUOBIAO_JSON + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = GuobiaoMandatoryCrawler() + events = crawler.fetch(limit=10) + + assert len(events) >= 1 + assert events[0].source == "国标委" + assert events[0].standard_code == "GB 18384-2025" +``` + +- [ ] **Step 2: Run test to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v +``` +Expected: ImportError + +- [ ] **Step 3: Create crawler base** + +```python +# backend/app/infrastructure/perception/crawlers/__init__.py +``` + +```python +# backend/app/infrastructure/perception/crawlers/base.py +"""Shared contracts for regulatory source crawlers.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field + + +@dataclass +class RawEvent: + """Raw regulatory event returned by a crawler before enrichment.""" + + source: str + source_label: str + standard_code: str + title: str + summary: str + full_text_url: str + status: str # 'enacted' | 'draft' | 'consultation' + published_at: str # YYYY-MM-DD string + effective_at: str | None + category: str + tags: list[str] = field(default_factory=list) + raw_text: str = "" # full crawled text for hashing + LLM + + +class BaseCrawler(ABC): + """Abstract regulatory source crawler.""" + + @abstractmethod + def fetch(self, limit: int = 50) -> list[RawEvent]: + """Fetch up to `limit` recent events from the data source.""" +``` + +- [ ] **Step 4: Create CATARC crawler** + +```python +# backend/app/infrastructure/perception/crawlers/catarc_crawler.py +"""Crawler for CATARC automotive standard catalogue.""" + +from __future__ import annotations + +import hashlib + +import httpx +from bs4 import BeautifulSoup +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent + +_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html" +_HOST = "https://www.catarc.org.cn" + +# Status strings appearing on the CATARC site mapped to our vocabulary. +_STATUS_MAP = { + "现行": "enacted", + "即将实施": "enacted", + "废止": "enacted", + "征求意见": "consultation", + "报批": "draft", +} + + +class CatarcCrawler(BaseCrawler): + """Scrape the CATARC automotive standard list page.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + while len(events) < limit: + url = f"{_BASE_URL}?page={page}" + try: + resp = httpx.get(url, timeout=30, follow_redirects=True) + resp.raise_for_status() + except Exception as exc: + logger.warning("CATARC fetch failed page={} err={}", page, exc) + break + + soup = BeautifulSoup(resp.text, "lxml") + rows = soup.select("table tr") + if not rows: + break + + batch: list[RawEvent] = [] + for row in rows: + cells = row.find_all("td") + if len(cells) < 3: + continue + link = cells[0].find("a") + standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True) + title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code + date_text = cells[2].get_text(strip=True) if len(cells) > 2 else "" + published_at = _parse_date(date_text) + status_text = cells[3].get_text(strip=True) if len(cells) > 3 else "" + status = _STATUS_MAP.get(status_text, "enacted") + detail_url = (_HOST + link["href"]) if link and link.get("href") else url + raw_text = f"{standard_code} {title}" + batch.append(RawEvent( + source="CATARC", + source_label="全国汽车标准化技术委员会", + standard_code=standard_code, + title=title, + summary=title, + full_text_url=detail_url, + status=status, + published_at=published_at, + effective_at=None, + category="汽车标准", + tags=_extract_tags(standard_code, title), + raw_text=raw_text, + )) + + if not batch: + break + events.extend(batch) + page += 1 + + return events[:limit] + + +def _parse_date(text: str) -> str: + """Return YYYY-MM-DD from common Chinese date formats, or today's date.""" + import re + from datetime import date + text = text.strip() + m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + return f"{y}-{mo}-{d}" + m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text) + if m2: + y, mo, d = m2.group(1), m2.group(2).zfill(2), m2.group(3).zfill(2) + return f"{y}-{mo}-{d}" + return date.today().isoformat() + + +def _extract_tags(standard_code: str, title: str) -> list[str]: + """Derive simple keyword tags from standard code and title.""" + tags: list[str] = [] + code_upper = standard_code.upper() + if "GB" in code_upper: + tags.append("国家标准") + if "/T" in code_upper: + tags.append("推荐性") + else: + tags.append("强制性") + keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"] + for kw in keywords: + if kw in title: + tags.append(kw) + return tags[:5] +``` + +- [ ] **Step 5: Create 国标委 crawler** + +```python +# backend/app/infrastructure/perception/crawlers/guobiao_crawler.py +"""Crawlers for the 国标委 (SAMR) standard information platform.""" + +from __future__ import annotations + +import httpx +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from app.infrastructure.perception.crawlers.catarc_crawler import _parse_date, _extract_tags + +# p.p1=1 → mandatory (强制性); p.p1=2 → recommended (推荐性) +_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type" +_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"} + + +def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]: + params = { + "p.p1": std_type, + "p.p2": "车", + "p.p90": "circulation_date", + "p.p91": "desc", + "p.p6": page, + "p.p7": page_size, + } + try: + resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30) + resp.raise_for_status() + data = resp.json() + return data.get("rows", []) or [] + except Exception as exc: + logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc) + return [] + + +def _row_to_raw_event(row: dict, source_label: str) -> RawEvent: + standard_code = row.get("std_code", "") + title = row.get("std_name", standard_code) + published_at = _parse_date(row.get("release_date", "")) + effective_at_raw = row.get("implement_date", "") + effective_at = _parse_date(effective_at_raw) if effective_at_raw else None + status_text = row.get("std_status", "") + if "征求意见" in status_text: + status = "consultation" + elif "报批" in status_text or "草案" in status_text: + status = "draft" + else: + status = "enacted" + return RawEvent( + source="国标委", + source_label=source_label, + standard_code=standard_code, + title=title, + summary=title, + full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}", + status=status, + published_at=published_at, + effective_at=effective_at, + category=row.get("std_type", "国家标准"), + tags=_extract_tags(standard_code, title), + raw_text=f"{standard_code} {title}", + ) + + +class GuobiaoMandatoryCrawler(BaseCrawler): + """Fetch mandatory national standards (强制性) related to vehicles.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + while len(events) < limit: + rows = _fetch_page(std_type=1, page=page, page_size=20) + if not rows: + break + events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows) + page += 1 + return events[:limit] + + +class GuobiaoRecommendedCrawler(BaseCrawler): + """Fetch recommended national standards (推荐性) related to vehicles.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + page = 1 + while len(events) < limit: + rows = _fetch_page(std_type=2, page=page, page_size=20) + if not rows: + break + events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows) + page += 1 + return events[:limit] +``` + +- [ ] **Step 6: Run tests** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v +``` +Expected: 3 tests PASS + +--- + +## Task 4: EUR-Lex + UN-ECE crawler + +**Files:** +- Create: `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` + +(Tests already created in `test_crawlers.py` — add to existing file) + +- [ ] **Step 1: Add EUR-Lex test to existing test file** + +Append to `backend/tests/perception/test_crawlers.py`: + +```python +EURLEX_RSS = """ + + + EUR-Lex + + Regulation (EU) 2024/1689 — AI Act + https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689 + The EU Artificial Intelligence Act enters into force. + Fri, 12 Jul 2024 00:00:00 GMT + + +""" + + +def test_eurlex_crawler_parses_rss(): + from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = EURLEX_RSS + mock_resp.raise_for_status = MagicMock() + + with patch("httpx.get", return_value=mock_resp): + crawler = EurlexCrawler() + events = crawler.fetch(limit=5) + + assert isinstance(events, list) + assert len(events) >= 1 + assert events[0].source == "EUR-Lex" +``` + +- [ ] **Step 2: Run to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py::test_eurlex_crawler_parses_rss -v +``` +Expected: ImportError + +- [ ] **Step 3: Implement EUR-Lex + UN-ECE crawler** + +```python +# backend/app/infrastructure/perception/crawlers/eurlex_crawler.py +"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations.""" + +from __future__ import annotations + +import re +from email.utils import parsedate_to_datetime + +import httpx +from bs4 import BeautifulSoup +from loguru import logger + +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from app.infrastructure.perception.crawlers.catarc_crawler import _parse_date + +# EUR-Lex predefined RSS: legislation in force (OJ L series) +_EURLEX_RSS_URLS = [ + # EU AI Act + automotive-related OJ publications + "https://eur-lex.europa.eu/rss-feed/OJ-L.rss", +] + +# UN-ECE automotive regulations via EUR-Lex CELLAR +_UNECE_CELEX = [ + "32024R0001", # UN R155 cybersecurity (representative CELEX; adjust as needed) + "32024R0002", # UN R156 software updates +] + +_AUTOMOTIVE_KEYWORDS = [ + "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous", + "AI Act", "artificial intelligence", "cybersecurity", "software update", + "R155", "R156", "汽车", "车辆", +] + + +def _is_automotive_relevant(title: str, description: str) -> bool: + combined = (title + " " + description).lower() + return any(kw.lower() in combined for kw in _AUTOMOTIVE_KEYWORDS) + + +def _extract_celex(url: str) -> str: + """Extract CELEX number from EUR-Lex URL, or return empty string.""" + m = re.search(r"CELEX[:/]([0-9A-Z]+)", url) + return m.group(1) if m else "" + + +def _parse_rss_date(rfc2822: str) -> str: + """Parse RFC-2822 date string → YYYY-MM-DD.""" + try: + dt = parsedate_to_datetime(rfc2822) + return dt.date().isoformat() + except Exception: + return _parse_date(rfc2822) + + +class EurlexCrawler(BaseCrawler): + """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds.""" + + def fetch(self, limit: int = 50) -> list[RawEvent]: + events: list[RawEvent] = [] + for rss_url in _EURLEX_RSS_URLS: + if len(events) >= limit: + break + try: + resp = httpx.get(rss_url, timeout=30, follow_redirects=True) + resp.raise_for_status() + except Exception as exc: + logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc) + continue + + soup = BeautifulSoup(resp.text, "lxml-xml") + for item in soup.find_all("item"): + if len(events) >= limit: + break + title = (item.find("title") or {}).get_text(strip=True) + description = (item.find("description") or {}).get_text(strip=True) + link = (item.find("link") or {}).get_text(strip=True) + pub_date = (item.find("pubDate") or {}).get_text(strip=True) + + if not _is_automotive_relevant(title, description): + continue + + celex = _extract_celex(link) + standard_code = celex if celex else title[:60] + published_at = _parse_rss_date(pub_date) if pub_date else _parse_date("") + + events.append(RawEvent( + source="EUR-Lex", + source_label="欧盟官方公报", + standard_code=standard_code, + title=title, + summary=description[:500], + full_text_url=link, + status="enacted", + published_at=published_at, + effective_at=None, + category="EU法规", + tags=_extract_eurlex_tags(title, description), + raw_text=f"{title}\n{description}", + )) + + return events[:limit] + + +def _extract_eurlex_tags(title: str, description: str) -> list[str]: + combined = title + " " + description + tag_map = { + "AI Act": "EU AI Act", + "artificial intelligence": "EU AI Act", + "R155": "UN R155", + "R156": "UN R156", + "cybersecurity": "网络安全", + "emission": "排放", + "autonomous": "自动驾驶", + "ADAS": "ADAS", + } + tags = [] + for kw, tag in tag_map.items(): + if kw.lower() in combined.lower(): + tags.append(tag) + return tags[:5] +``` + +- [ ] **Step 4: Run tests** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v +``` +Expected: 4 tests PASS + +--- + +## Task 5: LLM Pipeline (extract + assess + diff) + +**Files:** +- Create: `backend/app/infrastructure/perception/llm_pipeline.py` +- Create: `backend/tests/perception/test_llm_pipeline.py` + +- [ ] **Step 1: Write the failing test** + +```python +# backend/tests/perception/test_llm_pipeline.py +"""Unit tests for LlmPipeline — mock LLM client and embedding provider.""" +from __future__ import annotations +from unittest.mock import MagicMock, patch +import json +import pytest + + +def _make_pipeline(): + with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \ + patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls: + + mock_client = MagicMock() + mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}') + mock_llm_fn.return_value = mock_client + + mock_emb = MagicMock() + mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024] + mock_emb_cls.return_value = mock_emb + + from app.infrastructure.perception.llm_pipeline import LlmPipeline + return LlmPipeline(), mock_client, mock_emb + + +def test_extract_structure_returns_dict(): + pipeline, mock_client, _ = _make_pipeline() + event = { + "id": "evt-001", + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "summary": "新增 IP67 级别防护", + "source_label": "CATARC", + "tags": ["电池安全"], + } + result = pipeline.extract_structure(event) + assert isinstance(result, dict) + assert "obligations" in result + assert "impact_level" in result + + +def test_assess_impact_returns_list(): + pipeline, mock_client, _ = _make_pipeline() + mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]') + mock_retrieval = MagicMock() + chunk = MagicMock() + chunk.doc_id = "d1" + chunk.doc_title = "Safety Manual" + chunk.score = 0.85 + chunk.text = "relevant text" + chunk.section_title = "§4.2" + mock_retrieval.retrieve.return_value = [chunk] + event = { + "standard_code": "GB 18384-2025", + "title": "电动汽车安全要求", + "obligations": [{"text": "OEM shall comply"}], + } + result = pipeline.assess_impact(event, mock_retrieval) + assert isinstance(result, list) + + +def test_compute_diff_no_change(): + pipeline, _, mock_emb = _make_pipeline() + # identical texts → cosine similarity = 1.0 → no changes + mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024] + result = pipeline.compute_diff("paragraph one", "paragraph one") + assert isinstance(result, dict) + assert "changed_sections" in result + assert "change_summary" in result + + +def test_compute_diff_detects_change(): + pipeline, mock_client, mock_emb = _make_pipeline() + # low cosine similarity → change detected + import numpy as np + mock_emb.embed_texts.return_value = [ + [1.0] + [0.0] * 1023, + [0.0] + [1.0] + [0.0] * 1022, + ] + mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}') + result = pipeline.compute_diff("old paragraph text", "new tighter requirement text") + assert isinstance(result["changed_sections"], list) +``` + +- [ ] **Step 2: Run to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_llm_pipeline.py -v +``` +Expected: ImportError + +- [ ] **Step 3: Implement LlmPipeline** + +```python +# backend/app/infrastructure/perception/llm_pipeline.py +"""LLM-driven pipeline for regulatory event enrichment.""" + +from __future__ import annotations + +import json +import math +from typing import Any + +from loguru import logger + +from app.config.settings import settings +from app.infrastructure.embedding.openai_compatible_embedding_provider import ( + OpenAICompatibleEmbeddingProvider, +) +from app.services.llm.llm_factory import get_llm_client + +_EXTRACT_SYSTEM = ( + "You are a regulatory compliance expert specialising in automotive standards " + "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. " + "Return valid JSON only — no markdown fences, no extra keys." +) + +_ASSESS_SYSTEM = ( + "You are an automotive compliance analyst. Given a regulation and related document excerpts, " + "identify which documents are affected and what actions are required. " + "Return a JSON array only." +) + +_DIFF_SYSTEM = ( + "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, " + "classify the type of change and summarise it. " + "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}" +) + +_SIMILARITY_THRESHOLD = 0.85 + + +def _cosine(a: list[float], b: list[float]) -> float: + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + if norm_a == 0 or norm_b == 0: + return 1.0 + return dot / (norm_a * norm_b) + + +def _llm_json(client: Any, messages: list[dict]) -> Any: + """Call LLM and parse JSON response; return None on failure.""" + try: + resp = client.chat(messages) + text = (resp.content or "").strip() + # strip markdown fences if model added them despite instructions + if text.startswith("```"): + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + return json.loads(text) + except Exception as exc: + logger.warning("LLM JSON parse failed: {}", exc) + return None + + +class LlmPipeline: + """Three-step enrichment pipeline for crawled regulatory events.""" + + def __init__(self) -> None: + self._client = get_llm_client( + provider=settings.llm_provider, + model=settings.llm_model, + ) + self._embedder = OpenAICompatibleEmbeddingProvider() + + # ------------------------------------------------------------------ + # Step 1: Structure extraction + # ------------------------------------------------------------------ + + def extract_structure(self, event: dict) -> dict: + """Extract obligations, deadlines, scope, penalties, impact_level from event text.""" + prompt = f"""Extract structured compliance information from this regulation: + +Standard: {event.get('standard_code', '')} +Title: {event.get('title', '')} +Source: {event.get('source_label', '')} +Summary: {event.get('summary', '')} +Tags: {', '.join(event.get('tags', []))} + +Return JSON with exactly these keys: +{{ + "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}], + "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}], + "scope": "one sentence describing who/what this applies to", + "penalties": "one sentence on consequences of non-compliance, or null", + "impact_level": "high|medium|low" +}}""" + + messages = [ + {"role": "system", "content": _EXTRACT_SYSTEM}, + {"role": "user", "content": prompt}, + ] + result = _llm_json(self._client, messages) + if not isinstance(result, dict): + return { + "obligations": [], + "deadlines": [], + "scope": "", + "penalties": "", + "impact_level": "medium", + } + return result + + # ------------------------------------------------------------------ + # Step 2: Impact assessment + # ------------------------------------------------------------------ + + def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]: + """Use RAG to find affected documents and generate recommendations.""" + obligations = event.get("obligations") or [] + obligation_texts = " ".join(o.get("text", "") for o in obligations[:3]) + query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}" + + try: + chunks = retrieval_service.retrieve(query=query, top_k=5) + except Exception as exc: + logger.warning("RAG retrieval failed: {}", exc) + return [] + + if not chunks: + return [] + + seen: set[str] = set() + doc_excerpts: list[dict] = [] + for chunk in chunks: + if chunk.doc_id not in seen: + seen.add(chunk.doc_id) + doc_excerpts.append({ + "doc_id": chunk.doc_id, + "doc_name": chunk.doc_title, + "score": round(float(chunk.score), 4), + "snippet": (chunk.text or "")[:300], + "clause": getattr(chunk, "section_title", "") or "", + }) + + context = "\n".join( + f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}" + for d in doc_excerpts + ) + prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')} +Obligations: {obligation_texts or event.get('summary', '')} + +Affected documents found in knowledge base: +{context} + +For each document, assess impact and recommend action. Return JSON array: +[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]""" + + messages = [ + {"role": "system", "content": _ASSESS_SYSTEM}, + {"role": "user", "content": prompt}, + ] + result = _llm_json(self._client, messages) + if isinstance(result, list): + # merge score from retrieval (more reliable than LLM-invented scores) + score_map = {d["doc_id"]: d["score"] for d in doc_excerpts} + for item in result: + if isinstance(item, dict) and item.get("doc_id") in score_map: + item["score"] = score_map[item["doc_id"]] + return result + return doc_excerpts # fallback: return retrieval results without LLM recommendation + + # ------------------------------------------------------------------ + # Step 3: Semantic diff + # ------------------------------------------------------------------ + + def compute_diff(self, old_text: str, new_text: str) -> dict: + """Compare old and new regulation text; return changed sections and summary.""" + old_paras = [p.strip() for p in old_text.split("\n") if p.strip()] + new_paras = [p.strip() for p in new_text.split("\n") if p.strip()] + + if not old_paras or not new_paras: + return {"changed_sections": [], "change_summary": "No comparable text."} + + all_paras = old_paras + new_paras + try: + all_embeddings = self._embedder.embed_texts(all_paras) + except Exception as exc: + logger.warning("Embedding for diff failed: {}", exc) + return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."} + + old_embeddings = all_embeddings[: len(old_paras)] + new_embeddings = all_embeddings[len(old_paras):] + + # Pair paragraphs by position (zip — handles length differences) + changed_sections: list[dict] = [] + for i, (old_emb, new_emb, old_p, new_p) in enumerate( + zip(old_embeddings, new_embeddings, old_paras, new_paras) + ): + sim = _cosine(old_emb, new_emb) + if sim < _SIMILARITY_THRESHOLD: + messages = [ + {"role": "system", "content": _DIFF_SYSTEM}, + {"role": "user", "content": f"OLD: {old_p[:500]}\nNEW: {new_p[:500]}"}, + ] + classification = _llm_json(self._client, messages) or {} + changed_sections.append({ + "old_text": old_p[:300], + "new_text": new_p[:300], + "similarity": round(sim, 3), + "change_type": classification.get("change_type", "modified"), + "summary": classification.get("summary", ""), + }) + + if not changed_sections: + change_summary = "No substantive changes detected between versions." + else: + types = [s["change_type"] for s in changed_sections] + change_summary = ( + f"{len(changed_sections)} paragraph(s) changed: " + + ", ".join(f"{t}" for t in set(types)) + + ". " + + (changed_sections[0].get("summary", "") if changed_sections else "") + ) + + return {"changed_sections": changed_sections, "change_summary": change_summary} +``` + +- [ ] **Step 4: Run tests** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_llm_pipeline.py -v +``` +Expected: 4 tests PASS + +--- + +## Task 6: CrawlService + +**Files:** +- Create: `backend/app/application/perception/crawl_service.py` +- Create: `backend/tests/perception/test_crawl_service.py` + +- [ ] **Step 1: Write the failing test** + +```python +# backend/tests/perception/test_crawl_service.py +"""Integration tests for CrawlService.""" +from __future__ import annotations +from unittest.mock import MagicMock +import hashlib +import pytest + +from app.infrastructure.perception.crawlers.base import RawEvent +from app.infrastructure.perception.mock_event_store import MockEventStore + + +def _make_raw_event(code="TST-001"): + return RawEvent( + source="TEST", source_label="Test", standard_code=code, + title=f"Test {code}", summary="Summary", full_text_url="https://example.com", + status="enacted", published_at="2026-01-01", effective_at=None, + category="test", tags=["test"], raw_text="full text", + ) + + +def _make_service(raw_events): + from app.application.perception.crawl_service import CrawlService + + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = raw_events + + mock_pipeline = MagicMock() + mock_pipeline.extract_structure.return_value = { + "obligations": [], "deadlines": [], "scope": "test", + "penalties": None, "impact_level": "low", + } + mock_pipeline.assess_impact.return_value = [] + mock_pipeline.compute_diff.return_value = { + "changed_sections": [], "change_summary": "No changes.", + } + + mock_retrieval = MagicMock() + store = MockEventStore() + + return CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=mock_retrieval, + ) + + +def test_crawl_yields_progress_and_done(): + svc = _make_service([_make_raw_event("TST-001")]) + events = list(svc.run_crawl()) + event_types = [e.get("event") for e in events] + assert "done" in event_types + + +def test_crawl_upserts_to_store(): + store = MockEventStore() + from app.application.perception.crawl_service import CrawlService + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")] + mock_pipeline = MagicMock() + mock_pipeline.extract_structure.return_value = { + "obligations": [], "deadlines": [], "scope": "", + "penalties": None, "impact_level": "medium", + } + mock_pipeline.assess_impact.return_value = [] + mock_pipeline.compute_diff.return_value = { + "changed_sections": [], "change_summary": "", + } + svc = CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=MagicMock(), + ) + list(svc.run_crawl()) + result = store.get_by_standard_code("NEW-001") + assert result is not None + assert result["title"] == "Test NEW-001" + + +def test_crawl_skips_unchanged_events(): + store = MockEventStore() + raw = _make_raw_event("SKIP-001") + content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest() + # Pre-seed with same hash + store.upsert({ + "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12], + "standard_code": "SKIP-001", + "source": "TEST", + "source_label": "Test", + "title": "Test SKIP-001", + "summary": "", + "full_text_url": "", + "status": "enacted", + "impact_level": "low", + "published_at": "2026-01-01", + "effective_at": None, + "category": "test", + "tags": [], + "content_hash": content_hash, + }) + mock_pipeline = MagicMock() + from app.application.perception.crawl_service import CrawlService + mock_crawler = MagicMock() + mock_crawler.fetch.return_value = [raw] + svc = CrawlService( + crawlers={"TEST": mock_crawler}, + event_store=store, + llm_pipeline=mock_pipeline, + retrieval_service=MagicMock(), + ) + list(svc.run_crawl()) + # pipeline should NOT have been called for unchanged event + mock_pipeline.extract_structure.assert_not_called() +``` + +- [ ] **Step 2: Run to verify it fails** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawl_service.py -v +``` +Expected: ImportError + +- [ ] **Step 3: Implement CrawlService** + +```python +# backend/app/application/perception/crawl_service.py +"""Orchestrates regulatory source crawlers and LLM enrichment pipeline.""" + +from __future__ import annotations + +import hashlib +from typing import Any, Generator + +from loguru import logger + +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent +from app.infrastructure.perception.llm_pipeline import LlmPipeline + + +def _event_id(source: str, standard_code: str) -> str: + """Deterministic 12-char ID from source + standard_code.""" + return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12] + + +def _content_hash(raw_text: str) -> str: + return hashlib.sha256(raw_text.encode()).hexdigest() + + +def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict: + return { + "id": event_id, + "source": raw.source, + "source_label": raw.source_label, + "standard_code": raw.standard_code, + "title": raw.title, + "summary": raw.summary, + "full_text_url": raw.full_text_url, + "status": raw.status, + "impact_level": "medium", # updated by LLM pipeline + "published_at": raw.published_at, + "effective_at": raw.effective_at, + "category": raw.category, + "tags": raw.tags, + "content_hash": content_hash, + "previous_hash": None, + } + + +class CrawlService: + """Orchestrate crawlers, hash-based change detection, and LLM enrichment.""" + + def __init__( + self, + crawlers: dict[str, BaseCrawler], + event_store: BaseEventStore, + llm_pipeline: LlmPipeline, + retrieval_service: Any, + ) -> None: + self._crawlers = crawlers + self._store = event_store + self._pipeline = llm_pipeline + self._retrieval = retrieval_service + + def run_crawl( + self, sources: list[str] | None = None + ) -> Generator[dict, None, None]: + """Run crawl for selected sources. Yields SSE-ready progress dicts.""" + targets = sources or list(self._crawlers.keys()) + total_new = 0 + total_updated = 0 + + for source_key in targets: + crawler = self._crawlers.get(source_key) + if not crawler: + yield {"event": "error", "data": f"Unknown source: {source_key}"} + continue + + yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}} + try: + raw_events = crawler.fetch(limit=100) + except Exception as exc: + logger.exception("Crawler failed source={}", source_key) + yield {"event": "error", "data": {"source": source_key, "message": str(exc)}} + continue + + yield { + "event": "progress", + "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)}, + } + + new_count = 0 + updated_count = 0 + + for raw in raw_events: + eid = _event_id(raw.source, raw.standard_code) + new_hash = _content_hash(raw.raw_text or raw.title) + existing = self._store.get(eid) + + if existing and existing.get("content_hash") == new_hash: + # Unchanged — skip LLM processing + continue + + is_update = existing is not None + old_text = existing.get("summary", "") if is_update else "" + previous_hash = existing.get("content_hash") if is_update else None + + event_dict = _raw_to_dict(raw, eid, new_hash) + event_dict["previous_hash"] = previous_hash + + # Step 1: Structure extraction + try: + structure = self._pipeline.extract_structure(event_dict) + event_dict.update(structure) + except Exception as exc: + logger.warning("Structure extraction failed id={} err={}", eid, exc) + + # Step 2: Impact assessment + try: + affected = self._pipeline.assess_impact(event_dict, self._retrieval) + event_dict["affected_docs"] = affected + except Exception as exc: + logger.warning("Impact assessment failed id={} err={}", eid, exc) + + # Step 3: Semantic diff (only when updating existing event) + if is_update and old_text and raw.raw_text: + try: + diff = self._pipeline.compute_diff(old_text, raw.raw_text) + event_dict["change_summary"] = diff.get("change_summary") + event_dict["changed_sections"] = diff.get("changed_sections") + except Exception as exc: + logger.warning("Diff failed id={} err={}", eid, exc) + + self._store.upsert(event_dict) + + if is_update: + updated_count += 1 + else: + new_count += 1 + + total_new += new_count + total_updated += updated_count + + yield { + "event": "progress", + "data": { + "source": source_key, + "stage": "done", + "new": new_count, + "updated": updated_count, + }, + } + + yield { + "event": "done", + "data": {"total_new": total_new, "total_updated": total_updated}, + } +``` + +- [ ] **Step 4: Run tests** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/test_crawl_service.py -v +``` +Expected: 3 tests PASS + +--- + +## Task 7: Wire bootstrap + add settings + update PerceptionService type hint + +**Files:** +- Modify: `backend/app/config/settings.py` +- Modify: `backend/app/shared/bootstrap.py` +- Modify: `backend/app/application/perception/services.py` +- Modify: `backend/requirements.txt` +- Modify: `backend/.env` +- Modify: `backend/.env.example` + +- [ ] **Step 1: Add settings** + +In `backend/app/config/settings.py`, after the `use_celery_worker` field (line ~88), add: + +```python + # ── Perception crawl ────────────────────────────────────────────────────── + perception_crawl_timeout_seconds: int = Field( + default=120, description="HTTP timeout for regulatory source crawlers." + ) + perception_max_events_per_source: int = Field( + default=100, description="Maximum events fetched per source per crawl run." + ) + perception_diff_similarity_threshold: float = Field( + default=0.85, + description="Cosine similarity below which a paragraph is flagged as changed.", + ) +``` + +- [ ] **Step 2: Add env vars to .env and .env.example** + +Add to `backend/.env` (after `USE_CELERY_WORKER=false`): +``` +PERCEPTION_CRAWL_TIMEOUT_SECONDS=120 +PERCEPTION_MAX_EVENTS_PER_SOURCE=100 +PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85 +``` + +Add the same block to `.env.example`. + +- [ ] **Step 3: Fix type hint in PerceptionService** + +In `backend/app/application/perception/services.py`, change: + +```python +from app.infrastructure.perception.mock_event_store import MockEventStore +``` +to: +```python +from app.infrastructure.perception.base_event_store import BaseEventStore +``` + +Change constructor type hint from: +```python + def __init__( + self, + event_store: MockEventStore, + retrieval_service: KnowledgeRetrievalService, + ) -> None: +``` +to: +```python + def __init__( + self, + event_store: BaseEventStore, + retrieval_service: KnowledgeRetrievalService, + ) -> None: +``` + +- [ ] **Step 4: Wire bootstrap.py** + +At the top of `backend/app/shared/bootstrap.py`, after existing imports, add: + +```python +from app.application.perception.crawl_service import CrawlService +from app.infrastructure.perception.base_event_store import BaseEventStore +from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler +from app.infrastructure.perception.crawlers.guobiao_crawler import ( + GuobiaoMandatoryCrawler, + GuobiaoRecommendedCrawler, +) +from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler +from app.infrastructure.perception.llm_pipeline import LlmPipeline +``` + +Replace the existing `get_perception_service()` function: + +```python +@lru_cache +def _get_event_store() -> BaseEventStore: + """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting.""" + if settings.document_repository_backend == "postgres": + from app.infrastructure.perception.postgres_event_store import PostgresEventStore + return PostgresEventStore() + return MockEventStore() + + +@lru_cache +def get_perception_service() -> PerceptionService: + """Return perception service for regulatory intelligence.""" + return PerceptionService( + event_store=_get_event_store(), + retrieval_service=get_retrieval_service(), + ) + + +@lru_cache +def get_crawl_service() -> CrawlService: + """Return CrawlService wired with all registered crawlers and LLM pipeline.""" + crawlers = { + "CATARC": CatarcCrawler(), + "国标委·强制性": GuobiaoMandatoryCrawler(), + "国标委·推荐性": GuobiaoRecommendedCrawler(), + "EUR-Lex": EurlexCrawler(), + } + return CrawlService( + crawlers=crawlers, + event_store=_get_event_store(), + llm_pipeline=LlmPipeline(), + retrieval_service=get_retrieval_service(), + ) +``` + +- [ ] **Step 5: Add beautifulsoup4 + lxml to requirements.txt** + +After the `httpx>=0.25.0` line in `backend/requirements.txt`, add: + +``` +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +``` + +- [ ] **Step 6: Verify imports work** + +``` +cd backend && PYTHONPATH=. python -c "from app.shared.bootstrap import get_crawl_service; print('ok')" +``` +Expected: `ok` + +--- + +## Task 8: New API endpoints (crawl + process + diff) + +**Files:** +- Modify: `backend/app/api/routes/perception.py` + +- [ ] **Step 1: Add three new endpoints** + +Open `backend/app/api/routes/perception.py`. After the existing `analyze_event` endpoint, add: + +```python +from fastapi import Depends +from app.api.dependencies.auth import get_current_user +from app.domain.auth.models import UserClaims +from app.shared.bootstrap import get_crawl_service + + +@router.post("/crawl") +async def run_crawl( + body: dict = None, + current_user: UserClaims = Depends(get_current_user), +): + """Trigger manual crawl of regulatory sources. Streams SSE progress. + + Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]} + Omit sources to crawl all registered sources. + """ + sources: list[str] | None = (body or {}).get("sources") + crawl_svc = get_crawl_service() + + async def crawl_stream(): + async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)): + event_name = item.get("event", "message") + data = item.get("data", "") + if isinstance(data, (dict, list)): + data = json.dumps(data, ensure_ascii=False) + yield f"event: {event_name}\ndata: {data}\n\n" + + return StreamingResponse( + crawl_stream(), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + +@router.post("/events/{event_id}/process") +async def process_event( + event_id: str, + current_user: UserClaims = Depends(get_current_user), +): + """Trigger LLM pipeline (extract + assess + diff) for a single event.""" + from datetime import UTC, datetime + from app.infrastructure.perception.llm_pipeline import LlmPipeline + from app.shared.bootstrap import get_retrieval_service + + event = get_perception_service().get_event(event_id) + if not event: + from fastapi import HTTPException + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + + store = get_crawl_service()._store # share the same store instance + pipeline = LlmPipeline() + + structure = pipeline.extract_structure(event) + event.update(structure) + event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service()) + event["processed_at"] = datetime.now(UTC).isoformat() + store.upsert(event) + + return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]} + + +@router.get("/events/{event_id}/diff") +async def get_event_diff(event_id: str): + """Return semantic diff detail for an event (only available if previously crawled twice).""" + event = get_perception_service().get_event(event_id) + if not event: + from fastapi import HTTPException + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + if not event.get("change_summary"): + from fastapi import HTTPException + raise HTTPException(status_code=404, detail="No diff available for this event") + return { + "event_id": event_id, + "change_summary": event.get("change_summary"), + "changed_sections": event.get("changed_sections") or [], + "previous_hash": event.get("previous_hash"), + "content_hash": event.get("content_hash"), + } +``` + +- [ ] **Step 2: Smoke test with curl (backend running)** + +```bash +# With backend running (./dev.sh start api): +curl -s -H "Authorization: Bearer $TOKEN" \ + http://localhost:8000/api/v1/perception/stats | python -m json.tool +``` +Expected: JSON with `total`, `high_impact`, `medium_impact`, `recent_90d`. + +--- + +## Task 9: Frontend — Crawl Bar + Detail Tabs + +**Files:** +- Modify: `frontend/src/pages/Perception/PerceptionPage.tsx` + +- [ ] **Step 1: Add CrawlBar state and handler at the top of PerceptionPage** + +In `PerceptionPage.tsx`, after the existing `abortRef` line (~line 107), add: + +```tsx + const [crawling, setCrawling] = useState(false); + const [crawlStatus, setCrawlStatus] = useState(''); + const [detailTab, setDetailTab] = useState<'overview'|'obligations'|'assessment'|'diff'>('overview'); + + // Extended signal shape from DB (populated after crawl) + const [selectedFull, setSelectedFull] = useState | null>(null); + + async function fetchFullEvent(id: string) { + try { + const res = await fetch(`/api/v1/perception/events/${id}`, { headers: authHeader() }); + if (res.ok) setSelectedFull(await res.json()); + } catch { /* ignore */ } + } +``` + +- [ ] **Step 2: Add runCrawl function** + +After `stopAnalysis()`, add: + +```tsx + async function runCrawl() { + setCrawling(true); + setCrawlStatus('正在连接数据源...'); + try { + const res = await fetch('/api/v1/perception/crawl', { + method: 'POST', + headers: { 'Content-Type': 'application/json', ...authHeader() }, + body: JSON.stringify({}), + }); + if (!res.body) { setCrawlStatus('No stream'); setCrawling(false); return; } + const reader = res.body.getReader(); + const dec = new TextDecoder(); + let buf = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += dec.decode(value); + const parts = buf.split('\n\n'); + buf = parts.pop() ?? ''; + for (const block of parts) { + const eventLine = block.split('\n').find(l => l.startsWith('event: ')); + const dataLine = block.split('\n').find(l => l.startsWith('data: ')); + const evtName = eventLine?.slice(7).trim(); + const raw = dataLine?.slice(6).trim(); + if (!raw) continue; + try { + const d = JSON.parse(raw); + if (evtName === 'progress') { + setCrawlStatus(`${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`); + } else if (evtName === 'done') { + setCrawlStatus(`更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated} 条`); + // refresh event list + fetch('/api/v1/perception/events?limit=100', { headers: authHeader() }) + .then(r => r.json()) + .then(d2 => { if (Array.isArray(d2?.events)) setSignals(d2.events.map(mapEvent)); }); + } else if (evtName === 'error') { + setCrawlStatus(`错误: ${typeof d === 'string' ? d : d.message}`); + } + } catch { /* ignore */ } + } + } + } catch (e: unknown) { + setCrawlStatus(`连接失败: ${e instanceof Error ? e.message : String(e)}`); + } + setCrawling(false); + } +``` + +- [ ] **Step 3: Update selectSignal to also fetch full event** + +Replace: +```tsx + function selectSignal(sig: Signal) { + setSelected(sig); + setAiOutput(''); + setStreaming(false); + } +``` +with: +```tsx + function selectSignal(sig: Signal) { + setSelected(sig); + setSelectedFull(null); + setAiOutput(''); + setStreaming(false); + setDetailTab('overview'); + fetchFullEvent(sig.id); + } +``` + +- [ ] **Step 4: Replace Topbar Refresh button with CrawlBar** + +Replace the existing: +```tsx + +``` +with: +```tsx + + {crawlStatus && {crawlStatus}} +``` + +- [ ] **Step 5: Replace right panel with tabbed detail view** + +Replace the entire right panel section (the `
` block, roughly lines 267–319) with: + +```tsx +
+ {!selected ? ( +
+
+

Select a signal to run impact analysis

+
+ ) : ( + <> + {/* ── Detail header card ── */} +
+
+ {selected.source} + {selected.standard} + + {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'} + + {selectedFull?.change_summary && ( + CHANGED + )} +
+
{selected.title}
+

{selected.summary}

+
+ {!streaming + ? + : + } + {selected && ( + + Source + + )} +
+
+ + {/* ── Tab bar ── */} +
+ {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => ( + + ))} +
+ + {/* ── Tab content ── */} + {detailTab === 'overview' && ( +
+
Scope & Summary
+

+ {(selectedFull?.scope as string) || selected.summary} +

+ {selectedFull?.penalties && ( +

+ ⚠ {selectedFull.penalties as string} +

+ )} +
+ )} + + {detailTab === 'obligations' && ( +
+
义务条款
+ {(() => { + const obs = (selectedFull?.obligations as Array>) || []; + const deadlines = (selectedFull?.deadlines as Array>) || []; + return obs.length === 0 && deadlines.length === 0 ? ( +

暂无结构化数据。点击右上角"Run impact analysis"触发提取。

+ ) : ( + <> + {obs.length > 0 && ( + + + + + + + + + + {obs.map((ob, i) => ( + + + + + + ))} + +
义务描述主体类型
{ob.text}{ob.subject} + + {ob.deontic} + +
+ )} + {deadlines.length > 0 && ( +
+
截止日期
+ {deadlines.map((d, i) => ( +
+ {d.date || '待定'} + {d.description} +
+ ))} +
+ )} + + ); + })()} +
+ )} + + {detailTab === 'assessment' && ( +
+
Affected documents
+ {(() => { + const docs = (selectedFull?.affected_docs as Array>) || MOCK_DOCS.map(d => ({ doc_name: d.name, score: d.score / 100, key_clauses: d.clause, snippet: d.snippet, recommendation: '' })); + return docs.length === 0 + ?

No affected documents found.

+ : docs.map((d, i) => ( +
+ {Math.round(Number(d.score ?? 0) * 100)}% +
+
+ {String(d.doc_name || '')} + {String(d.key_clauses || d.clause || '')} +
+ {d.snippet &&
{String(d.snippet)}
} + {d.recommendation && ( +
→ {String(d.recommendation)}
+ )} +
+
+ )); + })()} +
+ )} + + {detailTab === 'diff' && selectedFull?.change_summary && ( +
+
变更对比
+

+ {selectedFull.change_summary as string} +

+ {(() => { + const sections = (selectedFull.changed_sections as Array>) || []; + return sections.map((s, i) => ( +
+
+ + {String(s.change_type)} + + cosine: {String(s.similarity)} +
+
+
+
旧版
+ {String(s.old_text)} +
+
+
新版
+ {String(s.new_text)} +
+
+ {s.summary &&

{String(s.summary)}

} +
+ )); + })()} +
+ )} + + {/* ── AI Analysis card (unchanged) ── */} + {(aiOutput || streaming) && ( +
+
AI Impact Analysis
+
+ {aiOutput} + {streaming && } +
+
+ )} + + )} +
+``` + +- [ ] **Step 6: Add CSS for tabs and spin animation** + +In `frontend/src/styles/globals.css`, append at the end: + +```css +/* ── Perception detail tabs ── */ +.detail-tabs { + display: flex; + gap: 2px; + margin: 8px 0 0; + border-bottom: 1px solid var(--border); + padding-bottom: 0; +} +.detail-tab { + background: none; + border: none; + border-bottom: 2px solid transparent; + padding: 6px 14px; + font-size: 13px; + color: var(--text-secondary); + cursor: pointer; + transition: color 0.15s, border-color 0.15s; +} +.detail-tab:hover { color: var(--text); } +.detail-tab.active { + color: var(--accent); + border-bottom-color: var(--accent); + font-weight: 600; +} +.detail-tab.disabled { + opacity: 0.35; + cursor: not-allowed; +} + +/* ── Spin animation for crawl refresh icon ── */ +@keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } +.spin { animation: spin 1s linear infinite; } +``` + +- [ ] **Step 7: Verify TypeScript compiles** + +``` +cd frontend && npx tsc --noEmit +``` +Expected: no errors (or only pre-existing errors unrelated to PerceptionPage) + +--- + +## Task 10: Install new Python dependencies + +**Files:** +- Modify: `backend/requirements.txt` (already done in Task 7) + +- [ ] **Step 1: Install on server** + +```bash +# On the server (in project root): +.venv/bin/pip install beautifulsoup4>=4.12.0 lxml>=5.0.0 +``` + +- [ ] **Step 2: Verify import** + +```bash +PYTHONPATH=backend .venv/bin/python -c "from bs4 import BeautifulSoup; print('ok')" +``` +Expected: `ok` + +- [ ] **Step 3: Run all perception tests** + +``` +cd backend && PYTHONPATH=. pytest tests/perception/ -v +``` +Expected: all tests PASS + +--- + +## Task 11: End-to-end verification + +- [ ] **Step 1: Start backend** + +```bash +./dev.sh start api +``` + +- [ ] **Step 2: Verify stats endpoint still works** + +```bash +TOKEN=$(curl -s -X POST http://localhost:8000/api/v1/auth/login \ + -H "Content-Type: application/json" \ + -d '{"username":"admin","password":"Admin@2026!"}' | python -m json.tool | grep access_token | cut -d'"' -f4) + +curl -s -H "Authorization: Bearer $TOKEN" \ + http://localhost:8000/api/v1/perception/stats | python -m json.tool +``` +Expected: `{"total": ..., "high_impact": ..., ...}` + +- [ ] **Step 3: Trigger manual crawl (with DOCUMENT_REPOSITORY_BACKEND=json, uses MockEventStore)** + +```bash +curl -s -X POST \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + http://localhost:8000/api/v1/perception/crawl \ + -d '{"sources":["CATARC"]}' --no-buffer +``` +Expected: SSE stream with `event: progress` lines followed by `event: done` + +- [ ] **Step 4: Switch to postgres backend and re-verify (if PostgreSQL available)** + +In `.env`, set `DOCUMENT_REPOSITORY_BACKEND=postgres`, restart API, then repeat Step 2 and 3. Verify events appear in `regulation_events` table: + +```bash +psql -h 6.86.80.8 -U postgresql -d compliance_db -c "SELECT COUNT(*) FROM regulation_events;" +``` + +- [ ] **Step 5: Build frontend on server** + +```bash +cd frontend && npm install && npm run build +``` +Expected: build succeeds + +- [ ] **Step 6: Open browser, navigate to Regulatory Signals page** + +Verify: +- Stats bar shows real counts +- "刷新数据源" button is visible in topbar +- Clicking a signal shows 概览 / 义务条款 / 影响评估 / 变更对比 tabs +- 变更对比 tab is greyed out until a second crawl detects a change + +--- + +## Self-Review + +**Spec coverage check:** + +| Spec requirement | Task | +|-----------------|------| +| Replace MockEventStore → PostgresEventStore | Tasks 1, 2, 7 | +| BaseEventStore ABC as port | Task 1 | +| CATARC crawler | Task 3 | +| 国标委 strong + recommended crawlers | Task 3 | +| EUR-Lex RSS crawler | Task 4 | +| LLM structure extraction | Task 5 | +| LLM impact assessment (RAG) | Task 5 | +| Semantic diff via embedding | Task 5 | +| CrawlService with hash-based skip | Task 6 | +| bootstrap.py wiring + settings | Task 7 | +| POST /crawl SSE endpoint | Task 8 | +| POST /events/{id}/process endpoint | Task 8 | +| GET /events/{id}/diff endpoint | Task 8 | +| Frontend crawl bar + progress | Task 9 | +| Frontend detail tabs (4 tabs) | Task 9 | +| Changed badge on signal cards | Task 9 (CHANGED badge in header) | +| Real affected_docs replacing MOCK_DOCS | Task 9 | +| New Python dependencies | Task 10 | +| E2E verification | Task 11 | + +All spec requirements covered. No placeholders found. diff --git a/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md b/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md new file mode 100644 index 0000000..575f76c --- /dev/null +++ b/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md @@ -0,0 +1,328 @@ +# Regulatory Signals Intelligence Enhancement — Design Spec + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the 20-item hardcoded MockEventStore with real regulatory data from Chinese and international sources, add LLM-driven structured extraction, impact assessment, and semantic change diff — all accessible through a manual-trigger crawl in the frontend. + +**Architecture:** Crawler Service (httpx + BeautifulSoup) → PostgreSQL EventStore → LLM Pipeline (extract → assess → diff) → existing PerceptionService interface. New code follows `api → application → domain ports → infrastructure` layering; no new files in `services/*` or `workflows/*`; `shared/bootstrap.py` is the composition root. + +**Tech Stack:** httpx, BeautifulSoup4, sentence-transformers (for diff), existing LLM factory (deepseek/qwen), existing KnowledgeRetrievalService (RAG), PostgreSQL (already available), existing SSE infrastructure. + +--- + +## 1. Data Sources + +| Source | URL | Method | Coverage | +|--------|-----|--------|----------| +| CATARC 汽车标准 | `https://www.catarc.org.cn/bzzxd/qcbz/index.html` | httpx + BeautifulSoup (static pages) | 国家/行业汽车标准列表 | +| 国标委强制性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=1&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 强制性国家标准,按"车"过滤 | +| 国标委推荐性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=2&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 推荐性国家标准,按"车"过滤 | +| EUR-Lex | RSS + CELLAR REST API | pyeurlex / httpx | EU AI Act, automotive directives | +| UN R155/R156 | CELLAR REST API (CELEX lookup) | httpx | UN-ECE cybersecurity/OTA regulations | + +Crawl is **manual-trigger only** — no cron/Celery Beat. Admin clicks "刷新数据源" in the frontend UI. + +--- + +## 2. Database Schema + +### New table: `regulation_events` + +```sql +CREATE TABLE IF NOT EXISTS regulation_events ( + id TEXT PRIMARY KEY, -- sha256(source + standard_code)[:12] + source TEXT NOT NULL, -- 'CATARC' | '国标委' | 'EUR-Lex' | 'UN-ECE' + source_label TEXT, -- Human-readable source label + standard_code TEXT NOT NULL, -- e.g. "GB 18384-2025", "EU/2024/1689" + title TEXT NOT NULL, + summary TEXT, -- Crawled abstract or first paragraph + full_text_url TEXT, -- Original page URL + status TEXT, -- 'enacted' | 'draft' | 'consultation' + impact_level TEXT, -- 'high' | 'medium' | 'low' (LLM-assigned) + published_at DATE, + effective_at DATE, + category TEXT, + tags TEXT[], + -- LLM structured extraction + obligations JSONB, -- [{text, deontic, subject, object, condition}] + deadlines JSONB, -- [{date, description}] + scope TEXT, -- Applicability scope summary + penalties TEXT, -- Penalty / consequence summary + -- Change tracking + content_hash TEXT, -- SHA256 of crawled full text + previous_hash TEXT, -- Hash from prior crawl (NULL on first crawl) + change_summary TEXT, -- LLM-generated description of changes + changed_sections JSONB, -- [{old_text, new_text, change_type}] where cosine<0.85 + -- Impact assessment + affected_docs JSONB, -- [{doc_id, doc_name, score, key_clauses, recommendation}] + -- Metadata + crawled_at TIMESTAMPTZ DEFAULT now(), + processed_at TIMESTAMPTZ, + raw_storage_key TEXT -- MinIO path for raw HTML/PDF (optional) +); + +CREATE INDEX IF NOT EXISTS regulation_events_source_date + ON regulation_events (source, published_at DESC); +CREATE INDEX IF NOT EXISTS regulation_events_impact_date + ON regulation_events (impact_level, published_at DESC); +CREATE INDEX IF NOT EXISTS regulation_events_tags + ON regulation_events USING gin(tags); +``` + +--- + +## 3. Backend Architecture + +### 3.1 File Map + +**New files (infrastructure layer):** +- `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` — CATARC scraper +- `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` — 国标委 JSON API crawler +- `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` — EUR-Lex RSS + CELLAR +- `backend/app/infrastructure/perception/crawlers/base.py` — Abstract base class +- `backend/app/infrastructure/perception/postgres_event_store.py` — PostgresEventStore (replaces MockEventStore) +- `backend/app/infrastructure/perception/llm_pipeline.py` — Extract / assess / diff pipeline + +**New files (application layer):** +- `backend/app/application/perception/crawl_service.py` — Orchestrates crawlers + LLM pipeline, exposes `run_crawl(sources)` + progress generator + +**Modified files:** +- `backend/app/api/routes/perception.py` — Add `POST /crawl`, `GET /crawl/status` (SSE), `POST /events/{id}/process`, `GET /events/{id}/diff` +- `backend/app/shared/bootstrap.py` — Wire `PostgresEventStore` + `CrawlService` + `LlmPipeline` when `DOCUMENT_REPOSITORY_BACKEND=postgres`; fallback to `MockEventStore` when `json` +- `backend/app/config/settings.py` — Add `perception_crawl_timeout_seconds`, `perception_max_events_per_source` + +**Unchanged files:** +- `backend/app/application/perception/services.py` — `PerceptionService` interface unchanged; only `_store` swap +- `backend/app/infrastructure/perception/mock_event_store.py` — Kept for `json` backend mode + +### 3.2 Domain Port (Abstract Interface) + +```python +# backend/app/infrastructure/perception/base_event_store.py +from abc import ABC, abstractmethod + +class BaseEventStore(ABC): + @abstractmethod + def all(self) -> list[dict]: ... + @abstractmethod + def get(self, event_id: str) -> dict | None: ... + @abstractmethod + def filter(self, source=None, impact_level=None, limit=50) -> list[dict]: ... + @abstractmethod + def stats(self) -> dict: ... + @abstractmethod + def upsert(self, event: dict) -> None: ... # new — needed for crawl writes + @abstractmethod + def get_by_standard_code(self, code: str) -> dict | None: ... # for change detection +``` + +`MockEventStore` and `PostgresEventStore` both implement this interface. + +### 3.3 Crawler Base Contract + +```python +# backend/app/infrastructure/perception/crawlers/base.py +from abc import ABC, abstractmethod +from dataclasses import dataclass + +@dataclass +class RawEvent: + source: str + source_label: str + standard_code: str + title: str + summary: str + full_text_url: str + status: str # 'enacted' | 'draft' | 'consultation' + published_at: str # YYYY-MM-DD string + effective_at: str | None + category: str + tags: list[str] + raw_text: str # full crawled text for hashing + LLM + +class BaseCrawler(ABC): + @abstractmethod + def fetch(self, limit: int = 50) -> list[RawEvent]: ... +``` + +### 3.4 LLM Pipeline + +```python +# backend/app/infrastructure/perception/llm_pipeline.py + +class LlmPipeline: + """Runs three sequential LLM steps on a regulation event.""" + + def extract_structure(self, event: dict) -> dict: + """Step 1: Extract obligations, deadlines, scope, penalties, impact_level. + + Returns dict with keys: obligations, deadlines, scope, penalties, impact_level. + Uses JSON-mode or structured prompt; model retries once on parse failure. + """ + + def assess_impact(self, event: dict, retrieval_service) -> list[dict]: + """Step 2: RAG-based impact on existing knowledge base documents. + + Query = standard_code + title + first obligation texts. + Returns list of {doc_id, doc_name, score, key_clauses, recommendation}. + """ + + def compute_diff(self, old_text: str, new_text: str) -> dict: + """Step 3: Semantic diff between old and new regulation text. + + Splits both texts by paragraph. Calls existing EmbeddingService (text-embedding-v3 + via EMBEDDING_BASE_URL) to embed each paragraph, then computes cosine similarity. + Changed paragraphs (cosine < 0.85) sent to LLM for change_type classification: + 'tightened' | 'relaxed' | 'added' | 'removed' + Returns {changed_sections: [...], change_summary: str}. + Only called when content_hash differs from previous_hash. + """ +``` + +### 3.5 CrawlService + +```python +# backend/app/application/perception/crawl_service.py + +class CrawlService: + def __init__(self, crawlers, event_store, llm_pipeline, retrieval_service): ... + + def run_crawl(self, sources: list[str] | None = None) -> Generator[dict, None, None]: + """Manual-trigger crawl. Yields progress SSE dicts: + {event: 'progress', data: {source, fetched, new, updated, stage}} + {event: 'done', data: {total_new, total_updated, duration_ms}} + {event: 'error', data: {source, message}} + + For each crawler: + 1. fetch() RawEvents + 2. hash check vs stored event → skip if unchanged + 3. upsert raw event to DB + 4. run LLM pipeline (extract → assess → diff) + 5. upsert enriched event to DB + 6. yield progress + """ +``` + +--- + +## 4. API Endpoints + +### Existing (unchanged interface, new store backend) +- `GET /api/v1/perception/stats` +- `GET /api/v1/perception/events` +- `GET /api/v1/perception/events/{id}` +- `POST /api/v1/perception/events/{id}/analyze` (streaming) + +### New endpoints + +``` +POST /api/v1/perception/crawl + Body: { sources?: ["CATARC", "国标委", "EUR-Lex", "UN-ECE"] } + Response: text/event-stream (SSE) + Auth: requires current_user (admin/legal role) + Streams progress events until done or error. + +POST /api/v1/perception/events/{id}/process + Trigger LLM pipeline for a single already-crawled event. + Response: { status: "ok", processed_at: "..." } + Auth: requires current_user + +GET /api/v1/perception/events/{id}/diff + Returns: { changed_sections: [...], change_summary: str, previous_hash: str } + Returns 404 if no diff available (first crawl or no change detected). +``` + +--- + +## 5. Frontend Changes + +### 5.1 New: Crawl Control Bar (top of PerceptionPage) + +Above the stats-bar, add a `` component: +- "刷新数据源" button — triggers `POST /crawl` (all sources) +- Inline progress display: shows SSE progress events as a mini status line + - e.g. "CATARC: 抓取中… | 国标委: 12 条新增 | EUR-Lex: 等待中" +- On completion: shows "更新完成 — 新增 N 条,更新 M 条" +- Disabled while crawl is in progress (prevents double-trigger) + +### 5.2 Signal Card Enhancement + +Existing cards get two new indicators: +- **NEW badge** — shown when `crawled_at` is within last 24h (green dot) +- **CHANGED badge** — shown when `previous_hash != content_hash` and `change_summary` exists + +### 5.3 Right Panel — Structured Tab + +Right detail panel adds a tab bar: **概览 | 义务条款 | 影响评估 | 变更对比** + +**义务条款 tab:** +- Table: 义务描述 | 主体 | 对象 | 截止日期 +- Tags for deontic type: 强制 / 禁止 / 允许 +- Shows `obligations[]` + `deadlines[]` from DB + +**影响评估 tab:** +- Replaces hardcoded MOCK_DOCS with real `affected_docs[]` from DB +- Each row: document name, similarity score (%), key clause excerpt, LLM recommendation +- "Run fresh assessment" button → triggers `POST /events/{id}/process` + +**变更对比 tab:** +- Only visible when `change_summary` is non-null +- Top: `change_summary` text (LLM prose) +- Below: diff table with old/new paragraph pairs, change_type badge per row +- Hidden (tab disabled) on first-crawl events with no prior version + +### 5.4 Existing behavior preserved +- `analyze` streaming (AI analysis) unchanged +- Search/filter (source, impact) unchanged — now hits real DB data +- Stats bar — now reflects real counts from PostgreSQL + +--- + +## 6. Settings Additions + +```python +# backend/app/config/settings.py additions +perception_crawl_timeout_seconds: int = Field(default=120, ...) +perception_max_events_per_source: int = Field(default=100, ...) +perception_diff_similarity_threshold: float = Field(default=0.85, ...) +``` + +```env +# .env additions +PERCEPTION_CRAWL_TIMEOUT_SECONDS=120 +PERCEPTION_MAX_EVENTS_PER_SOURCE=100 +PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85 +``` + +--- + +## 7. Dependencies + +``` +# requirements.txt additions +httpx>=0.27.0 # already likely present; confirm +beautifulsoup4>=4.12.0 # HTML parsing for CATARC +lxml>=5.0.0 # BeautifulSoup parser backend +# sentence-transformers NOT added — diff uses existing text-embedding-v3 API (EMBEDDING_BASE_URL) +``` + +No new infrastructure required (PostgreSQL + MinIO + Milvus already available). + +--- + +## 8. Backward Compatibility + +- `DOCUMENT_REPOSITORY_BACKEND=json` → `bootstrap.py` uses `MockEventStore` (unchanged behavior) +- `DOCUMENT_REPOSITORY_BACKEND=postgres` → uses `PostgresEventStore` +- Migration: run `CREATE TABLE` SQL on first startup (idempotent `CREATE TABLE IF NOT EXISTS`) +- Existing 20 mock events are not seeded to PostgreSQL; PostgreSQL starts empty until first crawl + +--- + +## 9. Out of Scope (this phase) + +- Automatic/scheduled crawling (Celery Beat) — manual trigger only +- Playwright-based JS-rendered pages — all target sites work with httpx +- Knowledge Graph (Neo4j / LightRAG) — future phase +- Email/Slack webhook notifications — future phase +- User-facing diff history (versioning beyond one prior snapshot) — future phase diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index d6f8d06..4dbc9c1 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -1,12 +1,14 @@ import './styles/globals.css'; -import { ThemeProvider, AuthProvider } from './contexts'; +import { ThemeProvider, AuthProvider, PageStateProvider } from './contexts'; import { AppRouter } from './router/AppRouter'; function App() { return ( - + + + ); diff --git a/frontend/src/contexts/PageStateContext.tsx b/frontend/src/contexts/PageStateContext.tsx new file mode 100644 index 0000000..ba22cc9 --- /dev/null +++ b/frontend/src/contexts/PageStateContext.tsx @@ -0,0 +1,211 @@ +/** + * PageStateContext — preserves page-level session state across route changes. + * + * When React Router unmounts a page component, all its useState values are lost. + * This context lives above the router and holds the state that must survive + * navigation so users can switch modules and return without losing their work. + * + * Covered pages: + * - RagChat: message history, citation rail, sessionId, input draft + * - Compliance: analysis result (sources, findings, conclusion, meta) + * - Perception: selected signal, filter state, AI analysis output + */ + +import React, { createContext, useContext, useState, useCallback, useRef } from 'react'; + +// ── RagChat types ───────────────────────────────────────────────────────────── + +export interface RagMessage { + id: string; + role: 'user' | 'assistant'; + text: string; + citationRefs?: number[]; +} + +export interface RagCitation { + index: number; + score: number; + name: string; + clause: string; + snippet: string; + docId?: string; +} + +export interface RagChatState { + messages: RagMessage[]; + citations: RagCitation[]; + sessionId: string | null; + inputDraft: string; +} + +const RAG_INIT: RagChatState = { + messages: [ + { + id: 'init', + role: 'assistant', + text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.', + }, + ], + citations: [], + sessionId: null, + inputDraft: '', +}; + +// ── Compliance types ────────────────────────────────────────────────────────── + +export interface ComplianceSourceEvent { + standard: string; + clause: string; + score: number; + status: string; + full_content: string; +} + +export interface ComplianceFindingEvent { + title: string; + desc: string; + status: 'ok' | 'warn' | 'risk'; + clause_ref?: string; +} + +export interface ComplianceActionItem { + label: string; + value: string; + risk?: boolean; +} + +export interface ComplianceDonePayload { + conclusion: string; + actions: ComplianceActionItem[]; + risk_score: number; + highlight_terms: string[]; + para_text: string; +} + +export interface ComplianceMeta { + title: string; + sourceType: 'text' | 'doc' | 'upload'; + startedAt: string; +} + +export type ComplianceStatus = 'idle' | 'streaming' | 'done' | 'error'; + +export interface ComplianceState { + status: ComplianceStatus; + stageLabel: string; + stageKey: string; + meta: ComplianceMeta | null; + sources: ComplianceSourceEvent[]; + findings: ComplianceFindingEvent[]; + done: ComplianceDonePayload | null; + errorText: string; +} + +const COMPLIANCE_INIT: ComplianceState = { + status: 'idle', + stageLabel: '', + stageKey: '', + meta: null, + sources: [], + findings: [], + done: null, + errorText: '', +}; + +// ── Perception types ────────────────────────────────────────────────────────── + +export interface PerceptionSignal { + id: string; + source: string; + standard: string; + status: 'ok' | 'warn' | 'risk' | 'info'; + title: string; + summary: string; + date: string; + tags: string[]; + impact: 'High' | 'Medium' | 'Low'; +} + +export interface PerceptionPageState { + signals: PerceptionSignal[]; + searchQuery: string; + sourceFilter: string; + impactFilter: string; + selectedId: string | null; + aiOutput: string; + detailTab: 'overview' | 'obligations' | 'assessment' | 'diff'; + crawlStatus: string; +} + +const PERCEPTION_INIT: PerceptionPageState = { + signals: [], + searchQuery: '', + sourceFilter: 'All', + impactFilter: 'All', + selectedId: null, + aiOutput: '', + detailTab: 'overview', + crawlStatus: '', +}; + +// ── Context value ───────────────────────────────────────────────────────────── + +interface PageStateContextValue { + // RagChat + ragState: RagChatState; + setRagState: React.Dispatch>; + ragStreamingRef: React.MutableRefObject; + ragAbortRef: React.MutableRefObject; + + // Compliance + complianceState: ComplianceState; + setComplianceState: React.Dispatch>; + complianceAbortRef: React.MutableRefObject; + resetCompliance: () => void; + + // Perception + perceptionState: PerceptionPageState; + setPerceptionState: React.Dispatch>; + perceptionAbortRef: React.MutableRefObject; + perceptionCrawlAbortRef: React.MutableRefObject; +} + +const PageStateContext = createContext(null); + +// ── Provider ────────────────────────────────────────────────────────────────── + +export function PageStateProvider({ children }: { children: React.ReactNode }) { + const [ragState, setRagState] = useState(RAG_INIT); + const ragStreamingRef = useRef(false); + const ragAbortRef = useRef(null); + + const [complianceState, setComplianceState] = useState(COMPLIANCE_INIT); + const complianceAbortRef = useRef(null); + + const resetCompliance = useCallback(() => { + complianceAbortRef.current?.abort(); + setComplianceState(COMPLIANCE_INIT); + }, []); + + const [perceptionState, setPerceptionState] = useState(PERCEPTION_INIT); + const perceptionAbortRef = useRef(null); + const perceptionCrawlAbortRef = useRef(null); + + return ( + + {children} + + ); +} + +// ── Hook ────────────────────────────────────────────────────────────────────── + +export function usePageState() { + const ctx = useContext(PageStateContext); + if (!ctx) throw new Error('usePageState must be used inside PageStateProvider'); + return ctx; +} diff --git a/frontend/src/contexts/index.ts b/frontend/src/contexts/index.ts index 5267afb..693c63f 100644 --- a/frontend/src/contexts/index.ts +++ b/frontend/src/contexts/index.ts @@ -1,3 +1,18 @@ export { ThemeProvider, useTheme } from './ThemeContext'; export { AuthProvider, useAuth } from './AuthContext'; export type { AuthUser } from './AuthContext'; +export { PageStateProvider, usePageState } from './PageStateContext'; +export type { + RagChatState, + RagMessage, + RagCitation, + ComplianceState, + ComplianceStatus, + ComplianceSourceEvent, + ComplianceFindingEvent, + ComplianceDonePayload, + ComplianceMeta, + ComplianceActionItem, + PerceptionPageState, + PerceptionSignal, +} from './PageStateContext'; diff --git a/frontend/src/pages/Compliance/useComplianceAnalysis.ts b/frontend/src/pages/Compliance/useComplianceAnalysis.ts index 312c43e..9a63d71 100644 --- a/frontend/src/pages/Compliance/useComplianceAnalysis.ts +++ b/frontend/src/pages/Compliance/useComplianceAnalysis.ts @@ -1,4 +1,25 @@ -import { useState, useCallback, useRef } from 'react'; +/** + * useComplianceAnalysis — compliance analysis state wired to PageStateContext. + * + * State is stored in the global context so it persists when the user navigates + * to another module and returns. The `run` and `reset` actions are identical + * to the previous hook API so CompliancePage needs no structural changes. + */ + +import { useCallback } from 'react'; +import { usePageState } from '../../contexts'; +import type { + ComplianceMeta, + ComplianceState, + ComplianceSourceEvent, + ComplianceFindingEvent, + ComplianceDonePayload, +} from '../../contexts'; + +export type { ComplianceMeta, ComplianceState, ComplianceSourceEvent as SourceEvent, ComplianceFindingEvent as FindingEvent, ComplianceDonePayload as DonePayload }; +export type { ComplianceActionItem as ActionItem } from '../../contexts'; +export type AnalysisStatus = import('../../contexts').ComplianceStatus; +export type AnalysisMeta = ComplianceMeta; const TOKEN_KEY = 'auth_token'; function authHeader(): Record { @@ -6,55 +27,7 @@ function authHeader(): Record { return t ? { Authorization: `Bearer ${t}` } : {}; } -export type AnalysisStatus = 'idle' | 'streaming' | 'done' | 'error'; - -export interface SourceEvent { - standard: string; - clause: string; - score: number; - status: string; - full_content: string; -} - -export interface FindingEvent { - title: string; - desc: string; - status: 'ok' | 'warn' | 'risk'; - clause_ref?: string; -} - -export interface ActionItem { - label: string; - value: string; - risk?: boolean; -} - -export interface DonePayload { - conclusion: string; - actions: ActionItem[]; - risk_score: number; - highlight_terms: string[]; - para_text: string; -} - -export interface AnalysisMeta { - title: string; - sourceType: 'text' | 'doc' | 'upload'; - startedAt: string; // ISO timestamp -} - -export interface AnalysisState { - status: AnalysisStatus; - stageLabel: string; - stageKey: string; - meta: AnalysisMeta | null; - sources: SourceEvent[]; - findings: FindingEvent[]; - done: DonePayload | null; - errorText: string; -} - -const INITIAL_STATE: AnalysisState = { +const INITIAL_STATE: ComplianceState = { status: 'idle', stageLabel: '', stageKey: '', @@ -66,18 +39,12 @@ const INITIAL_STATE: AnalysisState = { }; export function useComplianceAnalysis() { - const [state, setState] = useState(INITIAL_STATE); - const abortRef = useRef(null); + const { complianceState: state, setComplianceState: setState, complianceAbortRef, resetCompliance: reset } = usePageState(); - const reset = useCallback(() => { - abortRef.current?.abort(); - setState(INITIAL_STATE); - }, []); - - const run = useCallback(async (formData: FormData, meta: AnalysisMeta) => { - abortRef.current?.abort(); + const run = useCallback(async (formData: FormData, meta: ComplianceMeta) => { + complianceAbortRef.current?.abort(); const ctrl = new AbortController(); - abortRef.current = ctrl; + complianceAbortRef.current = ctrl; setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta }); @@ -124,7 +91,7 @@ export function useComplianceAnalysis() { if (j.type === 'stage') { setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' })); } else if (j.type === 'source') { - const src: SourceEvent = { + const src: ComplianceSourceEvent = { standard: j.standard ?? '', clause: j.clause ?? '', score: j.score ?? 0, @@ -133,7 +100,7 @@ export function useComplianceAnalysis() { }; setState(s => ({ ...s, sources: [...s.sources, src] })); } else if (j.type === 'finding') { - const finding: FindingEvent = { + const finding: ComplianceFindingEvent = { title: j.title ?? '', desc: j.desc ?? '', status: j.status ?? 'info', @@ -141,7 +108,7 @@ export function useComplianceAnalysis() { }; setState(s => ({ ...s, findings: [...s.findings, finding] })); } else if (j.type === 'done') { - const payload: DonePayload = { + const payload: ComplianceDonePayload = { conclusion: j.conclusion ?? '', actions: j.actions ?? [], risk_score: j.risk_score ?? 0, @@ -162,7 +129,7 @@ export function useComplianceAnalysis() { if (e instanceof Error && e.name === 'AbortError') return; setState(s => ({ ...s, status: 'error', errorText: String(e) })); } - }, []); + }, [setState, complianceAbortRef]); return { state, run, reset }; } diff --git a/frontend/src/pages/Perception/PerceptionPage.tsx b/frontend/src/pages/Perception/PerceptionPage.tsx index 7a342b3..226cfdc 100644 --- a/frontend/src/pages/Perception/PerceptionPage.tsx +++ b/frontend/src/pages/Perception/PerceptionPage.tsx @@ -1,6 +1,8 @@ import { useState, useEffect, useRef } from 'react'; import { Topbar } from '../../components/layout/Topbar'; import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react'; +import { usePageState } from '../../contexts'; +import type { PerceptionSignal } from '../../contexts'; const TOKEN_KEY = 'auth_token'; function authHeader(): Record { @@ -8,18 +10,6 @@ function authHeader(): Record { return t ? { Authorization: `Bearer ${t}` } : {}; } -interface Signal { - id: string; - source: string; - standard: string; - status: 'ok' | 'warn' | 'risk' | 'info'; - title: string; - summary: string; - date: string; - tags: string[]; - impact: 'High' | 'Medium' | 'Low'; -} - interface Stats { total: number; high_impact: number; @@ -27,29 +17,17 @@ interface Stats { last_90_days: number; } -interface DocResult { - score: number; - name: string; - clause: string; - snippet: string; -} - const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF']; const IMPACTS = ['All', 'High', 'Medium', 'Low']; -// Backend /api/v1/perception/stats returns: -// { total, high_impact, medium_impact, last_90_days } — field names match, ✓ - -// Backend /api/v1/perception/events returns: -// { events: [{ id, title, summary, source, standard, impact_level, published_at, tags, status }] } -// Map backend event fields → frontend Signal shape -function mapEvent(e: Record): Signal { +// Backend event → Signal +function mapEvent(e: Record): PerceptionSignal { const impact = String(e.impact_level ?? '').toLowerCase(); const backendStatus = String(e.status ?? '').toLowerCase(); return { id: String(e.id ?? e.event_id ?? ''), source: String(e.source ?? ''), - standard: String(e.standard ?? e.regulation_id ?? ''), + standard: String(e.standard ?? e.standard_code ?? e.regulation_id ?? ''), status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk' : backendStatus === 'medium' || backendStatus === 'draft' ? 'warn' : backendStatus === 'low' || backendStatus === 'final' ? 'ok' @@ -62,50 +40,40 @@ function mapEvent(e: Record): Signal { }; } -const MOCK_SIGNALS: Signal[] = [ +const MOCK_SIGNALS: PerceptionSignal[] = [ { id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk', title: 'EU AI Act — High-risk AI in vehicles', summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.', - date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High' + date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High', }, { id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn', title: 'MIIT Draft — in-vehicle AI training data', summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.', - date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High' + date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High', }, { id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info', title: 'ISO/SAE 21434 Amendment 1', summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.', - date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium' + date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium', }, - { - id: '4', source: 'UN-ECE', standard: 'UNECE WP.29 R155', status: 'ok', - title: 'UNECE R155 Corrigendum', - summary: 'Editorial corrections to cybersecurity management system requirements. No substantive changes.', - date: '2025-09-12', tags: ['type-approval', 'UNECE'], impact: 'Low' - }, -]; - -const MOCK_DOCS: DocResult[] = [ - { score: 94, name: 'Vehicle AI Safety Manual v3.2', clause: '§4.2.1', snippet: 'The risk management process shall identify and evaluate risks arising from AI system decisions in safety-critical scenarios...' }, - { score: 87, name: 'ADAS System Requirements', clause: '§7.1', snippet: 'Automated driving functions must document training data lineage and model performance envelopes prior to deployment.' }, - { score: 71, name: 'Type Approval Documentation', clause: 'Annex B', snippet: 'Cybersecurity management system certification requires third-party audit of AI decision audit logs retention policy.' }, ]; export function PerceptionPage() { - const [stats, setStats] = useState(null); - const [signals, setSignals] = useState(MOCK_SIGNALS); - const [searchQuery, setSearchQuery] = useState(''); - const [sourceFilter, setSourceFilter] = useState('All'); - const [impactFilter, setImpactFilter] = useState('All'); - const [selected, setSelected] = useState(null); - const [streaming, setStreaming] = useState(false); - const [aiOutput, setAiOutput] = useState(''); - const abortRef = useRef(null); + // Persistent state lives in PageStateContext — survives route changes + const { perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef } = usePageState(); + const { signals, searchQuery, sourceFilter, impactFilter, selectedId, aiOutput, detailTab, crawlStatus } = perceptionState; + // Stats and selectedFull are lightweight to re-fetch on mount + const [stats, setStats] = useState(null); + const [streaming, setStreaming] = useState(false); + const [crawling, setCrawling] = useState(false); + // Full event detail — re-fetched when selected changes or page mounts with a selection + const [selectedFull, setSelectedFull] = useState | null>(null); + + // Re-fetch stats every time the page mounts useEffect(() => { fetch('/api/v1/perception/stats', { headers: authHeader() }) .then(r => r.json()) @@ -113,16 +81,36 @@ export function PerceptionPage() { .catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 })); }, []); + // Fetch signal list on first mount only (if empty), otherwise preserve context state useEffect(() => { + if (signals.length > 0) return; // already loaded fetch('/api/v1/perception/events?limit=100', { headers: authHeader() }) .then(r => r.json()) .then(d => { if (Array.isArray(d?.events) && d.events.length > 0) { - setSignals(d.events.map(mapEvent)); + setPerceptionState(s => ({ ...s, signals: d.events.map(mapEvent) })); + } else { + setPerceptionState(s => ({ ...s, signals: MOCK_SIGNALS })); } }) - .catch(() => { /* keep mock data on error */ }); - }, []); + .catch(() => { + setPerceptionState(s => ({ ...s, signals: s.signals.length > 0 ? s.signals : MOCK_SIGNALS })); + }); + }, []); // eslint-disable-line react-hooks/exhaustive-deps + + // Re-fetch full event detail when navigating back with a selected signal + useEffect(() => { + if (selectedId) { + fetch(`/api/v1/perception/events/${selectedId}`, { headers: authHeader() }) + .then(r => r.ok ? r.json() : null) + .then(d => { if (d) setSelectedFull(d); }) + .catch(() => {}); + } else { + setSelectedFull(null); + } + }, [selectedId]); + + const selected = signals.find(s => s.id === selectedId) ?? null; const filtered = signals.filter(s => { if (sourceFilter !== 'All' && s.source !== sourceFilter) return false; @@ -137,13 +125,20 @@ export function PerceptionPage() { function runAnalysis() { if (!selected) return; setStreaming(true); - setAiOutput(''); + setPerceptionState(s => ({ ...s, aiOutput: '' })); const ctrl = new AbortController(); - abortRef.current = ctrl; - // Backend: POST /api/v1/perception/events/{id}/analyze → SSE stream - fetch(`/api/v1/perception/events/${selected.id}/analyze`, { method: 'POST', headers: authHeader(), signal: ctrl.signal }) + perceptionAbortRef.current = ctrl; + fetch(`/api/v1/perception/events/${selected.id}/analyze`, { + method: 'POST', + headers: authHeader(), + signal: ctrl.signal, + }) .then(async res => { - if (!res.body) { setAiOutput('No stream available.'); setStreaming(false); return; } + if (!res.body) { + setPerceptionState(s => ({ ...s, aiOutput: 'No stream available.' })); + setStreaming(false); + return; + } const reader = res.body.getReader(); const dec = new TextDecoder(); let buf = ''; @@ -160,30 +155,99 @@ export function PerceptionPage() { if (!raw || raw === '[DONE]') continue; try { const j = JSON.parse(raw); - if (j.text) setAiOutput(p => p + j.text); - else if (typeof j === 'string') setAiOutput(p => p + j); + if (j.text) setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j.text })); + else if (typeof j === 'string') setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j })); } catch { - setAiOutput(p => p + raw); + setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + raw })); } } } setStreaming(false); }) .catch(e => { - if (e.name !== 'AbortError') setAiOutput('Analysis failed. Check API connection.'); + if (e.name !== 'AbortError') setPerceptionState(s => ({ ...s, aiOutput: 'Analysis failed. Check API connection.' })); setStreaming(false); }); } function stopAnalysis() { - abortRef.current?.abort(); + perceptionAbortRef.current?.abort(); setStreaming(false); } - function selectSignal(sig: Signal) { - setSelected(sig); - setAiOutput(''); + async function runCrawl() { + setCrawling(true); + setPerceptionState(s => ({ ...s, crawlStatus: '正在连接数据源...' })); + try { + const res = await fetch('/api/v1/perception/crawl', { + method: 'POST', + headers: { 'Content-Type': 'application/json', ...authHeader() }, + body: JSON.stringify({}), + }); + if (!res.body) { + setPerceptionState(s => ({ ...s, crawlStatus: 'No stream' })); + setCrawling(false); + return; + } + const reader = res.body.getReader(); + const dec = new TextDecoder(); + let buf = ''; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += dec.decode(value); + const parts = buf.split('\n\n'); + buf = parts.pop() ?? ''; + for (const block of parts) { + const eventLine = block.split('\n').find(l => l.startsWith('event: ')); + const dataLine = block.split('\n').find(l => l.startsWith('data: ')); + const evtName = eventLine?.slice(7).trim(); + const raw = dataLine?.slice(6).trim(); + if (!raw) continue; + try { + const d = JSON.parse(raw); + if (evtName === 'progress') { + setPerceptionState(s => ({ + ...s, + crawlStatus: `${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`, + })); + } else if (evtName === 'done') { + setPerceptionState(s => ({ ...s, crawlStatus: `更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated} 条` })); + fetch('/api/v1/perception/events?limit=100', { headers: authHeader() }) + .then(r => r.json()) + .then(d2 => { + if (Array.isArray(d2?.events)) { + setPerceptionState(s => ({ ...s, signals: d2.events.map(mapEvent) })); + } + }); + } else if (evtName === 'error') { + setPerceptionState(s => ({ + ...s, + crawlStatus: `错误: ${typeof d === 'string' ? d : d.message}`, + })); + } + } catch { /* ignore */ } + } + } + } catch (e: unknown) { + setPerceptionState(s => ({ + ...s, + crawlStatus: `连接失败: ${e instanceof Error ? e.message : String(e)}`, + })); + } + setCrawling(false); + } + + function selectSignal(sig: PerceptionSignal) { + setPerceptionState(s => ({ + ...s, + selectedId: sig.id, + aiOutput: '', + detailTab: 'overview', + })); + setSelectedFull(null); setStreaming(false); + perceptionAbortRef.current?.abort(); } return ( @@ -197,10 +261,18 @@ export function PerceptionPage() { setSearchQuery(e.target.value)} + onChange={e => setPerceptionState(s => ({ ...s, searchQuery: e.target.value }))} />
- + + {crawlStatus && ( + + {crawlStatus} + + )} } /> @@ -227,13 +299,25 @@ export function PerceptionPage() {
{SOURCES.map(s => ( - + ))}
{IMPACTS.map(i => ( - + ))}
@@ -243,7 +327,7 @@ export function PerceptionPage() { {filtered.map(sig => (
selectSignal(sig)} >
@@ -277,8 +361,11 @@ export function PerceptionPage() { {selected.source} {selected.standard} - {selected.status === 'risk' ? 'Urgent' : 'Published'} + {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'} + {selectedFull?.change_summary && ( + CHANGED + )}
{selected.title}

{selected.summary}

@@ -287,23 +374,160 @@ export function PerceptionPage() { ? : } - + {selected && ( + + Source + + )}
-
-
Affected documents
- {MOCK_DOCS.map(d => ( -
- {d.score}% -
-
{d.name} {d.clause}
-
{d.snippet}
-
-
+
+ {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => ( + ))}
+ {detailTab === 'overview' && ( +
+
Scope & Summary
+

+ {(selectedFull?.scope as string) || selected.summary} +

+ {selectedFull?.penalties && ( +

+ ⚠ {selectedFull.penalties as string} +

+ )} +
+ )} + + {detailTab === 'obligations' && ( +
+
义务条款
+ {(() => { + const obs = (selectedFull?.obligations as Array>) || []; + const deadlines = (selectedFull?.deadlines as Array>) || []; + return obs.length === 0 && deadlines.length === 0 ? ( +

暂无结构化数据。点击右上角"Run impact analysis"触发提取。

+ ) : ( + <> + {obs.length > 0 && ( + + + + + + + + + + {obs.map((ob, i) => ( + + + + + + ))} + +
义务描述主体类型
{ob.text}{ob.subject} + + {ob.deontic} + +
+ )} + {deadlines.length > 0 && ( +
+
截止日期
+ {deadlines.map((d, i) => ( +
+ {d.date || '待定'} + {d.description} +
+ ))} +
+ )} + + ); + })()} +
+ )} + + {detailTab === 'assessment' && ( +
+
Affected documents
+ {(() => { + const docs = (selectedFull?.affected_docs as Array>); + const displayDocs = docs && docs.length > 0 ? docs : []; + return displayDocs.length === 0 + ?

No affected documents found.

+ : displayDocs.map((d, i) => ( +
+ {Math.round(Number(d.score ?? 0) * 100)}% +
+
+ {String(d.doc_name || '')} + {String(d.key_clauses || d.clause || '')} +
+ {d.snippet &&
{String(d.snippet)}
} + {d.recommendation && ( +
→ {String(d.recommendation)}
+ )} +
+
+ )); + })()} +
+ )} + + {detailTab === 'diff' && selectedFull?.change_summary && ( +
+
变更对比
+

+ {selectedFull.change_summary as string} +

+ {(() => { + const sections = (selectedFull.changed_sections as Array>) || []; + return sections.map((s, i) => ( +
+
+ + {String(s.change_type)} + + cosine: {String(s.similarity)} +
+
+
+
旧版
+ {String(s.old_text || '')} +
+
+
新版
+ {String(s.new_text || '')} +
+
+ {s.summary &&

{String(s.summary)}

} +
+ )); + })()} +
+ )} + {(aiOutput || streaming) && (
AI Impact Analysis
diff --git a/frontend/src/pages/RagChat/RagChatPage.tsx b/frontend/src/pages/RagChat/RagChatPage.tsx index ad508d4..c3a5681 100644 --- a/frontend/src/pages/RagChat/RagChatPage.tsx +++ b/frontend/src/pages/RagChat/RagChatPage.tsx @@ -1,6 +1,8 @@ -import { useState, useRef, useEffect, useCallback } from 'react'; +import { useRef, useEffect, useCallback, useState } from 'react'; import { Topbar } from '../../components/layout/Topbar'; import { Send, Download } from 'lucide-react'; +import { usePageState } from '../../contexts'; +import type { RagCitation } from '../../contexts'; const TOKEN_KEY = 'auth_token'; function authHeader(): Record { @@ -8,26 +10,8 @@ function authHeader(): Record { return t ? { Authorization: `Bearer ${t}` } : {}; } -interface Message { - id: string; - role: 'user' | 'assistant'; - text: string; - // citation indices mentioned in this assistant message (1-based, matching citations array) - citationRefs?: number[]; -} - -interface Citation { - index: number; // 1-based, matches [N] markers in text - score: number; // 0–100 display percentage - name: string; // doc_name - clause: string; // section_title or clause - snippet: string; // preview text - docId?: string; -} - // Map a raw source doc from the backend "retrieved" event to our Citation shape. -// Backend fields: { id, score(0-1), preview, doc_name, clause, doc_id } -function mapSource(s: Record, idx: number): Citation { +function mapSource(s: Record, idx: number): RagCitation { const rawScore = typeof s.score === 'number' ? s.score : 0; const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore); return { @@ -73,25 +57,21 @@ const MOCK_QUICK = [ ]; export function RagChatPage() { - const [messages, setMessages] = useState([ - { - id: 'init', role: 'assistant', - text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.', - } - ]); - const [quickPrompts, setQuickPrompts] = useState(MOCK_QUICK); - const [input, setInput] = useState(''); - const [streaming, setStreaming] = useState(false); - const [citations, setCitations] = useState([]); + // All persistent state lives in PageStateContext — survives route changes + const { ragState, setRagState, ragStreamingRef, ragAbortRef } = usePageState(); + const { messages, citations, sessionId, inputDraft } = ragState; + + // Local-only UI state: highlighted citation and streaming indicator + // These are fine to reset on navigation since they're transient UI feedback const [highlightedCit, setHighlightedCit] = useState(null); - const [sessionId, setSessionId] = useState(null); + const [streaming, setStreaming] = useState(ragStreamingRef.current); + const [quickPrompts, setQuickPrompts] = useState(MOCK_QUICK); const bottomRef = useRef(null); const citRailRef = useRef(null); const citItemRefs = useRef>({}); - const abortRef = useRef(null); - // Fetch quick questions from backend on mount + // Fetch quick questions from backend on mount (only once per session) useEffect(() => { fetch('/api/v1/rag/quick-questions', { headers: authHeader() }) .then(r => r.json()) @@ -115,26 +95,33 @@ export function RagChatPage() { if (el) { el.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); } - // Clear highlight after 3s setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000); }, []); async function send(text?: string) { - const q = (text ?? input).trim(); - if (!q || streaming) return; - setInput(''); - - const userMsg: Message = { id: Date.now().toString(), role: 'user', text: q }; - setMessages(m => [...m, userMsg]); + const q = (text ?? inputDraft).trim(); + if (!q || ragStreamingRef.current) return; + setRagState(s => ({ ...s, inputDraft: '' })); + const userMsgId = Date.now().toString(); const assistantId = (Date.now() + 1).toString(); - setMessages(m => [...m, { id: assistantId, role: 'assistant', text: '' }]); + + setRagState(s => ({ + ...s, + messages: [ + ...s.messages, + { id: userMsgId, role: 'user', text: q }, + { id: assistantId, role: 'assistant', text: '' }, + ], + citations: [], + })); + + ragStreamingRef.current = true; setStreaming(true); - setCitations([]); setHighlightedCit(null); const ctrl = new AbortController(); - abortRef.current = ctrl; + ragAbortRef.current = ctrl; try { const body: Record = { query: q, top_k: 5 }; @@ -151,14 +138,13 @@ export function RagChatPage() { const reader = res.body.getReader(); const dec = new TextDecoder(); let buffer = ''; - const newCitations: Citation[] = []; + const newCitations: RagCitation[] = []; while (true) { const { done, value } = await reader.read(); if (done) break; buffer += dec.decode(value, { stream: true }); - // SSE blocks separated by double newline const blocks = buffer.split('\n\n'); buffer = blocks.pop() ?? ''; @@ -171,56 +157,62 @@ export function RagChatPage() { const j = JSON.parse(raw); if (j.type === 'session') { - // Backend assigned a session_id — persist for next request - if (j.session_id) setSessionId(j.session_id); + if (j.session_id) setRagState(s => ({ ...s, sessionId: j.session_id })); } else if (j.type === 'retrieved' && Array.isArray(j.docs)) { - // Sources arrive before the answer starts const mapped = j.docs.map((d: Record, i: number) => mapSource(d, i + 1)); newCitations.push(...mapped); - setCitations([...mapped]); + setRagState(s => ({ ...s, citations: [...mapped] })); } else if (j.type === 'chunk' && j.text) { - setMessages(m => m.map(msg => - msg.id === assistantId - ? { ...msg, text: msg.text + (j.text as string) } - : msg - )); - - } else if (j.type === 'status') { - // Status message (e.g. "找到N条相关法规…") — could show in UI if desired - // For now we ignore it to keep the bubble clean + setRagState(s => ({ + ...s, + messages: s.messages.map(msg => + msg.id === assistantId + ? { ...msg, text: msg.text + (j.text as string) } + : msg + ), + })); } else if (j.type === 'done') { - // Extract which citation numbers appear in the final answer - setMessages(m => m.map(msg => { - if (msg.id !== assistantId) return msg; - const refs = [...new Set( - [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10)) - )].filter(n => n >= 1 && n <= newCitations.length); - return { ...msg, citationRefs: refs }; + setRagState(s => ({ + ...s, + messages: s.messages.map(msg => { + if (msg.id !== assistantId) return msg; + const refs = [...new Set( + [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10)) + )].filter(n => n >= 1 && n <= newCitations.length); + return { ...msg, citationRefs: refs }; + }), })); break; } else if (j.type === 'error') { - setMessages(m => m.map(msg => - msg.id === assistantId - ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` } - : msg - )); + setRagState(s => ({ + ...s, + messages: s.messages.map(msg => + msg.id === assistantId + ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` } + : msg + ), + })); } } catch { /* malformed JSON chunk, skip */ } } } } catch (e: unknown) { if (e instanceof Error && e.name !== 'AbortError') { - setMessages(m => m.map(msg => - msg.id === assistantId - ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' } - : msg - )); + setRagState(s => ({ + ...s, + messages: s.messages.map(msg => + msg.id === assistantId + ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' } + : msg + ), + })); } } finally { + ragStreamingRef.current = false; setStreaming(false); } } @@ -291,15 +283,15 @@ export function RagChatPage() {