fix somethings
This commit is contained in:
5
.env
5
.env
@@ -54,6 +54,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
|
|||||||
# Default false: processing runs in FastAPI's threadpool — no external worker needed.
|
# Default false: processing runs in FastAPI's threadpool — no external worker needed.
|
||||||
USE_CELERY_WORKER=false
|
USE_CELERY_WORKER=false
|
||||||
|
|
||||||
|
# ===== 法规感知爬取配置 =====
|
||||||
|
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
|
||||||
|
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
|
||||||
|
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
|
||||||
|
|
||||||
# ===== API配置 =====
|
# ===== API配置 =====
|
||||||
API_HOST=0.0.0.0
|
API_HOST=0.0.0.0
|
||||||
API_PORT=8000
|
API_PORT=8000
|
||||||
|
|||||||
@@ -55,6 +55,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
|
|||||||
# Default false: document processing runs in FastAPI's threadpool (no external worker needed).
|
# Default false: document processing runs in FastAPI's threadpool (no external worker needed).
|
||||||
USE_CELERY_WORKER=false
|
USE_CELERY_WORKER=false
|
||||||
|
|
||||||
|
# ===== 法规感知爬取配置 =====
|
||||||
|
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
|
||||||
|
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
|
||||||
|
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
|
||||||
|
|
||||||
# ===== 阿里云文档解析 =====
|
# ===== 阿里云文档解析 =====
|
||||||
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
|
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
|
||||||
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
|
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
|
||||||
|
|||||||
@@ -4,10 +4,12 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from fastapi import APIRouter, Query
|
from fastapi import APIRouter, Depends, Query
|
||||||
from fastapi.responses import StreamingResponse
|
from fastapi.responses import StreamingResponse
|
||||||
|
|
||||||
from app.shared.bootstrap import get_perception_service
|
from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service
|
||||||
|
from app.api.dependencies.auth import get_current_user
|
||||||
|
from app.domain.auth.models import UserClaims
|
||||||
from app.shared.async_utils import iter_in_thread
|
from app.shared.async_utils import iter_in_thread
|
||||||
|
|
||||||
router = APIRouter(prefix="/perception", tags=["智能感知"])
|
router = APIRouter(prefix="/perception", tags=["智能感知"])
|
||||||
@@ -65,3 +67,77 @@ async def analyze_event(event_id: str):
|
|||||||
"X-Accel-Buffering": "no",
|
"X-Accel-Buffering": "no",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/crawl")
|
||||||
|
async def run_crawl(
|
||||||
|
body: dict = None,
|
||||||
|
current_user: UserClaims = Depends(get_current_user),
|
||||||
|
):
|
||||||
|
"""Trigger manual crawl of regulatory sources. Streams SSE progress.
|
||||||
|
|
||||||
|
Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
|
||||||
|
Omit sources to crawl all registered sources.
|
||||||
|
"""
|
||||||
|
sources: list[str] | None = (body or {}).get("sources")
|
||||||
|
crawl_svc = get_crawl_service()
|
||||||
|
|
||||||
|
async def crawl_stream():
|
||||||
|
async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
|
||||||
|
event_name = item.get("event", "message")
|
||||||
|
data = item.get("data", "")
|
||||||
|
if isinstance(data, (dict, list)):
|
||||||
|
data = json.dumps(data, ensure_ascii=False)
|
||||||
|
yield f"event: {event_name}\ndata: {data}\n\n"
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
crawl_stream(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/events/{event_id}/process")
|
||||||
|
async def process_event(
|
||||||
|
event_id: str,
|
||||||
|
current_user: UserClaims = Depends(get_current_user),
|
||||||
|
):
|
||||||
|
"""Trigger LLM pipeline (extract + assess + diff) for a single event."""
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||||||
|
from app.shared.bootstrap import get_retrieval_service
|
||||||
|
|
||||||
|
event = get_perception_service().get_event(event_id)
|
||||||
|
if not event:
|
||||||
|
from fastapi import HTTPException
|
||||||
|
raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
|
||||||
|
|
||||||
|
store = get_event_store()
|
||||||
|
pipeline = LlmPipeline()
|
||||||
|
|
||||||
|
structure = pipeline.extract_structure(event)
|
||||||
|
event.update(structure)
|
||||||
|
event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
|
||||||
|
event["processed_at"] = datetime.now(UTC).isoformat()
|
||||||
|
store.upsert(event)
|
||||||
|
|
||||||
|
return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/events/{event_id}/diff")
|
||||||
|
async def get_event_diff(event_id: str):
|
||||||
|
"""Return semantic diff detail for an event (only available if previously crawled twice)."""
|
||||||
|
event = get_perception_service().get_event(event_id)
|
||||||
|
if not event:
|
||||||
|
from fastapi import HTTPException
|
||||||
|
raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
|
||||||
|
if not event.get("change_summary"):
|
||||||
|
from fastapi import HTTPException
|
||||||
|
raise HTTPException(status_code=404, detail="No diff available for this event")
|
||||||
|
return {
|
||||||
|
"event_id": event_id,
|
||||||
|
"change_summary": event.get("change_summary"),
|
||||||
|
"changed_sections": event.get("changed_sections") or [],
|
||||||
|
"previous_hash": event.get("previous_hash"),
|
||||||
|
"content_hash": event.get("content_hash"),
|
||||||
|
}
|
||||||
|
|||||||
147
backend/app/application/perception/crawl_service.py
Normal file
147
backend/app/application/perception/crawl_service.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from typing import Any, Generator
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||||
|
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def _event_id(source: str, standard_code: str) -> str:
|
||||||
|
"""Deterministic 12-char ID from source + standard_code."""
|
||||||
|
return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
|
||||||
|
|
||||||
|
|
||||||
|
def _content_hash(raw_text: str) -> str:
|
||||||
|
return hashlib.sha256(raw_text.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
|
||||||
|
return {
|
||||||
|
"id": event_id,
|
||||||
|
"source": raw.source,
|
||||||
|
"source_label": raw.source_label,
|
||||||
|
"standard_code": raw.standard_code,
|
||||||
|
"title": raw.title,
|
||||||
|
"summary": raw.summary,
|
||||||
|
"full_text_url": raw.full_text_url,
|
||||||
|
"status": raw.status,
|
||||||
|
"impact_level": "medium",
|
||||||
|
"published_at": raw.published_at,
|
||||||
|
"effective_at": raw.effective_at,
|
||||||
|
"category": raw.category,
|
||||||
|
"tags": raw.tags,
|
||||||
|
"content_hash": content_hash,
|
||||||
|
"previous_hash": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlService:
|
||||||
|
"""Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
crawlers: dict[str, BaseCrawler],
|
||||||
|
event_store: BaseEventStore,
|
||||||
|
llm_pipeline: LlmPipeline,
|
||||||
|
retrieval_service: Any,
|
||||||
|
) -> None:
|
||||||
|
self._crawlers = crawlers
|
||||||
|
self._store = event_store
|
||||||
|
self._pipeline = llm_pipeline
|
||||||
|
self._retrieval = retrieval_service
|
||||||
|
|
||||||
|
def run_crawl(
|
||||||
|
self, sources: list[str] | None = None
|
||||||
|
) -> Generator[dict, None, None]:
|
||||||
|
"""Run crawl for selected sources. Yields SSE-ready progress dicts."""
|
||||||
|
targets = sources or list(self._crawlers.keys())
|
||||||
|
total_new = 0
|
||||||
|
total_updated = 0
|
||||||
|
|
||||||
|
for source_key in targets:
|
||||||
|
crawler = self._crawlers.get(source_key)
|
||||||
|
if not crawler:
|
||||||
|
yield {"event": "error", "data": f"Unknown source: {source_key}"}
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
|
||||||
|
try:
|
||||||
|
raw_events = crawler.fetch(limit=100)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Crawler failed source={}", source_key)
|
||||||
|
yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"event": "progress",
|
||||||
|
"data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
|
||||||
|
}
|
||||||
|
|
||||||
|
new_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for raw in raw_events:
|
||||||
|
eid = _event_id(raw.source, raw.standard_code)
|
||||||
|
new_hash = _content_hash(raw.raw_text or raw.title)
|
||||||
|
existing = self._store.get(eid)
|
||||||
|
|
||||||
|
if existing and existing.get("content_hash") == new_hash:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_update = existing is not None
|
||||||
|
old_text = existing.get("summary", "") if is_update else ""
|
||||||
|
previous_hash = existing.get("content_hash") if is_update else None
|
||||||
|
|
||||||
|
event_dict = _raw_to_dict(raw, eid, new_hash)
|
||||||
|
event_dict["previous_hash"] = previous_hash
|
||||||
|
|
||||||
|
try:
|
||||||
|
structure = self._pipeline.extract_structure(event_dict)
|
||||||
|
event_dict.update(structure)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Structure extraction failed id={} err={}", eid, exc)
|
||||||
|
|
||||||
|
try:
|
||||||
|
affected = self._pipeline.assess_impact(event_dict, self._retrieval)
|
||||||
|
event_dict["affected_docs"] = affected
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Impact assessment failed id={} err={}", eid, exc)
|
||||||
|
|
||||||
|
if is_update and old_text and raw.raw_text:
|
||||||
|
try:
|
||||||
|
diff = self._pipeline.compute_diff(old_text, raw.raw_text)
|
||||||
|
event_dict["change_summary"] = diff.get("change_summary")
|
||||||
|
event_dict["changed_sections"] = diff.get("changed_sections")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Diff failed id={} err={}", eid, exc)
|
||||||
|
|
||||||
|
self._store.upsert(event_dict)
|
||||||
|
|
||||||
|
if is_update:
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
total_new += new_count
|
||||||
|
total_updated += updated_count
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"event": "progress",
|
||||||
|
"data": {
|
||||||
|
"source": source_key,
|
||||||
|
"stage": "done",
|
||||||
|
"new": new_count,
|
||||||
|
"updated": updated_count,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"event": "done",
|
||||||
|
"data": {"total_new": total_new, "total_updated": total_updated},
|
||||||
|
}
|
||||||
@@ -6,7 +6,7 @@ import json
|
|||||||
from typing import Generator
|
from typing import Generator
|
||||||
|
|
||||||
from app.application.knowledge.services import KnowledgeRetrievalService
|
from app.application.knowledge.services import KnowledgeRetrievalService
|
||||||
from app.infrastructure.perception.mock_event_store import MockEventStore
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
from app.services.llm.llm_factory import get_llm_client
|
from app.services.llm.llm_factory import get_llm_client
|
||||||
from app.config.settings import settings
|
from app.config.settings import settings
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ class PerceptionService:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
event_store: MockEventStore,
|
event_store: BaseEventStore,
|
||||||
retrieval_service: KnowledgeRetrievalService,
|
retrieval_service: KnowledgeRetrievalService,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._store = event_store
|
self._store = event_store
|
||||||
|
|||||||
@@ -87,6 +87,18 @@ class Settings(BaseSettings):
|
|||||||
# no external worker needed. Switch to True only when a Celery worker is running.
|
# no external worker needed. Switch to True only when a Celery worker is running.
|
||||||
use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")
|
use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")
|
||||||
|
|
||||||
|
# ── Perception crawl ──────────────────────────────────────────────────────
|
||||||
|
perception_crawl_timeout_seconds: int = Field(
|
||||||
|
default=120, description="HTTP timeout for regulatory source crawlers."
|
||||||
|
)
|
||||||
|
perception_max_events_per_source: int = Field(
|
||||||
|
default=100, description="Maximum events fetched per source per crawl run."
|
||||||
|
)
|
||||||
|
perception_diff_similarity_threshold: float = Field(
|
||||||
|
default=0.85,
|
||||||
|
description="Cosine similarity below which a paragraph is flagged as changed.",
|
||||||
|
)
|
||||||
|
|
||||||
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
# Keep configuration setup explicit so runtime behavior is easy to reason about.
|
||||||
api_host: str = Field(default="0.0.0.0", description="API服务地址")
|
api_host: str = Field(default="0.0.0.0", description="API服务地址")
|
||||||
api_port: int = Field(default=8000, description="API服务端口")
|
api_port: int = Field(default=8000, description="API服务端口")
|
||||||
|
|||||||
39
backend/app/infrastructure/perception/base_event_store.py
Normal file
39
backend/app/infrastructure/perception/base_event_store.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""Abstract base class for regulatory event stores."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEventStore(ABC):
|
||||||
|
"""Port interface for regulatory event persistence."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def all(self) -> list[dict]:
|
||||||
|
"""Return all events, most-recent first."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get(self, event_id: str) -> dict | None:
|
||||||
|
"""Return a single event by ID, or None."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def filter(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
source: str | None = None,
|
||||||
|
impact_level: str | None = None,
|
||||||
|
limit: int = 50,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Return filtered events sorted by published_at descending."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def stats(self) -> dict:
|
||||||
|
"""Return {total, high_impact, medium_impact, low_impact, recent_90d}."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def upsert(self, event: dict) -> None:
|
||||||
|
"""Insert or update an event record."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_by_standard_code(self, standard_code: str) -> dict | None:
|
||||||
|
"""Return the most-recent event with matching standard_code, or None."""
|
||||||
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""Shared utility functions for crawlers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(text: str) -> str:
|
||||||
|
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return date.today().isoformat()
|
||||||
|
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
|
||||||
|
if m2:
|
||||||
|
try:
|
||||||
|
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return date.today().isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tags(standard_code: str, title: str) -> list[str]:
|
||||||
|
"""Derive simple keyword tags from standard code and title."""
|
||||||
|
tags: list[str] = []
|
||||||
|
code_upper = standard_code.upper()
|
||||||
|
if "GB" in code_upper:
|
||||||
|
tags.append("国家标准")
|
||||||
|
if "/T" in code_upper:
|
||||||
|
tags.append("推荐性")
|
||||||
|
else:
|
||||||
|
tags.append("强制性")
|
||||||
|
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
|
||||||
|
for kw in keywords:
|
||||||
|
if kw in title:
|
||||||
|
tags.append(kw)
|
||||||
|
return tags[:5]
|
||||||
32
backend/app/infrastructure/perception/crawlers/base.py
Normal file
32
backend/app/infrastructure/perception/crawlers/base.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Shared contracts for regulatory source crawlers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RawEvent:
|
||||||
|
"""Raw regulatory event returned by a crawler before enrichment."""
|
||||||
|
|
||||||
|
source: str
|
||||||
|
source_label: str
|
||||||
|
standard_code: str
|
||||||
|
title: str
|
||||||
|
summary: str
|
||||||
|
full_text_url: str
|
||||||
|
status: str # 'enacted' | 'draft' | 'consultation'
|
||||||
|
published_at: str # YYYY-MM-DD string
|
||||||
|
effective_at: str | None
|
||||||
|
category: str
|
||||||
|
tags: list[str] = field(default_factory=list)
|
||||||
|
raw_text: str = "" # full crawled text for hashing + LLM
|
||||||
|
|
||||||
|
|
||||||
|
class BaseCrawler(ABC):
|
||||||
|
"""Abstract regulatory source crawler."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||||
|
"""Fetch up to `limit` recent events from the data source."""
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
"""Crawler for CATARC automotive standard catalogue."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||||
|
from ._utils import extract_tags, parse_date
|
||||||
|
|
||||||
|
_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
|
||||||
|
_HOST = "https://www.catarc.org.cn"
|
||||||
|
|
||||||
|
_STATUS_MAP = {
|
||||||
|
"现行": "enacted",
|
||||||
|
"即将实施": "enacted",
|
||||||
|
"废止": "enacted",
|
||||||
|
"征求意见": "consultation",
|
||||||
|
"报批": "draft",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CatarcCrawler(BaseCrawler):
|
||||||
|
"""Scrape the CATARC automotive standard list page."""
|
||||||
|
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||||
|
events: list[RawEvent] = []
|
||||||
|
page = 1
|
||||||
|
max_pages = max(10, limit)
|
||||||
|
while len(events) < limit and page <= max_pages:
|
||||||
|
url = f"{_BASE_URL}?page={page}"
|
||||||
|
try:
|
||||||
|
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("CATARC fetch failed page={} err={}", page, exc)
|
||||||
|
break
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "lxml")
|
||||||
|
rows = soup.select("table tr")
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
|
||||||
|
batch: list[RawEvent] = []
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_all("td")
|
||||||
|
if len(cells) < 3:
|
||||||
|
continue
|
||||||
|
link = cells[0].find("a")
|
||||||
|
standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
|
||||||
|
title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
|
||||||
|
date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
|
||||||
|
published_at = parse_date(date_text)
|
||||||
|
status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
|
||||||
|
status = _STATUS_MAP.get(status_text, "enacted")
|
||||||
|
detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
|
||||||
|
raw_text = f"{standard_code} {title}"
|
||||||
|
batch.append(RawEvent(
|
||||||
|
source="CATARC",
|
||||||
|
source_label="全国汽车标准化技术委员会",
|
||||||
|
standard_code=standard_code,
|
||||||
|
title=title,
|
||||||
|
summary=title,
|
||||||
|
full_text_url=detail_url,
|
||||||
|
status=status,
|
||||||
|
published_at=published_at,
|
||||||
|
effective_at=None,
|
||||||
|
category="汽车标准",
|
||||||
|
tags=extract_tags(standard_code, title),
|
||||||
|
raw_text=raw_text,
|
||||||
|
))
|
||||||
|
|
||||||
|
if not batch:
|
||||||
|
break
|
||||||
|
events.extend(batch)
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
return events[:limit]
|
||||||
|
|
||||||
|
|
||||||
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||||
|
from ._utils import parse_date
|
||||||
|
|
||||||
|
_EURLEX_RSS_URLS = [
|
||||||
|
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
|
||||||
|
]
|
||||||
|
|
||||||
|
_AUTOMOTIVE_KEYWORDS = [
|
||||||
|
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
|
||||||
|
"AI Act", "artificial intelligence", "cybersecurity", "software update",
|
||||||
|
"R155", "R156", "汽车", "车辆",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
|
||||||
|
|
||||||
|
|
||||||
|
def _is_automotive_relevant(title: str, description: str) -> bool:
|
||||||
|
combined = (title + " " + description).lower()
|
||||||
|
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_celex(url: str) -> str:
|
||||||
|
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
|
||||||
|
return m.group(1) if m else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_rss_date(rfc2822: str) -> str:
|
||||||
|
try:
|
||||||
|
dt = parsedate_to_datetime(rfc2822)
|
||||||
|
return dt.date().isoformat()
|
||||||
|
except Exception:
|
||||||
|
return parse_date(rfc2822)
|
||||||
|
|
||||||
|
|
||||||
|
class EurlexCrawler(BaseCrawler):
|
||||||
|
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
|
||||||
|
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||||
|
events: list[RawEvent] = []
|
||||||
|
for rss_url in _EURLEX_RSS_URLS:
|
||||||
|
if len(events) >= limit:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
|
||||||
|
continue
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.content, "lxml-xml")
|
||||||
|
for item in soup.find_all("item"):
|
||||||
|
if len(events) >= limit:
|
||||||
|
break
|
||||||
|
title_tag = item.find("title")
|
||||||
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
||||||
|
desc_tag = item.find("description")
|
||||||
|
description = desc_tag.get_text(strip=True) if desc_tag else ""
|
||||||
|
link_tag = item.find("link")
|
||||||
|
link = link_tag.get_text(strip=True) if link_tag else ""
|
||||||
|
pub_date_tag = item.find("pubDate")
|
||||||
|
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
|
||||||
|
|
||||||
|
if not _is_automotive_relevant(title, description):
|
||||||
|
continue
|
||||||
|
|
||||||
|
celex = _extract_celex(link)
|
||||||
|
standard_code = celex if celex else title[:60]
|
||||||
|
published_at = _parse_rss_date(pub_date) if pub_date else ""
|
||||||
|
|
||||||
|
events.append(RawEvent(
|
||||||
|
source="EUR-Lex",
|
||||||
|
source_label="欧盟官方公报",
|
||||||
|
standard_code=standard_code,
|
||||||
|
title=title,
|
||||||
|
summary=description[:500],
|
||||||
|
full_text_url=link,
|
||||||
|
status="enacted",
|
||||||
|
published_at=published_at,
|
||||||
|
effective_at=None,
|
||||||
|
category="EU法规",
|
||||||
|
tags=_extract_eurlex_tags(title, description),
|
||||||
|
raw_text=f"{title}\n{description}",
|
||||||
|
))
|
||||||
|
|
||||||
|
return events[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
|
||||||
|
combined = title + " " + description
|
||||||
|
tag_map = {
|
||||||
|
"AI Act": "EU AI Act",
|
||||||
|
"artificial intelligence": "EU AI Act",
|
||||||
|
"R155": "UN R155",
|
||||||
|
"R156": "UN R156",
|
||||||
|
"cybersecurity": "网络安全",
|
||||||
|
"emission": "排放",
|
||||||
|
"autonomous": "自动驾驶",
|
||||||
|
"ADAS": "ADAS",
|
||||||
|
}
|
||||||
|
combined_lower = combined.lower()
|
||||||
|
tags = []
|
||||||
|
for kw, tag in tag_map.items():
|
||||||
|
if kw.lower() in combined_lower:
|
||||||
|
tags.append(tag)
|
||||||
|
return tags[:5]
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
"""Crawlers for the 国标委 (SAMR) standard information platform."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||||
|
from ._utils import extract_tags, parse_date
|
||||||
|
|
||||||
|
_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
|
||||||
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
|
||||||
|
params = {
|
||||||
|
"p.p1": std_type,
|
||||||
|
"p.p2": "车",
|
||||||
|
"p.p90": "circulation_date",
|
||||||
|
"p.p91": "desc",
|
||||||
|
"p.p6": page,
|
||||||
|
"p.p7": page_size,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return data.get("rows", []) or []
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
|
||||||
|
standard_code = row.get("std_code", "")
|
||||||
|
title = row.get("std_name", standard_code)
|
||||||
|
published_at = parse_date(row.get("release_date", ""))
|
||||||
|
effective_at_raw = row.get("implement_date", "")
|
||||||
|
effective_at = parse_date(effective_at_raw) if effective_at_raw else None
|
||||||
|
status_text = row.get("std_status", "")
|
||||||
|
if "征求意见" in status_text:
|
||||||
|
status = "consultation"
|
||||||
|
elif "报批" in status_text or "草案" in status_text:
|
||||||
|
status = "draft"
|
||||||
|
else:
|
||||||
|
status = "enacted"
|
||||||
|
return RawEvent(
|
||||||
|
source="国标委",
|
||||||
|
source_label=source_label,
|
||||||
|
standard_code=standard_code,
|
||||||
|
title=title,
|
||||||
|
summary=title,
|
||||||
|
full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
|
||||||
|
status=status,
|
||||||
|
published_at=published_at,
|
||||||
|
effective_at=effective_at,
|
||||||
|
category=row.get("std_type", "国家标准"),
|
||||||
|
tags=extract_tags(standard_code, title),
|
||||||
|
raw_text=f"{standard_code} {title}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class GuobiaoMandatoryCrawler(BaseCrawler):
|
||||||
|
"""Fetch mandatory national standards (强制性) related to vehicles."""
|
||||||
|
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||||
|
events: list[RawEvent] = []
|
||||||
|
page = 1
|
||||||
|
max_pages = max(10, limit)
|
||||||
|
while len(events) < limit and page <= max_pages:
|
||||||
|
rows = _fetch_page(std_type=1, page=page, page_size=20)
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
|
||||||
|
page += 1
|
||||||
|
return events[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
class GuobiaoRecommendedCrawler(BaseCrawler):
|
||||||
|
"""Fetch recommended national standards (推荐性) related to vehicles."""
|
||||||
|
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||||
|
events: list[RawEvent] = []
|
||||||
|
page = 1
|
||||||
|
max_pages = max(10, limit)
|
||||||
|
while len(events) < limit and page <= max_pages:
|
||||||
|
rows = _fetch_page(std_type=2, page=page, page_size=20)
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
|
||||||
|
page += 1
|
||||||
|
return events[:limit]
|
||||||
241
backend/app/infrastructure/perception/llm_pipeline.py
Normal file
241
backend/app/infrastructure/perception/llm_pipeline.py
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
"""LLM-driven pipeline for regulatory event enrichment."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.config.settings import settings
|
||||||
|
from app.infrastructure.embedding.openai_compatible_embedding_provider import (
|
||||||
|
OpenAICompatibleEmbeddingProvider,
|
||||||
|
)
|
||||||
|
from app.services.llm.llm_factory import get_llm_client
|
||||||
|
|
||||||
|
_EXTRACT_SYSTEM = (
|
||||||
|
"You are a regulatory compliance expert specialising in automotive standards "
|
||||||
|
"(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
|
||||||
|
"Return valid JSON only — no markdown fences, no extra keys."
|
||||||
|
)
|
||||||
|
|
||||||
|
_ASSESS_SYSTEM = (
|
||||||
|
"You are an automotive compliance analyst. Given a regulation and related document excerpts, "
|
||||||
|
"identify which documents are affected and what actions are required. "
|
||||||
|
"Return a JSON array only."
|
||||||
|
)
|
||||||
|
|
||||||
|
_DIFF_SYSTEM = (
|
||||||
|
"You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
|
||||||
|
"classify the type of change and summarise it. "
|
||||||
|
"Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
|
||||||
|
)
|
||||||
|
|
||||||
|
_SIMILARITY_THRESHOLD = 0.85
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine(a: list[float], b: list[float]) -> float:
|
||||||
|
dot = sum(x * y for x, y in zip(a, b))
|
||||||
|
norm_a = math.sqrt(sum(x * x for x in a))
|
||||||
|
norm_b = math.sqrt(sum(x * x for x in b))
|
||||||
|
if norm_a == 0 or norm_b == 0:
|
||||||
|
return 0.0
|
||||||
|
return dot / (norm_a * norm_b)
|
||||||
|
|
||||||
|
|
||||||
|
def _llm_json(client: Any, messages: list[dict]) -> Any:
|
||||||
|
"""Call LLM and parse JSON response; return None on failure."""
|
||||||
|
try:
|
||||||
|
resp = client.chat(messages)
|
||||||
|
text = (resp.content or "").strip()
|
||||||
|
if text.startswith("```"):
|
||||||
|
text = text.split("```")[1]
|
||||||
|
if text.startswith("json"):
|
||||||
|
text = text[4:]
|
||||||
|
return json.loads(text)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("LLM JSON parse failed: {}", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class LlmPipeline:
|
||||||
|
"""Three-step enrichment pipeline for crawled regulatory events."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._client = get_llm_client(
|
||||||
|
provider=settings.llm_provider,
|
||||||
|
model=settings.llm_model,
|
||||||
|
)
|
||||||
|
self._embedder = OpenAICompatibleEmbeddingProvider()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Step 1: Structure extraction
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def extract_structure(self, event: dict) -> dict:
|
||||||
|
"""Extract obligations, deadlines, scope, penalties, impact_level from event text."""
|
||||||
|
prompt = f"""Extract structured compliance information from this regulation:
|
||||||
|
|
||||||
|
Standard: {event.get('standard_code', '')}
|
||||||
|
Title: {event.get('title', '')}
|
||||||
|
Source: {event.get('source_label', '')}
|
||||||
|
Summary: {event.get('summary', '')}
|
||||||
|
Tags: {', '.join(event.get('tags') or [])}
|
||||||
|
|
||||||
|
Return JSON with exactly these keys:
|
||||||
|
{{
|
||||||
|
"obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
|
||||||
|
"deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
|
||||||
|
"scope": "one sentence describing who/what this applies to",
|
||||||
|
"penalties": "one sentence on consequences of non-compliance, or null",
|
||||||
|
"impact_level": "high|medium|low"
|
||||||
|
}}"""
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": _EXTRACT_SYSTEM},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
]
|
||||||
|
result = _llm_json(self._client, messages)
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
return {
|
||||||
|
"obligations": [],
|
||||||
|
"deadlines": [],
|
||||||
|
"scope": "",
|
||||||
|
"penalties": "",
|
||||||
|
"impact_level": "medium",
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Step 2: Impact assessment
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
|
||||||
|
"""Use RAG to find affected documents and generate recommendations."""
|
||||||
|
obligations = event.get("obligations") or []
|
||||||
|
obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
|
||||||
|
query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunks = retrieval_service.retrieve(query=query, top_k=5)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("RAG retrieval failed: {}", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
doc_excerpts: list[dict] = []
|
||||||
|
for chunk in chunks:
|
||||||
|
if chunk.doc_id not in seen:
|
||||||
|
seen.add(chunk.doc_id)
|
||||||
|
doc_excerpts.append({
|
||||||
|
"doc_id": chunk.doc_id,
|
||||||
|
"doc_name": chunk.doc_title,
|
||||||
|
"score": round(float(chunk.score if chunk.score is not None else 0), 4),
|
||||||
|
"snippet": (chunk.text or "")[:300],
|
||||||
|
"clause": getattr(chunk, "section_title", "") or "",
|
||||||
|
})
|
||||||
|
|
||||||
|
context = "\n".join(
|
||||||
|
f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
|
||||||
|
for d in doc_excerpts
|
||||||
|
)
|
||||||
|
prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')}
|
||||||
|
Obligations: {obligation_texts or event.get('summary', '')}
|
||||||
|
|
||||||
|
Affected documents found in knowledge base:
|
||||||
|
{context}
|
||||||
|
|
||||||
|
For each document, assess impact and recommend action. Return JSON array:
|
||||||
|
[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": _ASSESS_SYSTEM},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
]
|
||||||
|
result = _llm_json(self._client, messages)
|
||||||
|
if isinstance(result, list):
|
||||||
|
score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
|
||||||
|
for item in result:
|
||||||
|
if isinstance(item, dict) and item.get("doc_id") in score_map:
|
||||||
|
item["score"] = score_map[item["doc_id"]]
|
||||||
|
return result
|
||||||
|
return doc_excerpts
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Step 3: Semantic diff
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def compute_diff(self, old_text: str, new_text: str) -> dict:
|
||||||
|
"""Compare old and new regulation text; return changed sections and summary."""
|
||||||
|
old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
|
||||||
|
new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
|
||||||
|
|
||||||
|
if not old_paras or not new_paras:
|
||||||
|
return {"changed_sections": [], "change_summary": "No comparable text."}
|
||||||
|
|
||||||
|
all_paras = old_paras + new_paras
|
||||||
|
try:
|
||||||
|
all_embeddings = self._embedder.embed_texts(all_paras)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Embedding for diff failed: {}", exc)
|
||||||
|
return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
|
||||||
|
|
||||||
|
old_embeddings = all_embeddings[: len(old_paras)]
|
||||||
|
new_embeddings = all_embeddings[len(old_paras):]
|
||||||
|
|
||||||
|
changed_sections: list[dict] = []
|
||||||
|
max_len = max(len(old_paras), len(new_paras))
|
||||||
|
|
||||||
|
for i in range(max_len):
|
||||||
|
if i >= len(old_paras):
|
||||||
|
# New paragraph added
|
||||||
|
changed_sections.append({
|
||||||
|
"old_text": "",
|
||||||
|
"new_text": new_paras[i][:300],
|
||||||
|
"similarity": 0.0,
|
||||||
|
"change_type": "added",
|
||||||
|
"summary": "New paragraph added.",
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
if i >= len(new_paras):
|
||||||
|
# Old paragraph removed
|
||||||
|
changed_sections.append({
|
||||||
|
"old_text": old_paras[i][:300],
|
||||||
|
"new_text": "",
|
||||||
|
"similarity": 0.0,
|
||||||
|
"change_type": "removed",
|
||||||
|
"summary": "Paragraph removed.",
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
# Both exist — compare via embeddings
|
||||||
|
sim = _cosine(old_embeddings[i], new_embeddings[i])
|
||||||
|
if sim < _SIMILARITY_THRESHOLD:
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": _DIFF_SYSTEM},
|
||||||
|
{"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"},
|
||||||
|
]
|
||||||
|
classification = _llm_json(self._client, messages) or {}
|
||||||
|
changed_sections.append({
|
||||||
|
"old_text": old_paras[i][:300],
|
||||||
|
"new_text": new_paras[i][:300],
|
||||||
|
"similarity": round(sim, 3),
|
||||||
|
"change_type": classification.get("change_type", "modified"),
|
||||||
|
"summary": classification.get("summary", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
if not changed_sections:
|
||||||
|
change_summary = "No substantive changes detected between versions."
|
||||||
|
else:
|
||||||
|
types = [s["change_type"] for s in changed_sections]
|
||||||
|
change_summary = (
|
||||||
|
f"{len(changed_sections)} paragraph(s) changed: "
|
||||||
|
+ ", ".join(f"{t}" for t in set(types))
|
||||||
|
+ ". "
|
||||||
|
+ (changed_sections[0].get("summary", "") if changed_sections else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"changed_sections": changed_sections, "change_summary": change_summary}
|
||||||
@@ -4,6 +4,8 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
|
||||||
MOCK_EVENTS: list[dict[str, Any]] = [
|
MOCK_EVENTS: list[dict[str, Any]] = [
|
||||||
# ------------------------------------------------------------------ HIGH
|
# ------------------------------------------------------------------ HIGH
|
||||||
{
|
{
|
||||||
@@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Index for fast lookup
|
class MockEventStore(BaseEventStore):
|
||||||
_EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS}
|
|
||||||
|
|
||||||
|
|
||||||
class MockEventStore:
|
|
||||||
"""In-memory mock store for regulatory events."""
|
"""In-memory mock store for regulatory events."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._events: list[dict] = [dict(e) for e in MOCK_EVENTS]
|
||||||
|
self._index: dict[str, dict] = {e["id"]: e for e in self._events}
|
||||||
|
|
||||||
def all(self) -> list[dict]:
|
def all(self) -> list[dict]:
|
||||||
return list(MOCK_EVENTS)
|
return list(self._events)
|
||||||
|
|
||||||
def get(self, event_id: str) -> dict | None:
|
def get(self, event_id: str) -> dict | None:
|
||||||
return _EVENT_INDEX.get(event_id)
|
return self._index.get(event_id)
|
||||||
|
|
||||||
def filter(
|
def filter(
|
||||||
self,
|
self,
|
||||||
@@ -399,23 +401,39 @@ class MockEventStore:
|
|||||||
impact_level: str | None = None,
|
impact_level: str | None = None,
|
||||||
limit: int = 50,
|
limit: int = 50,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
events = list(MOCK_EVENTS)
|
events = list(self._events)
|
||||||
if source:
|
if source:
|
||||||
events = [e for e in events if e["source"] == source]
|
events = [e for e in events if e["source"] == source]
|
||||||
if impact_level:
|
if impact_level:
|
||||||
events = [e for e in events if e["impact_level"] == impact_level]
|
events = [e for e in events if e["impact_level"] == impact_level]
|
||||||
events.sort(key=lambda e: e["published_at"], reverse=True)
|
events.sort(key=lambda e: e.get("published_at") or "", reverse=True)
|
||||||
return events[:limit]
|
return events[:limit]
|
||||||
|
|
||||||
def stats(self) -> dict:
|
def stats(self) -> dict:
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
|
||||||
events = MOCK_EVENTS
|
events = self._events
|
||||||
cutoff = (date.today() - timedelta(days=90)).isoformat()
|
cutoff = (date.today() - timedelta(days=90)).isoformat()
|
||||||
return {
|
return {
|
||||||
"total": len(events),
|
"total": len(events),
|
||||||
"high_impact": sum(1 for e in events if e["impact_level"] == "high"),
|
"high_impact": sum(1 for e in events if e["impact_level"] == "high"),
|
||||||
"medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
|
"medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
|
||||||
"low_impact": sum(1 for e in events if e["impact_level"] == "low"),
|
"low_impact": sum(1 for e in events if e["impact_level"] == "low"),
|
||||||
"recent_90d": sum(1 for e in events if e["published_at"] >= cutoff),
|
"recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def upsert(self, event: dict) -> None:
|
||||||
|
"""Insert or update event in the in-memory list (used in tests)."""
|
||||||
|
existing = self._index.get(event["id"])
|
||||||
|
if existing:
|
||||||
|
existing.update(event)
|
||||||
|
else:
|
||||||
|
self._events.append(event)
|
||||||
|
self._index[event["id"]] = event
|
||||||
|
|
||||||
|
def get_by_standard_code(self, standard_code: str) -> dict | None:
|
||||||
|
"""Return most-recent event with matching standard_code."""
|
||||||
|
matches = [e for e in self._events if e.get("standard_code") == standard_code]
|
||||||
|
if not matches:
|
||||||
|
return None
|
||||||
|
return max(matches, key=lambda e: e.get("published_at", ""))
|
||||||
|
|||||||
225
backend/app/infrastructure/perception/postgres_event_store.py
Normal file
225
backend/app/infrastructure/perception/postgres_event_store.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
"""PostgreSQL-backed regulatory event store."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from datetime import UTC, date, datetime, timedelta
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
import psycopg2.extras
|
||||||
|
from psycopg2.pool import ThreadedConnectionPool
|
||||||
|
|
||||||
|
from app.config.settings import settings
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
|
||||||
|
_CREATE_TABLE = """
|
||||||
|
CREATE TABLE IF NOT EXISTS regulation_events (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
source_label TEXT,
|
||||||
|
standard_code TEXT NOT NULL,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
summary TEXT,
|
||||||
|
full_text_url TEXT,
|
||||||
|
status TEXT,
|
||||||
|
impact_level TEXT,
|
||||||
|
published_at DATE,
|
||||||
|
effective_at DATE,
|
||||||
|
category TEXT,
|
||||||
|
tags TEXT[],
|
||||||
|
obligations JSONB,
|
||||||
|
deadlines JSONB,
|
||||||
|
scope TEXT,
|
||||||
|
penalties TEXT,
|
||||||
|
content_hash TEXT,
|
||||||
|
previous_hash TEXT,
|
||||||
|
change_summary TEXT,
|
||||||
|
changed_sections JSONB,
|
||||||
|
affected_docs JSONB,
|
||||||
|
crawled_at TIMESTAMPTZ DEFAULT now(),
|
||||||
|
processed_at TIMESTAMPTZ,
|
||||||
|
raw_storage_key TEXT
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS reg_events_source_date
|
||||||
|
ON regulation_events (source, published_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS reg_events_impact_date
|
||||||
|
ON regulation_events (impact_level, published_at DESC);
|
||||||
|
"""
|
||||||
|
|
||||||
|
_ALL_COLUMNS = (
|
||||||
|
"id", "source", "source_label", "standard_code", "title", "summary",
|
||||||
|
"full_text_url", "status", "impact_level", "published_at", "effective_at",
|
||||||
|
"category", "tags", "obligations", "deadlines", "scope", "penalties",
|
||||||
|
"content_hash", "previous_hash", "change_summary", "changed_sections",
|
||||||
|
"affected_docs", "crawled_at", "processed_at", "raw_storage_key",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_dict(row: dict[str, Any]) -> dict:
|
||||||
|
"""Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
|
||||||
|
d = dict(row)
|
||||||
|
for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
|
||||||
|
val = d.get(field)
|
||||||
|
if isinstance(val, str):
|
||||||
|
d[field] = json.loads(val)
|
||||||
|
for date_field in ("published_at", "effective_at"):
|
||||||
|
val = d.get(date_field)
|
||||||
|
if isinstance(val, datetime):
|
||||||
|
d[date_field] = val.date().isoformat()
|
||||||
|
elif isinstance(val, date):
|
||||||
|
d[date_field] = val.isoformat()
|
||||||
|
for ts_field in ("crawled_at", "processed_at"):
|
||||||
|
val = d.get(ts_field)
|
||||||
|
if isinstance(val, datetime):
|
||||||
|
d[ts_field] = val.isoformat()
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
class PostgresEventStore(BaseEventStore):
|
||||||
|
"""Regulatory event store backed by PostgreSQL."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._pool = ThreadedConnectionPool(
|
||||||
|
minconn=1,
|
||||||
|
maxconn=5,
|
||||||
|
host=settings.postgres_host,
|
||||||
|
port=settings.postgres_port,
|
||||||
|
user=settings.postgres_user,
|
||||||
|
password=settings.postgres_password,
|
||||||
|
dbname=settings.postgres_db,
|
||||||
|
)
|
||||||
|
self._ensure_schema()
|
||||||
|
|
||||||
|
def _ensure_schema(self) -> None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(_CREATE_TABLE)
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _conn(self):
|
||||||
|
conn = None
|
||||||
|
try:
|
||||||
|
conn = self._pool.getconn()
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
if conn is not None:
|
||||||
|
self._pool.putconn(conn)
|
||||||
|
|
||||||
|
def all(self) -> list[dict]:
|
||||||
|
with self._conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(
|
||||||
|
"SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
|
||||||
|
)
|
||||||
|
return [_row_to_dict(r) for r in cur.fetchall()]
|
||||||
|
|
||||||
|
def get(self, event_id: str) -> dict | None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(
|
||||||
|
"SELECT * FROM regulation_events WHERE id = %s", (event_id,)
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
return _row_to_dict(row) if row else None
|
||||||
|
|
||||||
|
def filter(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
source: str | None = None,
|
||||||
|
impact_level: str | None = None,
|
||||||
|
limit: int = 50,
|
||||||
|
) -> list[dict]:
|
||||||
|
conditions: list[str] = []
|
||||||
|
params: list[Any] = []
|
||||||
|
if source:
|
||||||
|
conditions.append("source = %s")
|
||||||
|
params.append(source)
|
||||||
|
if impact_level:
|
||||||
|
conditions.append("impact_level = %s")
|
||||||
|
params.append(impact_level)
|
||||||
|
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
|
||||||
|
params.append(limit)
|
||||||
|
sql = f"""
|
||||||
|
SELECT * FROM regulation_events
|
||||||
|
{where}
|
||||||
|
ORDER BY published_at DESC NULLS LAST
|
||||||
|
LIMIT %s
|
||||||
|
"""
|
||||||
|
with self._conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(sql, params)
|
||||||
|
return [_row_to_dict(r) for r in cur.fetchall()]
|
||||||
|
|
||||||
|
def stats(self) -> dict:
|
||||||
|
cutoff = (date.today() - timedelta(days=90)).isoformat()
|
||||||
|
with self._conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
|
||||||
|
total = (cur.fetchone() or {}).get("count", 0)
|
||||||
|
cur.execute(
|
||||||
|
"SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
|
||||||
|
)
|
||||||
|
high = (cur.fetchone() or {}).get("count", 0)
|
||||||
|
cur.execute(
|
||||||
|
"SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
|
||||||
|
)
|
||||||
|
medium = (cur.fetchone() or {}).get("count", 0)
|
||||||
|
cur.execute(
|
||||||
|
"SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
|
||||||
|
(cutoff,),
|
||||||
|
)
|
||||||
|
recent = (cur.fetchone() or {}).get("count", 0)
|
||||||
|
return {
|
||||||
|
"total": int(total),
|
||||||
|
"high_impact": int(high),
|
||||||
|
"medium_impact": int(medium),
|
||||||
|
"recent_90d": int(recent),
|
||||||
|
}
|
||||||
|
|
||||||
|
def upsert(self, event: dict) -> None:
|
||||||
|
"""Insert or update a regulation event."""
|
||||||
|
cols = [c for c in _ALL_COLUMNS if c in event]
|
||||||
|
placeholders = ", ".join(f"%({c})s" for c in cols)
|
||||||
|
updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
|
||||||
|
sql = f"""
|
||||||
|
INSERT INTO regulation_events ({', '.join(cols)})
|
||||||
|
VALUES ({placeholders})
|
||||||
|
ON CONFLICT (id) DO UPDATE SET {updates}
|
||||||
|
"""
|
||||||
|
row: dict[str, Any] = {}
|
||||||
|
for c in cols:
|
||||||
|
val = event.get(c)
|
||||||
|
if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
|
||||||
|
row[c] = json.dumps(val, ensure_ascii=False)
|
||||||
|
elif c == "tags" and isinstance(val, list):
|
||||||
|
row[c] = val
|
||||||
|
else:
|
||||||
|
row[c] = val
|
||||||
|
with self._conn() as conn:
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(sql, row)
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_by_standard_code(self, standard_code: str) -> dict | None:
|
||||||
|
with self._conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""SELECT * FROM regulation_events
|
||||||
|
WHERE standard_code = %s
|
||||||
|
ORDER BY published_at DESC NULLS LAST
|
||||||
|
LIMIT 1""",
|
||||||
|
(standard_code,),
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
return _row_to_dict(row) if row else None
|
||||||
@@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu
|
|||||||
from app.infrastructure.parser.local_document_parser import LocalDocumentParser
|
from app.infrastructure.parser.local_document_parser import LocalDocumentParser
|
||||||
from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
|
from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
|
||||||
from app.infrastructure.perception.mock_event_store import MockEventStore
|
from app.infrastructure.perception.mock_event_store import MockEventStore
|
||||||
|
from app.application.perception.crawl_service import CrawlService
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
|
||||||
|
from app.infrastructure.perception.crawlers.guobiao_crawler import (
|
||||||
|
GuobiaoMandatoryCrawler,
|
||||||
|
GuobiaoRecommendedCrawler,
|
||||||
|
)
|
||||||
|
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
|
||||||
|
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||||||
from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
|
from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
|
||||||
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
|
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
|
||||||
from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
|
from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
|
||||||
@@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_event_store() -> BaseEventStore:
|
||||||
|
"""Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
|
||||||
|
if settings.document_repository_backend == "postgres":
|
||||||
|
from app.infrastructure.perception.postgres_event_store import PostgresEventStore
|
||||||
|
return PostgresEventStore()
|
||||||
|
return MockEventStore()
|
||||||
|
|
||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def get_perception_service() -> PerceptionService:
|
def get_perception_service() -> PerceptionService:
|
||||||
"""Return perception service for regulatory intelligence."""
|
|
||||||
return PerceptionService(
|
return PerceptionService(
|
||||||
event_store=MockEventStore(),
|
event_store=get_event_store(),
|
||||||
|
retrieval_service=get_retrieval_service(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_crawl_service() -> CrawlService:
|
||||||
|
crawlers = {
|
||||||
|
"CATARC": CatarcCrawler(),
|
||||||
|
"国标委·强制性": GuobiaoMandatoryCrawler(),
|
||||||
|
"国标委·推荐性": GuobiaoRecommendedCrawler(),
|
||||||
|
"EUR-Lex": EurlexCrawler(),
|
||||||
|
}
|
||||||
|
return CrawlService(
|
||||||
|
crawlers=crawlers,
|
||||||
|
event_store=get_event_store(),
|
||||||
|
llm_pipeline=LlmPipeline(),
|
||||||
retrieval_service=get_retrieval_service(),
|
retrieval_service=get_retrieval_service(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ pydantic-settings>=2.0.0
|
|||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
loguru>=0.7.0
|
loguru>=0.7.0
|
||||||
httpx>=0.25.0
|
httpx>=0.25.0
|
||||||
|
beautifulsoup4>=4.12.0
|
||||||
|
lxml>=5.0.0
|
||||||
tiktoken>=0.5.0
|
tiktoken>=0.5.0
|
||||||
tenacity>=8.2.0
|
tenacity>=8.2.0
|
||||||
|
|
||||||
|
|||||||
0
backend/tests/perception/__init__.py
Normal file
0
backend/tests/perception/__init__.py
Normal file
95
backend/tests/perception/test_base_event_store.py
Normal file
95
backend/tests/perception/test_base_event_store.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""Contract tests: any BaseEventStore implementation must pass these."""
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
from app.infrastructure.perception.mock_event_store import MockEventStore
|
||||||
|
|
||||||
|
|
||||||
|
def _store() -> BaseEventStore:
|
||||||
|
return MockEventStore()
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_base_event_store():
|
||||||
|
assert isinstance(_store(), BaseEventStore)
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_returns_list():
|
||||||
|
result = _store().all()
|
||||||
|
assert isinstance(result, list)
|
||||||
|
assert len(result) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_known_id():
|
||||||
|
store = _store()
|
||||||
|
first = store.all()[0]
|
||||||
|
result = store.get(first["id"])
|
||||||
|
assert result is not None
|
||||||
|
assert result["id"] == first["id"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_unknown_returns_none():
|
||||||
|
assert _store().get("does-not-exist") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_by_impact():
|
||||||
|
store = _store()
|
||||||
|
highs = store.filter(impact_level="high", limit=100)
|
||||||
|
assert all(e["impact_level"] == "high" for e in highs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_limit():
|
||||||
|
store = _store()
|
||||||
|
result = store.filter(limit=3)
|
||||||
|
assert len(result) <= 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_keys():
|
||||||
|
stats = _store().stats()
|
||||||
|
for key in ("total", "high_impact", "medium_impact", "recent_90d"):
|
||||||
|
assert key in stats, f"missing key: {key}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_and_get():
|
||||||
|
store = _store()
|
||||||
|
event = {
|
||||||
|
"id": "test-upsert-001",
|
||||||
|
"source": "TEST",
|
||||||
|
"source_label": "Test Source",
|
||||||
|
"standard_code": "TST-001",
|
||||||
|
"title": "Test Event",
|
||||||
|
"summary": "A test event",
|
||||||
|
"full_text_url": "https://example.com",
|
||||||
|
"status": "draft",
|
||||||
|
"impact_level": "low",
|
||||||
|
"published_at": "2026-01-01",
|
||||||
|
"effective_at": None,
|
||||||
|
"category": "test",
|
||||||
|
"tags": ["test"],
|
||||||
|
"content_hash": "abc123",
|
||||||
|
"previous_hash": None,
|
||||||
|
}
|
||||||
|
store.upsert(event)
|
||||||
|
result = store.get("test-upsert-001")
|
||||||
|
assert result is not None
|
||||||
|
assert result["title"] == "Test Event"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_by_standard_code():
|
||||||
|
store = _store()
|
||||||
|
first = store.all()[0]
|
||||||
|
result = store.get_by_standard_code(first["standard_code"])
|
||||||
|
assert result is not None
|
||||||
|
assert result["standard_code"] == first["standard_code"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_updates_existing():
|
||||||
|
store = _store()
|
||||||
|
first = store.all()[0]
|
||||||
|
original_id = first["id"]
|
||||||
|
store.upsert({"id": original_id, "title": "Updated Title", "impact_level": first["impact_level"],
|
||||||
|
"standard_code": first.get("standard_code", ""), "source": first["source"],
|
||||||
|
"source_label": first.get("source_label", ""), "summary": "Updated",
|
||||||
|
"full_text_url": "", "status": first["status"], "published_at": first.get("published_at", ""),
|
||||||
|
"effective_at": None, "category": first.get("category", ""), "tags": [],
|
||||||
|
"content_hash": "newhash", "previous_hash": None})
|
||||||
|
result = store.get(original_id)
|
||||||
|
assert result is not None
|
||||||
|
assert result["title"] == "Updated Title"
|
||||||
111
backend/tests/perception/test_crawl_service.py
Normal file
111
backend/tests/perception/test_crawl_service.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
"""Integration tests for CrawlService."""
|
||||||
|
from __future__ import annotations
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
import hashlib
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.infrastructure.perception.crawlers.base import RawEvent
|
||||||
|
from app.infrastructure.perception.mock_event_store import MockEventStore
|
||||||
|
|
||||||
|
|
||||||
|
def _make_raw_event(code="TST-001"):
|
||||||
|
return RawEvent(
|
||||||
|
source="TEST", source_label="Test", standard_code=code,
|
||||||
|
title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
|
||||||
|
status="enacted", published_at="2026-01-01", effective_at=None,
|
||||||
|
category="test", tags=["test"], raw_text="full text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_service(raw_events):
|
||||||
|
from app.application.perception.crawl_service import CrawlService
|
||||||
|
|
||||||
|
mock_crawler = MagicMock()
|
||||||
|
mock_crawler.fetch.return_value = raw_events
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.extract_structure.return_value = {
|
||||||
|
"obligations": [], "deadlines": [], "scope": "test",
|
||||||
|
"penalties": None, "impact_level": "low",
|
||||||
|
}
|
||||||
|
mock_pipeline.assess_impact.return_value = []
|
||||||
|
mock_pipeline.compute_diff.return_value = {
|
||||||
|
"changed_sections": [], "change_summary": "No changes.",
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_retrieval = MagicMock()
|
||||||
|
store = MockEventStore()
|
||||||
|
|
||||||
|
return CrawlService(
|
||||||
|
crawlers={"TEST": mock_crawler},
|
||||||
|
event_store=store,
|
||||||
|
llm_pipeline=mock_pipeline,
|
||||||
|
retrieval_service=mock_retrieval,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_yields_progress_and_done():
|
||||||
|
svc = _make_service([_make_raw_event("TST-001")])
|
||||||
|
events = list(svc.run_crawl())
|
||||||
|
event_types = [e.get("event") for e in events]
|
||||||
|
assert "done" in event_types
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_upserts_to_store():
|
||||||
|
store = MockEventStore()
|
||||||
|
from app.application.perception.crawl_service import CrawlService
|
||||||
|
mock_crawler = MagicMock()
|
||||||
|
mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.extract_structure.return_value = {
|
||||||
|
"obligations": [], "deadlines": [], "scope": "",
|
||||||
|
"penalties": None, "impact_level": "medium",
|
||||||
|
}
|
||||||
|
mock_pipeline.assess_impact.return_value = []
|
||||||
|
mock_pipeline.compute_diff.return_value = {
|
||||||
|
"changed_sections": [], "change_summary": "",
|
||||||
|
}
|
||||||
|
svc = CrawlService(
|
||||||
|
crawlers={"TEST": mock_crawler},
|
||||||
|
event_store=store,
|
||||||
|
llm_pipeline=mock_pipeline,
|
||||||
|
retrieval_service=MagicMock(),
|
||||||
|
)
|
||||||
|
list(svc.run_crawl())
|
||||||
|
result = store.get_by_standard_code("NEW-001")
|
||||||
|
assert result is not None
|
||||||
|
assert result["title"] == "Test NEW-001"
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_skips_unchanged_events():
|
||||||
|
store = MockEventStore()
|
||||||
|
raw = _make_raw_event("SKIP-001")
|
||||||
|
content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
|
||||||
|
store.upsert({
|
||||||
|
"id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
|
||||||
|
"standard_code": "SKIP-001",
|
||||||
|
"source": "TEST",
|
||||||
|
"source_label": "Test",
|
||||||
|
"title": "Test SKIP-001",
|
||||||
|
"summary": "",
|
||||||
|
"full_text_url": "",
|
||||||
|
"status": "enacted",
|
||||||
|
"impact_level": "low",
|
||||||
|
"published_at": "2026-01-01",
|
||||||
|
"effective_at": None,
|
||||||
|
"category": "test",
|
||||||
|
"tags": [],
|
||||||
|
"content_hash": content_hash,
|
||||||
|
})
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
from app.application.perception.crawl_service import CrawlService
|
||||||
|
mock_crawler = MagicMock()
|
||||||
|
mock_crawler.fetch.return_value = [raw]
|
||||||
|
svc = CrawlService(
|
||||||
|
crawlers={"TEST": mock_crawler},
|
||||||
|
event_store=store,
|
||||||
|
llm_pipeline=mock_pipeline,
|
||||||
|
retrieval_service=MagicMock(),
|
||||||
|
)
|
||||||
|
list(svc.run_crawl())
|
||||||
|
mock_pipeline.extract_structure.assert_not_called()
|
||||||
127
backend/tests/perception/test_crawlers.py
Normal file
127
backend/tests/perception/test_crawlers.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
"""Unit tests for crawlers — mock httpx responses."""
|
||||||
|
from __future__ import annotations
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
|
def test_raw_event_fields():
|
||||||
|
ev = RawEvent(
|
||||||
|
source="TEST",
|
||||||
|
source_label="Test",
|
||||||
|
standard_code="TST-001",
|
||||||
|
title="Test",
|
||||||
|
summary="Summary",
|
||||||
|
full_text_url="https://example.com",
|
||||||
|
status="enacted",
|
||||||
|
published_at="2026-01-01",
|
||||||
|
effective_at=None,
|
||||||
|
category="test",
|
||||||
|
tags=["a"],
|
||||||
|
raw_text="full text here",
|
||||||
|
)
|
||||||
|
assert ev.source == "TEST"
|
||||||
|
assert ev.tags == ["a"]
|
||||||
|
|
||||||
|
|
||||||
|
CATARC_HTML = """
|
||||||
|
<html><body>
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td><a href="/std/detail/123">GB 18384-2025</a></td>
|
||||||
|
<td>电动汽车安全要求</td>
|
||||||
|
<td>2025-11-15</td>
|
||||||
|
<td>现行</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><a href="/std/detail/456">GB/T 40429-2026</a></td>
|
||||||
|
<td>汽车驾驶自动化分级</td>
|
||||||
|
<td>2026-02-01</td>
|
||||||
|
<td>即将实施</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_catarc_crawler_parses_html():
|
||||||
|
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
|
||||||
|
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
mock_resp.text = CATARC_HTML
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
with patch("httpx.get", return_value=mock_resp):
|
||||||
|
crawler = CatarcCrawler()
|
||||||
|
events = crawler.fetch(limit=10)
|
||||||
|
|
||||||
|
assert isinstance(events, list)
|
||||||
|
assert len(events) >= 1
|
||||||
|
assert all(isinstance(e, RawEvent) for e in events)
|
||||||
|
codes = [e.standard_code for e in events]
|
||||||
|
assert "GB 18384-2025" in codes
|
||||||
|
|
||||||
|
|
||||||
|
GUOBIAO_JSON = {
|
||||||
|
"rows": [
|
||||||
|
{
|
||||||
|
"std_code": "GB 18384-2025",
|
||||||
|
"std_name": "电动汽车安全要求",
|
||||||
|
"release_date": "2025-11-15",
|
||||||
|
"implement_date": "2026-07-01",
|
||||||
|
"std_status": "现行",
|
||||||
|
"std_type": "强制性",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_guobiao_crawler_parses_json():
|
||||||
|
from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
|
||||||
|
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
mock_resp.json.return_value = GUOBIAO_JSON
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
with patch("httpx.get", return_value=mock_resp):
|
||||||
|
crawler = GuobiaoMandatoryCrawler()
|
||||||
|
events = crawler.fetch(limit=10)
|
||||||
|
|
||||||
|
assert len(events) >= 1
|
||||||
|
assert events[0].source == "国标委"
|
||||||
|
assert events[0].standard_code == "GB 18384-2025"
|
||||||
|
|
||||||
|
|
||||||
|
EURLEX_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>EUR-Lex</title>
|
||||||
|
<item>
|
||||||
|
<title>Regulation (EU) 2024/1689 — AI Act</title>
|
||||||
|
<link>https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689</link>
|
||||||
|
<description>The EU Artificial Intelligence Act enters into force.</description>
|
||||||
|
<pubDate>Fri, 12 Jul 2024 00:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_eurlex_crawler_parses_rss():
|
||||||
|
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
|
||||||
|
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
mock_resp.text = EURLEX_RSS
|
||||||
|
mock_resp.content = EURLEX_RSS
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
with patch("httpx.get", return_value=mock_resp):
|
||||||
|
crawler = EurlexCrawler()
|
||||||
|
events = crawler.fetch(limit=5)
|
||||||
|
|
||||||
|
assert isinstance(events, list)
|
||||||
|
assert len(events) >= 1
|
||||||
|
assert events[0].source == "EUR-Lex"
|
||||||
77
backend/tests/perception/test_llm_pipeline.py
Normal file
77
backend/tests/perception/test_llm_pipeline.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
"""Unit tests for LlmPipeline — mock LLM client and embedding provider."""
|
||||||
|
from __future__ import annotations
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pipeline():
|
||||||
|
with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \
|
||||||
|
patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls:
|
||||||
|
|
||||||
|
mock_client = MagicMock()
|
||||||
|
mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}')
|
||||||
|
mock_llm_fn.return_value = mock_client
|
||||||
|
|
||||||
|
mock_emb = MagicMock()
|
||||||
|
mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024]
|
||||||
|
mock_emb_cls.return_value = mock_emb
|
||||||
|
|
||||||
|
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||||||
|
return LlmPipeline(), mock_client, mock_emb
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_structure_returns_dict():
|
||||||
|
pipeline, mock_client, _ = _make_pipeline()
|
||||||
|
event = {
|
||||||
|
"id": "evt-001",
|
||||||
|
"standard_code": "GB 18384-2025",
|
||||||
|
"title": "电动汽车安全要求",
|
||||||
|
"summary": "新增 IP67 级别防护",
|
||||||
|
"source_label": "CATARC",
|
||||||
|
"tags": ["电池安全"],
|
||||||
|
}
|
||||||
|
result = pipeline.extract_structure(event)
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert "obligations" in result
|
||||||
|
assert "impact_level" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_assess_impact_returns_list():
|
||||||
|
pipeline, mock_client, _ = _make_pipeline()
|
||||||
|
mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]')
|
||||||
|
mock_retrieval = MagicMock()
|
||||||
|
chunk = MagicMock()
|
||||||
|
chunk.doc_id = "d1"
|
||||||
|
chunk.doc_title = "Safety Manual"
|
||||||
|
chunk.score = 0.85
|
||||||
|
chunk.text = "relevant text"
|
||||||
|
chunk.section_title = "§4.2"
|
||||||
|
mock_retrieval.retrieve.return_value = [chunk]
|
||||||
|
event = {
|
||||||
|
"standard_code": "GB 18384-2025",
|
||||||
|
"title": "电动汽车安全要求",
|
||||||
|
"obligations": [{"text": "OEM shall comply"}],
|
||||||
|
}
|
||||||
|
result = pipeline.assess_impact(event, mock_retrieval)
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_diff_no_change():
|
||||||
|
pipeline, _, mock_emb = _make_pipeline()
|
||||||
|
mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024]
|
||||||
|
result = pipeline.compute_diff("paragraph one", "paragraph one")
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert "changed_sections" in result
|
||||||
|
assert "change_summary" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_diff_detects_change():
|
||||||
|
pipeline, mock_client, mock_emb = _make_pipeline()
|
||||||
|
mock_emb.embed_texts.return_value = [
|
||||||
|
[1.0] + [0.0] * 1023,
|
||||||
|
[0.0] + [1.0] + [0.0] * 1022,
|
||||||
|
]
|
||||||
|
mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}')
|
||||||
|
result = pipeline.compute_diff("old paragraph text", "new tighter requirement text")
|
||||||
|
assert isinstance(result["changed_sections"], list)
|
||||||
98
backend/tests/perception/test_postgres_event_store.py
Normal file
98
backend/tests/perception/test_postgres_event_store.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""Unit tests for PostgresEventStore using a mocked psycopg2 pool."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Patch psycopg2 before importing the module under test
|
||||||
|
import sys
|
||||||
|
mock_psycopg2 = MagicMock()
|
||||||
|
mock_psycopg2.extras = MagicMock()
|
||||||
|
sys.modules.setdefault("psycopg2", mock_psycopg2)
|
||||||
|
sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras)
|
||||||
|
sys.modules.setdefault("psycopg2.pool", MagicMock())
|
||||||
|
|
||||||
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_ROW = {
|
||||||
|
"id": "pg-001",
|
||||||
|
"source": "国标委",
|
||||||
|
"source_label": "国家标准化管理委员会",
|
||||||
|
"standard_code": "GB 18384-2025",
|
||||||
|
"title": "电动汽车安全要求",
|
||||||
|
"summary": "新增要求",
|
||||||
|
"full_text_url": "https://openstd.samr.gov.cn",
|
||||||
|
"status": "enacted",
|
||||||
|
"impact_level": "high",
|
||||||
|
"published_at": "2025-11-15",
|
||||||
|
"effective_at": "2026-07-01",
|
||||||
|
"category": "电动汽车安全",
|
||||||
|
"tags": ["电池安全"],
|
||||||
|
"obligations": None,
|
||||||
|
"deadlines": None,
|
||||||
|
"scope": None,
|
||||||
|
"penalties": None,
|
||||||
|
"content_hash": "abc123",
|
||||||
|
"previous_hash": None,
|
||||||
|
"change_summary": None,
|
||||||
|
"changed_sections": None,
|
||||||
|
"affected_docs": None,
|
||||||
|
"crawled_at": "2026-06-05T10:00:00+00:00",
|
||||||
|
"processed_at": None,
|
||||||
|
"raw_storage_key": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _make_store_with_pool(mock_pool):
|
||||||
|
with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool):
|
||||||
|
with patch(
|
||||||
|
"app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema"
|
||||||
|
):
|
||||||
|
from app.infrastructure.perception.postgres_event_store import PostgresEventStore
|
||||||
|
return PostgresEventStore()
|
||||||
|
|
||||||
|
|
||||||
|
def _cursor_returning(rows):
|
||||||
|
cursor = MagicMock()
|
||||||
|
cursor.__enter__ = lambda s: s
|
||||||
|
cursor.__exit__ = MagicMock(return_value=False)
|
||||||
|
cursor.fetchall.return_value = rows
|
||||||
|
cursor.fetchone.return_value = rows[0] if rows else None
|
||||||
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_base_event_store():
|
||||||
|
mock_pool = MagicMock()
|
||||||
|
store = _make_store_with_pool(mock_pool)
|
||||||
|
assert isinstance(store, BaseEventStore)
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_returns_list():
|
||||||
|
mock_pool = MagicMock()
|
||||||
|
conn = MagicMock()
|
||||||
|
conn.__enter__ = lambda s: s
|
||||||
|
conn.__exit__ = MagicMock(return_value=False)
|
||||||
|
cursor = _cursor_returning([SAMPLE_ROW])
|
||||||
|
conn.cursor.return_value = cursor
|
||||||
|
mock_pool.getconn.return_value = conn
|
||||||
|
store = _make_store_with_pool(mock_pool)
|
||||||
|
result = store.filter(limit=10)
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_returns_correct_keys():
|
||||||
|
mock_pool = MagicMock()
|
||||||
|
conn = MagicMock()
|
||||||
|
conn.__enter__ = lambda s: s
|
||||||
|
conn.__exit__ = MagicMock(return_value=False)
|
||||||
|
cursor = MagicMock()
|
||||||
|
cursor.__enter__ = lambda s: s
|
||||||
|
cursor.__exit__ = MagicMock(return_value=False)
|
||||||
|
cursor.fetchone.return_value = {"count": 5}
|
||||||
|
conn.cursor.return_value = cursor
|
||||||
|
mock_pool.getconn.return_value = conn
|
||||||
|
store = _make_store_with_pool(mock_pool)
|
||||||
|
stats = store.stats()
|
||||||
|
for key in ("total", "high_impact", "medium_impact", "recent_90d"):
|
||||||
|
assert key in stats
|
||||||
2500
docs/superpowers/plans/2026-06-05-perception-intelligence.md
Normal file
2500
docs/superpowers/plans/2026-06-05-perception-intelligence.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,328 @@
|
|||||||
|
# Regulatory Signals Intelligence Enhancement — Design Spec
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** Replace the 20-item hardcoded MockEventStore with real regulatory data from Chinese and international sources, add LLM-driven structured extraction, impact assessment, and semantic change diff — all accessible through a manual-trigger crawl in the frontend.
|
||||||
|
|
||||||
|
**Architecture:** Crawler Service (httpx + BeautifulSoup) → PostgreSQL EventStore → LLM Pipeline (extract → assess → diff) → existing PerceptionService interface. New code follows `api → application → domain ports → infrastructure` layering; no new files in `services/*` or `workflows/*`; `shared/bootstrap.py` is the composition root.
|
||||||
|
|
||||||
|
**Tech Stack:** httpx, BeautifulSoup4, sentence-transformers (for diff), existing LLM factory (deepseek/qwen), existing KnowledgeRetrievalService (RAG), PostgreSQL (already available), existing SSE infrastructure.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Data Sources
|
||||||
|
|
||||||
|
| Source | URL | Method | Coverage |
|
||||||
|
|--------|-----|--------|----------|
|
||||||
|
| CATARC 汽车标准 | `https://www.catarc.org.cn/bzzxd/qcbz/index.html` | httpx + BeautifulSoup (static pages) | 国家/行业汽车标准列表 |
|
||||||
|
| 国标委强制性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=1&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 强制性国家标准,按"车"过滤 |
|
||||||
|
| 国标委推荐性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=2&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 推荐性国家标准,按"车"过滤 |
|
||||||
|
| EUR-Lex | RSS + CELLAR REST API | pyeurlex / httpx | EU AI Act, automotive directives |
|
||||||
|
| UN R155/R156 | CELLAR REST API (CELEX lookup) | httpx | UN-ECE cybersecurity/OTA regulations |
|
||||||
|
|
||||||
|
Crawl is **manual-trigger only** — no cron/Celery Beat. Admin clicks "刷新数据源" in the frontend UI.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Database Schema
|
||||||
|
|
||||||
|
### New table: `regulation_events`
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE IF NOT EXISTS regulation_events (
|
||||||
|
id TEXT PRIMARY KEY, -- sha256(source + standard_code)[:12]
|
||||||
|
source TEXT NOT NULL, -- 'CATARC' | '国标委' | 'EUR-Lex' | 'UN-ECE'
|
||||||
|
source_label TEXT, -- Human-readable source label
|
||||||
|
standard_code TEXT NOT NULL, -- e.g. "GB 18384-2025", "EU/2024/1689"
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
summary TEXT, -- Crawled abstract or first paragraph
|
||||||
|
full_text_url TEXT, -- Original page URL
|
||||||
|
status TEXT, -- 'enacted' | 'draft' | 'consultation'
|
||||||
|
impact_level TEXT, -- 'high' | 'medium' | 'low' (LLM-assigned)
|
||||||
|
published_at DATE,
|
||||||
|
effective_at DATE,
|
||||||
|
category TEXT,
|
||||||
|
tags TEXT[],
|
||||||
|
-- LLM structured extraction
|
||||||
|
obligations JSONB, -- [{text, deontic, subject, object, condition}]
|
||||||
|
deadlines JSONB, -- [{date, description}]
|
||||||
|
scope TEXT, -- Applicability scope summary
|
||||||
|
penalties TEXT, -- Penalty / consequence summary
|
||||||
|
-- Change tracking
|
||||||
|
content_hash TEXT, -- SHA256 of crawled full text
|
||||||
|
previous_hash TEXT, -- Hash from prior crawl (NULL on first crawl)
|
||||||
|
change_summary TEXT, -- LLM-generated description of changes
|
||||||
|
changed_sections JSONB, -- [{old_text, new_text, change_type}] where cosine<0.85
|
||||||
|
-- Impact assessment
|
||||||
|
affected_docs JSONB, -- [{doc_id, doc_name, score, key_clauses, recommendation}]
|
||||||
|
-- Metadata
|
||||||
|
crawled_at TIMESTAMPTZ DEFAULT now(),
|
||||||
|
processed_at TIMESTAMPTZ,
|
||||||
|
raw_storage_key TEXT -- MinIO path for raw HTML/PDF (optional)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS regulation_events_source_date
|
||||||
|
ON regulation_events (source, published_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS regulation_events_impact_date
|
||||||
|
ON regulation_events (impact_level, published_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS regulation_events_tags
|
||||||
|
ON regulation_events USING gin(tags);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Backend Architecture
|
||||||
|
|
||||||
|
### 3.1 File Map
|
||||||
|
|
||||||
|
**New files (infrastructure layer):**
|
||||||
|
- `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` — CATARC scraper
|
||||||
|
- `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` — 国标委 JSON API crawler
|
||||||
|
- `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` — EUR-Lex RSS + CELLAR
|
||||||
|
- `backend/app/infrastructure/perception/crawlers/base.py` — Abstract base class
|
||||||
|
- `backend/app/infrastructure/perception/postgres_event_store.py` — PostgresEventStore (replaces MockEventStore)
|
||||||
|
- `backend/app/infrastructure/perception/llm_pipeline.py` — Extract / assess / diff pipeline
|
||||||
|
|
||||||
|
**New files (application layer):**
|
||||||
|
- `backend/app/application/perception/crawl_service.py` — Orchestrates crawlers + LLM pipeline, exposes `run_crawl(sources)` + progress generator
|
||||||
|
|
||||||
|
**Modified files:**
|
||||||
|
- `backend/app/api/routes/perception.py` — Add `POST /crawl`, `GET /crawl/status` (SSE), `POST /events/{id}/process`, `GET /events/{id}/diff`
|
||||||
|
- `backend/app/shared/bootstrap.py` — Wire `PostgresEventStore` + `CrawlService` + `LlmPipeline` when `DOCUMENT_REPOSITORY_BACKEND=postgres`; fallback to `MockEventStore` when `json`
|
||||||
|
- `backend/app/config/settings.py` — Add `perception_crawl_timeout_seconds`, `perception_max_events_per_source`
|
||||||
|
|
||||||
|
**Unchanged files:**
|
||||||
|
- `backend/app/application/perception/services.py` — `PerceptionService` interface unchanged; only `_store` swap
|
||||||
|
- `backend/app/infrastructure/perception/mock_event_store.py` — Kept for `json` backend mode
|
||||||
|
|
||||||
|
### 3.2 Domain Port (Abstract Interface)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# backend/app/infrastructure/perception/base_event_store.py
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
class BaseEventStore(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def all(self) -> list[dict]: ...
|
||||||
|
@abstractmethod
|
||||||
|
def get(self, event_id: str) -> dict | None: ...
|
||||||
|
@abstractmethod
|
||||||
|
def filter(self, source=None, impact_level=None, limit=50) -> list[dict]: ...
|
||||||
|
@abstractmethod
|
||||||
|
def stats(self) -> dict: ...
|
||||||
|
@abstractmethod
|
||||||
|
def upsert(self, event: dict) -> None: ... # new — needed for crawl writes
|
||||||
|
@abstractmethod
|
||||||
|
def get_by_standard_code(self, code: str) -> dict | None: ... # for change detection
|
||||||
|
```
|
||||||
|
|
||||||
|
`MockEventStore` and `PostgresEventStore` both implement this interface.
|
||||||
|
|
||||||
|
### 3.3 Crawler Base Contract
|
||||||
|
|
||||||
|
```python
|
||||||
|
# backend/app/infrastructure/perception/crawlers/base.py
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RawEvent:
|
||||||
|
source: str
|
||||||
|
source_label: str
|
||||||
|
standard_code: str
|
||||||
|
title: str
|
||||||
|
summary: str
|
||||||
|
full_text_url: str
|
||||||
|
status: str # 'enacted' | 'draft' | 'consultation'
|
||||||
|
published_at: str # YYYY-MM-DD string
|
||||||
|
effective_at: str | None
|
||||||
|
category: str
|
||||||
|
tags: list[str]
|
||||||
|
raw_text: str # full crawled text for hashing + LLM
|
||||||
|
|
||||||
|
class BaseCrawler(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def fetch(self, limit: int = 50) -> list[RawEvent]: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.4 LLM Pipeline
|
||||||
|
|
||||||
|
```python
|
||||||
|
# backend/app/infrastructure/perception/llm_pipeline.py
|
||||||
|
|
||||||
|
class LlmPipeline:
|
||||||
|
"""Runs three sequential LLM steps on a regulation event."""
|
||||||
|
|
||||||
|
def extract_structure(self, event: dict) -> dict:
|
||||||
|
"""Step 1: Extract obligations, deadlines, scope, penalties, impact_level.
|
||||||
|
|
||||||
|
Returns dict with keys: obligations, deadlines, scope, penalties, impact_level.
|
||||||
|
Uses JSON-mode or structured prompt; model retries once on parse failure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def assess_impact(self, event: dict, retrieval_service) -> list[dict]:
|
||||||
|
"""Step 2: RAG-based impact on existing knowledge base documents.
|
||||||
|
|
||||||
|
Query = standard_code + title + first obligation texts.
|
||||||
|
Returns list of {doc_id, doc_name, score, key_clauses, recommendation}.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def compute_diff(self, old_text: str, new_text: str) -> dict:
|
||||||
|
"""Step 3: Semantic diff between old and new regulation text.
|
||||||
|
|
||||||
|
Splits both texts by paragraph. Calls existing EmbeddingService (text-embedding-v3
|
||||||
|
via EMBEDDING_BASE_URL) to embed each paragraph, then computes cosine similarity.
|
||||||
|
Changed paragraphs (cosine < 0.85) sent to LLM for change_type classification:
|
||||||
|
'tightened' | 'relaxed' | 'added' | 'removed'
|
||||||
|
Returns {changed_sections: [...], change_summary: str}.
|
||||||
|
Only called when content_hash differs from previous_hash.
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.5 CrawlService
|
||||||
|
|
||||||
|
```python
|
||||||
|
# backend/app/application/perception/crawl_service.py
|
||||||
|
|
||||||
|
class CrawlService:
|
||||||
|
def __init__(self, crawlers, event_store, llm_pipeline, retrieval_service): ...
|
||||||
|
|
||||||
|
def run_crawl(self, sources: list[str] | None = None) -> Generator[dict, None, None]:
|
||||||
|
"""Manual-trigger crawl. Yields progress SSE dicts:
|
||||||
|
{event: 'progress', data: {source, fetched, new, updated, stage}}
|
||||||
|
{event: 'done', data: {total_new, total_updated, duration_ms}}
|
||||||
|
{event: 'error', data: {source, message}}
|
||||||
|
|
||||||
|
For each crawler:
|
||||||
|
1. fetch() RawEvents
|
||||||
|
2. hash check vs stored event → skip if unchanged
|
||||||
|
3. upsert raw event to DB
|
||||||
|
4. run LLM pipeline (extract → assess → diff)
|
||||||
|
5. upsert enriched event to DB
|
||||||
|
6. yield progress
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. API Endpoints
|
||||||
|
|
||||||
|
### Existing (unchanged interface, new store backend)
|
||||||
|
- `GET /api/v1/perception/stats`
|
||||||
|
- `GET /api/v1/perception/events`
|
||||||
|
- `GET /api/v1/perception/events/{id}`
|
||||||
|
- `POST /api/v1/perception/events/{id}/analyze` (streaming)
|
||||||
|
|
||||||
|
### New endpoints
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /api/v1/perception/crawl
|
||||||
|
Body: { sources?: ["CATARC", "国标委", "EUR-Lex", "UN-ECE"] }
|
||||||
|
Response: text/event-stream (SSE)
|
||||||
|
Auth: requires current_user (admin/legal role)
|
||||||
|
Streams progress events until done or error.
|
||||||
|
|
||||||
|
POST /api/v1/perception/events/{id}/process
|
||||||
|
Trigger LLM pipeline for a single already-crawled event.
|
||||||
|
Response: { status: "ok", processed_at: "..." }
|
||||||
|
Auth: requires current_user
|
||||||
|
|
||||||
|
GET /api/v1/perception/events/{id}/diff
|
||||||
|
Returns: { changed_sections: [...], change_summary: str, previous_hash: str }
|
||||||
|
Returns 404 if no diff available (first crawl or no change detected).
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Frontend Changes
|
||||||
|
|
||||||
|
### 5.1 New: Crawl Control Bar (top of PerceptionPage)
|
||||||
|
|
||||||
|
Above the stats-bar, add a `<CrawlBar>` component:
|
||||||
|
- "刷新数据源" button — triggers `POST /crawl` (all sources)
|
||||||
|
- Inline progress display: shows SSE progress events as a mini status line
|
||||||
|
- e.g. "CATARC: 抓取中… | 国标委: 12 条新增 | EUR-Lex: 等待中"
|
||||||
|
- On completion: shows "更新完成 — 新增 N 条,更新 M 条"
|
||||||
|
- Disabled while crawl is in progress (prevents double-trigger)
|
||||||
|
|
||||||
|
### 5.2 Signal Card Enhancement
|
||||||
|
|
||||||
|
Existing cards get two new indicators:
|
||||||
|
- **NEW badge** — shown when `crawled_at` is within last 24h (green dot)
|
||||||
|
- **CHANGED badge** — shown when `previous_hash != content_hash` and `change_summary` exists
|
||||||
|
|
||||||
|
### 5.3 Right Panel — Structured Tab
|
||||||
|
|
||||||
|
Right detail panel adds a tab bar: **概览 | 义务条款 | 影响评估 | 变更对比**
|
||||||
|
|
||||||
|
**义务条款 tab:**
|
||||||
|
- Table: 义务描述 | 主体 | 对象 | 截止日期
|
||||||
|
- Tags for deontic type: 强制 / 禁止 / 允许
|
||||||
|
- Shows `obligations[]` + `deadlines[]` from DB
|
||||||
|
|
||||||
|
**影响评估 tab:**
|
||||||
|
- Replaces hardcoded MOCK_DOCS with real `affected_docs[]` from DB
|
||||||
|
- Each row: document name, similarity score (%), key clause excerpt, LLM recommendation
|
||||||
|
- "Run fresh assessment" button → triggers `POST /events/{id}/process`
|
||||||
|
|
||||||
|
**变更对比 tab:**
|
||||||
|
- Only visible when `change_summary` is non-null
|
||||||
|
- Top: `change_summary` text (LLM prose)
|
||||||
|
- Below: diff table with old/new paragraph pairs, change_type badge per row
|
||||||
|
- Hidden (tab disabled) on first-crawl events with no prior version
|
||||||
|
|
||||||
|
### 5.4 Existing behavior preserved
|
||||||
|
- `analyze` streaming (AI analysis) unchanged
|
||||||
|
- Search/filter (source, impact) unchanged — now hits real DB data
|
||||||
|
- Stats bar — now reflects real counts from PostgreSQL
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Settings Additions
|
||||||
|
|
||||||
|
```python
|
||||||
|
# backend/app/config/settings.py additions
|
||||||
|
perception_crawl_timeout_seconds: int = Field(default=120, ...)
|
||||||
|
perception_max_events_per_source: int = Field(default=100, ...)
|
||||||
|
perception_diff_similarity_threshold: float = Field(default=0.85, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
```env
|
||||||
|
# .env additions
|
||||||
|
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
|
||||||
|
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
|
||||||
|
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Dependencies
|
||||||
|
|
||||||
|
```
|
||||||
|
# requirements.txt additions
|
||||||
|
httpx>=0.27.0 # already likely present; confirm
|
||||||
|
beautifulsoup4>=4.12.0 # HTML parsing for CATARC
|
||||||
|
lxml>=5.0.0 # BeautifulSoup parser backend
|
||||||
|
# sentence-transformers NOT added — diff uses existing text-embedding-v3 API (EMBEDDING_BASE_URL)
|
||||||
|
```
|
||||||
|
|
||||||
|
No new infrastructure required (PostgreSQL + MinIO + Milvus already available).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Backward Compatibility
|
||||||
|
|
||||||
|
- `DOCUMENT_REPOSITORY_BACKEND=json` → `bootstrap.py` uses `MockEventStore` (unchanged behavior)
|
||||||
|
- `DOCUMENT_REPOSITORY_BACKEND=postgres` → uses `PostgresEventStore`
|
||||||
|
- Migration: run `CREATE TABLE` SQL on first startup (idempotent `CREATE TABLE IF NOT EXISTS`)
|
||||||
|
- Existing 20 mock events are not seeded to PostgreSQL; PostgreSQL starts empty until first crawl
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Out of Scope (this phase)
|
||||||
|
|
||||||
|
- Automatic/scheduled crawling (Celery Beat) — manual trigger only
|
||||||
|
- Playwright-based JS-rendered pages — all target sites work with httpx
|
||||||
|
- Knowledge Graph (Neo4j / LightRAG) — future phase
|
||||||
|
- Email/Slack webhook notifications — future phase
|
||||||
|
- User-facing diff history (versioning beyond one prior snapshot) — future phase
|
||||||
@@ -1,12 +1,14 @@
|
|||||||
import './styles/globals.css';
|
import './styles/globals.css';
|
||||||
import { ThemeProvider, AuthProvider } from './contexts';
|
import { ThemeProvider, AuthProvider, PageStateProvider } from './contexts';
|
||||||
import { AppRouter } from './router/AppRouter';
|
import { AppRouter } from './router/AppRouter';
|
||||||
|
|
||||||
function App() {
|
function App() {
|
||||||
return (
|
return (
|
||||||
<ThemeProvider>
|
<ThemeProvider>
|
||||||
<AuthProvider>
|
<AuthProvider>
|
||||||
<AppRouter />
|
<PageStateProvider>
|
||||||
|
<AppRouter />
|
||||||
|
</PageStateProvider>
|
||||||
</AuthProvider>
|
</AuthProvider>
|
||||||
</ThemeProvider>
|
</ThemeProvider>
|
||||||
);
|
);
|
||||||
|
|||||||
211
frontend/src/contexts/PageStateContext.tsx
Normal file
211
frontend/src/contexts/PageStateContext.tsx
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
/**
|
||||||
|
* PageStateContext — preserves page-level session state across route changes.
|
||||||
|
*
|
||||||
|
* When React Router unmounts a page component, all its useState values are lost.
|
||||||
|
* This context lives above the router and holds the state that must survive
|
||||||
|
* navigation so users can switch modules and return without losing their work.
|
||||||
|
*
|
||||||
|
* Covered pages:
|
||||||
|
* - RagChat: message history, citation rail, sessionId, input draft
|
||||||
|
* - Compliance: analysis result (sources, findings, conclusion, meta)
|
||||||
|
* - Perception: selected signal, filter state, AI analysis output
|
||||||
|
*/
|
||||||
|
|
||||||
|
import React, { createContext, useContext, useState, useCallback, useRef } from 'react';
|
||||||
|
|
||||||
|
// ── RagChat types ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export interface RagMessage {
|
||||||
|
id: string;
|
||||||
|
role: 'user' | 'assistant';
|
||||||
|
text: string;
|
||||||
|
citationRefs?: number[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RagCitation {
|
||||||
|
index: number;
|
||||||
|
score: number;
|
||||||
|
name: string;
|
||||||
|
clause: string;
|
||||||
|
snippet: string;
|
||||||
|
docId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RagChatState {
|
||||||
|
messages: RagMessage[];
|
||||||
|
citations: RagCitation[];
|
||||||
|
sessionId: string | null;
|
||||||
|
inputDraft: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const RAG_INIT: RagChatState = {
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
id: 'init',
|
||||||
|
role: 'assistant',
|
||||||
|
text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
citations: [],
|
||||||
|
sessionId: null,
|
||||||
|
inputDraft: '',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Compliance types ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export interface ComplianceSourceEvent {
|
||||||
|
standard: string;
|
||||||
|
clause: string;
|
||||||
|
score: number;
|
||||||
|
status: string;
|
||||||
|
full_content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComplianceFindingEvent {
|
||||||
|
title: string;
|
||||||
|
desc: string;
|
||||||
|
status: 'ok' | 'warn' | 'risk';
|
||||||
|
clause_ref?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComplianceActionItem {
|
||||||
|
label: string;
|
||||||
|
value: string;
|
||||||
|
risk?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComplianceDonePayload {
|
||||||
|
conclusion: string;
|
||||||
|
actions: ComplianceActionItem[];
|
||||||
|
risk_score: number;
|
||||||
|
highlight_terms: string[];
|
||||||
|
para_text: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ComplianceMeta {
|
||||||
|
title: string;
|
||||||
|
sourceType: 'text' | 'doc' | 'upload';
|
||||||
|
startedAt: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ComplianceStatus = 'idle' | 'streaming' | 'done' | 'error';
|
||||||
|
|
||||||
|
export interface ComplianceState {
|
||||||
|
status: ComplianceStatus;
|
||||||
|
stageLabel: string;
|
||||||
|
stageKey: string;
|
||||||
|
meta: ComplianceMeta | null;
|
||||||
|
sources: ComplianceSourceEvent[];
|
||||||
|
findings: ComplianceFindingEvent[];
|
||||||
|
done: ComplianceDonePayload | null;
|
||||||
|
errorText: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const COMPLIANCE_INIT: ComplianceState = {
|
||||||
|
status: 'idle',
|
||||||
|
stageLabel: '',
|
||||||
|
stageKey: '',
|
||||||
|
meta: null,
|
||||||
|
sources: [],
|
||||||
|
findings: [],
|
||||||
|
done: null,
|
||||||
|
errorText: '',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Perception types ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export interface PerceptionSignal {
|
||||||
|
id: string;
|
||||||
|
source: string;
|
||||||
|
standard: string;
|
||||||
|
status: 'ok' | 'warn' | 'risk' | 'info';
|
||||||
|
title: string;
|
||||||
|
summary: string;
|
||||||
|
date: string;
|
||||||
|
tags: string[];
|
||||||
|
impact: 'High' | 'Medium' | 'Low';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PerceptionPageState {
|
||||||
|
signals: PerceptionSignal[];
|
||||||
|
searchQuery: string;
|
||||||
|
sourceFilter: string;
|
||||||
|
impactFilter: string;
|
||||||
|
selectedId: string | null;
|
||||||
|
aiOutput: string;
|
||||||
|
detailTab: 'overview' | 'obligations' | 'assessment' | 'diff';
|
||||||
|
crawlStatus: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PERCEPTION_INIT: PerceptionPageState = {
|
||||||
|
signals: [],
|
||||||
|
searchQuery: '',
|
||||||
|
sourceFilter: 'All',
|
||||||
|
impactFilter: 'All',
|
||||||
|
selectedId: null,
|
||||||
|
aiOutput: '',
|
||||||
|
detailTab: 'overview',
|
||||||
|
crawlStatus: '',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Context value ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
interface PageStateContextValue {
|
||||||
|
// RagChat
|
||||||
|
ragState: RagChatState;
|
||||||
|
setRagState: React.Dispatch<React.SetStateAction<RagChatState>>;
|
||||||
|
ragStreamingRef: React.MutableRefObject<boolean>;
|
||||||
|
ragAbortRef: React.MutableRefObject<AbortController | null>;
|
||||||
|
|
||||||
|
// Compliance
|
||||||
|
complianceState: ComplianceState;
|
||||||
|
setComplianceState: React.Dispatch<React.SetStateAction<ComplianceState>>;
|
||||||
|
complianceAbortRef: React.MutableRefObject<AbortController | null>;
|
||||||
|
resetCompliance: () => void;
|
||||||
|
|
||||||
|
// Perception
|
||||||
|
perceptionState: PerceptionPageState;
|
||||||
|
setPerceptionState: React.Dispatch<React.SetStateAction<PerceptionPageState>>;
|
||||||
|
perceptionAbortRef: React.MutableRefObject<AbortController | null>;
|
||||||
|
perceptionCrawlAbortRef: React.MutableRefObject<AbortController | null>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PageStateContext = createContext<PageStateContextValue | null>(null);
|
||||||
|
|
||||||
|
// ── Provider ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function PageStateProvider({ children }: { children: React.ReactNode }) {
|
||||||
|
const [ragState, setRagState] = useState<RagChatState>(RAG_INIT);
|
||||||
|
const ragStreamingRef = useRef(false);
|
||||||
|
const ragAbortRef = useRef<AbortController | null>(null);
|
||||||
|
|
||||||
|
const [complianceState, setComplianceState] = useState<ComplianceState>(COMPLIANCE_INIT);
|
||||||
|
const complianceAbortRef = useRef<AbortController | null>(null);
|
||||||
|
|
||||||
|
const resetCompliance = useCallback(() => {
|
||||||
|
complianceAbortRef.current?.abort();
|
||||||
|
setComplianceState(COMPLIANCE_INIT);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const [perceptionState, setPerceptionState] = useState<PerceptionPageState>(PERCEPTION_INIT);
|
||||||
|
const perceptionAbortRef = useRef<AbortController | null>(null);
|
||||||
|
const perceptionCrawlAbortRef = useRef<AbortController | null>(null);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<PageStateContext.Provider value={{
|
||||||
|
ragState, setRagState, ragStreamingRef, ragAbortRef,
|
||||||
|
complianceState, setComplianceState, complianceAbortRef, resetCompliance,
|
||||||
|
perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef,
|
||||||
|
}}>
|
||||||
|
{children}
|
||||||
|
</PageStateContext.Provider>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Hook ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function usePageState() {
|
||||||
|
const ctx = useContext(PageStateContext);
|
||||||
|
if (!ctx) throw new Error('usePageState must be used inside PageStateProvider');
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
@@ -1,3 +1,18 @@
|
|||||||
export { ThemeProvider, useTheme } from './ThemeContext';
|
export { ThemeProvider, useTheme } from './ThemeContext';
|
||||||
export { AuthProvider, useAuth } from './AuthContext';
|
export { AuthProvider, useAuth } from './AuthContext';
|
||||||
export type { AuthUser } from './AuthContext';
|
export type { AuthUser } from './AuthContext';
|
||||||
|
export { PageStateProvider, usePageState } from './PageStateContext';
|
||||||
|
export type {
|
||||||
|
RagChatState,
|
||||||
|
RagMessage,
|
||||||
|
RagCitation,
|
||||||
|
ComplianceState,
|
||||||
|
ComplianceStatus,
|
||||||
|
ComplianceSourceEvent,
|
||||||
|
ComplianceFindingEvent,
|
||||||
|
ComplianceDonePayload,
|
||||||
|
ComplianceMeta,
|
||||||
|
ComplianceActionItem,
|
||||||
|
PerceptionPageState,
|
||||||
|
PerceptionSignal,
|
||||||
|
} from './PageStateContext';
|
||||||
|
|||||||
@@ -1,4 +1,25 @@
|
|||||||
import { useState, useCallback, useRef } from 'react';
|
/**
|
||||||
|
* useComplianceAnalysis — compliance analysis state wired to PageStateContext.
|
||||||
|
*
|
||||||
|
* State is stored in the global context so it persists when the user navigates
|
||||||
|
* to another module and returns. The `run` and `reset` actions are identical
|
||||||
|
* to the previous hook API so CompliancePage needs no structural changes.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { useCallback } from 'react';
|
||||||
|
import { usePageState } from '../../contexts';
|
||||||
|
import type {
|
||||||
|
ComplianceMeta,
|
||||||
|
ComplianceState,
|
||||||
|
ComplianceSourceEvent,
|
||||||
|
ComplianceFindingEvent,
|
||||||
|
ComplianceDonePayload,
|
||||||
|
} from '../../contexts';
|
||||||
|
|
||||||
|
export type { ComplianceMeta, ComplianceState, ComplianceSourceEvent as SourceEvent, ComplianceFindingEvent as FindingEvent, ComplianceDonePayload as DonePayload };
|
||||||
|
export type { ComplianceActionItem as ActionItem } from '../../contexts';
|
||||||
|
export type AnalysisStatus = import('../../contexts').ComplianceStatus;
|
||||||
|
export type AnalysisMeta = ComplianceMeta;
|
||||||
|
|
||||||
const TOKEN_KEY = 'auth_token';
|
const TOKEN_KEY = 'auth_token';
|
||||||
function authHeader(): Record<string, string> {
|
function authHeader(): Record<string, string> {
|
||||||
@@ -6,55 +27,7 @@ function authHeader(): Record<string, string> {
|
|||||||
return t ? { Authorization: `Bearer ${t}` } : {};
|
return t ? { Authorization: `Bearer ${t}` } : {};
|
||||||
}
|
}
|
||||||
|
|
||||||
export type AnalysisStatus = 'idle' | 'streaming' | 'done' | 'error';
|
const INITIAL_STATE: ComplianceState = {
|
||||||
|
|
||||||
export interface SourceEvent {
|
|
||||||
standard: string;
|
|
||||||
clause: string;
|
|
||||||
score: number;
|
|
||||||
status: string;
|
|
||||||
full_content: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface FindingEvent {
|
|
||||||
title: string;
|
|
||||||
desc: string;
|
|
||||||
status: 'ok' | 'warn' | 'risk';
|
|
||||||
clause_ref?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface ActionItem {
|
|
||||||
label: string;
|
|
||||||
value: string;
|
|
||||||
risk?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface DonePayload {
|
|
||||||
conclusion: string;
|
|
||||||
actions: ActionItem[];
|
|
||||||
risk_score: number;
|
|
||||||
highlight_terms: string[];
|
|
||||||
para_text: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface AnalysisMeta {
|
|
||||||
title: string;
|
|
||||||
sourceType: 'text' | 'doc' | 'upload';
|
|
||||||
startedAt: string; // ISO timestamp
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface AnalysisState {
|
|
||||||
status: AnalysisStatus;
|
|
||||||
stageLabel: string;
|
|
||||||
stageKey: string;
|
|
||||||
meta: AnalysisMeta | null;
|
|
||||||
sources: SourceEvent[];
|
|
||||||
findings: FindingEvent[];
|
|
||||||
done: DonePayload | null;
|
|
||||||
errorText: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const INITIAL_STATE: AnalysisState = {
|
|
||||||
status: 'idle',
|
status: 'idle',
|
||||||
stageLabel: '',
|
stageLabel: '',
|
||||||
stageKey: '',
|
stageKey: '',
|
||||||
@@ -66,18 +39,12 @@ const INITIAL_STATE: AnalysisState = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export function useComplianceAnalysis() {
|
export function useComplianceAnalysis() {
|
||||||
const [state, setState] = useState<AnalysisState>(INITIAL_STATE);
|
const { complianceState: state, setComplianceState: setState, complianceAbortRef, resetCompliance: reset } = usePageState();
|
||||||
const abortRef = useRef<AbortController | null>(null);
|
|
||||||
|
|
||||||
const reset = useCallback(() => {
|
const run = useCallback(async (formData: FormData, meta: ComplianceMeta) => {
|
||||||
abortRef.current?.abort();
|
complianceAbortRef.current?.abort();
|
||||||
setState(INITIAL_STATE);
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
const run = useCallback(async (formData: FormData, meta: AnalysisMeta) => {
|
|
||||||
abortRef.current?.abort();
|
|
||||||
const ctrl = new AbortController();
|
const ctrl = new AbortController();
|
||||||
abortRef.current = ctrl;
|
complianceAbortRef.current = ctrl;
|
||||||
|
|
||||||
setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta });
|
setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta });
|
||||||
|
|
||||||
@@ -124,7 +91,7 @@ export function useComplianceAnalysis() {
|
|||||||
if (j.type === 'stage') {
|
if (j.type === 'stage') {
|
||||||
setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' }));
|
setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' }));
|
||||||
} else if (j.type === 'source') {
|
} else if (j.type === 'source') {
|
||||||
const src: SourceEvent = {
|
const src: ComplianceSourceEvent = {
|
||||||
standard: j.standard ?? '',
|
standard: j.standard ?? '',
|
||||||
clause: j.clause ?? '',
|
clause: j.clause ?? '',
|
||||||
score: j.score ?? 0,
|
score: j.score ?? 0,
|
||||||
@@ -133,7 +100,7 @@ export function useComplianceAnalysis() {
|
|||||||
};
|
};
|
||||||
setState(s => ({ ...s, sources: [...s.sources, src] }));
|
setState(s => ({ ...s, sources: [...s.sources, src] }));
|
||||||
} else if (j.type === 'finding') {
|
} else if (j.type === 'finding') {
|
||||||
const finding: FindingEvent = {
|
const finding: ComplianceFindingEvent = {
|
||||||
title: j.title ?? '',
|
title: j.title ?? '',
|
||||||
desc: j.desc ?? '',
|
desc: j.desc ?? '',
|
||||||
status: j.status ?? 'info',
|
status: j.status ?? 'info',
|
||||||
@@ -141,7 +108,7 @@ export function useComplianceAnalysis() {
|
|||||||
};
|
};
|
||||||
setState(s => ({ ...s, findings: [...s.findings, finding] }));
|
setState(s => ({ ...s, findings: [...s.findings, finding] }));
|
||||||
} else if (j.type === 'done') {
|
} else if (j.type === 'done') {
|
||||||
const payload: DonePayload = {
|
const payload: ComplianceDonePayload = {
|
||||||
conclusion: j.conclusion ?? '',
|
conclusion: j.conclusion ?? '',
|
||||||
actions: j.actions ?? [],
|
actions: j.actions ?? [],
|
||||||
risk_score: j.risk_score ?? 0,
|
risk_score: j.risk_score ?? 0,
|
||||||
@@ -162,7 +129,7 @@ export function useComplianceAnalysis() {
|
|||||||
if (e instanceof Error && e.name === 'AbortError') return;
|
if (e instanceof Error && e.name === 'AbortError') return;
|
||||||
setState(s => ({ ...s, status: 'error', errorText: String(e) }));
|
setState(s => ({ ...s, status: 'error', errorText: String(e) }));
|
||||||
}
|
}
|
||||||
}, []);
|
}, [setState, complianceAbortRef]);
|
||||||
|
|
||||||
return { state, run, reset };
|
return { state, run, reset };
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import { useState, useEffect, useRef } from 'react';
|
import { useState, useEffect, useRef } from 'react';
|
||||||
import { Topbar } from '../../components/layout/Topbar';
|
import { Topbar } from '../../components/layout/Topbar';
|
||||||
import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react';
|
import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react';
|
||||||
|
import { usePageState } from '../../contexts';
|
||||||
|
import type { PerceptionSignal } from '../../contexts';
|
||||||
|
|
||||||
const TOKEN_KEY = 'auth_token';
|
const TOKEN_KEY = 'auth_token';
|
||||||
function authHeader(): Record<string, string> {
|
function authHeader(): Record<string, string> {
|
||||||
@@ -8,18 +10,6 @@ function authHeader(): Record<string, string> {
|
|||||||
return t ? { Authorization: `Bearer ${t}` } : {};
|
return t ? { Authorization: `Bearer ${t}` } : {};
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Signal {
|
|
||||||
id: string;
|
|
||||||
source: string;
|
|
||||||
standard: string;
|
|
||||||
status: 'ok' | 'warn' | 'risk' | 'info';
|
|
||||||
title: string;
|
|
||||||
summary: string;
|
|
||||||
date: string;
|
|
||||||
tags: string[];
|
|
||||||
impact: 'High' | 'Medium' | 'Low';
|
|
||||||
}
|
|
||||||
|
|
||||||
interface Stats {
|
interface Stats {
|
||||||
total: number;
|
total: number;
|
||||||
high_impact: number;
|
high_impact: number;
|
||||||
@@ -27,29 +17,17 @@ interface Stats {
|
|||||||
last_90_days: number;
|
last_90_days: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface DocResult {
|
|
||||||
score: number;
|
|
||||||
name: string;
|
|
||||||
clause: string;
|
|
||||||
snippet: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF'];
|
const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF'];
|
||||||
const IMPACTS = ['All', 'High', 'Medium', 'Low'];
|
const IMPACTS = ['All', 'High', 'Medium', 'Low'];
|
||||||
|
|
||||||
// Backend /api/v1/perception/stats returns:
|
// Backend event → Signal
|
||||||
// { total, high_impact, medium_impact, last_90_days } — field names match, ✓
|
function mapEvent(e: Record<string, unknown>): PerceptionSignal {
|
||||||
|
|
||||||
// Backend /api/v1/perception/events returns:
|
|
||||||
// { events: [{ id, title, summary, source, standard, impact_level, published_at, tags, status }] }
|
|
||||||
// Map backend event fields → frontend Signal shape
|
|
||||||
function mapEvent(e: Record<string, unknown>): Signal {
|
|
||||||
const impact = String(e.impact_level ?? '').toLowerCase();
|
const impact = String(e.impact_level ?? '').toLowerCase();
|
||||||
const backendStatus = String(e.status ?? '').toLowerCase();
|
const backendStatus = String(e.status ?? '').toLowerCase();
|
||||||
return {
|
return {
|
||||||
id: String(e.id ?? e.event_id ?? ''),
|
id: String(e.id ?? e.event_id ?? ''),
|
||||||
source: String(e.source ?? ''),
|
source: String(e.source ?? ''),
|
||||||
standard: String(e.standard ?? e.regulation_id ?? ''),
|
standard: String(e.standard ?? e.standard_code ?? e.regulation_id ?? ''),
|
||||||
status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk'
|
status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk'
|
||||||
: backendStatus === 'medium' || backendStatus === 'draft' ? 'warn'
|
: backendStatus === 'medium' || backendStatus === 'draft' ? 'warn'
|
||||||
: backendStatus === 'low' || backendStatus === 'final' ? 'ok'
|
: backendStatus === 'low' || backendStatus === 'final' ? 'ok'
|
||||||
@@ -62,50 +40,40 @@ function mapEvent(e: Record<string, unknown>): Signal {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const MOCK_SIGNALS: Signal[] = [
|
const MOCK_SIGNALS: PerceptionSignal[] = [
|
||||||
{
|
{
|
||||||
id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk',
|
id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk',
|
||||||
title: 'EU AI Act — High-risk AI in vehicles',
|
title: 'EU AI Act — High-risk AI in vehicles',
|
||||||
summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.',
|
summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.',
|
||||||
date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High'
|
date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn',
|
id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn',
|
||||||
title: 'MIIT Draft — in-vehicle AI training data',
|
title: 'MIIT Draft — in-vehicle AI training data',
|
||||||
summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.',
|
summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.',
|
||||||
date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High'
|
date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info',
|
id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info',
|
||||||
title: 'ISO/SAE 21434 Amendment 1',
|
title: 'ISO/SAE 21434 Amendment 1',
|
||||||
summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.',
|
summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.',
|
||||||
date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium'
|
date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium',
|
||||||
},
|
},
|
||||||
{
|
|
||||||
id: '4', source: 'UN-ECE', standard: 'UNECE WP.29 R155', status: 'ok',
|
|
||||||
title: 'UNECE R155 Corrigendum',
|
|
||||||
summary: 'Editorial corrections to cybersecurity management system requirements. No substantive changes.',
|
|
||||||
date: '2025-09-12', tags: ['type-approval', 'UNECE'], impact: 'Low'
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const MOCK_DOCS: DocResult[] = [
|
|
||||||
{ score: 94, name: 'Vehicle AI Safety Manual v3.2', clause: '§4.2.1', snippet: 'The risk management process shall identify and evaluate risks arising from AI system decisions in safety-critical scenarios...' },
|
|
||||||
{ score: 87, name: 'ADAS System Requirements', clause: '§7.1', snippet: 'Automated driving functions must document training data lineage and model performance envelopes prior to deployment.' },
|
|
||||||
{ score: 71, name: 'Type Approval Documentation', clause: 'Annex B', snippet: 'Cybersecurity management system certification requires third-party audit of AI decision audit logs retention policy.' },
|
|
||||||
];
|
];
|
||||||
|
|
||||||
export function PerceptionPage() {
|
export function PerceptionPage() {
|
||||||
const [stats, setStats] = useState<Stats | null>(null);
|
// Persistent state lives in PageStateContext — survives route changes
|
||||||
const [signals, setSignals] = useState<Signal[]>(MOCK_SIGNALS);
|
const { perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef } = usePageState();
|
||||||
const [searchQuery, setSearchQuery] = useState('');
|
const { signals, searchQuery, sourceFilter, impactFilter, selectedId, aiOutput, detailTab, crawlStatus } = perceptionState;
|
||||||
const [sourceFilter, setSourceFilter] = useState('All');
|
|
||||||
const [impactFilter, setImpactFilter] = useState('All');
|
|
||||||
const [selected, setSelected] = useState<Signal | null>(null);
|
|
||||||
const [streaming, setStreaming] = useState(false);
|
|
||||||
const [aiOutput, setAiOutput] = useState('');
|
|
||||||
const abortRef = useRef<AbortController | null>(null);
|
|
||||||
|
|
||||||
|
// Stats and selectedFull are lightweight to re-fetch on mount
|
||||||
|
const [stats, setStats] = useState<Stats | null>(null);
|
||||||
|
const [streaming, setStreaming] = useState(false);
|
||||||
|
const [crawling, setCrawling] = useState(false);
|
||||||
|
// Full event detail — re-fetched when selected changes or page mounts with a selection
|
||||||
|
const [selectedFull, setSelectedFull] = useState<Record<string, unknown> | null>(null);
|
||||||
|
|
||||||
|
// Re-fetch stats every time the page mounts
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetch('/api/v1/perception/stats', { headers: authHeader() })
|
fetch('/api/v1/perception/stats', { headers: authHeader() })
|
||||||
.then(r => r.json())
|
.then(r => r.json())
|
||||||
@@ -113,16 +81,36 @@ export function PerceptionPage() {
|
|||||||
.catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 }));
|
.catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 }));
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
// Fetch signal list on first mount only (if empty), otherwise preserve context state
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
if (signals.length > 0) return; // already loaded
|
||||||
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
|
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
|
||||||
.then(r => r.json())
|
.then(r => r.json())
|
||||||
.then(d => {
|
.then(d => {
|
||||||
if (Array.isArray(d?.events) && d.events.length > 0) {
|
if (Array.isArray(d?.events) && d.events.length > 0) {
|
||||||
setSignals(d.events.map(mapEvent));
|
setPerceptionState(s => ({ ...s, signals: d.events.map(mapEvent) }));
|
||||||
|
} else {
|
||||||
|
setPerceptionState(s => ({ ...s, signals: MOCK_SIGNALS }));
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.catch(() => { /* keep mock data on error */ });
|
.catch(() => {
|
||||||
}, []);
|
setPerceptionState(s => ({ ...s, signals: s.signals.length > 0 ? s.signals : MOCK_SIGNALS }));
|
||||||
|
});
|
||||||
|
}, []); // eslint-disable-line react-hooks/exhaustive-deps
|
||||||
|
|
||||||
|
// Re-fetch full event detail when navigating back with a selected signal
|
||||||
|
useEffect(() => {
|
||||||
|
if (selectedId) {
|
||||||
|
fetch(`/api/v1/perception/events/${selectedId}`, { headers: authHeader() })
|
||||||
|
.then(r => r.ok ? r.json() : null)
|
||||||
|
.then(d => { if (d) setSelectedFull(d); })
|
||||||
|
.catch(() => {});
|
||||||
|
} else {
|
||||||
|
setSelectedFull(null);
|
||||||
|
}
|
||||||
|
}, [selectedId]);
|
||||||
|
|
||||||
|
const selected = signals.find(s => s.id === selectedId) ?? null;
|
||||||
|
|
||||||
const filtered = signals.filter(s => {
|
const filtered = signals.filter(s => {
|
||||||
if (sourceFilter !== 'All' && s.source !== sourceFilter) return false;
|
if (sourceFilter !== 'All' && s.source !== sourceFilter) return false;
|
||||||
@@ -137,13 +125,20 @@ export function PerceptionPage() {
|
|||||||
function runAnalysis() {
|
function runAnalysis() {
|
||||||
if (!selected) return;
|
if (!selected) return;
|
||||||
setStreaming(true);
|
setStreaming(true);
|
||||||
setAiOutput('');
|
setPerceptionState(s => ({ ...s, aiOutput: '' }));
|
||||||
const ctrl = new AbortController();
|
const ctrl = new AbortController();
|
||||||
abortRef.current = ctrl;
|
perceptionAbortRef.current = ctrl;
|
||||||
// Backend: POST /api/v1/perception/events/{id}/analyze → SSE stream
|
fetch(`/api/v1/perception/events/${selected.id}/analyze`, {
|
||||||
fetch(`/api/v1/perception/events/${selected.id}/analyze`, { method: 'POST', headers: authHeader(), signal: ctrl.signal })
|
method: 'POST',
|
||||||
|
headers: authHeader(),
|
||||||
|
signal: ctrl.signal,
|
||||||
|
})
|
||||||
.then(async res => {
|
.then(async res => {
|
||||||
if (!res.body) { setAiOutput('No stream available.'); setStreaming(false); return; }
|
if (!res.body) {
|
||||||
|
setPerceptionState(s => ({ ...s, aiOutput: 'No stream available.' }));
|
||||||
|
setStreaming(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
const reader = res.body.getReader();
|
const reader = res.body.getReader();
|
||||||
const dec = new TextDecoder();
|
const dec = new TextDecoder();
|
||||||
let buf = '';
|
let buf = '';
|
||||||
@@ -160,30 +155,99 @@ export function PerceptionPage() {
|
|||||||
if (!raw || raw === '[DONE]') continue;
|
if (!raw || raw === '[DONE]') continue;
|
||||||
try {
|
try {
|
||||||
const j = JSON.parse(raw);
|
const j = JSON.parse(raw);
|
||||||
if (j.text) setAiOutput(p => p + j.text);
|
if (j.text) setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j.text }));
|
||||||
else if (typeof j === 'string') setAiOutput(p => p + j);
|
else if (typeof j === 'string') setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j }));
|
||||||
} catch {
|
} catch {
|
||||||
setAiOutput(p => p + raw);
|
setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + raw }));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
setStreaming(false);
|
setStreaming(false);
|
||||||
})
|
})
|
||||||
.catch(e => {
|
.catch(e => {
|
||||||
if (e.name !== 'AbortError') setAiOutput('Analysis failed. Check API connection.');
|
if (e.name !== 'AbortError') setPerceptionState(s => ({ ...s, aiOutput: 'Analysis failed. Check API connection.' }));
|
||||||
setStreaming(false);
|
setStreaming(false);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function stopAnalysis() {
|
function stopAnalysis() {
|
||||||
abortRef.current?.abort();
|
perceptionAbortRef.current?.abort();
|
||||||
setStreaming(false);
|
setStreaming(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
function selectSignal(sig: Signal) {
|
async function runCrawl() {
|
||||||
setSelected(sig);
|
setCrawling(true);
|
||||||
setAiOutput('');
|
setPerceptionState(s => ({ ...s, crawlStatus: '正在连接数据源...' }));
|
||||||
|
try {
|
||||||
|
const res = await fetch('/api/v1/perception/crawl', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json', ...authHeader() },
|
||||||
|
body: JSON.stringify({}),
|
||||||
|
});
|
||||||
|
if (!res.body) {
|
||||||
|
setPerceptionState(s => ({ ...s, crawlStatus: 'No stream' }));
|
||||||
|
setCrawling(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const reader = res.body.getReader();
|
||||||
|
const dec = new TextDecoder();
|
||||||
|
let buf = '';
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) break;
|
||||||
|
buf += dec.decode(value);
|
||||||
|
const parts = buf.split('\n\n');
|
||||||
|
buf = parts.pop() ?? '';
|
||||||
|
for (const block of parts) {
|
||||||
|
const eventLine = block.split('\n').find(l => l.startsWith('event: '));
|
||||||
|
const dataLine = block.split('\n').find(l => l.startsWith('data: '));
|
||||||
|
const evtName = eventLine?.slice(7).trim();
|
||||||
|
const raw = dataLine?.slice(6).trim();
|
||||||
|
if (!raw) continue;
|
||||||
|
try {
|
||||||
|
const d = JSON.parse(raw);
|
||||||
|
if (evtName === 'progress') {
|
||||||
|
setPerceptionState(s => ({
|
||||||
|
...s,
|
||||||
|
crawlStatus: `${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`,
|
||||||
|
}));
|
||||||
|
} else if (evtName === 'done') {
|
||||||
|
setPerceptionState(s => ({ ...s, crawlStatus: `更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated} 条` }));
|
||||||
|
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(d2 => {
|
||||||
|
if (Array.isArray(d2?.events)) {
|
||||||
|
setPerceptionState(s => ({ ...s, signals: d2.events.map(mapEvent) }));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else if (evtName === 'error') {
|
||||||
|
setPerceptionState(s => ({
|
||||||
|
...s,
|
||||||
|
crawlStatus: `错误: ${typeof d === 'string' ? d : d.message}`,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch { /* ignore */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e: unknown) {
|
||||||
|
setPerceptionState(s => ({
|
||||||
|
...s,
|
||||||
|
crawlStatus: `连接失败: ${e instanceof Error ? e.message : String(e)}`,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
setCrawling(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
function selectSignal(sig: PerceptionSignal) {
|
||||||
|
setPerceptionState(s => ({
|
||||||
|
...s,
|
||||||
|
selectedId: sig.id,
|
||||||
|
aiOutput: '',
|
||||||
|
detailTab: 'overview',
|
||||||
|
}));
|
||||||
|
setSelectedFull(null);
|
||||||
setStreaming(false);
|
setStreaming(false);
|
||||||
|
perceptionAbortRef.current?.abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@@ -197,10 +261,18 @@ export function PerceptionPage() {
|
|||||||
<input
|
<input
|
||||||
placeholder="Search signals..."
|
placeholder="Search signals..."
|
||||||
value={searchQuery}
|
value={searchQuery}
|
||||||
onChange={e => setSearchQuery(e.target.value)}
|
onChange={e => setPerceptionState(s => ({ ...s, searchQuery: e.target.value }))}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<button className="btn sm"><RefreshCw size={13} />Refresh</button>
|
<button className="btn sm primary" onClick={runCrawl} disabled={crawling}>
|
||||||
|
<RefreshCw size={13} className={crawling ? 'spin' : ''} />
|
||||||
|
{crawling ? '抓取中...' : '刷新数据源'}
|
||||||
|
</button>
|
||||||
|
{crawlStatus && (
|
||||||
|
<span style={{ fontSize: 12, color: 'var(--text-secondary)', marginLeft: 8 }}>
|
||||||
|
{crawlStatus}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
@@ -227,13 +299,25 @@ export function PerceptionPage() {
|
|||||||
<div className="filter-bar">
|
<div className="filter-bar">
|
||||||
<div className="chip-group">
|
<div className="chip-group">
|
||||||
{SOURCES.map(s => (
|
{SOURCES.map(s => (
|
||||||
<button key={s} className={`chip${sourceFilter === s ? ' active' : ''}`} onClick={() => setSourceFilter(s)}>{s}</button>
|
<button
|
||||||
|
key={s}
|
||||||
|
className={`chip${sourceFilter === s ? ' active' : ''}`}
|
||||||
|
onClick={() => setPerceptionState(st => ({ ...st, sourceFilter: s }))}
|
||||||
|
>
|
||||||
|
{s}
|
||||||
|
</button>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
<div className="filter-sep" />
|
<div className="filter-sep" />
|
||||||
<div className="chip-group">
|
<div className="chip-group">
|
||||||
{IMPACTS.map(i => (
|
{IMPACTS.map(i => (
|
||||||
<button key={i} className={`chip${impactFilter === i ? ' active' : ''}`} onClick={() => setImpactFilter(i)}>{i}</button>
|
<button
|
||||||
|
key={i}
|
||||||
|
className={`chip${impactFilter === i ? ' active' : ''}`}
|
||||||
|
onClick={() => setPerceptionState(st => ({ ...st, impactFilter: i }))}
|
||||||
|
>
|
||||||
|
{i}
|
||||||
|
</button>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -243,7 +327,7 @@ export function PerceptionPage() {
|
|||||||
{filtered.map(sig => (
|
{filtered.map(sig => (
|
||||||
<div
|
<div
|
||||||
key={sig.id}
|
key={sig.id}
|
||||||
className={`ev-card${selected?.id === sig.id ? ' selected' : ''}`}
|
className={`ev-card${selectedId === sig.id ? ' selected' : ''}`}
|
||||||
onClick={() => selectSignal(sig)}
|
onClick={() => selectSignal(sig)}
|
||||||
>
|
>
|
||||||
<div className="ev-top">
|
<div className="ev-top">
|
||||||
@@ -277,8 +361,11 @@ export function PerceptionPage() {
|
|||||||
<span className="source-tag">{selected.source}</span>
|
<span className="source-tag">{selected.source}</span>
|
||||||
<span className="ev-std">{selected.standard}</span>
|
<span className="ev-std">{selected.standard}</span>
|
||||||
<span className={`status ${selected.status}`}>
|
<span className={`status ${selected.status}`}>
|
||||||
{selected.status === 'risk' ? 'Urgent' : 'Published'}
|
{selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'}
|
||||||
</span>
|
</span>
|
||||||
|
{selectedFull?.change_summary && (
|
||||||
|
<span className="status warn" style={{ marginLeft: 'auto' }}>CHANGED</span>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="detail-title">{selected.title}</div>
|
<div className="detail-title">{selected.title}</div>
|
||||||
<p className="detail-summary">{selected.summary}</p>
|
<p className="detail-summary">{selected.summary}</p>
|
||||||
@@ -287,23 +374,160 @@ export function PerceptionPage() {
|
|||||||
? <button className="btn sm primary" onClick={runAnalysis}><Play size={12} />Run impact analysis</button>
|
? <button className="btn sm primary" onClick={runAnalysis}><Play size={12} />Run impact analysis</button>
|
||||||
: <button className="btn sm" onClick={stopAnalysis}><Square size={12} />Stop</button>
|
: <button className="btn sm" onClick={stopAnalysis}><Square size={12} />Stop</button>
|
||||||
}
|
}
|
||||||
<button className="btn sm"><ExternalLink size={12} />Source</button>
|
{selected && (
|
||||||
|
<a
|
||||||
|
href={(selectedFull?.full_text_url as string) || '#'}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="btn sm"
|
||||||
|
>
|
||||||
|
<ExternalLink size={12} />Source
|
||||||
|
</a>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="card docs-card">
|
<div className="detail-tabs">
|
||||||
<div className="card-header">Affected documents</div>
|
{(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => (
|
||||||
{MOCK_DOCS.map(d => (
|
<button
|
||||||
<div key={d.name} className="doc-row">
|
key={tab}
|
||||||
<span className="doc-score">{d.score}%</span>
|
className={`detail-tab${detailTab === tab ? ' active' : ''}${tab === 'diff' && !selectedFull?.change_summary ? ' disabled' : ''}`}
|
||||||
<div>
|
onClick={() => {
|
||||||
<div className="doc-name">{d.name} <span className="doc-clause">{d.clause}</span></div>
|
if (tab !== 'diff' || selectedFull?.change_summary) {
|
||||||
<div className="doc-snippet">{d.snippet}</div>
|
setPerceptionState(s => ({ ...s, detailTab: tab }));
|
||||||
</div>
|
}
|
||||||
</div>
|
}}
|
||||||
|
>
|
||||||
|
{tab === 'overview' ? '概览' : tab === 'obligations' ? '义务条款' : tab === 'assessment' ? '影响评估' : '变更对比'}
|
||||||
|
</button>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{detailTab === 'overview' && (
|
||||||
|
<div className="card">
|
||||||
|
<div className="card-header">Scope & Summary</div>
|
||||||
|
<p className="detail-summary" style={{ marginTop: 8 }}>
|
||||||
|
{(selectedFull?.scope as string) || selected.summary}
|
||||||
|
</p>
|
||||||
|
{selectedFull?.penalties && (
|
||||||
|
<p style={{ fontSize: 13, color: 'var(--danger)', marginTop: 6 }}>
|
||||||
|
⚠ {selectedFull.penalties as string}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{detailTab === 'obligations' && (
|
||||||
|
<div className="card">
|
||||||
|
<div className="card-header">义务条款</div>
|
||||||
|
{(() => {
|
||||||
|
const obs = (selectedFull?.obligations as Array<Record<string, string>>) || [];
|
||||||
|
const deadlines = (selectedFull?.deadlines as Array<Record<string, string>>) || [];
|
||||||
|
return obs.length === 0 && deadlines.length === 0 ? (
|
||||||
|
<p className="detail-summary" style={{ marginTop: 8 }}>暂无结构化数据。点击右上角"Run impact analysis"触发提取。</p>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
{obs.length > 0 && (
|
||||||
|
<table style={{ width: '100%', fontSize: 13, borderCollapse: 'collapse', marginTop: 8 }}>
|
||||||
|
<thead>
|
||||||
|
<tr style={{ borderBottom: '1px solid var(--border)' }}>
|
||||||
|
<th style={{ textAlign: 'left', padding: '4px 8px' }}>义务描述</th>
|
||||||
|
<th style={{ textAlign: 'left', padding: '4px 8px', width: 80 }}>主体</th>
|
||||||
|
<th style={{ textAlign: 'left', padding: '4px 8px', width: 60 }}>类型</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{obs.map((ob, i) => (
|
||||||
|
<tr key={i} style={{ borderBottom: '1px solid var(--border-faint)' }}>
|
||||||
|
<td style={{ padding: '6px 8px' }}>{ob.text}</td>
|
||||||
|
<td style={{ padding: '6px 8px', color: 'var(--text-secondary)' }}>{ob.subject}</td>
|
||||||
|
<td style={{ padding: '6px 8px' }}>
|
||||||
|
<span className={`status ${ob.deontic === 'must' || ob.deontic === 'shall' ? 'risk' : ob.deontic === 'prohibited' ? 'risk' : 'info'}`}>
|
||||||
|
{ob.deontic}
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
)}
|
||||||
|
{deadlines.length > 0 && (
|
||||||
|
<div style={{ marginTop: 12 }}>
|
||||||
|
<div className="card-header">截止日期</div>
|
||||||
|
{deadlines.map((d, i) => (
|
||||||
|
<div key={i} style={{ fontSize: 13, padding: '4px 0', display: 'flex', gap: 12 }}>
|
||||||
|
<span style={{ fontWeight: 600, color: 'var(--danger)' }}>{d.date || '待定'}</span>
|
||||||
|
<span style={{ color: 'var(--text-secondary)' }}>{d.description}</span>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
})()}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{detailTab === 'assessment' && (
|
||||||
|
<div className="card docs-card">
|
||||||
|
<div className="card-header">Affected documents</div>
|
||||||
|
{(() => {
|
||||||
|
const docs = (selectedFull?.affected_docs as Array<Record<string, unknown>>);
|
||||||
|
const displayDocs = docs && docs.length > 0 ? docs : [];
|
||||||
|
return displayDocs.length === 0
|
||||||
|
? <p className="detail-summary" style={{ marginTop: 8 }}>No affected documents found.</p>
|
||||||
|
: displayDocs.map((d, i) => (
|
||||||
|
<div key={i} className="doc-row">
|
||||||
|
<span className="doc-score">{Math.round(Number(d.score ?? 0) * 100)}%</span>
|
||||||
|
<div>
|
||||||
|
<div className="doc-name">
|
||||||
|
{String(d.doc_name || '')}
|
||||||
|
<span className="doc-clause">{String(d.key_clauses || d.clause || '')}</span>
|
||||||
|
</div>
|
||||||
|
{d.snippet && <div className="doc-snippet">{String(d.snippet)}</div>}
|
||||||
|
{d.recommendation && (
|
||||||
|
<div style={{ fontSize: 12, color: 'var(--accent)', marginTop: 2 }}>→ {String(d.recommendation)}</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
));
|
||||||
|
})()}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{detailTab === 'diff' && selectedFull?.change_summary && (
|
||||||
|
<div className="card">
|
||||||
|
<div className="card-header">变更对比</div>
|
||||||
|
<p style={{ fontSize: 13, color: 'var(--text-secondary)', marginTop: 8 }}>
|
||||||
|
{selectedFull.change_summary as string}
|
||||||
|
</p>
|
||||||
|
{(() => {
|
||||||
|
const sections = (selectedFull.changed_sections as Array<Record<string, unknown>>) || [];
|
||||||
|
return sections.map((s, i) => (
|
||||||
|
<div key={i} style={{ marginTop: 12, borderTop: '1px solid var(--border)', paddingTop: 10 }}>
|
||||||
|
<div style={{ display: 'flex', gap: 8, marginBottom: 6 }}>
|
||||||
|
<span className={`status ${s.change_type === 'tightened' || s.change_type === 'added' ? 'risk' : s.change_type === 'removed' ? 'warn' : 'info'}`}>
|
||||||
|
{String(s.change_type)}
|
||||||
|
</span>
|
||||||
|
<span style={{ fontSize: 12, color: 'var(--text-secondary)' }}>cosine: {String(s.similarity)}</span>
|
||||||
|
</div>
|
||||||
|
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, fontSize: 12 }}>
|
||||||
|
<div style={{ background: 'var(--danger-bg)', padding: 8, borderRadius: 4 }}>
|
||||||
|
<div style={{ fontWeight: 600, marginBottom: 4 }}>旧版</div>
|
||||||
|
{String(s.old_text || '')}
|
||||||
|
</div>
|
||||||
|
<div style={{ background: 'var(--success-bg)', padding: 8, borderRadius: 4 }}>
|
||||||
|
<div style={{ fontWeight: 600, marginBottom: 4 }}>新版</div>
|
||||||
|
{String(s.new_text || '')}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{s.summary && <p style={{ fontSize: 12, marginTop: 6, color: 'var(--text-secondary)' }}>{String(s.summary)}</p>}
|
||||||
|
</div>
|
||||||
|
));
|
||||||
|
})()}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{(aiOutput || streaming) && (
|
{(aiOutput || streaming) && (
|
||||||
<div className="card ai-card">
|
<div className="card ai-card">
|
||||||
<div className="card-header">AI Impact Analysis</div>
|
<div className="card-header">AI Impact Analysis</div>
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import { useState, useRef, useEffect, useCallback } from 'react';
|
import { useRef, useEffect, useCallback, useState } from 'react';
|
||||||
import { Topbar } from '../../components/layout/Topbar';
|
import { Topbar } from '../../components/layout/Topbar';
|
||||||
import { Send, Download } from 'lucide-react';
|
import { Send, Download } from 'lucide-react';
|
||||||
|
import { usePageState } from '../../contexts';
|
||||||
|
import type { RagCitation } from '../../contexts';
|
||||||
|
|
||||||
const TOKEN_KEY = 'auth_token';
|
const TOKEN_KEY = 'auth_token';
|
||||||
function authHeader(): Record<string, string> {
|
function authHeader(): Record<string, string> {
|
||||||
@@ -8,26 +10,8 @@ function authHeader(): Record<string, string> {
|
|||||||
return t ? { Authorization: `Bearer ${t}` } : {};
|
return t ? { Authorization: `Bearer ${t}` } : {};
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Message {
|
|
||||||
id: string;
|
|
||||||
role: 'user' | 'assistant';
|
|
||||||
text: string;
|
|
||||||
// citation indices mentioned in this assistant message (1-based, matching citations array)
|
|
||||||
citationRefs?: number[];
|
|
||||||
}
|
|
||||||
|
|
||||||
interface Citation {
|
|
||||||
index: number; // 1-based, matches [N] markers in text
|
|
||||||
score: number; // 0–100 display percentage
|
|
||||||
name: string; // doc_name
|
|
||||||
clause: string; // section_title or clause
|
|
||||||
snippet: string; // preview text
|
|
||||||
docId?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Map a raw source doc from the backend "retrieved" event to our Citation shape.
|
// Map a raw source doc from the backend "retrieved" event to our Citation shape.
|
||||||
// Backend fields: { id, score(0-1), preview, doc_name, clause, doc_id }
|
function mapSource(s: Record<string, unknown>, idx: number): RagCitation {
|
||||||
function mapSource(s: Record<string, unknown>, idx: number): Citation {
|
|
||||||
const rawScore = typeof s.score === 'number' ? s.score : 0;
|
const rawScore = typeof s.score === 'number' ? s.score : 0;
|
||||||
const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore);
|
const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore);
|
||||||
return {
|
return {
|
||||||
@@ -73,25 +57,21 @@ const MOCK_QUICK = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
export function RagChatPage() {
|
export function RagChatPage() {
|
||||||
const [messages, setMessages] = useState<Message[]>([
|
// All persistent state lives in PageStateContext — survives route changes
|
||||||
{
|
const { ragState, setRagState, ragStreamingRef, ragAbortRef } = usePageState();
|
||||||
id: 'init', role: 'assistant',
|
const { messages, citations, sessionId, inputDraft } = ragState;
|
||||||
text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
|
|
||||||
}
|
// Local-only UI state: highlighted citation and streaming indicator
|
||||||
]);
|
// These are fine to reset on navigation since they're transient UI feedback
|
||||||
const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
|
|
||||||
const [input, setInput] = useState('');
|
|
||||||
const [streaming, setStreaming] = useState(false);
|
|
||||||
const [citations, setCitations] = useState<Citation[]>([]);
|
|
||||||
const [highlightedCit, setHighlightedCit] = useState<number | null>(null);
|
const [highlightedCit, setHighlightedCit] = useState<number | null>(null);
|
||||||
const [sessionId, setSessionId] = useState<string | null>(null);
|
const [streaming, setStreaming] = useState(ragStreamingRef.current);
|
||||||
|
const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
|
||||||
|
|
||||||
const bottomRef = useRef<HTMLDivElement>(null);
|
const bottomRef = useRef<HTMLDivElement>(null);
|
||||||
const citRailRef = useRef<HTMLDivElement>(null);
|
const citRailRef = useRef<HTMLDivElement>(null);
|
||||||
const citItemRefs = useRef<Record<number, HTMLDivElement | null>>({});
|
const citItemRefs = useRef<Record<number, HTMLDivElement | null>>({});
|
||||||
const abortRef = useRef<AbortController | null>(null);
|
|
||||||
|
|
||||||
// Fetch quick questions from backend on mount
|
// Fetch quick questions from backend on mount (only once per session)
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetch('/api/v1/rag/quick-questions', { headers: authHeader() })
|
fetch('/api/v1/rag/quick-questions', { headers: authHeader() })
|
||||||
.then(r => r.json())
|
.then(r => r.json())
|
||||||
@@ -115,26 +95,33 @@ export function RagChatPage() {
|
|||||||
if (el) {
|
if (el) {
|
||||||
el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
||||||
}
|
}
|
||||||
// Clear highlight after 3s
|
|
||||||
setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000);
|
setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000);
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
async function send(text?: string) {
|
async function send(text?: string) {
|
||||||
const q = (text ?? input).trim();
|
const q = (text ?? inputDraft).trim();
|
||||||
if (!q || streaming) return;
|
if (!q || ragStreamingRef.current) return;
|
||||||
setInput('');
|
setRagState(s => ({ ...s, inputDraft: '' }));
|
||||||
|
|
||||||
const userMsg: Message = { id: Date.now().toString(), role: 'user', text: q };
|
|
||||||
setMessages(m => [...m, userMsg]);
|
|
||||||
|
|
||||||
|
const userMsgId = Date.now().toString();
|
||||||
const assistantId = (Date.now() + 1).toString();
|
const assistantId = (Date.now() + 1).toString();
|
||||||
setMessages(m => [...m, { id: assistantId, role: 'assistant', text: '' }]);
|
|
||||||
|
setRagState(s => ({
|
||||||
|
...s,
|
||||||
|
messages: [
|
||||||
|
...s.messages,
|
||||||
|
{ id: userMsgId, role: 'user', text: q },
|
||||||
|
{ id: assistantId, role: 'assistant', text: '' },
|
||||||
|
],
|
||||||
|
citations: [],
|
||||||
|
}));
|
||||||
|
|
||||||
|
ragStreamingRef.current = true;
|
||||||
setStreaming(true);
|
setStreaming(true);
|
||||||
setCitations([]);
|
|
||||||
setHighlightedCit(null);
|
setHighlightedCit(null);
|
||||||
|
|
||||||
const ctrl = new AbortController();
|
const ctrl = new AbortController();
|
||||||
abortRef.current = ctrl;
|
ragAbortRef.current = ctrl;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const body: Record<string, unknown> = { query: q, top_k: 5 };
|
const body: Record<string, unknown> = { query: q, top_k: 5 };
|
||||||
@@ -151,14 +138,13 @@ export function RagChatPage() {
|
|||||||
const reader = res.body.getReader();
|
const reader = res.body.getReader();
|
||||||
const dec = new TextDecoder();
|
const dec = new TextDecoder();
|
||||||
let buffer = '';
|
let buffer = '';
|
||||||
const newCitations: Citation[] = [];
|
const newCitations: RagCitation[] = [];
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const { done, value } = await reader.read();
|
const { done, value } = await reader.read();
|
||||||
if (done) break;
|
if (done) break;
|
||||||
buffer += dec.decode(value, { stream: true });
|
buffer += dec.decode(value, { stream: true });
|
||||||
|
|
||||||
// SSE blocks separated by double newline
|
|
||||||
const blocks = buffer.split('\n\n');
|
const blocks = buffer.split('\n\n');
|
||||||
buffer = blocks.pop() ?? '';
|
buffer = blocks.pop() ?? '';
|
||||||
|
|
||||||
@@ -171,56 +157,62 @@ export function RagChatPage() {
|
|||||||
const j = JSON.parse(raw);
|
const j = JSON.parse(raw);
|
||||||
|
|
||||||
if (j.type === 'session') {
|
if (j.type === 'session') {
|
||||||
// Backend assigned a session_id — persist for next request
|
if (j.session_id) setRagState(s => ({ ...s, sessionId: j.session_id }));
|
||||||
if (j.session_id) setSessionId(j.session_id);
|
|
||||||
|
|
||||||
} else if (j.type === 'retrieved' && Array.isArray(j.docs)) {
|
} else if (j.type === 'retrieved' && Array.isArray(j.docs)) {
|
||||||
// Sources arrive before the answer starts
|
|
||||||
const mapped = j.docs.map((d: Record<string, unknown>, i: number) => mapSource(d, i + 1));
|
const mapped = j.docs.map((d: Record<string, unknown>, i: number) => mapSource(d, i + 1));
|
||||||
newCitations.push(...mapped);
|
newCitations.push(...mapped);
|
||||||
setCitations([...mapped]);
|
setRagState(s => ({ ...s, citations: [...mapped] }));
|
||||||
|
|
||||||
} else if (j.type === 'chunk' && j.text) {
|
} else if (j.type === 'chunk' && j.text) {
|
||||||
setMessages(m => m.map(msg =>
|
setRagState(s => ({
|
||||||
msg.id === assistantId
|
...s,
|
||||||
? { ...msg, text: msg.text + (j.text as string) }
|
messages: s.messages.map(msg =>
|
||||||
: msg
|
msg.id === assistantId
|
||||||
));
|
? { ...msg, text: msg.text + (j.text as string) }
|
||||||
|
: msg
|
||||||
} else if (j.type === 'status') {
|
),
|
||||||
// Status message (e.g. "找到N条相关法规…") — could show in UI if desired
|
}));
|
||||||
// For now we ignore it to keep the bubble clean
|
|
||||||
|
|
||||||
} else if (j.type === 'done') {
|
} else if (j.type === 'done') {
|
||||||
// Extract which citation numbers appear in the final answer
|
setRagState(s => ({
|
||||||
setMessages(m => m.map(msg => {
|
...s,
|
||||||
if (msg.id !== assistantId) return msg;
|
messages: s.messages.map(msg => {
|
||||||
const refs = [...new Set(
|
if (msg.id !== assistantId) return msg;
|
||||||
[...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
|
const refs = [...new Set(
|
||||||
)].filter(n => n >= 1 && n <= newCitations.length);
|
[...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
|
||||||
return { ...msg, citationRefs: refs };
|
)].filter(n => n >= 1 && n <= newCitations.length);
|
||||||
|
return { ...msg, citationRefs: refs };
|
||||||
|
}),
|
||||||
}));
|
}));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
} else if (j.type === 'error') {
|
} else if (j.type === 'error') {
|
||||||
setMessages(m => m.map(msg =>
|
setRagState(s => ({
|
||||||
msg.id === assistantId
|
...s,
|
||||||
? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
|
messages: s.messages.map(msg =>
|
||||||
: msg
|
msg.id === assistantId
|
||||||
));
|
? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
|
||||||
|
: msg
|
||||||
|
),
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
} catch { /* malformed JSON chunk, skip */ }
|
} catch { /* malformed JSON chunk, skip */ }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e: unknown) {
|
} catch (e: unknown) {
|
||||||
if (e instanceof Error && e.name !== 'AbortError') {
|
if (e instanceof Error && e.name !== 'AbortError') {
|
||||||
setMessages(m => m.map(msg =>
|
setRagState(s => ({
|
||||||
msg.id === assistantId
|
...s,
|
||||||
? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
|
messages: s.messages.map(msg =>
|
||||||
: msg
|
msg.id === assistantId
|
||||||
));
|
? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
|
||||||
|
: msg
|
||||||
|
),
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
ragStreamingRef.current = false;
|
||||||
setStreaming(false);
|
setStreaming(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -291,15 +283,15 @@ export function RagChatPage() {
|
|||||||
<textarea
|
<textarea
|
||||||
className="composer-input"
|
className="composer-input"
|
||||||
placeholder="Ask about your regulations…"
|
placeholder="Ask about your regulations…"
|
||||||
value={input}
|
value={inputDraft}
|
||||||
onChange={e => setInput(e.target.value)}
|
onChange={e => setRagState(s => ({ ...s, inputDraft: e.target.value }))}
|
||||||
onKeyDown={e => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); } }}
|
onKeyDown={e => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); } }}
|
||||||
rows={2}
|
rows={2}
|
||||||
/>
|
/>
|
||||||
<button
|
<button
|
||||||
className="btn primary"
|
className="btn primary"
|
||||||
onClick={() => send()}
|
onClick={() => send()}
|
||||||
disabled={!input.trim() || streaming}
|
disabled={!inputDraft.trim() || streaming}
|
||||||
>
|
>
|
||||||
<Send size={14} />
|
<Send size={14} />
|
||||||
</button>
|
</button>
|
||||||
|
|||||||
@@ -1108,3 +1108,33 @@ mark.comp-highlight {
|
|||||||
transition: color 0.15s;
|
transition: color 0.15s;
|
||||||
}
|
}
|
||||||
.logout-btn:hover { color: var(--danger); }
|
.logout-btn:hover { color: var(--danger); }
|
||||||
|
|
||||||
|
/* ── Detail Tabs (Perception) ──────────────────── */
|
||||||
|
.detail-tabs {
|
||||||
|
display: flex;
|
||||||
|
gap: 2px;
|
||||||
|
margin: 8px 0 0;
|
||||||
|
border-bottom: 1px solid var(--border);
|
||||||
|
padding-bottom: 0;
|
||||||
|
}
|
||||||
|
.detail-tab {
|
||||||
|
background: none;
|
||||||
|
border: none;
|
||||||
|
border-bottom: 2px solid transparent;
|
||||||
|
padding: 6px 14px;
|
||||||
|
font-size: 13px;
|
||||||
|
color: var(--text-secondary);
|
||||||
|
cursor: pointer;
|
||||||
|
transition: color 0.15s, border-color 0.15s;
|
||||||
|
}
|
||||||
|
.detail-tab:hover { color: var(--text); }
|
||||||
|
.detail-tab.active {
|
||||||
|
color: var(--accent);
|
||||||
|
border-bottom-color: var(--accent);
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.detail-tab.disabled {
|
||||||
|
opacity: 0.35;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.spin { animation: spin 1s linear infinite; }
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ dependencies = [
|
|||||||
"loguru>=0.7.0",
|
"loguru>=0.7.0",
|
||||||
"tenacity>=8.2.0",
|
"tenacity>=8.2.0",
|
||||||
"httpx>=0.24.0",
|
"httpx>=0.24.0",
|
||||||
|
"beautifulsoup4>=4.12.0",
|
||||||
|
"lxml>=5.0.0",
|
||||||
"alibabacloud-docmind-api20220711>=1.0.6",
|
"alibabacloud-docmind-api20220711>=1.0.6",
|
||||||
"alibabacloud-tea-openapi>=0.3.11",
|
"alibabacloud-tea-openapi>=0.3.11",
|
||||||
"alibabacloud-tea-util>=0.3.13",
|
"alibabacloud-tea-util>=0.3.13",
|
||||||
|
|||||||
Reference in New Issue
Block a user