fix somethings

This commit is contained in:
2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions

5
.env
View File

@@ -54,6 +54,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
# Default false: processing runs in FastAPI's threadpool — no external worker needed. # Default false: processing runs in FastAPI's threadpool — no external worker needed.
USE_CELERY_WORKER=false USE_CELERY_WORKER=false
# ===== 法规感知爬取配置 =====
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
# ===== API配置 ===== # ===== API配置 =====
API_HOST=0.0.0.0 API_HOST=0.0.0.0
API_PORT=8000 API_PORT=8000

View File

@@ -55,6 +55,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
# Default false: document processing runs in FastAPI's threadpool (no external worker needed). # Default false: document processing runs in FastAPI's threadpool (no external worker needed).
USE_CELERY_WORKER=false USE_CELERY_WORKER=false
# ===== 法规感知爬取配置 =====
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
# ===== 阿里云文档解析 ===== # ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret

View File

@@ -4,10 +4,12 @@ from __future__ import annotations
import json import json
from fastapi import APIRouter, Query from fastapi import APIRouter, Depends, Query
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from app.shared.bootstrap import get_perception_service from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service
from app.api.dependencies.auth import get_current_user
from app.domain.auth.models import UserClaims
from app.shared.async_utils import iter_in_thread from app.shared.async_utils import iter_in_thread
router = APIRouter(prefix="/perception", tags=["智能感知"]) router = APIRouter(prefix="/perception", tags=["智能感知"])
@@ -65,3 +67,77 @@ async def analyze_event(event_id: str):
"X-Accel-Buffering": "no", "X-Accel-Buffering": "no",
}, },
) )
@router.post("/crawl")
async def run_crawl(
body: dict = None,
current_user: UserClaims = Depends(get_current_user),
):
"""Trigger manual crawl of regulatory sources. Streams SSE progress.
Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
Omit sources to crawl all registered sources.
"""
sources: list[str] | None = (body or {}).get("sources")
crawl_svc = get_crawl_service()
async def crawl_stream():
async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
event_name = item.get("event", "message")
data = item.get("data", "")
if isinstance(data, (dict, list)):
data = json.dumps(data, ensure_ascii=False)
yield f"event: {event_name}\ndata: {data}\n\n"
return StreamingResponse(
crawl_stream(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
@router.post("/events/{event_id}/process")
async def process_event(
event_id: str,
current_user: UserClaims = Depends(get_current_user),
):
"""Trigger LLM pipeline (extract + assess + diff) for a single event."""
from datetime import UTC, datetime
from app.infrastructure.perception.llm_pipeline import LlmPipeline
from app.shared.bootstrap import get_retrieval_service
event = get_perception_service().get_event(event_id)
if not event:
from fastapi import HTTPException
raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
store = get_event_store()
pipeline = LlmPipeline()
structure = pipeline.extract_structure(event)
event.update(structure)
event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
event["processed_at"] = datetime.now(UTC).isoformat()
store.upsert(event)
return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
@router.get("/events/{event_id}/diff")
async def get_event_diff(event_id: str):
"""Return semantic diff detail for an event (only available if previously crawled twice)."""
event = get_perception_service().get_event(event_id)
if not event:
from fastapi import HTTPException
raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
if not event.get("change_summary"):
from fastapi import HTTPException
raise HTTPException(status_code=404, detail="No diff available for this event")
return {
"event_id": event_id,
"change_summary": event.get("change_summary"),
"changed_sections": event.get("changed_sections") or [],
"previous_hash": event.get("previous_hash"),
"content_hash": event.get("content_hash"),
}

View File

@@ -0,0 +1,147 @@
"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
from __future__ import annotations
import hashlib
from typing import Any, Generator
from loguru import logger
from app.infrastructure.perception.base_event_store import BaseEventStore
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from app.infrastructure.perception.llm_pipeline import LlmPipeline
def _event_id(source: str, standard_code: str) -> str:
"""Deterministic 12-char ID from source + standard_code."""
return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
def _content_hash(raw_text: str) -> str:
return hashlib.sha256(raw_text.encode()).hexdigest()
def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
return {
"id": event_id,
"source": raw.source,
"source_label": raw.source_label,
"standard_code": raw.standard_code,
"title": raw.title,
"summary": raw.summary,
"full_text_url": raw.full_text_url,
"status": raw.status,
"impact_level": "medium",
"published_at": raw.published_at,
"effective_at": raw.effective_at,
"category": raw.category,
"tags": raw.tags,
"content_hash": content_hash,
"previous_hash": None,
}
class CrawlService:
"""Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
def __init__(
self,
crawlers: dict[str, BaseCrawler],
event_store: BaseEventStore,
llm_pipeline: LlmPipeline,
retrieval_service: Any,
) -> None:
self._crawlers = crawlers
self._store = event_store
self._pipeline = llm_pipeline
self._retrieval = retrieval_service
def run_crawl(
self, sources: list[str] | None = None
) -> Generator[dict, None, None]:
"""Run crawl for selected sources. Yields SSE-ready progress dicts."""
targets = sources or list(self._crawlers.keys())
total_new = 0
total_updated = 0
for source_key in targets:
crawler = self._crawlers.get(source_key)
if not crawler:
yield {"event": "error", "data": f"Unknown source: {source_key}"}
continue
yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
try:
raw_events = crawler.fetch(limit=100)
except Exception as exc:
logger.exception("Crawler failed source={}", source_key)
yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
continue
yield {
"event": "progress",
"data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
}
new_count = 0
updated_count = 0
for raw in raw_events:
eid = _event_id(raw.source, raw.standard_code)
new_hash = _content_hash(raw.raw_text or raw.title)
existing = self._store.get(eid)
if existing and existing.get("content_hash") == new_hash:
continue
is_update = existing is not None
old_text = existing.get("summary", "") if is_update else ""
previous_hash = existing.get("content_hash") if is_update else None
event_dict = _raw_to_dict(raw, eid, new_hash)
event_dict["previous_hash"] = previous_hash
try:
structure = self._pipeline.extract_structure(event_dict)
event_dict.update(structure)
except Exception as exc:
logger.warning("Structure extraction failed id={} err={}", eid, exc)
try:
affected = self._pipeline.assess_impact(event_dict, self._retrieval)
event_dict["affected_docs"] = affected
except Exception as exc:
logger.warning("Impact assessment failed id={} err={}", eid, exc)
if is_update and old_text and raw.raw_text:
try:
diff = self._pipeline.compute_diff(old_text, raw.raw_text)
event_dict["change_summary"] = diff.get("change_summary")
event_dict["changed_sections"] = diff.get("changed_sections")
except Exception as exc:
logger.warning("Diff failed id={} err={}", eid, exc)
self._store.upsert(event_dict)
if is_update:
updated_count += 1
else:
new_count += 1
total_new += new_count
total_updated += updated_count
yield {
"event": "progress",
"data": {
"source": source_key,
"stage": "done",
"new": new_count,
"updated": updated_count,
},
}
yield {
"event": "done",
"data": {"total_new": total_new, "total_updated": total_updated},
}

View File

@@ -6,7 +6,7 @@ import json
from typing import Generator from typing import Generator
from app.application.knowledge.services import KnowledgeRetrievalService from app.application.knowledge.services import KnowledgeRetrievalService
from app.infrastructure.perception.mock_event_store import MockEventStore from app.infrastructure.perception.base_event_store import BaseEventStore
from app.services.llm.llm_factory import get_llm_client from app.services.llm.llm_factory import get_llm_client
from app.config.settings import settings from app.config.settings import settings
@@ -22,7 +22,7 @@ class PerceptionService:
def __init__( def __init__(
self, self,
event_store: MockEventStore, event_store: BaseEventStore,
retrieval_service: KnowledgeRetrievalService, retrieval_service: KnowledgeRetrievalService,
) -> None: ) -> None:
self._store = event_store self._store = event_store

View File

@@ -87,6 +87,18 @@ class Settings(BaseSettings):
# no external worker needed. Switch to True only when a Celery worker is running. # no external worker needed. Switch to True only when a Celery worker is running.
use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)") use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")
# ── Perception crawl ──────────────────────────────────────────────────────
perception_crawl_timeout_seconds: int = Field(
default=120, description="HTTP timeout for regulatory source crawlers."
)
perception_max_events_per_source: int = Field(
default=100, description="Maximum events fetched per source per crawl run."
)
perception_diff_similarity_threshold: float = Field(
default=0.85,
description="Cosine similarity below which a paragraph is flagged as changed.",
)
# Keep configuration setup explicit so runtime behavior is easy to reason about. # Keep configuration setup explicit so runtime behavior is easy to reason about.
api_host: str = Field(default="0.0.0.0", description="API服务地址") api_host: str = Field(default="0.0.0.0", description="API服务地址")
api_port: int = Field(default=8000, description="API服务端口") api_port: int = Field(default=8000, description="API服务端口")

View File

@@ -0,0 +1,39 @@
"""Abstract base class for regulatory event stores."""
from __future__ import annotations
from abc import ABC, abstractmethod
class BaseEventStore(ABC):
"""Port interface for regulatory event persistence."""
@abstractmethod
def all(self) -> list[dict]:
"""Return all events, most-recent first."""
@abstractmethod
def get(self, event_id: str) -> dict | None:
"""Return a single event by ID, or None."""
@abstractmethod
def filter(
self,
*,
source: str | None = None,
impact_level: str | None = None,
limit: int = 50,
) -> list[dict]:
"""Return filtered events sorted by published_at descending."""
@abstractmethod
def stats(self) -> dict:
"""Return {total, high_impact, medium_impact, low_impact, recent_90d}."""
@abstractmethod
def upsert(self, event: dict) -> None:
"""Insert or update an event record."""
@abstractmethod
def get_by_standard_code(self, standard_code: str) -> dict | None:
"""Return the most-recent event with matching standard_code, or None."""

View File

@@ -0,0 +1,43 @@
"""Shared utility functions for crawlers."""
from __future__ import annotations
import re
from datetime import date
def parse_date(text: str) -> str:
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
text = text.strip()
if not text:
return date.today().isoformat()
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
except ValueError:
pass
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
if m2:
try:
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
except ValueError:
pass
return date.today().isoformat()
def extract_tags(standard_code: str, title: str) -> list[str]:
"""Derive simple keyword tags from standard code and title."""
tags: list[str] = []
code_upper = standard_code.upper()
if "GB" in code_upper:
tags.append("国家标准")
if "/T" in code_upper:
tags.append("推荐性")
else:
tags.append("强制性")
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
for kw in keywords:
if kw in title:
tags.append(kw)
return tags[:5]

View File

@@ -0,0 +1,32 @@
"""Shared contracts for regulatory source crawlers."""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
@dataclass
class RawEvent:
"""Raw regulatory event returned by a crawler before enrichment."""
source: str
source_label: str
standard_code: str
title: str
summary: str
full_text_url: str
status: str # 'enacted' | 'draft' | 'consultation'
published_at: str # YYYY-MM-DD string
effective_at: str | None
category: str
tags: list[str] = field(default_factory=list)
raw_text: str = "" # full crawled text for hashing + LLM
class BaseCrawler(ABC):
"""Abstract regulatory source crawler."""
@abstractmethod
def fetch(self, limit: int = 50) -> list[RawEvent]:
"""Fetch up to `limit` recent events from the data source."""

View File

@@ -0,0 +1,83 @@
"""Crawler for CATARC automotive standard catalogue."""
from __future__ import annotations
from urllib.parse import urljoin
import httpx
from bs4 import BeautifulSoup
from loguru import logger
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import extract_tags, parse_date
_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
_HOST = "https://www.catarc.org.cn"
_STATUS_MAP = {
"现行": "enacted",
"即将实施": "enacted",
"废止": "enacted",
"征求意见": "consultation",
"报批": "draft",
}
class CatarcCrawler(BaseCrawler):
"""Scrape the CATARC automotive standard list page."""
def fetch(self, limit: int = 50) -> list[RawEvent]:
events: list[RawEvent] = []
page = 1
max_pages = max(10, limit)
while len(events) < limit and page <= max_pages:
url = f"{_BASE_URL}?page={page}"
try:
resp = httpx.get(url, timeout=30, follow_redirects=True)
resp.raise_for_status()
except Exception as exc:
logger.warning("CATARC fetch failed page={} err={}", page, exc)
break
soup = BeautifulSoup(resp.text, "lxml")
rows = soup.select("table tr")
if not rows:
break
batch: list[RawEvent] = []
for row in rows:
cells = row.find_all("td")
if len(cells) < 3:
continue
link = cells[0].find("a")
standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
published_at = parse_date(date_text)
status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
status = _STATUS_MAP.get(status_text, "enacted")
detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
raw_text = f"{standard_code} {title}"
batch.append(RawEvent(
source="CATARC",
source_label="全国汽车标准化技术委员会",
standard_code=standard_code,
title=title,
summary=title,
full_text_url=detail_url,
status=status,
published_at=published_at,
effective_at=None,
category="汽车标准",
tags=extract_tags(standard_code, title),
raw_text=raw_text,
))
if not batch:
break
events.extend(batch)
page += 1
return events[:limit]

View File

@@ -0,0 +1,117 @@
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
from __future__ import annotations
import re
from email.utils import parsedate_to_datetime
import httpx
from bs4 import BeautifulSoup
from loguru import logger
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import parse_date
_EURLEX_RSS_URLS = [
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
]
_AUTOMOTIVE_KEYWORDS = [
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
"AI Act", "artificial intelligence", "cybersecurity", "software update",
"R155", "R156", "汽车", "车辆",
]
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
def _is_automotive_relevant(title: str, description: str) -> bool:
combined = (title + " " + description).lower()
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
def _extract_celex(url: str) -> str:
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
return m.group(1) if m else ""
def _parse_rss_date(rfc2822: str) -> str:
try:
dt = parsedate_to_datetime(rfc2822)
return dt.date().isoformat()
except Exception:
return parse_date(rfc2822)
class EurlexCrawler(BaseCrawler):
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
def fetch(self, limit: int = 50) -> list[RawEvent]:
events: list[RawEvent] = []
for rss_url in _EURLEX_RSS_URLS:
if len(events) >= limit:
break
try:
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
resp.raise_for_status()
except Exception as exc:
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
continue
soup = BeautifulSoup(resp.content, "lxml-xml")
for item in soup.find_all("item"):
if len(events) >= limit:
break
title_tag = item.find("title")
title = title_tag.get_text(strip=True) if title_tag else ""
desc_tag = item.find("description")
description = desc_tag.get_text(strip=True) if desc_tag else ""
link_tag = item.find("link")
link = link_tag.get_text(strip=True) if link_tag else ""
pub_date_tag = item.find("pubDate")
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
if not _is_automotive_relevant(title, description):
continue
celex = _extract_celex(link)
standard_code = celex if celex else title[:60]
published_at = _parse_rss_date(pub_date) if pub_date else ""
events.append(RawEvent(
source="EUR-Lex",
source_label="欧盟官方公报",
standard_code=standard_code,
title=title,
summary=description[:500],
full_text_url=link,
status="enacted",
published_at=published_at,
effective_at=None,
category="EU法规",
tags=_extract_eurlex_tags(title, description),
raw_text=f"{title}\n{description}",
))
return events[:limit]
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
combined = title + " " + description
tag_map = {
"AI Act": "EU AI Act",
"artificial intelligence": "EU AI Act",
"R155": "UN R155",
"R156": "UN R156",
"cybersecurity": "网络安全",
"emission": "排放",
"autonomous": "自动驾驶",
"ADAS": "ADAS",
}
combined_lower = combined.lower()
tags = []
for kw, tag in tag_map.items():
if kw.lower() in combined_lower:
tags.append(tag)
return tags[:5]

View File

@@ -0,0 +1,92 @@
"""Crawlers for the 国标委 (SAMR) standard information platform."""
from __future__ import annotations
import httpx
from loguru import logger
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import extract_tags, parse_date
_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
params = {
"p.p1": std_type,
"p.p2": "",
"p.p90": "circulation_date",
"p.p91": "desc",
"p.p6": page,
"p.p7": page_size,
}
try:
resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
resp.raise_for_status()
data = resp.json()
return data.get("rows", []) or []
except Exception as exc:
logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
return []
def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
standard_code = row.get("std_code", "")
title = row.get("std_name", standard_code)
published_at = parse_date(row.get("release_date", ""))
effective_at_raw = row.get("implement_date", "")
effective_at = parse_date(effective_at_raw) if effective_at_raw else None
status_text = row.get("std_status", "")
if "征求意见" in status_text:
status = "consultation"
elif "报批" in status_text or "草案" in status_text:
status = "draft"
else:
status = "enacted"
return RawEvent(
source="国标委",
source_label=source_label,
standard_code=standard_code,
title=title,
summary=title,
full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
status=status,
published_at=published_at,
effective_at=effective_at,
category=row.get("std_type", "国家标准"),
tags=extract_tags(standard_code, title),
raw_text=f"{standard_code} {title}",
)
class GuobiaoMandatoryCrawler(BaseCrawler):
"""Fetch mandatory national standards (强制性) related to vehicles."""
def fetch(self, limit: int = 50) -> list[RawEvent]:
events: list[RawEvent] = []
page = 1
max_pages = max(10, limit)
while len(events) < limit and page <= max_pages:
rows = _fetch_page(std_type=1, page=page, page_size=20)
if not rows:
break
events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
page += 1
return events[:limit]
class GuobiaoRecommendedCrawler(BaseCrawler):
"""Fetch recommended national standards (推荐性) related to vehicles."""
def fetch(self, limit: int = 50) -> list[RawEvent]:
events: list[RawEvent] = []
page = 1
max_pages = max(10, limit)
while len(events) < limit and page <= max_pages:
rows = _fetch_page(std_type=2, page=page, page_size=20)
if not rows:
break
events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
page += 1
return events[:limit]

View File

@@ -0,0 +1,241 @@
"""LLM-driven pipeline for regulatory event enrichment."""
from __future__ import annotations
import json
import math
from typing import Any
from loguru import logger
from app.config.settings import settings
from app.infrastructure.embedding.openai_compatible_embedding_provider import (
OpenAICompatibleEmbeddingProvider,
)
from app.services.llm.llm_factory import get_llm_client
_EXTRACT_SYSTEM = (
"You are a regulatory compliance expert specialising in automotive standards "
"(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
"Return valid JSON only — no markdown fences, no extra keys."
)
_ASSESS_SYSTEM = (
"You are an automotive compliance analyst. Given a regulation and related document excerpts, "
"identify which documents are affected and what actions are required. "
"Return a JSON array only."
)
_DIFF_SYSTEM = (
"You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
"classify the type of change and summarise it. "
"Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
)
_SIMILARITY_THRESHOLD = 0.85
def _cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
def _llm_json(client: Any, messages: list[dict]) -> Any:
"""Call LLM and parse JSON response; return None on failure."""
try:
resp = client.chat(messages)
text = (resp.content or "").strip()
if text.startswith("```"):
text = text.split("```")[1]
if text.startswith("json"):
text = text[4:]
return json.loads(text)
except Exception as exc:
logger.warning("LLM JSON parse failed: {}", exc)
return None
class LlmPipeline:
"""Three-step enrichment pipeline for crawled regulatory events."""
def __init__(self) -> None:
self._client = get_llm_client(
provider=settings.llm_provider,
model=settings.llm_model,
)
self._embedder = OpenAICompatibleEmbeddingProvider()
# ------------------------------------------------------------------
# Step 1: Structure extraction
# ------------------------------------------------------------------
def extract_structure(self, event: dict) -> dict:
"""Extract obligations, deadlines, scope, penalties, impact_level from event text."""
prompt = f"""Extract structured compliance information from this regulation:
Standard: {event.get('standard_code', '')}
Title: {event.get('title', '')}
Source: {event.get('source_label', '')}
Summary: {event.get('summary', '')}
Tags: {', '.join(event.get('tags') or [])}
Return JSON with exactly these keys:
{{
"obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
"deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
"scope": "one sentence describing who/what this applies to",
"penalties": "one sentence on consequences of non-compliance, or null",
"impact_level": "high|medium|low"
}}"""
messages = [
{"role": "system", "content": _EXTRACT_SYSTEM},
{"role": "user", "content": prompt},
]
result = _llm_json(self._client, messages)
if not isinstance(result, dict):
return {
"obligations": [],
"deadlines": [],
"scope": "",
"penalties": "",
"impact_level": "medium",
}
return result
# ------------------------------------------------------------------
# Step 2: Impact assessment
# ------------------------------------------------------------------
def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
"""Use RAG to find affected documents and generate recommendations."""
obligations = event.get("obligations") or []
obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
try:
chunks = retrieval_service.retrieve(query=query, top_k=5)
except Exception as exc:
logger.warning("RAG retrieval failed: {}", exc)
return []
if not chunks:
return []
seen: set[str] = set()
doc_excerpts: list[dict] = []
for chunk in chunks:
if chunk.doc_id not in seen:
seen.add(chunk.doc_id)
doc_excerpts.append({
"doc_id": chunk.doc_id,
"doc_name": chunk.doc_title,
"score": round(float(chunk.score if chunk.score is not None else 0), 4),
"snippet": (chunk.text or "")[:300],
"clause": getattr(chunk, "section_title", "") or "",
})
context = "\n".join(
f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
for d in doc_excerpts
)
prompt = f"""Regulation: {event.get('standard_code')}{event.get('title')}
Obligations: {obligation_texts or event.get('summary', '')}
Affected documents found in knowledge base:
{context}
For each document, assess impact and recommend action. Return JSON array:
[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
messages = [
{"role": "system", "content": _ASSESS_SYSTEM},
{"role": "user", "content": prompt},
]
result = _llm_json(self._client, messages)
if isinstance(result, list):
score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
for item in result:
if isinstance(item, dict) and item.get("doc_id") in score_map:
item["score"] = score_map[item["doc_id"]]
return result
return doc_excerpts
# ------------------------------------------------------------------
# Step 3: Semantic diff
# ------------------------------------------------------------------
def compute_diff(self, old_text: str, new_text: str) -> dict:
"""Compare old and new regulation text; return changed sections and summary."""
old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
if not old_paras or not new_paras:
return {"changed_sections": [], "change_summary": "No comparable text."}
all_paras = old_paras + new_paras
try:
all_embeddings = self._embedder.embed_texts(all_paras)
except Exception as exc:
logger.warning("Embedding for diff failed: {}", exc)
return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
old_embeddings = all_embeddings[: len(old_paras)]
new_embeddings = all_embeddings[len(old_paras):]
changed_sections: list[dict] = []
max_len = max(len(old_paras), len(new_paras))
for i in range(max_len):
if i >= len(old_paras):
# New paragraph added
changed_sections.append({
"old_text": "",
"new_text": new_paras[i][:300],
"similarity": 0.0,
"change_type": "added",
"summary": "New paragraph added.",
})
continue
if i >= len(new_paras):
# Old paragraph removed
changed_sections.append({
"old_text": old_paras[i][:300],
"new_text": "",
"similarity": 0.0,
"change_type": "removed",
"summary": "Paragraph removed.",
})
continue
# Both exist — compare via embeddings
sim = _cosine(old_embeddings[i], new_embeddings[i])
if sim < _SIMILARITY_THRESHOLD:
messages = [
{"role": "system", "content": _DIFF_SYSTEM},
{"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"},
]
classification = _llm_json(self._client, messages) or {}
changed_sections.append({
"old_text": old_paras[i][:300],
"new_text": new_paras[i][:300],
"similarity": round(sim, 3),
"change_type": classification.get("change_type", "modified"),
"summary": classification.get("summary", ""),
})
if not changed_sections:
change_summary = "No substantive changes detected between versions."
else:
types = [s["change_type"] for s in changed_sections]
change_summary = (
f"{len(changed_sections)} paragraph(s) changed: "
+ ", ".join(f"{t}" for t in set(types))
+ ". "
+ (changed_sections[0].get("summary", "") if changed_sections else "")
)
return {"changed_sections": changed_sections, "change_summary": change_summary}

View File

@@ -4,6 +4,8 @@ from __future__ import annotations
from typing import Any from typing import Any
from app.infrastructure.perception.base_event_store import BaseEventStore
MOCK_EVENTS: list[dict[str, Any]] = [ MOCK_EVENTS: list[dict[str, Any]] = [
# ------------------------------------------------------------------ HIGH # ------------------------------------------------------------------ HIGH
{ {
@@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [
}, },
] ]
# Index for fast lookup class MockEventStore(BaseEventStore):
_EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS}
class MockEventStore:
"""In-memory mock store for regulatory events.""" """In-memory mock store for regulatory events."""
def __init__(self) -> None:
self._events: list[dict] = [dict(e) for e in MOCK_EVENTS]
self._index: dict[str, dict] = {e["id"]: e for e in self._events}
def all(self) -> list[dict]: def all(self) -> list[dict]:
return list(MOCK_EVENTS) return list(self._events)
def get(self, event_id: str) -> dict | None: def get(self, event_id: str) -> dict | None:
return _EVENT_INDEX.get(event_id) return self._index.get(event_id)
def filter( def filter(
self, self,
@@ -399,23 +401,39 @@ class MockEventStore:
impact_level: str | None = None, impact_level: str | None = None,
limit: int = 50, limit: int = 50,
) -> list[dict]: ) -> list[dict]:
events = list(MOCK_EVENTS) events = list(self._events)
if source: if source:
events = [e for e in events if e["source"] == source] events = [e for e in events if e["source"] == source]
if impact_level: if impact_level:
events = [e for e in events if e["impact_level"] == impact_level] events = [e for e in events if e["impact_level"] == impact_level]
events.sort(key=lambda e: e["published_at"], reverse=True) events.sort(key=lambda e: e.get("published_at") or "", reverse=True)
return events[:limit] return events[:limit]
def stats(self) -> dict: def stats(self) -> dict:
from datetime import date, timedelta from datetime import date, timedelta
events = MOCK_EVENTS events = self._events
cutoff = (date.today() - timedelta(days=90)).isoformat() cutoff = (date.today() - timedelta(days=90)).isoformat()
return { return {
"total": len(events), "total": len(events),
"high_impact": sum(1 for e in events if e["impact_level"] == "high"), "high_impact": sum(1 for e in events if e["impact_level"] == "high"),
"medium_impact": sum(1 for e in events if e["impact_level"] == "medium"), "medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
"low_impact": sum(1 for e in events if e["impact_level"] == "low"), "low_impact": sum(1 for e in events if e["impact_level"] == "low"),
"recent_90d": sum(1 for e in events if e["published_at"] >= cutoff), "recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff),
} }
def upsert(self, event: dict) -> None:
"""Insert or update event in the in-memory list (used in tests)."""
existing = self._index.get(event["id"])
if existing:
existing.update(event)
else:
self._events.append(event)
self._index[event["id"]] = event
def get_by_standard_code(self, standard_code: str) -> dict | None:
"""Return most-recent event with matching standard_code."""
matches = [e for e in self._events if e.get("standard_code") == standard_code]
if not matches:
return None
return max(matches, key=lambda e: e.get("published_at", ""))

View File

@@ -0,0 +1,225 @@
"""PostgreSQL-backed regulatory event store."""
from __future__ import annotations
import json
from contextlib import contextmanager
from datetime import UTC, date, datetime, timedelta
from typing import Any
import psycopg2
import psycopg2.extras
from psycopg2.pool import ThreadedConnectionPool
from app.config.settings import settings
from app.infrastructure.perception.base_event_store import BaseEventStore
_CREATE_TABLE = """
CREATE TABLE IF NOT EXISTS regulation_events (
id TEXT PRIMARY KEY,
source TEXT NOT NULL,
source_label TEXT,
standard_code TEXT NOT NULL,
title TEXT NOT NULL,
summary TEXT,
full_text_url TEXT,
status TEXT,
impact_level TEXT,
published_at DATE,
effective_at DATE,
category TEXT,
tags TEXT[],
obligations JSONB,
deadlines JSONB,
scope TEXT,
penalties TEXT,
content_hash TEXT,
previous_hash TEXT,
change_summary TEXT,
changed_sections JSONB,
affected_docs JSONB,
crawled_at TIMESTAMPTZ DEFAULT now(),
processed_at TIMESTAMPTZ,
raw_storage_key TEXT
);
CREATE INDEX IF NOT EXISTS reg_events_source_date
ON regulation_events (source, published_at DESC);
CREATE INDEX IF NOT EXISTS reg_events_impact_date
ON regulation_events (impact_level, published_at DESC);
"""
_ALL_COLUMNS = (
"id", "source", "source_label", "standard_code", "title", "summary",
"full_text_url", "status", "impact_level", "published_at", "effective_at",
"category", "tags", "obligations", "deadlines", "scope", "penalties",
"content_hash", "previous_hash", "change_summary", "changed_sections",
"affected_docs", "crawled_at", "processed_at", "raw_storage_key",
)
def _row_to_dict(row: dict[str, Any]) -> dict:
"""Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
d = dict(row)
for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
val = d.get(field)
if isinstance(val, str):
d[field] = json.loads(val)
for date_field in ("published_at", "effective_at"):
val = d.get(date_field)
if isinstance(val, datetime):
d[date_field] = val.date().isoformat()
elif isinstance(val, date):
d[date_field] = val.isoformat()
for ts_field in ("crawled_at", "processed_at"):
val = d.get(ts_field)
if isinstance(val, datetime):
d[ts_field] = val.isoformat()
return d
class PostgresEventStore(BaseEventStore):
"""Regulatory event store backed by PostgreSQL."""
def __init__(self) -> None:
self._pool = ThreadedConnectionPool(
minconn=1,
maxconn=5,
host=settings.postgres_host,
port=settings.postgres_port,
user=settings.postgres_user,
password=settings.postgres_password,
dbname=settings.postgres_db,
)
self._ensure_schema()
def _ensure_schema(self) -> None:
with self._conn() as conn:
try:
with conn.cursor() as cur:
cur.execute(_CREATE_TABLE)
conn.commit()
except Exception:
conn.rollback()
raise
@contextmanager
def _conn(self):
conn = None
try:
conn = self._pool.getconn()
yield conn
finally:
if conn is not None:
self._pool.putconn(conn)
def all(self) -> list[dict]:
with self._conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
)
return [_row_to_dict(r) for r in cur.fetchall()]
def get(self, event_id: str) -> dict | None:
with self._conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"SELECT * FROM regulation_events WHERE id = %s", (event_id,)
)
row = cur.fetchone()
return _row_to_dict(row) if row else None
def filter(
self,
*,
source: str | None = None,
impact_level: str | None = None,
limit: int = 50,
) -> list[dict]:
conditions: list[str] = []
params: list[Any] = []
if source:
conditions.append("source = %s")
params.append(source)
if impact_level:
conditions.append("impact_level = %s")
params.append(impact_level)
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
params.append(limit)
sql = f"""
SELECT * FROM regulation_events
{where}
ORDER BY published_at DESC NULLS LAST
LIMIT %s
"""
with self._conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params)
return [_row_to_dict(r) for r in cur.fetchall()]
def stats(self) -> dict:
cutoff = (date.today() - timedelta(days=90)).isoformat()
with self._conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
total = (cur.fetchone() or {}).get("count", 0)
cur.execute(
"SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
)
high = (cur.fetchone() or {}).get("count", 0)
cur.execute(
"SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
)
medium = (cur.fetchone() or {}).get("count", 0)
cur.execute(
"SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
(cutoff,),
)
recent = (cur.fetchone() or {}).get("count", 0)
return {
"total": int(total),
"high_impact": int(high),
"medium_impact": int(medium),
"recent_90d": int(recent),
}
def upsert(self, event: dict) -> None:
"""Insert or update a regulation event."""
cols = [c for c in _ALL_COLUMNS if c in event]
placeholders = ", ".join(f"%({c})s" for c in cols)
updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
sql = f"""
INSERT INTO regulation_events ({', '.join(cols)})
VALUES ({placeholders})
ON CONFLICT (id) DO UPDATE SET {updates}
"""
row: dict[str, Any] = {}
for c in cols:
val = event.get(c)
if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
row[c] = json.dumps(val, ensure_ascii=False)
elif c == "tags" and isinstance(val, list):
row[c] = val
else:
row[c] = val
with self._conn() as conn:
try:
with conn.cursor() as cur:
cur.execute(sql, row)
conn.commit()
except Exception:
conn.rollback()
raise
def get_by_standard_code(self, standard_code: str) -> dict | None:
with self._conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"""SELECT * FROM regulation_events
WHERE standard_code = %s
ORDER BY published_at DESC NULLS LAST
LIMIT 1""",
(standard_code,),
)
row = cur.fetchone()
return _row_to_dict(row) if row else None

View File

@@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu
from app.infrastructure.parser.local_document_parser import LocalDocumentParser from app.infrastructure.parser.local_document_parser import LocalDocumentParser
from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
from app.infrastructure.perception.mock_event_store import MockEventStore from app.infrastructure.perception.mock_event_store import MockEventStore
from app.application.perception.crawl_service import CrawlService
from app.infrastructure.perception.base_event_store import BaseEventStore
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
from app.infrastructure.perception.crawlers.guobiao_crawler import (
GuobiaoMandatoryCrawler,
GuobiaoRecommendedCrawler,
)
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
from app.infrastructure.perception.llm_pipeline import LlmPipeline
from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
from app.infrastructure.storage.json_document_repository import JsonDocumentRepository from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
@@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService:
) )
@lru_cache
def get_event_store() -> BaseEventStore:
"""Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
if settings.document_repository_backend == "postgres":
from app.infrastructure.perception.postgres_event_store import PostgresEventStore
return PostgresEventStore()
return MockEventStore()
@lru_cache @lru_cache
def get_perception_service() -> PerceptionService: def get_perception_service() -> PerceptionService:
"""Return perception service for regulatory intelligence."""
return PerceptionService( return PerceptionService(
event_store=MockEventStore(), event_store=get_event_store(),
retrieval_service=get_retrieval_service(),
)
@lru_cache
def get_crawl_service() -> CrawlService:
crawlers = {
"CATARC": CatarcCrawler(),
"国标委·强制性": GuobiaoMandatoryCrawler(),
"国标委·推荐性": GuobiaoRecommendedCrawler(),
"EUR-Lex": EurlexCrawler(),
}
return CrawlService(
crawlers=crawlers,
event_store=get_event_store(),
llm_pipeline=LlmPipeline(),
retrieval_service=get_retrieval_service(), retrieval_service=get_retrieval_service(),
) )

View File

@@ -9,6 +9,8 @@ pydantic-settings>=2.0.0
python-dotenv>=1.0.0 python-dotenv>=1.0.0
loguru>=0.7.0 loguru>=0.7.0
httpx>=0.25.0 httpx>=0.25.0
beautifulsoup4>=4.12.0
lxml>=5.0.0
tiktoken>=0.5.0 tiktoken>=0.5.0
tenacity>=8.2.0 tenacity>=8.2.0

View File

View File

@@ -0,0 +1,95 @@
"""Contract tests: any BaseEventStore implementation must pass these."""
from app.infrastructure.perception.base_event_store import BaseEventStore
from app.infrastructure.perception.mock_event_store import MockEventStore
def _store() -> BaseEventStore:
return MockEventStore()
def test_is_base_event_store():
assert isinstance(_store(), BaseEventStore)
def test_all_returns_list():
result = _store().all()
assert isinstance(result, list)
assert len(result) > 0
def test_get_known_id():
store = _store()
first = store.all()[0]
result = store.get(first["id"])
assert result is not None
assert result["id"] == first["id"]
def test_get_unknown_returns_none():
assert _store().get("does-not-exist") is None
def test_filter_by_impact():
store = _store()
highs = store.filter(impact_level="high", limit=100)
assert all(e["impact_level"] == "high" for e in highs)
def test_filter_limit():
store = _store()
result = store.filter(limit=3)
assert len(result) <= 3
def test_stats_keys():
stats = _store().stats()
for key in ("total", "high_impact", "medium_impact", "recent_90d"):
assert key in stats, f"missing key: {key}"
def test_upsert_and_get():
store = _store()
event = {
"id": "test-upsert-001",
"source": "TEST",
"source_label": "Test Source",
"standard_code": "TST-001",
"title": "Test Event",
"summary": "A test event",
"full_text_url": "https://example.com",
"status": "draft",
"impact_level": "low",
"published_at": "2026-01-01",
"effective_at": None,
"category": "test",
"tags": ["test"],
"content_hash": "abc123",
"previous_hash": None,
}
store.upsert(event)
result = store.get("test-upsert-001")
assert result is not None
assert result["title"] == "Test Event"
def test_get_by_standard_code():
store = _store()
first = store.all()[0]
result = store.get_by_standard_code(first["standard_code"])
assert result is not None
assert result["standard_code"] == first["standard_code"]
def test_upsert_updates_existing():
store = _store()
first = store.all()[0]
original_id = first["id"]
store.upsert({"id": original_id, "title": "Updated Title", "impact_level": first["impact_level"],
"standard_code": first.get("standard_code", ""), "source": first["source"],
"source_label": first.get("source_label", ""), "summary": "Updated",
"full_text_url": "", "status": first["status"], "published_at": first.get("published_at", ""),
"effective_at": None, "category": first.get("category", ""), "tags": [],
"content_hash": "newhash", "previous_hash": None})
result = store.get(original_id)
assert result is not None
assert result["title"] == "Updated Title"

View File

@@ -0,0 +1,111 @@
"""Integration tests for CrawlService."""
from __future__ import annotations
from unittest.mock import MagicMock
import hashlib
import pytest
from app.infrastructure.perception.crawlers.base import RawEvent
from app.infrastructure.perception.mock_event_store import MockEventStore
def _make_raw_event(code="TST-001"):
return RawEvent(
source="TEST", source_label="Test", standard_code=code,
title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
status="enacted", published_at="2026-01-01", effective_at=None,
category="test", tags=["test"], raw_text="full text",
)
def _make_service(raw_events):
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = raw_events
mock_pipeline = MagicMock()
mock_pipeline.extract_structure.return_value = {
"obligations": [], "deadlines": [], "scope": "test",
"penalties": None, "impact_level": "low",
}
mock_pipeline.assess_impact.return_value = []
mock_pipeline.compute_diff.return_value = {
"changed_sections": [], "change_summary": "No changes.",
}
mock_retrieval = MagicMock()
store = MockEventStore()
return CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=mock_retrieval,
)
def test_crawl_yields_progress_and_done():
svc = _make_service([_make_raw_event("TST-001")])
events = list(svc.run_crawl())
event_types = [e.get("event") for e in events]
assert "done" in event_types
def test_crawl_upserts_to_store():
store = MockEventStore()
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
mock_pipeline = MagicMock()
mock_pipeline.extract_structure.return_value = {
"obligations": [], "deadlines": [], "scope": "",
"penalties": None, "impact_level": "medium",
}
mock_pipeline.assess_impact.return_value = []
mock_pipeline.compute_diff.return_value = {
"changed_sections": [], "change_summary": "",
}
svc = CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=MagicMock(),
)
list(svc.run_crawl())
result = store.get_by_standard_code("NEW-001")
assert result is not None
assert result["title"] == "Test NEW-001"
def test_crawl_skips_unchanged_events():
store = MockEventStore()
raw = _make_raw_event("SKIP-001")
content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
store.upsert({
"id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
"standard_code": "SKIP-001",
"source": "TEST",
"source_label": "Test",
"title": "Test SKIP-001",
"summary": "",
"full_text_url": "",
"status": "enacted",
"impact_level": "low",
"published_at": "2026-01-01",
"effective_at": None,
"category": "test",
"tags": [],
"content_hash": content_hash,
})
mock_pipeline = MagicMock()
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = [raw]
svc = CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=MagicMock(),
)
list(svc.run_crawl())
mock_pipeline.extract_structure.assert_not_called()

View File

@@ -0,0 +1,127 @@
"""Unit tests for crawlers — mock httpx responses."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
def test_raw_event_fields():
ev = RawEvent(
source="TEST",
source_label="Test",
standard_code="TST-001",
title="Test",
summary="Summary",
full_text_url="https://example.com",
status="enacted",
published_at="2026-01-01",
effective_at=None,
category="test",
tags=["a"],
raw_text="full text here",
)
assert ev.source == "TEST"
assert ev.tags == ["a"]
CATARC_HTML = """
<html><body>
<table>
<tr>
<td><a href="/std/detail/123">GB 18384-2025</a></td>
<td>电动汽车安全要求</td>
<td>2025-11-15</td>
<td>现行</td>
</tr>
<tr>
<td><a href="/std/detail/456">GB/T 40429-2026</a></td>
<td>汽车驾驶自动化分级</td>
<td>2026-02-01</td>
<td>即将实施</td>
</tr>
</table>
</body></html>
"""
def test_catarc_crawler_parses_html():
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = CATARC_HTML
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = CatarcCrawler()
events = crawler.fetch(limit=10)
assert isinstance(events, list)
assert len(events) >= 1
assert all(isinstance(e, RawEvent) for e in events)
codes = [e.standard_code for e in events]
assert "GB 18384-2025" in codes
GUOBIAO_JSON = {
"rows": [
{
"std_code": "GB 18384-2025",
"std_name": "电动汽车安全要求",
"release_date": "2025-11-15",
"implement_date": "2026-07-01",
"std_status": "现行",
"std_type": "强制性",
},
]
}
def test_guobiao_crawler_parses_json():
from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = GUOBIAO_JSON
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = GuobiaoMandatoryCrawler()
events = crawler.fetch(limit=10)
assert len(events) >= 1
assert events[0].source == "国标委"
assert events[0].standard_code == "GB 18384-2025"
EURLEX_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>EUR-Lex</title>
<item>
<title>Regulation (EU) 2024/1689 — AI Act</title>
<link>https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689</link>
<description>The EU Artificial Intelligence Act enters into force.</description>
<pubDate>Fri, 12 Jul 2024 00:00:00 GMT</pubDate>
</item>
</channel>
</rss>"""
def test_eurlex_crawler_parses_rss():
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = EURLEX_RSS
mock_resp.content = EURLEX_RSS
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = EurlexCrawler()
events = crawler.fetch(limit=5)
assert isinstance(events, list)
assert len(events) >= 1
assert events[0].source == "EUR-Lex"

View File

@@ -0,0 +1,77 @@
"""Unit tests for LlmPipeline — mock LLM client and embedding provider."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import json
import pytest
def _make_pipeline():
with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \
patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls:
mock_client = MagicMock()
mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}')
mock_llm_fn.return_value = mock_client
mock_emb = MagicMock()
mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024]
mock_emb_cls.return_value = mock_emb
from app.infrastructure.perception.llm_pipeline import LlmPipeline
return LlmPipeline(), mock_client, mock_emb
def test_extract_structure_returns_dict():
pipeline, mock_client, _ = _make_pipeline()
event = {
"id": "evt-001",
"standard_code": "GB 18384-2025",
"title": "电动汽车安全要求",
"summary": "新增 IP67 级别防护",
"source_label": "CATARC",
"tags": ["电池安全"],
}
result = pipeline.extract_structure(event)
assert isinstance(result, dict)
assert "obligations" in result
assert "impact_level" in result
def test_assess_impact_returns_list():
pipeline, mock_client, _ = _make_pipeline()
mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]')
mock_retrieval = MagicMock()
chunk = MagicMock()
chunk.doc_id = "d1"
chunk.doc_title = "Safety Manual"
chunk.score = 0.85
chunk.text = "relevant text"
chunk.section_title = "§4.2"
mock_retrieval.retrieve.return_value = [chunk]
event = {
"standard_code": "GB 18384-2025",
"title": "电动汽车安全要求",
"obligations": [{"text": "OEM shall comply"}],
}
result = pipeline.assess_impact(event, mock_retrieval)
assert isinstance(result, list)
def test_compute_diff_no_change():
pipeline, _, mock_emb = _make_pipeline()
mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024]
result = pipeline.compute_diff("paragraph one", "paragraph one")
assert isinstance(result, dict)
assert "changed_sections" in result
assert "change_summary" in result
def test_compute_diff_detects_change():
pipeline, mock_client, mock_emb = _make_pipeline()
mock_emb.embed_texts.return_value = [
[1.0] + [0.0] * 1023,
[0.0] + [1.0] + [0.0] * 1022,
]
mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}')
result = pipeline.compute_diff("old paragraph text", "new tighter requirement text")
assert isinstance(result["changed_sections"], list)

View File

@@ -0,0 +1,98 @@
"""Unit tests for PostgresEventStore using a mocked psycopg2 pool."""
from __future__ import annotations
import json
from unittest.mock import MagicMock, patch
import pytest
# Patch psycopg2 before importing the module under test
import sys
mock_psycopg2 = MagicMock()
mock_psycopg2.extras = MagicMock()
sys.modules.setdefault("psycopg2", mock_psycopg2)
sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras)
sys.modules.setdefault("psycopg2.pool", MagicMock())
from app.infrastructure.perception.base_event_store import BaseEventStore
SAMPLE_ROW = {
"id": "pg-001",
"source": "国标委",
"source_label": "国家标准化管理委员会",
"standard_code": "GB 18384-2025",
"title": "电动汽车安全要求",
"summary": "新增要求",
"full_text_url": "https://openstd.samr.gov.cn",
"status": "enacted",
"impact_level": "high",
"published_at": "2025-11-15",
"effective_at": "2026-07-01",
"category": "电动汽车安全",
"tags": ["电池安全"],
"obligations": None,
"deadlines": None,
"scope": None,
"penalties": None,
"content_hash": "abc123",
"previous_hash": None,
"change_summary": None,
"changed_sections": None,
"affected_docs": None,
"crawled_at": "2026-06-05T10:00:00+00:00",
"processed_at": None,
"raw_storage_key": None,
}
def _make_store_with_pool(mock_pool):
with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool):
with patch(
"app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema"
):
from app.infrastructure.perception.postgres_event_store import PostgresEventStore
return PostgresEventStore()
def _cursor_returning(rows):
cursor = MagicMock()
cursor.__enter__ = lambda s: s
cursor.__exit__ = MagicMock(return_value=False)
cursor.fetchall.return_value = rows
cursor.fetchone.return_value = rows[0] if rows else None
return cursor
def test_is_base_event_store():
mock_pool = MagicMock()
store = _make_store_with_pool(mock_pool)
assert isinstance(store, BaseEventStore)
def test_filter_returns_list():
mock_pool = MagicMock()
conn = MagicMock()
conn.__enter__ = lambda s: s
conn.__exit__ = MagicMock(return_value=False)
cursor = _cursor_returning([SAMPLE_ROW])
conn.cursor.return_value = cursor
mock_pool.getconn.return_value = conn
store = _make_store_with_pool(mock_pool)
result = store.filter(limit=10)
assert isinstance(result, list)
def test_stats_returns_correct_keys():
mock_pool = MagicMock()
conn = MagicMock()
conn.__enter__ = lambda s: s
conn.__exit__ = MagicMock(return_value=False)
cursor = MagicMock()
cursor.__enter__ = lambda s: s
cursor.__exit__ = MagicMock(return_value=False)
cursor.fetchone.return_value = {"count": 5}
conn.cursor.return_value = cursor
mock_pool.getconn.return_value = conn
store = _make_store_with_pool(mock_pool)
stats = store.stats()
for key in ("total", "high_impact", "medium_impact", "recent_90d"):
assert key in stats

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,328 @@
# Regulatory Signals Intelligence Enhancement — Design Spec
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Replace the 20-item hardcoded MockEventStore with real regulatory data from Chinese and international sources, add LLM-driven structured extraction, impact assessment, and semantic change diff — all accessible through a manual-trigger crawl in the frontend.
**Architecture:** Crawler Service (httpx + BeautifulSoup) → PostgreSQL EventStore → LLM Pipeline (extract → assess → diff) → existing PerceptionService interface. New code follows `api → application → domain ports → infrastructure` layering; no new files in `services/*` or `workflows/*`; `shared/bootstrap.py` is the composition root.
**Tech Stack:** httpx, BeautifulSoup4, sentence-transformers (for diff), existing LLM factory (deepseek/qwen), existing KnowledgeRetrievalService (RAG), PostgreSQL (already available), existing SSE infrastructure.
---
## 1. Data Sources
| Source | URL | Method | Coverage |
|--------|-----|--------|----------|
| CATARC 汽车标准 | `https://www.catarc.org.cn/bzzxd/qcbz/index.html` | httpx + BeautifulSoup (static pages) | 国家/行业汽车标准列表 |
| 国标委强制性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=1&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 强制性国家标准,按"车"过滤 |
| 国标委推荐性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=2&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 推荐性国家标准,按"车"过滤 |
| EUR-Lex | RSS + CELLAR REST API | pyeurlex / httpx | EU AI Act, automotive directives |
| UN R155/R156 | CELLAR REST API (CELEX lookup) | httpx | UN-ECE cybersecurity/OTA regulations |
Crawl is **manual-trigger only** — no cron/Celery Beat. Admin clicks "刷新数据源" in the frontend UI.
---
## 2. Database Schema
### New table: `regulation_events`
```sql
CREATE TABLE IF NOT EXISTS regulation_events (
id TEXT PRIMARY KEY, -- sha256(source + standard_code)[:12]
source TEXT NOT NULL, -- 'CATARC' | '国标委' | 'EUR-Lex' | 'UN-ECE'
source_label TEXT, -- Human-readable source label
standard_code TEXT NOT NULL, -- e.g. "GB 18384-2025", "EU/2024/1689"
title TEXT NOT NULL,
summary TEXT, -- Crawled abstract or first paragraph
full_text_url TEXT, -- Original page URL
status TEXT, -- 'enacted' | 'draft' | 'consultation'
impact_level TEXT, -- 'high' | 'medium' | 'low' (LLM-assigned)
published_at DATE,
effective_at DATE,
category TEXT,
tags TEXT[],
-- LLM structured extraction
obligations JSONB, -- [{text, deontic, subject, object, condition}]
deadlines JSONB, -- [{date, description}]
scope TEXT, -- Applicability scope summary
penalties TEXT, -- Penalty / consequence summary
-- Change tracking
content_hash TEXT, -- SHA256 of crawled full text
previous_hash TEXT, -- Hash from prior crawl (NULL on first crawl)
change_summary TEXT, -- LLM-generated description of changes
changed_sections JSONB, -- [{old_text, new_text, change_type}] where cosine<0.85
-- Impact assessment
affected_docs JSONB, -- [{doc_id, doc_name, score, key_clauses, recommendation}]
-- Metadata
crawled_at TIMESTAMPTZ DEFAULT now(),
processed_at TIMESTAMPTZ,
raw_storage_key TEXT -- MinIO path for raw HTML/PDF (optional)
);
CREATE INDEX IF NOT EXISTS regulation_events_source_date
ON regulation_events (source, published_at DESC);
CREATE INDEX IF NOT EXISTS regulation_events_impact_date
ON regulation_events (impact_level, published_at DESC);
CREATE INDEX IF NOT EXISTS regulation_events_tags
ON regulation_events USING gin(tags);
```
---
## 3. Backend Architecture
### 3.1 File Map
**New files (infrastructure layer):**
- `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` — CATARC scraper
- `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` — 国标委 JSON API crawler
- `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` — EUR-Lex RSS + CELLAR
- `backend/app/infrastructure/perception/crawlers/base.py` — Abstract base class
- `backend/app/infrastructure/perception/postgres_event_store.py` — PostgresEventStore (replaces MockEventStore)
- `backend/app/infrastructure/perception/llm_pipeline.py` — Extract / assess / diff pipeline
**New files (application layer):**
- `backend/app/application/perception/crawl_service.py` — Orchestrates crawlers + LLM pipeline, exposes `run_crawl(sources)` + progress generator
**Modified files:**
- `backend/app/api/routes/perception.py` — Add `POST /crawl`, `GET /crawl/status` (SSE), `POST /events/{id}/process`, `GET /events/{id}/diff`
- `backend/app/shared/bootstrap.py` — Wire `PostgresEventStore` + `CrawlService` + `LlmPipeline` when `DOCUMENT_REPOSITORY_BACKEND=postgres`; fallback to `MockEventStore` when `json`
- `backend/app/config/settings.py` — Add `perception_crawl_timeout_seconds`, `perception_max_events_per_source`
**Unchanged files:**
- `backend/app/application/perception/services.py``PerceptionService` interface unchanged; only `_store` swap
- `backend/app/infrastructure/perception/mock_event_store.py` — Kept for `json` backend mode
### 3.2 Domain Port (Abstract Interface)
```python
# backend/app/infrastructure/perception/base_event_store.py
from abc import ABC, abstractmethod
class BaseEventStore(ABC):
@abstractmethod
def all(self) -> list[dict]: ...
@abstractmethod
def get(self, event_id: str) -> dict | None: ...
@abstractmethod
def filter(self, source=None, impact_level=None, limit=50) -> list[dict]: ...
@abstractmethod
def stats(self) -> dict: ...
@abstractmethod
def upsert(self, event: dict) -> None: ... # new — needed for crawl writes
@abstractmethod
def get_by_standard_code(self, code: str) -> dict | None: ... # for change detection
```
`MockEventStore` and `PostgresEventStore` both implement this interface.
### 3.3 Crawler Base Contract
```python
# backend/app/infrastructure/perception/crawlers/base.py
from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class RawEvent:
source: str
source_label: str
standard_code: str
title: str
summary: str
full_text_url: str
status: str # 'enacted' | 'draft' | 'consultation'
published_at: str # YYYY-MM-DD string
effective_at: str | None
category: str
tags: list[str]
raw_text: str # full crawled text for hashing + LLM
class BaseCrawler(ABC):
@abstractmethod
def fetch(self, limit: int = 50) -> list[RawEvent]: ...
```
### 3.4 LLM Pipeline
```python
# backend/app/infrastructure/perception/llm_pipeline.py
class LlmPipeline:
"""Runs three sequential LLM steps on a regulation event."""
def extract_structure(self, event: dict) -> dict:
"""Step 1: Extract obligations, deadlines, scope, penalties, impact_level.
Returns dict with keys: obligations, deadlines, scope, penalties, impact_level.
Uses JSON-mode or structured prompt; model retries once on parse failure.
"""
def assess_impact(self, event: dict, retrieval_service) -> list[dict]:
"""Step 2: RAG-based impact on existing knowledge base documents.
Query = standard_code + title + first obligation texts.
Returns list of {doc_id, doc_name, score, key_clauses, recommendation}.
"""
def compute_diff(self, old_text: str, new_text: str) -> dict:
"""Step 3: Semantic diff between old and new regulation text.
Splits both texts by paragraph. Calls existing EmbeddingService (text-embedding-v3
via EMBEDDING_BASE_URL) to embed each paragraph, then computes cosine similarity.
Changed paragraphs (cosine < 0.85) sent to LLM for change_type classification:
'tightened' | 'relaxed' | 'added' | 'removed'
Returns {changed_sections: [...], change_summary: str}.
Only called when content_hash differs from previous_hash.
"""
```
### 3.5 CrawlService
```python
# backend/app/application/perception/crawl_service.py
class CrawlService:
def __init__(self, crawlers, event_store, llm_pipeline, retrieval_service): ...
def run_crawl(self, sources: list[str] | None = None) -> Generator[dict, None, None]:
"""Manual-trigger crawl. Yields progress SSE dicts:
{event: 'progress', data: {source, fetched, new, updated, stage}}
{event: 'done', data: {total_new, total_updated, duration_ms}}
{event: 'error', data: {source, message}}
For each crawler:
1. fetch() RawEvents
2. hash check vs stored event → skip if unchanged
3. upsert raw event to DB
4. run LLM pipeline (extract → assess → diff)
5. upsert enriched event to DB
6. yield progress
"""
```
---
## 4. API Endpoints
### Existing (unchanged interface, new store backend)
- `GET /api/v1/perception/stats`
- `GET /api/v1/perception/events`
- `GET /api/v1/perception/events/{id}`
- `POST /api/v1/perception/events/{id}/analyze` (streaming)
### New endpoints
```
POST /api/v1/perception/crawl
Body: { sources?: ["CATARC", "国标委", "EUR-Lex", "UN-ECE"] }
Response: text/event-stream (SSE)
Auth: requires current_user (admin/legal role)
Streams progress events until done or error.
POST /api/v1/perception/events/{id}/process
Trigger LLM pipeline for a single already-crawled event.
Response: { status: "ok", processed_at: "..." }
Auth: requires current_user
GET /api/v1/perception/events/{id}/diff
Returns: { changed_sections: [...], change_summary: str, previous_hash: str }
Returns 404 if no diff available (first crawl or no change detected).
```
---
## 5. Frontend Changes
### 5.1 New: Crawl Control Bar (top of PerceptionPage)
Above the stats-bar, add a `<CrawlBar>` component:
- "刷新数据源" button — triggers `POST /crawl` (all sources)
- Inline progress display: shows SSE progress events as a mini status line
- e.g. "CATARC: 抓取中… | 国标委: 12 条新增 | EUR-Lex: 等待中"
- On completion: shows "更新完成 — 新增 N 条,更新 M 条"
- Disabled while crawl is in progress (prevents double-trigger)
### 5.2 Signal Card Enhancement
Existing cards get two new indicators:
- **NEW badge** — shown when `crawled_at` is within last 24h (green dot)
- **CHANGED badge** — shown when `previous_hash != content_hash` and `change_summary` exists
### 5.3 Right Panel — Structured Tab
Right detail panel adds a tab bar: **概览 | 义务条款 | 影响评估 | 变更对比**
**义务条款 tab:**
- Table: 义务描述 | 主体 | 对象 | 截止日期
- Tags for deontic type: 强制 / 禁止 / 允许
- Shows `obligations[]` + `deadlines[]` from DB
**影响评估 tab:**
- Replaces hardcoded MOCK_DOCS with real `affected_docs[]` from DB
- Each row: document name, similarity score (%), key clause excerpt, LLM recommendation
- "Run fresh assessment" button → triggers `POST /events/{id}/process`
**变更对比 tab:**
- Only visible when `change_summary` is non-null
- Top: `change_summary` text (LLM prose)
- Below: diff table with old/new paragraph pairs, change_type badge per row
- Hidden (tab disabled) on first-crawl events with no prior version
### 5.4 Existing behavior preserved
- `analyze` streaming (AI analysis) unchanged
- Search/filter (source, impact) unchanged — now hits real DB data
- Stats bar — now reflects real counts from PostgreSQL
---
## 6. Settings Additions
```python
# backend/app/config/settings.py additions
perception_crawl_timeout_seconds: int = Field(default=120, ...)
perception_max_events_per_source: int = Field(default=100, ...)
perception_diff_similarity_threshold: float = Field(default=0.85, ...)
```
```env
# .env additions
PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
PERCEPTION_MAX_EVENTS_PER_SOURCE=100
PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
```
---
## 7. Dependencies
```
# requirements.txt additions
httpx>=0.27.0 # already likely present; confirm
beautifulsoup4>=4.12.0 # HTML parsing for CATARC
lxml>=5.0.0 # BeautifulSoup parser backend
# sentence-transformers NOT added — diff uses existing text-embedding-v3 API (EMBEDDING_BASE_URL)
```
No new infrastructure required (PostgreSQL + MinIO + Milvus already available).
---
## 8. Backward Compatibility
- `DOCUMENT_REPOSITORY_BACKEND=json``bootstrap.py` uses `MockEventStore` (unchanged behavior)
- `DOCUMENT_REPOSITORY_BACKEND=postgres` → uses `PostgresEventStore`
- Migration: run `CREATE TABLE` SQL on first startup (idempotent `CREATE TABLE IF NOT EXISTS`)
- Existing 20 mock events are not seeded to PostgreSQL; PostgreSQL starts empty until first crawl
---
## 9. Out of Scope (this phase)
- Automatic/scheduled crawling (Celery Beat) — manual trigger only
- Playwright-based JS-rendered pages — all target sites work with httpx
- Knowledge Graph (Neo4j / LightRAG) — future phase
- Email/Slack webhook notifications — future phase
- User-facing diff history (versioning beyond one prior snapshot) — future phase

View File

@@ -1,12 +1,14 @@
import './styles/globals.css'; import './styles/globals.css';
import { ThemeProvider, AuthProvider } from './contexts'; import { ThemeProvider, AuthProvider, PageStateProvider } from './contexts';
import { AppRouter } from './router/AppRouter'; import { AppRouter } from './router/AppRouter';
function App() { function App() {
return ( return (
<ThemeProvider> <ThemeProvider>
<AuthProvider> <AuthProvider>
<PageStateProvider>
<AppRouter /> <AppRouter />
</PageStateProvider>
</AuthProvider> </AuthProvider>
</ThemeProvider> </ThemeProvider>
); );

View File

@@ -0,0 +1,211 @@
/**
* PageStateContext — preserves page-level session state across route changes.
*
* When React Router unmounts a page component, all its useState values are lost.
* This context lives above the router and holds the state that must survive
* navigation so users can switch modules and return without losing their work.
*
* Covered pages:
* - RagChat: message history, citation rail, sessionId, input draft
* - Compliance: analysis result (sources, findings, conclusion, meta)
* - Perception: selected signal, filter state, AI analysis output
*/
import React, { createContext, useContext, useState, useCallback, useRef } from 'react';
// ── RagChat types ─────────────────────────────────────────────────────────────
export interface RagMessage {
id: string;
role: 'user' | 'assistant';
text: string;
citationRefs?: number[];
}
export interface RagCitation {
index: number;
score: number;
name: string;
clause: string;
snippet: string;
docId?: string;
}
export interface RagChatState {
messages: RagMessage[];
citations: RagCitation[];
sessionId: string | null;
inputDraft: string;
}
const RAG_INIT: RagChatState = {
messages: [
{
id: 'init',
role: 'assistant',
text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
},
],
citations: [],
sessionId: null,
inputDraft: '',
};
// ── Compliance types ──────────────────────────────────────────────────────────
export interface ComplianceSourceEvent {
standard: string;
clause: string;
score: number;
status: string;
full_content: string;
}
export interface ComplianceFindingEvent {
title: string;
desc: string;
status: 'ok' | 'warn' | 'risk';
clause_ref?: string;
}
export interface ComplianceActionItem {
label: string;
value: string;
risk?: boolean;
}
export interface ComplianceDonePayload {
conclusion: string;
actions: ComplianceActionItem[];
risk_score: number;
highlight_terms: string[];
para_text: string;
}
export interface ComplianceMeta {
title: string;
sourceType: 'text' | 'doc' | 'upload';
startedAt: string;
}
export type ComplianceStatus = 'idle' | 'streaming' | 'done' | 'error';
export interface ComplianceState {
status: ComplianceStatus;
stageLabel: string;
stageKey: string;
meta: ComplianceMeta | null;
sources: ComplianceSourceEvent[];
findings: ComplianceFindingEvent[];
done: ComplianceDonePayload | null;
errorText: string;
}
const COMPLIANCE_INIT: ComplianceState = {
status: 'idle',
stageLabel: '',
stageKey: '',
meta: null,
sources: [],
findings: [],
done: null,
errorText: '',
};
// ── Perception types ──────────────────────────────────────────────────────────
export interface PerceptionSignal {
id: string;
source: string;
standard: string;
status: 'ok' | 'warn' | 'risk' | 'info';
title: string;
summary: string;
date: string;
tags: string[];
impact: 'High' | 'Medium' | 'Low';
}
export interface PerceptionPageState {
signals: PerceptionSignal[];
searchQuery: string;
sourceFilter: string;
impactFilter: string;
selectedId: string | null;
aiOutput: string;
detailTab: 'overview' | 'obligations' | 'assessment' | 'diff';
crawlStatus: string;
}
const PERCEPTION_INIT: PerceptionPageState = {
signals: [],
searchQuery: '',
sourceFilter: 'All',
impactFilter: 'All',
selectedId: null,
aiOutput: '',
detailTab: 'overview',
crawlStatus: '',
};
// ── Context value ─────────────────────────────────────────────────────────────
interface PageStateContextValue {
// RagChat
ragState: RagChatState;
setRagState: React.Dispatch<React.SetStateAction<RagChatState>>;
ragStreamingRef: React.MutableRefObject<boolean>;
ragAbortRef: React.MutableRefObject<AbortController | null>;
// Compliance
complianceState: ComplianceState;
setComplianceState: React.Dispatch<React.SetStateAction<ComplianceState>>;
complianceAbortRef: React.MutableRefObject<AbortController | null>;
resetCompliance: () => void;
// Perception
perceptionState: PerceptionPageState;
setPerceptionState: React.Dispatch<React.SetStateAction<PerceptionPageState>>;
perceptionAbortRef: React.MutableRefObject<AbortController | null>;
perceptionCrawlAbortRef: React.MutableRefObject<AbortController | null>;
}
const PageStateContext = createContext<PageStateContextValue | null>(null);
// ── Provider ──────────────────────────────────────────────────────────────────
export function PageStateProvider({ children }: { children: React.ReactNode }) {
const [ragState, setRagState] = useState<RagChatState>(RAG_INIT);
const ragStreamingRef = useRef(false);
const ragAbortRef = useRef<AbortController | null>(null);
const [complianceState, setComplianceState] = useState<ComplianceState>(COMPLIANCE_INIT);
const complianceAbortRef = useRef<AbortController | null>(null);
const resetCompliance = useCallback(() => {
complianceAbortRef.current?.abort();
setComplianceState(COMPLIANCE_INIT);
}, []);
const [perceptionState, setPerceptionState] = useState<PerceptionPageState>(PERCEPTION_INIT);
const perceptionAbortRef = useRef<AbortController | null>(null);
const perceptionCrawlAbortRef = useRef<AbortController | null>(null);
return (
<PageStateContext.Provider value={{
ragState, setRagState, ragStreamingRef, ragAbortRef,
complianceState, setComplianceState, complianceAbortRef, resetCompliance,
perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef,
}}>
{children}
</PageStateContext.Provider>
);
}
// ── Hook ──────────────────────────────────────────────────────────────────────
export function usePageState() {
const ctx = useContext(PageStateContext);
if (!ctx) throw new Error('usePageState must be used inside PageStateProvider');
return ctx;
}

View File

@@ -1,3 +1,18 @@
export { ThemeProvider, useTheme } from './ThemeContext'; export { ThemeProvider, useTheme } from './ThemeContext';
export { AuthProvider, useAuth } from './AuthContext'; export { AuthProvider, useAuth } from './AuthContext';
export type { AuthUser } from './AuthContext'; export type { AuthUser } from './AuthContext';
export { PageStateProvider, usePageState } from './PageStateContext';
export type {
RagChatState,
RagMessage,
RagCitation,
ComplianceState,
ComplianceStatus,
ComplianceSourceEvent,
ComplianceFindingEvent,
ComplianceDonePayload,
ComplianceMeta,
ComplianceActionItem,
PerceptionPageState,
PerceptionSignal,
} from './PageStateContext';

View File

@@ -1,4 +1,25 @@
import { useState, useCallback, useRef } from 'react'; /**
* useComplianceAnalysis — compliance analysis state wired to PageStateContext.
*
* State is stored in the global context so it persists when the user navigates
* to another module and returns. The `run` and `reset` actions are identical
* to the previous hook API so CompliancePage needs no structural changes.
*/
import { useCallback } from 'react';
import { usePageState } from '../../contexts';
import type {
ComplianceMeta,
ComplianceState,
ComplianceSourceEvent,
ComplianceFindingEvent,
ComplianceDonePayload,
} from '../../contexts';
export type { ComplianceMeta, ComplianceState, ComplianceSourceEvent as SourceEvent, ComplianceFindingEvent as FindingEvent, ComplianceDonePayload as DonePayload };
export type { ComplianceActionItem as ActionItem } from '../../contexts';
export type AnalysisStatus = import('../../contexts').ComplianceStatus;
export type AnalysisMeta = ComplianceMeta;
const TOKEN_KEY = 'auth_token'; const TOKEN_KEY = 'auth_token';
function authHeader(): Record<string, string> { function authHeader(): Record<string, string> {
@@ -6,55 +27,7 @@ function authHeader(): Record<string, string> {
return t ? { Authorization: `Bearer ${t}` } : {}; return t ? { Authorization: `Bearer ${t}` } : {};
} }
export type AnalysisStatus = 'idle' | 'streaming' | 'done' | 'error'; const INITIAL_STATE: ComplianceState = {
export interface SourceEvent {
standard: string;
clause: string;
score: number;
status: string;
full_content: string;
}
export interface FindingEvent {
title: string;
desc: string;
status: 'ok' | 'warn' | 'risk';
clause_ref?: string;
}
export interface ActionItem {
label: string;
value: string;
risk?: boolean;
}
export interface DonePayload {
conclusion: string;
actions: ActionItem[];
risk_score: number;
highlight_terms: string[];
para_text: string;
}
export interface AnalysisMeta {
title: string;
sourceType: 'text' | 'doc' | 'upload';
startedAt: string; // ISO timestamp
}
export interface AnalysisState {
status: AnalysisStatus;
stageLabel: string;
stageKey: string;
meta: AnalysisMeta | null;
sources: SourceEvent[];
findings: FindingEvent[];
done: DonePayload | null;
errorText: string;
}
const INITIAL_STATE: AnalysisState = {
status: 'idle', status: 'idle',
stageLabel: '', stageLabel: '',
stageKey: '', stageKey: '',
@@ -66,18 +39,12 @@ const INITIAL_STATE: AnalysisState = {
}; };
export function useComplianceAnalysis() { export function useComplianceAnalysis() {
const [state, setState] = useState<AnalysisState>(INITIAL_STATE); const { complianceState: state, setComplianceState: setState, complianceAbortRef, resetCompliance: reset } = usePageState();
const abortRef = useRef<AbortController | null>(null);
const reset = useCallback(() => { const run = useCallback(async (formData: FormData, meta: ComplianceMeta) => {
abortRef.current?.abort(); complianceAbortRef.current?.abort();
setState(INITIAL_STATE);
}, []);
const run = useCallback(async (formData: FormData, meta: AnalysisMeta) => {
abortRef.current?.abort();
const ctrl = new AbortController(); const ctrl = new AbortController();
abortRef.current = ctrl; complianceAbortRef.current = ctrl;
setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta }); setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta });
@@ -124,7 +91,7 @@ export function useComplianceAnalysis() {
if (j.type === 'stage') { if (j.type === 'stage') {
setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' })); setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' }));
} else if (j.type === 'source') { } else if (j.type === 'source') {
const src: SourceEvent = { const src: ComplianceSourceEvent = {
standard: j.standard ?? '', standard: j.standard ?? '',
clause: j.clause ?? '', clause: j.clause ?? '',
score: j.score ?? 0, score: j.score ?? 0,
@@ -133,7 +100,7 @@ export function useComplianceAnalysis() {
}; };
setState(s => ({ ...s, sources: [...s.sources, src] })); setState(s => ({ ...s, sources: [...s.sources, src] }));
} else if (j.type === 'finding') { } else if (j.type === 'finding') {
const finding: FindingEvent = { const finding: ComplianceFindingEvent = {
title: j.title ?? '', title: j.title ?? '',
desc: j.desc ?? '', desc: j.desc ?? '',
status: j.status ?? 'info', status: j.status ?? 'info',
@@ -141,7 +108,7 @@ export function useComplianceAnalysis() {
}; };
setState(s => ({ ...s, findings: [...s.findings, finding] })); setState(s => ({ ...s, findings: [...s.findings, finding] }));
} else if (j.type === 'done') { } else if (j.type === 'done') {
const payload: DonePayload = { const payload: ComplianceDonePayload = {
conclusion: j.conclusion ?? '', conclusion: j.conclusion ?? '',
actions: j.actions ?? [], actions: j.actions ?? [],
risk_score: j.risk_score ?? 0, risk_score: j.risk_score ?? 0,
@@ -162,7 +129,7 @@ export function useComplianceAnalysis() {
if (e instanceof Error && e.name === 'AbortError') return; if (e instanceof Error && e.name === 'AbortError') return;
setState(s => ({ ...s, status: 'error', errorText: String(e) })); setState(s => ({ ...s, status: 'error', errorText: String(e) }));
} }
}, []); }, [setState, complianceAbortRef]);
return { state, run, reset }; return { state, run, reset };
} }

View File

@@ -1,6 +1,8 @@
import { useState, useEffect, useRef } from 'react'; import { useState, useEffect, useRef } from 'react';
import { Topbar } from '../../components/layout/Topbar'; import { Topbar } from '../../components/layout/Topbar';
import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react'; import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react';
import { usePageState } from '../../contexts';
import type { PerceptionSignal } from '../../contexts';
const TOKEN_KEY = 'auth_token'; const TOKEN_KEY = 'auth_token';
function authHeader(): Record<string, string> { function authHeader(): Record<string, string> {
@@ -8,18 +10,6 @@ function authHeader(): Record<string, string> {
return t ? { Authorization: `Bearer ${t}` } : {}; return t ? { Authorization: `Bearer ${t}` } : {};
} }
interface Signal {
id: string;
source: string;
standard: string;
status: 'ok' | 'warn' | 'risk' | 'info';
title: string;
summary: string;
date: string;
tags: string[];
impact: 'High' | 'Medium' | 'Low';
}
interface Stats { interface Stats {
total: number; total: number;
high_impact: number; high_impact: number;
@@ -27,29 +17,17 @@ interface Stats {
last_90_days: number; last_90_days: number;
} }
interface DocResult {
score: number;
name: string;
clause: string;
snippet: string;
}
const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF']; const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF'];
const IMPACTS = ['All', 'High', 'Medium', 'Low']; const IMPACTS = ['All', 'High', 'Medium', 'Low'];
// Backend /api/v1/perception/stats returns: // Backend event → Signal
// { total, high_impact, medium_impact, last_90_days } — field names match, ✓ function mapEvent(e: Record<string, unknown>): PerceptionSignal {
// Backend /api/v1/perception/events returns:
// { events: [{ id, title, summary, source, standard, impact_level, published_at, tags, status }] }
// Map backend event fields → frontend Signal shape
function mapEvent(e: Record<string, unknown>): Signal {
const impact = String(e.impact_level ?? '').toLowerCase(); const impact = String(e.impact_level ?? '').toLowerCase();
const backendStatus = String(e.status ?? '').toLowerCase(); const backendStatus = String(e.status ?? '').toLowerCase();
return { return {
id: String(e.id ?? e.event_id ?? ''), id: String(e.id ?? e.event_id ?? ''),
source: String(e.source ?? ''), source: String(e.source ?? ''),
standard: String(e.standard ?? e.regulation_id ?? ''), standard: String(e.standard ?? e.standard_code ?? e.regulation_id ?? ''),
status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk' status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk'
: backendStatus === 'medium' || backendStatus === 'draft' ? 'warn' : backendStatus === 'medium' || backendStatus === 'draft' ? 'warn'
: backendStatus === 'low' || backendStatus === 'final' ? 'ok' : backendStatus === 'low' || backendStatus === 'final' ? 'ok'
@@ -62,50 +40,40 @@ function mapEvent(e: Record<string, unknown>): Signal {
}; };
} }
const MOCK_SIGNALS: Signal[] = [ const MOCK_SIGNALS: PerceptionSignal[] = [
{ {
id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk', id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk',
title: 'EU AI Act — High-risk AI in vehicles', title: 'EU AI Act — High-risk AI in vehicles',
summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.', summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.',
date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High' date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High',
}, },
{ {
id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn', id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn',
title: 'MIIT Draft — in-vehicle AI training data', title: 'MIIT Draft — in-vehicle AI training data',
summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.', summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.',
date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High' date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High',
}, },
{ {
id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info', id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info',
title: 'ISO/SAE 21434 Amendment 1', title: 'ISO/SAE 21434 Amendment 1',
summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.', summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.',
date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium' date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium',
}, },
{
id: '4', source: 'UN-ECE', standard: 'UNECE WP.29 R155', status: 'ok',
title: 'UNECE R155 Corrigendum',
summary: 'Editorial corrections to cybersecurity management system requirements. No substantive changes.',
date: '2025-09-12', tags: ['type-approval', 'UNECE'], impact: 'Low'
},
];
const MOCK_DOCS: DocResult[] = [
{ score: 94, name: 'Vehicle AI Safety Manual v3.2', clause: '§4.2.1', snippet: 'The risk management process shall identify and evaluate risks arising from AI system decisions in safety-critical scenarios...' },
{ score: 87, name: 'ADAS System Requirements', clause: '§7.1', snippet: 'Automated driving functions must document training data lineage and model performance envelopes prior to deployment.' },
{ score: 71, name: 'Type Approval Documentation', clause: 'Annex B', snippet: 'Cybersecurity management system certification requires third-party audit of AI decision audit logs retention policy.' },
]; ];
export function PerceptionPage() { export function PerceptionPage() {
const [stats, setStats] = useState<Stats | null>(null); // Persistent state lives in PageStateContext — survives route changes
const [signals, setSignals] = useState<Signal[]>(MOCK_SIGNALS); const { perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef } = usePageState();
const [searchQuery, setSearchQuery] = useState(''); const { signals, searchQuery, sourceFilter, impactFilter, selectedId, aiOutput, detailTab, crawlStatus } = perceptionState;
const [sourceFilter, setSourceFilter] = useState('All');
const [impactFilter, setImpactFilter] = useState('All');
const [selected, setSelected] = useState<Signal | null>(null);
const [streaming, setStreaming] = useState(false);
const [aiOutput, setAiOutput] = useState('');
const abortRef = useRef<AbortController | null>(null);
// Stats and selectedFull are lightweight to re-fetch on mount
const [stats, setStats] = useState<Stats | null>(null);
const [streaming, setStreaming] = useState(false);
const [crawling, setCrawling] = useState(false);
// Full event detail — re-fetched when selected changes or page mounts with a selection
const [selectedFull, setSelectedFull] = useState<Record<string, unknown> | null>(null);
// Re-fetch stats every time the page mounts
useEffect(() => { useEffect(() => {
fetch('/api/v1/perception/stats', { headers: authHeader() }) fetch('/api/v1/perception/stats', { headers: authHeader() })
.then(r => r.json()) .then(r => r.json())
@@ -113,16 +81,36 @@ export function PerceptionPage() {
.catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 })); .catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 }));
}, []); }, []);
// Fetch signal list on first mount only (if empty), otherwise preserve context state
useEffect(() => { useEffect(() => {
if (signals.length > 0) return; // already loaded
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() }) fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
.then(r => r.json()) .then(r => r.json())
.then(d => { .then(d => {
if (Array.isArray(d?.events) && d.events.length > 0) { if (Array.isArray(d?.events) && d.events.length > 0) {
setSignals(d.events.map(mapEvent)); setPerceptionState(s => ({ ...s, signals: d.events.map(mapEvent) }));
} else {
setPerceptionState(s => ({ ...s, signals: MOCK_SIGNALS }));
} }
}) })
.catch(() => { /* keep mock data on error */ }); .catch(() => {
}, []); setPerceptionState(s => ({ ...s, signals: s.signals.length > 0 ? s.signals : MOCK_SIGNALS }));
});
}, []); // eslint-disable-line react-hooks/exhaustive-deps
// Re-fetch full event detail when navigating back with a selected signal
useEffect(() => {
if (selectedId) {
fetch(`/api/v1/perception/events/${selectedId}`, { headers: authHeader() })
.then(r => r.ok ? r.json() : null)
.then(d => { if (d) setSelectedFull(d); })
.catch(() => {});
} else {
setSelectedFull(null);
}
}, [selectedId]);
const selected = signals.find(s => s.id === selectedId) ?? null;
const filtered = signals.filter(s => { const filtered = signals.filter(s => {
if (sourceFilter !== 'All' && s.source !== sourceFilter) return false; if (sourceFilter !== 'All' && s.source !== sourceFilter) return false;
@@ -137,13 +125,20 @@ export function PerceptionPage() {
function runAnalysis() { function runAnalysis() {
if (!selected) return; if (!selected) return;
setStreaming(true); setStreaming(true);
setAiOutput(''); setPerceptionState(s => ({ ...s, aiOutput: '' }));
const ctrl = new AbortController(); const ctrl = new AbortController();
abortRef.current = ctrl; perceptionAbortRef.current = ctrl;
// Backend: POST /api/v1/perception/events/{id}/analyze → SSE stream fetch(`/api/v1/perception/events/${selected.id}/analyze`, {
fetch(`/api/v1/perception/events/${selected.id}/analyze`, { method: 'POST', headers: authHeader(), signal: ctrl.signal }) method: 'POST',
headers: authHeader(),
signal: ctrl.signal,
})
.then(async res => { .then(async res => {
if (!res.body) { setAiOutput('No stream available.'); setStreaming(false); return; } if (!res.body) {
setPerceptionState(s => ({ ...s, aiOutput: 'No stream available.' }));
setStreaming(false);
return;
}
const reader = res.body.getReader(); const reader = res.body.getReader();
const dec = new TextDecoder(); const dec = new TextDecoder();
let buf = ''; let buf = '';
@@ -160,30 +155,99 @@ export function PerceptionPage() {
if (!raw || raw === '[DONE]') continue; if (!raw || raw === '[DONE]') continue;
try { try {
const j = JSON.parse(raw); const j = JSON.parse(raw);
if (j.text) setAiOutput(p => p + j.text); if (j.text) setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j.text }));
else if (typeof j === 'string') setAiOutput(p => p + j); else if (typeof j === 'string') setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j }));
} catch { } catch {
setAiOutput(p => p + raw); setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + raw }));
} }
} }
} }
setStreaming(false); setStreaming(false);
}) })
.catch(e => { .catch(e => {
if (e.name !== 'AbortError') setAiOutput('Analysis failed. Check API connection.'); if (e.name !== 'AbortError') setPerceptionState(s => ({ ...s, aiOutput: 'Analysis failed. Check API connection.' }));
setStreaming(false); setStreaming(false);
}); });
} }
function stopAnalysis() { function stopAnalysis() {
abortRef.current?.abort(); perceptionAbortRef.current?.abort();
setStreaming(false); setStreaming(false);
} }
function selectSignal(sig: Signal) { async function runCrawl() {
setSelected(sig); setCrawling(true);
setAiOutput(''); setPerceptionState(s => ({ ...s, crawlStatus: '正在连接数据源...' }));
try {
const res = await fetch('/api/v1/perception/crawl', {
method: 'POST',
headers: { 'Content-Type': 'application/json', ...authHeader() },
body: JSON.stringify({}),
});
if (!res.body) {
setPerceptionState(s => ({ ...s, crawlStatus: 'No stream' }));
setCrawling(false);
return;
}
const reader = res.body.getReader();
const dec = new TextDecoder();
let buf = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += dec.decode(value);
const parts = buf.split('\n\n');
buf = parts.pop() ?? '';
for (const block of parts) {
const eventLine = block.split('\n').find(l => l.startsWith('event: '));
const dataLine = block.split('\n').find(l => l.startsWith('data: '));
const evtName = eventLine?.slice(7).trim();
const raw = dataLine?.slice(6).trim();
if (!raw) continue;
try {
const d = JSON.parse(raw);
if (evtName === 'progress') {
setPerceptionState(s => ({
...s,
crawlStatus: `${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new}`}`,
}));
} else if (evtName === 'done') {
setPerceptionState(s => ({ ...s, crawlStatus: `更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated}` }));
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
.then(r => r.json())
.then(d2 => {
if (Array.isArray(d2?.events)) {
setPerceptionState(s => ({ ...s, signals: d2.events.map(mapEvent) }));
}
});
} else if (evtName === 'error') {
setPerceptionState(s => ({
...s,
crawlStatus: `错误: ${typeof d === 'string' ? d : d.message}`,
}));
}
} catch { /* ignore */ }
}
}
} catch (e: unknown) {
setPerceptionState(s => ({
...s,
crawlStatus: `连接失败: ${e instanceof Error ? e.message : String(e)}`,
}));
}
setCrawling(false);
}
function selectSignal(sig: PerceptionSignal) {
setPerceptionState(s => ({
...s,
selectedId: sig.id,
aiOutput: '',
detailTab: 'overview',
}));
setSelectedFull(null);
setStreaming(false); setStreaming(false);
perceptionAbortRef.current?.abort();
} }
return ( return (
@@ -197,10 +261,18 @@ export function PerceptionPage() {
<input <input
placeholder="Search signals..." placeholder="Search signals..."
value={searchQuery} value={searchQuery}
onChange={e => setSearchQuery(e.target.value)} onChange={e => setPerceptionState(s => ({ ...s, searchQuery: e.target.value }))}
/> />
</div> </div>
<button className="btn sm"><RefreshCw size={13} />Refresh</button> <button className="btn sm primary" onClick={runCrawl} disabled={crawling}>
<RefreshCw size={13} className={crawling ? 'spin' : ''} />
{crawling ? '抓取中...' : '刷新数据源'}
</button>
{crawlStatus && (
<span style={{ fontSize: 12, color: 'var(--text-secondary)', marginLeft: 8 }}>
{crawlStatus}
</span>
)}
</> </>
} }
/> />
@@ -227,13 +299,25 @@ export function PerceptionPage() {
<div className="filter-bar"> <div className="filter-bar">
<div className="chip-group"> <div className="chip-group">
{SOURCES.map(s => ( {SOURCES.map(s => (
<button key={s} className={`chip${sourceFilter === s ? ' active' : ''}`} onClick={() => setSourceFilter(s)}>{s}</button> <button
key={s}
className={`chip${sourceFilter === s ? ' active' : ''}`}
onClick={() => setPerceptionState(st => ({ ...st, sourceFilter: s }))}
>
{s}
</button>
))} ))}
</div> </div>
<div className="filter-sep" /> <div className="filter-sep" />
<div className="chip-group"> <div className="chip-group">
{IMPACTS.map(i => ( {IMPACTS.map(i => (
<button key={i} className={`chip${impactFilter === i ? ' active' : ''}`} onClick={() => setImpactFilter(i)}>{i}</button> <button
key={i}
className={`chip${impactFilter === i ? ' active' : ''}`}
onClick={() => setPerceptionState(st => ({ ...st, impactFilter: i }))}
>
{i}
</button>
))} ))}
</div> </div>
</div> </div>
@@ -243,7 +327,7 @@ export function PerceptionPage() {
{filtered.map(sig => ( {filtered.map(sig => (
<div <div
key={sig.id} key={sig.id}
className={`ev-card${selected?.id === sig.id ? ' selected' : ''}`} className={`ev-card${selectedId === sig.id ? ' selected' : ''}`}
onClick={() => selectSignal(sig)} onClick={() => selectSignal(sig)}
> >
<div className="ev-top"> <div className="ev-top">
@@ -277,8 +361,11 @@ export function PerceptionPage() {
<span className="source-tag">{selected.source}</span> <span className="source-tag">{selected.source}</span>
<span className="ev-std">{selected.standard}</span> <span className="ev-std">{selected.standard}</span>
<span className={`status ${selected.status}`}> <span className={`status ${selected.status}`}>
{selected.status === 'risk' ? 'Urgent' : 'Published'} {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'}
</span> </span>
{selectedFull?.change_summary && (
<span className="status warn" style={{ marginLeft: 'auto' }}>CHANGED</span>
)}
</div> </div>
<div className="detail-title">{selected.title}</div> <div className="detail-title">{selected.title}</div>
<p className="detail-summary">{selected.summary}</p> <p className="detail-summary">{selected.summary}</p>
@@ -287,22 +374,159 @@ export function PerceptionPage() {
? <button className="btn sm primary" onClick={runAnalysis}><Play size={12} />Run impact analysis</button> ? <button className="btn sm primary" onClick={runAnalysis}><Play size={12} />Run impact analysis</button>
: <button className="btn sm" onClick={stopAnalysis}><Square size={12} />Stop</button> : <button className="btn sm" onClick={stopAnalysis}><Square size={12} />Stop</button>
} }
<button className="btn sm"><ExternalLink size={12} />Source</button> {selected && (
<a
href={(selectedFull?.full_text_url as string) || '#'}
target="_blank"
rel="noopener noreferrer"
className="btn sm"
>
<ExternalLink size={12} />Source
</a>
)}
</div> </div>
</div> </div>
<div className="card docs-card"> <div className="detail-tabs">
<div className="card-header">Affected documents</div> {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => (
{MOCK_DOCS.map(d => ( <button
<div key={d.name} className="doc-row"> key={tab}
<span className="doc-score">{d.score}%</span> className={`detail-tab${detailTab === tab ? ' active' : ''}${tab === 'diff' && !selectedFull?.change_summary ? ' disabled' : ''}`}
<div> onClick={() => {
<div className="doc-name">{d.name} <span className="doc-clause">{d.clause}</span></div> if (tab !== 'diff' || selectedFull?.change_summary) {
<div className="doc-snippet">{d.snippet}</div> setPerceptionState(s => ({ ...s, detailTab: tab }));
}
}}
>
{tab === 'overview' ? '概览' : tab === 'obligations' ? '义务条款' : tab === 'assessment' ? '影响评估' : '变更对比'}
</button>
))}
</div> </div>
{detailTab === 'overview' && (
<div className="card">
<div className="card-header">Scope &amp; Summary</div>
<p className="detail-summary" style={{ marginTop: 8 }}>
{(selectedFull?.scope as string) || selected.summary}
</p>
{selectedFull?.penalties && (
<p style={{ fontSize: 13, color: 'var(--danger)', marginTop: 6 }}>
{selectedFull.penalties as string}
</p>
)}
</div>
)}
{detailTab === 'obligations' && (
<div className="card">
<div className="card-header"></div>
{(() => {
const obs = (selectedFull?.obligations as Array<Record<string, string>>) || [];
const deadlines = (selectedFull?.deadlines as Array<Record<string, string>>) || [];
return obs.length === 0 && deadlines.length === 0 ? (
<p className="detail-summary" style={{ marginTop: 8 }}>"Run impact analysis"</p>
) : (
<>
{obs.length > 0 && (
<table style={{ width: '100%', fontSize: 13, borderCollapse: 'collapse', marginTop: 8 }}>
<thead>
<tr style={{ borderBottom: '1px solid var(--border)' }}>
<th style={{ textAlign: 'left', padding: '4px 8px' }}></th>
<th style={{ textAlign: 'left', padding: '4px 8px', width: 80 }}></th>
<th style={{ textAlign: 'left', padding: '4px 8px', width: 60 }}></th>
</tr>
</thead>
<tbody>
{obs.map((ob, i) => (
<tr key={i} style={{ borderBottom: '1px solid var(--border-faint)' }}>
<td style={{ padding: '6px 8px' }}>{ob.text}</td>
<td style={{ padding: '6px 8px', color: 'var(--text-secondary)' }}>{ob.subject}</td>
<td style={{ padding: '6px 8px' }}>
<span className={`status ${ob.deontic === 'must' || ob.deontic === 'shall' ? 'risk' : ob.deontic === 'prohibited' ? 'risk' : 'info'}`}>
{ob.deontic}
</span>
</td>
</tr>
))}
</tbody>
</table>
)}
{deadlines.length > 0 && (
<div style={{ marginTop: 12 }}>
<div className="card-header"></div>
{deadlines.map((d, i) => (
<div key={i} style={{ fontSize: 13, padding: '4px 0', display: 'flex', gap: 12 }}>
<span style={{ fontWeight: 600, color: 'var(--danger)' }}>{d.date || '待定'}</span>
<span style={{ color: 'var(--text-secondary)' }}>{d.description}</span>
</div> </div>
))} ))}
</div> </div>
)}
</>
);
})()}
</div>
)}
{detailTab === 'assessment' && (
<div className="card docs-card">
<div className="card-header">Affected documents</div>
{(() => {
const docs = (selectedFull?.affected_docs as Array<Record<string, unknown>>);
const displayDocs = docs && docs.length > 0 ? docs : [];
return displayDocs.length === 0
? <p className="detail-summary" style={{ marginTop: 8 }}>No affected documents found.</p>
: displayDocs.map((d, i) => (
<div key={i} className="doc-row">
<span className="doc-score">{Math.round(Number(d.score ?? 0) * 100)}%</span>
<div>
<div className="doc-name">
{String(d.doc_name || '')}
<span className="doc-clause">{String(d.key_clauses || d.clause || '')}</span>
</div>
{d.snippet && <div className="doc-snippet">{String(d.snippet)}</div>}
{d.recommendation && (
<div style={{ fontSize: 12, color: 'var(--accent)', marginTop: 2 }}> {String(d.recommendation)}</div>
)}
</div>
</div>
));
})()}
</div>
)}
{detailTab === 'diff' && selectedFull?.change_summary && (
<div className="card">
<div className="card-header"></div>
<p style={{ fontSize: 13, color: 'var(--text-secondary)', marginTop: 8 }}>
{selectedFull.change_summary as string}
</p>
{(() => {
const sections = (selectedFull.changed_sections as Array<Record<string, unknown>>) || [];
return sections.map((s, i) => (
<div key={i} style={{ marginTop: 12, borderTop: '1px solid var(--border)', paddingTop: 10 }}>
<div style={{ display: 'flex', gap: 8, marginBottom: 6 }}>
<span className={`status ${s.change_type === 'tightened' || s.change_type === 'added' ? 'risk' : s.change_type === 'removed' ? 'warn' : 'info'}`}>
{String(s.change_type)}
</span>
<span style={{ fontSize: 12, color: 'var(--text-secondary)' }}>cosine: {String(s.similarity)}</span>
</div>
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, fontSize: 12 }}>
<div style={{ background: 'var(--danger-bg)', padding: 8, borderRadius: 4 }}>
<div style={{ fontWeight: 600, marginBottom: 4 }}></div>
{String(s.old_text || '')}
</div>
<div style={{ background: 'var(--success-bg)', padding: 8, borderRadius: 4 }}>
<div style={{ fontWeight: 600, marginBottom: 4 }}></div>
{String(s.new_text || '')}
</div>
</div>
{s.summary && <p style={{ fontSize: 12, marginTop: 6, color: 'var(--text-secondary)' }}>{String(s.summary)}</p>}
</div>
));
})()}
</div>
)}
{(aiOutput || streaming) && ( {(aiOutput || streaming) && (
<div className="card ai-card"> <div className="card ai-card">

View File

@@ -1,6 +1,8 @@
import { useState, useRef, useEffect, useCallback } from 'react'; import { useRef, useEffect, useCallback, useState } from 'react';
import { Topbar } from '../../components/layout/Topbar'; import { Topbar } from '../../components/layout/Topbar';
import { Send, Download } from 'lucide-react'; import { Send, Download } from 'lucide-react';
import { usePageState } from '../../contexts';
import type { RagCitation } from '../../contexts';
const TOKEN_KEY = 'auth_token'; const TOKEN_KEY = 'auth_token';
function authHeader(): Record<string, string> { function authHeader(): Record<string, string> {
@@ -8,26 +10,8 @@ function authHeader(): Record<string, string> {
return t ? { Authorization: `Bearer ${t}` } : {}; return t ? { Authorization: `Bearer ${t}` } : {};
} }
interface Message {
id: string;
role: 'user' | 'assistant';
text: string;
// citation indices mentioned in this assistant message (1-based, matching citations array)
citationRefs?: number[];
}
interface Citation {
index: number; // 1-based, matches [N] markers in text
score: number; // 0100 display percentage
name: string; // doc_name
clause: string; // section_title or clause
snippet: string; // preview text
docId?: string;
}
// Map a raw source doc from the backend "retrieved" event to our Citation shape. // Map a raw source doc from the backend "retrieved" event to our Citation shape.
// Backend fields: { id, score(0-1), preview, doc_name, clause, doc_id } function mapSource(s: Record<string, unknown>, idx: number): RagCitation {
function mapSource(s: Record<string, unknown>, idx: number): Citation {
const rawScore = typeof s.score === 'number' ? s.score : 0; const rawScore = typeof s.score === 'number' ? s.score : 0;
const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore); const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore);
return { return {
@@ -73,25 +57,21 @@ const MOCK_QUICK = [
]; ];
export function RagChatPage() { export function RagChatPage() {
const [messages, setMessages] = useState<Message[]>([ // All persistent state lives in PageStateContext — survives route changes
{ const { ragState, setRagState, ragStreamingRef, ragAbortRef } = usePageState();
id: 'init', role: 'assistant', const { messages, citations, sessionId, inputDraft } = ragState;
text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
} // Local-only UI state: highlighted citation and streaming indicator
]); // These are fine to reset on navigation since they're transient UI feedback
const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
const [input, setInput] = useState('');
const [streaming, setStreaming] = useState(false);
const [citations, setCitations] = useState<Citation[]>([]);
const [highlightedCit, setHighlightedCit] = useState<number | null>(null); const [highlightedCit, setHighlightedCit] = useState<number | null>(null);
const [sessionId, setSessionId] = useState<string | null>(null); const [streaming, setStreaming] = useState(ragStreamingRef.current);
const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
const bottomRef = useRef<HTMLDivElement>(null); const bottomRef = useRef<HTMLDivElement>(null);
const citRailRef = useRef<HTMLDivElement>(null); const citRailRef = useRef<HTMLDivElement>(null);
const citItemRefs = useRef<Record<number, HTMLDivElement | null>>({}); const citItemRefs = useRef<Record<number, HTMLDivElement | null>>({});
const abortRef = useRef<AbortController | null>(null);
// Fetch quick questions from backend on mount // Fetch quick questions from backend on mount (only once per session)
useEffect(() => { useEffect(() => {
fetch('/api/v1/rag/quick-questions', { headers: authHeader() }) fetch('/api/v1/rag/quick-questions', { headers: authHeader() })
.then(r => r.json()) .then(r => r.json())
@@ -115,26 +95,33 @@ export function RagChatPage() {
if (el) { if (el) {
el.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
} }
// Clear highlight after 3s
setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000); setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000);
}, []); }, []);
async function send(text?: string) { async function send(text?: string) {
const q = (text ?? input).trim(); const q = (text ?? inputDraft).trim();
if (!q || streaming) return; if (!q || ragStreamingRef.current) return;
setInput(''); setRagState(s => ({ ...s, inputDraft: '' }));
const userMsg: Message = { id: Date.now().toString(), role: 'user', text: q };
setMessages(m => [...m, userMsg]);
const userMsgId = Date.now().toString();
const assistantId = (Date.now() + 1).toString(); const assistantId = (Date.now() + 1).toString();
setMessages(m => [...m, { id: assistantId, role: 'assistant', text: '' }]);
setRagState(s => ({
...s,
messages: [
...s.messages,
{ id: userMsgId, role: 'user', text: q },
{ id: assistantId, role: 'assistant', text: '' },
],
citations: [],
}));
ragStreamingRef.current = true;
setStreaming(true); setStreaming(true);
setCitations([]);
setHighlightedCit(null); setHighlightedCit(null);
const ctrl = new AbortController(); const ctrl = new AbortController();
abortRef.current = ctrl; ragAbortRef.current = ctrl;
try { try {
const body: Record<string, unknown> = { query: q, top_k: 5 }; const body: Record<string, unknown> = { query: q, top_k: 5 };
@@ -151,14 +138,13 @@ export function RagChatPage() {
const reader = res.body.getReader(); const reader = res.body.getReader();
const dec = new TextDecoder(); const dec = new TextDecoder();
let buffer = ''; let buffer = '';
const newCitations: Citation[] = []; const newCitations: RagCitation[] = [];
while (true) { while (true) {
const { done, value } = await reader.read(); const { done, value } = await reader.read();
if (done) break; if (done) break;
buffer += dec.decode(value, { stream: true }); buffer += dec.decode(value, { stream: true });
// SSE blocks separated by double newline
const blocks = buffer.split('\n\n'); const blocks = buffer.split('\n\n');
buffer = blocks.pop() ?? ''; buffer = blocks.pop() ?? '';
@@ -171,56 +157,62 @@ export function RagChatPage() {
const j = JSON.parse(raw); const j = JSON.parse(raw);
if (j.type === 'session') { if (j.type === 'session') {
// Backend assigned a session_id — persist for next request if (j.session_id) setRagState(s => ({ ...s, sessionId: j.session_id }));
if (j.session_id) setSessionId(j.session_id);
} else if (j.type === 'retrieved' && Array.isArray(j.docs)) { } else if (j.type === 'retrieved' && Array.isArray(j.docs)) {
// Sources arrive before the answer starts
const mapped = j.docs.map((d: Record<string, unknown>, i: number) => mapSource(d, i + 1)); const mapped = j.docs.map((d: Record<string, unknown>, i: number) => mapSource(d, i + 1));
newCitations.push(...mapped); newCitations.push(...mapped);
setCitations([...mapped]); setRagState(s => ({ ...s, citations: [...mapped] }));
} else if (j.type === 'chunk' && j.text) { } else if (j.type === 'chunk' && j.text) {
setMessages(m => m.map(msg => setRagState(s => ({
...s,
messages: s.messages.map(msg =>
msg.id === assistantId msg.id === assistantId
? { ...msg, text: msg.text + (j.text as string) } ? { ...msg, text: msg.text + (j.text as string) }
: msg : msg
)); ),
}));
} else if (j.type === 'status') {
// Status message (e.g. "找到N条相关法规…") — could show in UI if desired
// For now we ignore it to keep the bubble clean
} else if (j.type === 'done') { } else if (j.type === 'done') {
// Extract which citation numbers appear in the final answer setRagState(s => ({
setMessages(m => m.map(msg => { ...s,
messages: s.messages.map(msg => {
if (msg.id !== assistantId) return msg; if (msg.id !== assistantId) return msg;
const refs = [...new Set( const refs = [...new Set(
[...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10)) [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
)].filter(n => n >= 1 && n <= newCitations.length); )].filter(n => n >= 1 && n <= newCitations.length);
return { ...msg, citationRefs: refs }; return { ...msg, citationRefs: refs };
}),
})); }));
break; break;
} else if (j.type === 'error') { } else if (j.type === 'error') {
setMessages(m => m.map(msg => setRagState(s => ({
...s,
messages: s.messages.map(msg =>
msg.id === assistantId msg.id === assistantId
? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` } ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
: msg : msg
)); ),
}));
} }
} catch { /* malformed JSON chunk, skip */ } } catch { /* malformed JSON chunk, skip */ }
} }
} }
} catch (e: unknown) { } catch (e: unknown) {
if (e instanceof Error && e.name !== 'AbortError') { if (e instanceof Error && e.name !== 'AbortError') {
setMessages(m => m.map(msg => setRagState(s => ({
...s,
messages: s.messages.map(msg =>
msg.id === assistantId msg.id === assistantId
? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' } ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
: msg : msg
)); ),
}));
} }
} finally { } finally {
ragStreamingRef.current = false;
setStreaming(false); setStreaming(false);
} }
} }
@@ -291,15 +283,15 @@ export function RagChatPage() {
<textarea <textarea
className="composer-input" className="composer-input"
placeholder="Ask about your regulations…" placeholder="Ask about your regulations…"
value={input} value={inputDraft}
onChange={e => setInput(e.target.value)} onChange={e => setRagState(s => ({ ...s, inputDraft: e.target.value }))}
onKeyDown={e => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); } }} onKeyDown={e => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); } }}
rows={2} rows={2}
/> />
<button <button
className="btn primary" className="btn primary"
onClick={() => send()} onClick={() => send()}
disabled={!input.trim() || streaming} disabled={!inputDraft.trim() || streaming}
> >
<Send size={14} /> <Send size={14} />
</button> </button>

View File

@@ -1108,3 +1108,33 @@ mark.comp-highlight {
transition: color 0.15s; transition: color 0.15s;
} }
.logout-btn:hover { color: var(--danger); } .logout-btn:hover { color: var(--danger); }
/* ── Detail Tabs (Perception) ──────────────────── */
.detail-tabs {
display: flex;
gap: 2px;
margin: 8px 0 0;
border-bottom: 1px solid var(--border);
padding-bottom: 0;
}
.detail-tab {
background: none;
border: none;
border-bottom: 2px solid transparent;
padding: 6px 14px;
font-size: 13px;
color: var(--text-secondary);
cursor: pointer;
transition: color 0.15s, border-color 0.15s;
}
.detail-tab:hover { color: var(--text); }
.detail-tab.active {
color: var(--accent);
border-bottom-color: var(--accent);
font-weight: 600;
}
.detail-tab.disabled {
opacity: 0.35;
cursor: not-allowed;
}
.spin { animation: spin 1s linear infinite; }

View File

@@ -24,6 +24,8 @@ dependencies = [
"loguru>=0.7.0", "loguru>=0.7.0",
"tenacity>=8.2.0", "tenacity>=8.2.0",
"httpx>=0.24.0", "httpx>=0.24.0",
"beautifulsoup4>=4.12.0",
"lxml>=5.0.0",
"alibabacloud-docmind-api20220711>=1.0.6", "alibabacloud-docmind-api20220711>=1.0.6",
"alibabacloud-tea-openapi>=0.3.11", "alibabacloud-tea-openapi>=0.3.11",
"alibabacloud-tea-util>=0.3.13", "alibabacloud-tea-util>=0.3.13",