fix somethings

2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions
--- a/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
@@ -0,0 +1,92 @@
+"""Crawlers for the 国标委 (SAMR) standard information platform."""
+
+from __future__ import annotations
+
+import httpx
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import extract_tags, parse_date
+
+_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
+
+
+def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
+    params = {
+        "p.p1": std_type,
+        "p.p2": "车",
+        "p.p90": "circulation_date",
+        "p.p91": "desc",
+        "p.p6": page,
+        "p.p7": page_size,
+    }
+    try:
+        resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
+        resp.raise_for_status()
+        data = resp.json()
+        return data.get("rows", []) or []
+    except Exception as exc:
+        logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
+        return []
+
+
+def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
+    standard_code = row.get("std_code", "")
+    title = row.get("std_name", standard_code)
+    published_at = parse_date(row.get("release_date", ""))
+    effective_at_raw = row.get("implement_date", "")
+    effective_at = parse_date(effective_at_raw) if effective_at_raw else None
+    status_text = row.get("std_status", "")
+    if "征求意见" in status_text:
+        status = "consultation"
+    elif "报批" in status_text or "草案" in status_text:
+        status = "draft"
+    else:
+        status = "enacted"
+    return RawEvent(
+        source="国标委",
+        source_label=source_label,
+        standard_code=standard_code,
+        title=title,
+        summary=title,
+        full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
+        status=status,
+        published_at=published_at,
+        effective_at=effective_at,
+        category=row.get("std_type", "国家标准"),
+        tags=extract_tags(standard_code, title),
+        raw_text=f"{standard_code} {title}",
+    )
+
+
+class GuobiaoMandatoryCrawler(BaseCrawler):
+    """Fetch mandatory national standards (强制性) related to vehicles."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        page = 1
+        max_pages = max(10, limit)
+        while len(events) < limit and page <= max_pages:
+            rows = _fetch_page(std_type=1, page=page, page_size=20)
+            if not rows:
+                break
+            events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
+            page += 1
+        return events[:limit]
+
+
+class GuobiaoRecommendedCrawler(BaseCrawler):
+    """Fetch recommended national standards (推荐性) related to vehicles."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        page = 1
+        max_pages = max(10, limit)
+        while len(events) < limit and page <= max_pages:
+            rows = _fetch_page(std_type=2, page=page, page_size=20)
+            if not rows:
+                break
+            events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
+            page += 1
+        return events[:limit]