"""Crawlers for the 国标委 (SAMR) standard information platform.""" from __future__ import annotations import httpx from loguru import logger from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent from ._utils import extract_tags, parse_date _BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type" _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"} def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]: params = { "p.p1": std_type, "p.p2": "车", "p.p90": "circulation_date", "p.p91": "desc", "p.p6": page, "p.p7": page_size, } try: resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30) resp.raise_for_status() data = resp.json() return data.get("rows", []) or [] except Exception as exc: logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc) return [] def _row_to_raw_event(row: dict, source_label: str) -> RawEvent: standard_code = row.get("std_code", "") title = row.get("std_name", standard_code) published_at = parse_date(row.get("release_date", "")) effective_at_raw = row.get("implement_date", "") effective_at = parse_date(effective_at_raw) if effective_at_raw else None status_text = row.get("std_status", "") if "征求意见" in status_text: status = "consultation" elif "报批" in status_text or "草案" in status_text: status = "draft" else: status = "enacted" return RawEvent( source="国标委", source_label=source_label, standard_code=standard_code, title=title, summary=title, full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}", status=status, published_at=published_at, effective_at=effective_at, category=row.get("std_type", "国家标准"), tags=extract_tags(standard_code, title), raw_text=f"{standard_code} {title}", ) class GuobiaoMandatoryCrawler(BaseCrawler): """Fetch mandatory national standards (强制性) related to vehicles.""" def fetch(self, limit: int = 50) -> list[RawEvent]: events: list[RawEvent] = [] page = 1 max_pages = max(10, limit) while len(events) < limit and page <= max_pages: rows = _fetch_page(std_type=1, page=page, page_size=20) if not rows: break events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows) page += 1 return events[:limit] class GuobiaoRecommendedCrawler(BaseCrawler): """Fetch recommended national standards (推荐性) related to vehicles.""" def fetch(self, limit: int = 50) -> list[RawEvent]: events: list[RawEvent] = [] page = 1 max_pages = max(10, limit) while len(events) < limit and page <= max_pages: rows = _fetch_page(std_type=2, page=page, page_size=20) if not rows: break events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows) page += 1 return events[:limit]