93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
|
|
"""Crawlers for the 国标委 (SAMR) standard information platform."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from loguru import logger
|
||
|
|
|
||
|
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||
|
|
from ._utils import extract_tags, parse_date
|
||
|
|
|
||
|
|
_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
|
||
|
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
|
||
|
|
|
||
|
|
|
||
|
|
def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
|
||
|
|
params = {
|
||
|
|
"p.p1": std_type,
|
||
|
|
"p.p2": "车",
|
||
|
|
"p.p90": "circulation_date",
|
||
|
|
"p.p91": "desc",
|
||
|
|
"p.p6": page,
|
||
|
|
"p.p7": page_size,
|
||
|
|
}
|
||
|
|
try:
|
||
|
|
resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
|
||
|
|
resp.raise_for_status()
|
||
|
|
data = resp.json()
|
||
|
|
return data.get("rows", []) or []
|
||
|
|
except Exception as exc:
|
||
|
|
logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
|
||
|
|
standard_code = row.get("std_code", "")
|
||
|
|
title = row.get("std_name", standard_code)
|
||
|
|
published_at = parse_date(row.get("release_date", ""))
|
||
|
|
effective_at_raw = row.get("implement_date", "")
|
||
|
|
effective_at = parse_date(effective_at_raw) if effective_at_raw else None
|
||
|
|
status_text = row.get("std_status", "")
|
||
|
|
if "征求意见" in status_text:
|
||
|
|
status = "consultation"
|
||
|
|
elif "报批" in status_text or "草案" in status_text:
|
||
|
|
status = "draft"
|
||
|
|
else:
|
||
|
|
status = "enacted"
|
||
|
|
return RawEvent(
|
||
|
|
source="国标委",
|
||
|
|
source_label=source_label,
|
||
|
|
standard_code=standard_code,
|
||
|
|
title=title,
|
||
|
|
summary=title,
|
||
|
|
full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
|
||
|
|
status=status,
|
||
|
|
published_at=published_at,
|
||
|
|
effective_at=effective_at,
|
||
|
|
category=row.get("std_type", "国家标准"),
|
||
|
|
tags=extract_tags(standard_code, title),
|
||
|
|
raw_text=f"{standard_code} {title}",
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class GuobiaoMandatoryCrawler(BaseCrawler):
|
||
|
|
"""Fetch mandatory national standards (强制性) related to vehicles."""
|
||
|
|
|
||
|
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||
|
|
events: list[RawEvent] = []
|
||
|
|
page = 1
|
||
|
|
max_pages = max(10, limit)
|
||
|
|
while len(events) < limit and page <= max_pages:
|
||
|
|
rows = _fetch_page(std_type=1, page=page, page_size=20)
|
||
|
|
if not rows:
|
||
|
|
break
|
||
|
|
events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
|
||
|
|
page += 1
|
||
|
|
return events[:limit]
|
||
|
|
|
||
|
|
|
||
|
|
class GuobiaoRecommendedCrawler(BaseCrawler):
|
||
|
|
"""Fetch recommended national standards (推荐性) related to vehicles."""
|
||
|
|
|
||
|
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||
|
|
events: list[RawEvent] = []
|
||
|
|
page = 1
|
||
|
|
max_pages = max(10, limit)
|
||
|
|
while len(events) < limit and page <= max_pages:
|
||
|
|
rows = _fetch_page(std_type=2, page=page, page_size=20)
|
||
|
|
if not rows:
|
||
|
|
break
|
||
|
|
events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
|
||
|
|
page += 1
|
||
|
|
return events[:limit]
|