AIRegulation-DocAnalysis/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py

"""Crawlers for the 国标委 (SAMR) standard information platform."""

from __future__ import annotations

import httpx
from loguru import logger

from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import extract_tags, parse_date

_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}


def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
    params = {
        "p.p1": std_type,
        "p.p2": "车",
        "p.p90": "circulation_date",
        "p.p91": "desc",
        "p.p6": page,
        "p.p7": page_size,
    }
    try:
        resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        return data.get("rows", []) or []
    except Exception as exc:
        logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
        return []


def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
    standard_code = row.get("std_code", "")
    title = row.get("std_name", standard_code)
    published_at = parse_date(row.get("release_date", ""))
    effective_at_raw = row.get("implement_date", "")
    effective_at = parse_date(effective_at_raw) if effective_at_raw else None
    status_text = row.get("std_status", "")
    if "征求意见" in status_text:
        status = "consultation"
    elif "报批" in status_text or "草案" in status_text:
        status = "draft"
    else:
        status = "enacted"
    return RawEvent(
        source="国标委",
        source_label=source_label,
        standard_code=standard_code,
        title=title,
        summary=title,
        full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
        status=status,
        published_at=published_at,
        effective_at=effective_at,
        category=row.get("std_type", "国家标准"),
        tags=extract_tags(standard_code, title),
        raw_text=f"{standard_code} {title}",
    )


class GuobiaoMandatoryCrawler(BaseCrawler):
    """Fetch mandatory national standards (强制性) related to vehicles."""

    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            rows = _fetch_page(std_type=1, page=page, page_size=20)
            if not rows:
                break
            events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
            page += 1
        return events[:limit]


class GuobiaoRecommendedCrawler(BaseCrawler):
    """Fetch recommended national standards (推荐性) related to vehicles."""

    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            rows = _fetch_page(std_type=2, page=page, page_size=20)
            if not rows:
                break
            events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
            page += 1
        return events[:limit]
fix somethings 2026-06-08 11:16:28 +08:00			`"""Crawlers for the 国标委 (SAMR) standard information platform."""`

			`from __future__ import annotations`

			`import httpx`
			`from loguru import logger`

			`from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent`
			`from ._utils import extract_tags, parse_date`

			`_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"`
			`_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}`


			`def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:`
			`params = {`
			`"p.p1": std_type,`
			`"p.p2": "车",`
			`"p.p90": "circulation_date",`
			`"p.p91": "desc",`
			`"p.p6": page,`
			`"p.p7": page_size,`
			`}`
			`try:`
			`resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)`
			`resp.raise_for_status()`
			`data = resp.json()`
			`return data.get("rows", []) or []`
			`except Exception as exc:`
			`logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)`
			`return []`


			`def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:`
			`standard_code = row.get("std_code", "")`
			`title = row.get("std_name", standard_code)`
			`published_at = parse_date(row.get("release_date", ""))`
			`effective_at_raw = row.get("implement_date", "")`
			`effective_at = parse_date(effective_at_raw) if effective_at_raw else None`
			`status_text = row.get("std_status", "")`
			`if "征求意见" in status_text:`
			`status = "consultation"`
			`elif "报批" in status_text or "草案" in status_text:`
			`status = "draft"`
			`else:`
			`status = "enacted"`
			`return RawEvent(`
			`source="国标委",`
			`source_label=source_label,`
			`standard_code=standard_code,`
			`title=title,`
			`summary=title,`
			`full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",`
			`status=status,`
			`published_at=published_at,`
			`effective_at=effective_at,`
			`category=row.get("std_type", "国家标准"),`
			`tags=extract_tags(standard_code, title),`
			`raw_text=f"{standard_code} {title}",`
			`)`


			`class GuobiaoMandatoryCrawler(BaseCrawler):`
			`"""Fetch mandatory national standards (强制性) related to vehicles."""`

			`def fetch(self, limit: int = 50) -> list[RawEvent]:`
			`events: list[RawEvent] = []`
			`page = 1`
			`max_pages = max(10, limit)`
			`while len(events) < limit and page <= max_pages:`
			`rows = _fetch_page(std_type=1, page=page, page_size=20)`
			`if not rows:`
			`break`
			`events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)`
			`page += 1`
			`return events[:limit]`


			`class GuobiaoRecommendedCrawler(BaseCrawler):`
			`"""Fetch recommended national standards (推荐性) related to vehicles."""`

			`def fetch(self, limit: int = 50) -> list[RawEvent]:`
			`events: list[RawEvent] = []`
			`page = 1`
			`max_pages = max(10, limit)`
			`while len(events) < limit and page <= max_pages:`
			`rows = _fetch_page(std_type=2, page=page, page_size=20)`
			`if not rows:`
			`break`
			`events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)`
			`page += 1`
			`return events[:limit]`