"""Crawler for CATARC automotive standard catalogue.""" from __future__ import annotations from urllib.parse import urljoin import httpx from bs4 import BeautifulSoup from loguru import logger from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent from ._utils import extract_tags, parse_date _BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html" _HOST = "https://www.catarc.org.cn" _STATUS_MAP = { "现行": "enacted", "即将实施": "enacted", "废止": "enacted", "征求意见": "consultation", "报批": "draft", } class CatarcCrawler(BaseCrawler): """Scrape the CATARC automotive standard list page.""" def fetch(self, limit: int = 50) -> list[RawEvent]: events: list[RawEvent] = [] page = 1 max_pages = max(10, limit) while len(events) < limit and page <= max_pages: url = f"{_BASE_URL}?page={page}" try: resp = httpx.get(url, timeout=30, follow_redirects=True) resp.raise_for_status() except Exception as exc: logger.warning("CATARC fetch failed page={} err={}", page, exc) break soup = BeautifulSoup(resp.text, "lxml") rows = soup.select("table tr") if not rows: break batch: list[RawEvent] = [] for row in rows: cells = row.find_all("td") if len(cells) < 3: continue link = cells[0].find("a") standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True) title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code date_text = cells[2].get_text(strip=True) if len(cells) > 2 else "" published_at = parse_date(date_text) status_text = cells[3].get_text(strip=True) if len(cells) > 3 else "" status = _STATUS_MAP.get(status_text, "enacted") detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url raw_text = f"{standard_code} {title}" batch.append(RawEvent( source="CATARC", source_label="全国汽车标准化技术委员会", standard_code=standard_code, title=title, summary=title, full_text_url=detail_url, status=status, published_at=published_at, effective_at=None, category="汽车标准", tags=extract_tags(standard_code, title), raw_text=raw_text, )) if not batch: break events.extend(batch) page += 1 return events[:limit]