84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
|
|
"""Crawler for CATARC automotive standard catalogue."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from urllib.parse import urljoin
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
from loguru import logger
|
||
|
|
|
||
|
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||
|
|
from ._utils import extract_tags, parse_date
|
||
|
|
|
||
|
|
_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
|
||
|
|
_HOST = "https://www.catarc.org.cn"
|
||
|
|
|
||
|
|
_STATUS_MAP = {
|
||
|
|
"现行": "enacted",
|
||
|
|
"即将实施": "enacted",
|
||
|
|
"废止": "enacted",
|
||
|
|
"征求意见": "consultation",
|
||
|
|
"报批": "draft",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class CatarcCrawler(BaseCrawler):
|
||
|
|
"""Scrape the CATARC automotive standard list page."""
|
||
|
|
|
||
|
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||
|
|
events: list[RawEvent] = []
|
||
|
|
page = 1
|
||
|
|
max_pages = max(10, limit)
|
||
|
|
while len(events) < limit and page <= max_pages:
|
||
|
|
url = f"{_BASE_URL}?page={page}"
|
||
|
|
try:
|
||
|
|
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||
|
|
resp.raise_for_status()
|
||
|
|
except Exception as exc:
|
||
|
|
logger.warning("CATARC fetch failed page={} err={}", page, exc)
|
||
|
|
break
|
||
|
|
|
||
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
||
|
|
rows = soup.select("table tr")
|
||
|
|
if not rows:
|
||
|
|
break
|
||
|
|
|
||
|
|
batch: list[RawEvent] = []
|
||
|
|
for row in rows:
|
||
|
|
cells = row.find_all("td")
|
||
|
|
if len(cells) < 3:
|
||
|
|
continue
|
||
|
|
link = cells[0].find("a")
|
||
|
|
standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
|
||
|
|
title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
|
||
|
|
date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
|
||
|
|
published_at = parse_date(date_text)
|
||
|
|
status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
|
||
|
|
status = _STATUS_MAP.get(status_text, "enacted")
|
||
|
|
detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
|
||
|
|
raw_text = f"{standard_code} {title}"
|
||
|
|
batch.append(RawEvent(
|
||
|
|
source="CATARC",
|
||
|
|
source_label="全国汽车标准化技术委员会",
|
||
|
|
standard_code=standard_code,
|
||
|
|
title=title,
|
||
|
|
summary=title,
|
||
|
|
full_text_url=detail_url,
|
||
|
|
status=status,
|
||
|
|
published_at=published_at,
|
||
|
|
effective_at=None,
|
||
|
|
category="汽车标准",
|
||
|
|
tags=extract_tags(standard_code, title),
|
||
|
|
raw_text=raw_text,
|
||
|
|
))
|
||
|
|
|
||
|
|
if not batch:
|
||
|
|
break
|
||
|
|
events.extend(batch)
|
||
|
|
page += 1
|
||
|
|
|
||
|
|
return events[:limit]
|
||
|
|
|
||
|
|
|