fix somethings
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
"""Crawler for CATARC automotive standard catalogue."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from ._utils import extract_tags, parse_date
|
||||
|
||||
_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
|
||||
_HOST = "https://www.catarc.org.cn"
|
||||
|
||||
_STATUS_MAP = {
|
||||
"现行": "enacted",
|
||||
"即将实施": "enacted",
|
||||
"废止": "enacted",
|
||||
"征求意见": "consultation",
|
||||
"报批": "draft",
|
||||
}
|
||||
|
||||
|
||||
class CatarcCrawler(BaseCrawler):
|
||||
"""Scrape the CATARC automotive standard list page."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
page = 1
|
||||
max_pages = max(10, limit)
|
||||
while len(events) < limit and page <= max_pages:
|
||||
url = f"{_BASE_URL}?page={page}"
|
||||
try:
|
||||
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("CATARC fetch failed page={} err={}", page, exc)
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
rows = soup.select("table tr")
|
||||
if not rows:
|
||||
break
|
||||
|
||||
batch: list[RawEvent] = []
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
link = cells[0].find("a")
|
||||
standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
|
||||
title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
|
||||
date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
|
||||
published_at = parse_date(date_text)
|
||||
status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
|
||||
status = _STATUS_MAP.get(status_text, "enacted")
|
||||
detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
|
||||
raw_text = f"{standard_code} {title}"
|
||||
batch.append(RawEvent(
|
||||
source="CATARC",
|
||||
source_label="全国汽车标准化技术委员会",
|
||||
standard_code=standard_code,
|
||||
title=title,
|
||||
summary=title,
|
||||
full_text_url=detail_url,
|
||||
status=status,
|
||||
published_at=published_at,
|
||||
effective_at=None,
|
||||
category="汽车标准",
|
||||
tags=extract_tags(standard_code, title),
|
||||
raw_text=raw_text,
|
||||
))
|
||||
|
||||
if not batch:
|
||||
break
|
||||
events.extend(batch)
|
||||
page += 1
|
||||
|
||||
return events[:limit]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user