AIRegulation-DocAnalysis/backend/app/infrastructure/perception/crawlers/catarc_crawler.py

"""Crawler for CATARC automotive standard catalogue."""

from __future__ import annotations

from urllib.parse import urljoin

import httpx
from bs4 import BeautifulSoup
from loguru import logger

from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import extract_tags, parse_date

_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
_HOST = "https://www.catarc.org.cn"

_STATUS_MAP = {
    "现行": "enacted",
    "即将实施": "enacted",
    "废止": "enacted",
    "征求意见": "consultation",
    "报批": "draft",
}


class CatarcCrawler(BaseCrawler):
    """Scrape the CATARC automotive standard list page."""

    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            url = f"{_BASE_URL}?page={page}"
            try:
                resp = httpx.get(url, timeout=30, follow_redirects=True)
                resp.raise_for_status()
            except Exception as exc:
                logger.warning("CATARC fetch failed page={} err={}", page, exc)
                break

            soup = BeautifulSoup(resp.text, "lxml")
            rows = soup.select("table tr")
            if not rows:
                break

            batch: list[RawEvent] = []
            for row in rows:
                cells = row.find_all("td")
                if len(cells) < 3:
                    continue
                link = cells[0].find("a")
                standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
                title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
                date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
                published_at = parse_date(date_text)
                status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
                status = _STATUS_MAP.get(status_text, "enacted")
                detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
                raw_text = f"{standard_code} {title}"
                batch.append(RawEvent(
                    source="CATARC",
                    source_label="全国汽车标准化技术委员会",
                    standard_code=standard_code,
                    title=title,
                    summary=title,
                    full_text_url=detail_url,
                    status=status,
                    published_at=published_at,
                    effective_at=None,
                    category="汽车标准",
                    tags=extract_tags(standard_code, title),
                    raw_text=raw_text,
                ))

            if not batch:
                break
            events.extend(batch)
            page += 1

        return events[:limit]
fix somethings 2026-06-08 11:16:28 +08:00			`"""Crawler for CATARC automotive standard catalogue."""`

			`from __future__ import annotations`

			`from urllib.parse import urljoin`

			`import httpx`
			`from bs4 import BeautifulSoup`
			`from loguru import logger`

			`from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent`
			`from ._utils import extract_tags, parse_date`

			`_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"`
			`_HOST = "https://www.catarc.org.cn"`

			`_STATUS_MAP = {`
			`"现行": "enacted",`
			`"即将实施": "enacted",`
			`"废止": "enacted",`
			`"征求意见": "consultation",`
			`"报批": "draft",`
			`}`


			`class CatarcCrawler(BaseCrawler):`
			`"""Scrape the CATARC automotive standard list page."""`

			`def fetch(self, limit: int = 50) -> list[RawEvent]:`
			`events: list[RawEvent] = []`
			`page = 1`
			`max_pages = max(10, limit)`
			`while len(events) < limit and page <= max_pages:`
			`url = f"{_BASE_URL}?page={page}"`
			`try:`
			`resp = httpx.get(url, timeout=30, follow_redirects=True)`
			`resp.raise_for_status()`
			`except Exception as exc:`
			`logger.warning("CATARC fetch failed page={} err={}", page, exc)`
			`break`

			`soup = BeautifulSoup(resp.text, "lxml")`
			`rows = soup.select("table tr")`
			`if not rows:`
			`break`

			`batch: list[RawEvent] = []`
			`for row in rows:`
			`cells = row.find_all("td")`
			`if len(cells) < 3:`
			`continue`
			`link = cells[0].find("a")`
			`standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)`
			`title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code`
			`date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""`
			`published_at = parse_date(date_text)`
			`status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""`
			`status = _STATUS_MAP.get(status_text, "enacted")`
			`detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url`
			`raw_text = f"{standard_code} {title}"`
			`batch.append(RawEvent(`
			`source="CATARC",`
			`source_label="全国汽车标准化技术委员会",`
			`standard_code=standard_code,`
			`title=title,`
			`summary=title,`
			`full_text_url=detail_url,`
			`status=status,`
			`published_at=published_at,`
			`effective_at=None,`
			`category="汽车标准",`
			`tags=extract_tags(standard_code, title),`
			`raw_text=raw_text,`
			`))`

			`if not batch:`
			`break`
			`events.extend(batch)`
			`page += 1`

			`return events[:limit]`