fix somethings
This commit is contained in:
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Shared utility functions for crawlers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
|
||||
|
||||
def parse_date(text: str) -> str:
|
||||
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return date.today().isoformat()
|
||||
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
|
||||
if m:
|
||||
try:
|
||||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
|
||||
if m2:
|
||||
try:
|
||||
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
return date.today().isoformat()
|
||||
|
||||
|
||||
def extract_tags(standard_code: str, title: str) -> list[str]:
|
||||
"""Derive simple keyword tags from standard code and title."""
|
||||
tags: list[str] = []
|
||||
code_upper = standard_code.upper()
|
||||
if "GB" in code_upper:
|
||||
tags.append("国家标准")
|
||||
if "/T" in code_upper:
|
||||
tags.append("推荐性")
|
||||
else:
|
||||
tags.append("强制性")
|
||||
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
|
||||
for kw in keywords:
|
||||
if kw in title:
|
||||
tags.append(kw)
|
||||
return tags[:5]
|
||||
32
backend/app/infrastructure/perception/crawlers/base.py
Normal file
32
backend/app/infrastructure/perception/crawlers/base.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Shared contracts for regulatory source crawlers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawEvent:
|
||||
"""Raw regulatory event returned by a crawler before enrichment."""
|
||||
|
||||
source: str
|
||||
source_label: str
|
||||
standard_code: str
|
||||
title: str
|
||||
summary: str
|
||||
full_text_url: str
|
||||
status: str # 'enacted' | 'draft' | 'consultation'
|
||||
published_at: str # YYYY-MM-DD string
|
||||
effective_at: str | None
|
||||
category: str
|
||||
tags: list[str] = field(default_factory=list)
|
||||
raw_text: str = "" # full crawled text for hashing + LLM
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""Abstract regulatory source crawler."""
|
||||
|
||||
@abstractmethod
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
"""Fetch up to `limit` recent events from the data source."""
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Crawler for CATARC automotive standard catalogue."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from ._utils import extract_tags, parse_date
|
||||
|
||||
_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
|
||||
_HOST = "https://www.catarc.org.cn"
|
||||
|
||||
_STATUS_MAP = {
|
||||
"现行": "enacted",
|
||||
"即将实施": "enacted",
|
||||
"废止": "enacted",
|
||||
"征求意见": "consultation",
|
||||
"报批": "draft",
|
||||
}
|
||||
|
||||
|
||||
class CatarcCrawler(BaseCrawler):
|
||||
"""Scrape the CATARC automotive standard list page."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
page = 1
|
||||
max_pages = max(10, limit)
|
||||
while len(events) < limit and page <= max_pages:
|
||||
url = f"{_BASE_URL}?page={page}"
|
||||
try:
|
||||
resp = httpx.get(url, timeout=30, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("CATARC fetch failed page={} err={}", page, exc)
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
rows = soup.select("table tr")
|
||||
if not rows:
|
||||
break
|
||||
|
||||
batch: list[RawEvent] = []
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
link = cells[0].find("a")
|
||||
standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
|
||||
title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
|
||||
date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
|
||||
published_at = parse_date(date_text)
|
||||
status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
|
||||
status = _STATUS_MAP.get(status_text, "enacted")
|
||||
detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
|
||||
raw_text = f"{standard_code} {title}"
|
||||
batch.append(RawEvent(
|
||||
source="CATARC",
|
||||
source_label="全国汽车标准化技术委员会",
|
||||
standard_code=standard_code,
|
||||
title=title,
|
||||
summary=title,
|
||||
full_text_url=detail_url,
|
||||
status=status,
|
||||
published_at=published_at,
|
||||
effective_at=None,
|
||||
category="汽车标准",
|
||||
tags=extract_tags(standard_code, title),
|
||||
raw_text=raw_text,
|
||||
))
|
||||
|
||||
if not batch:
|
||||
break
|
||||
events.extend(batch)
|
||||
page += 1
|
||||
|
||||
return events[:limit]
|
||||
|
||||
|
||||
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from ._utils import parse_date
|
||||
|
||||
_EURLEX_RSS_URLS = [
|
||||
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
|
||||
]
|
||||
|
||||
_AUTOMOTIVE_KEYWORDS = [
|
||||
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
|
||||
"AI Act", "artificial intelligence", "cybersecurity", "software update",
|
||||
"R155", "R156", "汽车", "车辆",
|
||||
]
|
||||
|
||||
|
||||
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
|
||||
|
||||
|
||||
def _is_automotive_relevant(title: str, description: str) -> bool:
|
||||
combined = (title + " " + description).lower()
|
||||
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
|
||||
|
||||
|
||||
def _extract_celex(url: str) -> str:
|
||||
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
|
||||
return m.group(1) if m else ""
|
||||
|
||||
|
||||
def _parse_rss_date(rfc2822: str) -> str:
|
||||
try:
|
||||
dt = parsedate_to_datetime(rfc2822)
|
||||
return dt.date().isoformat()
|
||||
except Exception:
|
||||
return parse_date(rfc2822)
|
||||
|
||||
|
||||
class EurlexCrawler(BaseCrawler):
|
||||
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
for rss_url in _EURLEX_RSS_URLS:
|
||||
if len(events) >= limit:
|
||||
break
|
||||
try:
|
||||
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(resp.content, "lxml-xml")
|
||||
for item in soup.find_all("item"):
|
||||
if len(events) >= limit:
|
||||
break
|
||||
title_tag = item.find("title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else ""
|
||||
desc_tag = item.find("description")
|
||||
description = desc_tag.get_text(strip=True) if desc_tag else ""
|
||||
link_tag = item.find("link")
|
||||
link = link_tag.get_text(strip=True) if link_tag else ""
|
||||
pub_date_tag = item.find("pubDate")
|
||||
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
|
||||
|
||||
if not _is_automotive_relevant(title, description):
|
||||
continue
|
||||
|
||||
celex = _extract_celex(link)
|
||||
standard_code = celex if celex else title[:60]
|
||||
published_at = _parse_rss_date(pub_date) if pub_date else ""
|
||||
|
||||
events.append(RawEvent(
|
||||
source="EUR-Lex",
|
||||
source_label="欧盟官方公报",
|
||||
standard_code=standard_code,
|
||||
title=title,
|
||||
summary=description[:500],
|
||||
full_text_url=link,
|
||||
status="enacted",
|
||||
published_at=published_at,
|
||||
effective_at=None,
|
||||
category="EU法规",
|
||||
tags=_extract_eurlex_tags(title, description),
|
||||
raw_text=f"{title}\n{description}",
|
||||
))
|
||||
|
||||
return events[:limit]
|
||||
|
||||
|
||||
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
|
||||
combined = title + " " + description
|
||||
tag_map = {
|
||||
"AI Act": "EU AI Act",
|
||||
"artificial intelligence": "EU AI Act",
|
||||
"R155": "UN R155",
|
||||
"R156": "UN R156",
|
||||
"cybersecurity": "网络安全",
|
||||
"emission": "排放",
|
||||
"autonomous": "自动驾驶",
|
||||
"ADAS": "ADAS",
|
||||
}
|
||||
combined_lower = combined.lower()
|
||||
tags = []
|
||||
for kw, tag in tag_map.items():
|
||||
if kw.lower() in combined_lower:
|
||||
tags.append(tag)
|
||||
return tags[:5]
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Crawlers for the 国标委 (SAMR) standard information platform."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import httpx
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from ._utils import extract_tags, parse_date
|
||||
|
||||
_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
|
||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
|
||||
|
||||
|
||||
def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
|
||||
params = {
|
||||
"p.p1": std_type,
|
||||
"p.p2": "车",
|
||||
"p.p90": "circulation_date",
|
||||
"p.p91": "desc",
|
||||
"p.p6": page,
|
||||
"p.p7": page_size,
|
||||
}
|
||||
try:
|
||||
resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data.get("rows", []) or []
|
||||
except Exception as exc:
|
||||
logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
|
||||
return []
|
||||
|
||||
|
||||
def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
|
||||
standard_code = row.get("std_code", "")
|
||||
title = row.get("std_name", standard_code)
|
||||
published_at = parse_date(row.get("release_date", ""))
|
||||
effective_at_raw = row.get("implement_date", "")
|
||||
effective_at = parse_date(effective_at_raw) if effective_at_raw else None
|
||||
status_text = row.get("std_status", "")
|
||||
if "征求意见" in status_text:
|
||||
status = "consultation"
|
||||
elif "报批" in status_text or "草案" in status_text:
|
||||
status = "draft"
|
||||
else:
|
||||
status = "enacted"
|
||||
return RawEvent(
|
||||
source="国标委",
|
||||
source_label=source_label,
|
||||
standard_code=standard_code,
|
||||
title=title,
|
||||
summary=title,
|
||||
full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
|
||||
status=status,
|
||||
published_at=published_at,
|
||||
effective_at=effective_at,
|
||||
category=row.get("std_type", "国家标准"),
|
||||
tags=extract_tags(standard_code, title),
|
||||
raw_text=f"{standard_code} {title}",
|
||||
)
|
||||
|
||||
|
||||
class GuobiaoMandatoryCrawler(BaseCrawler):
|
||||
"""Fetch mandatory national standards (强制性) related to vehicles."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
page = 1
|
||||
max_pages = max(10, limit)
|
||||
while len(events) < limit and page <= max_pages:
|
||||
rows = _fetch_page(std_type=1, page=page, page_size=20)
|
||||
if not rows:
|
||||
break
|
||||
events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
|
||||
page += 1
|
||||
return events[:limit]
|
||||
|
||||
|
||||
class GuobiaoRecommendedCrawler(BaseCrawler):
|
||||
"""Fetch recommended national standards (推荐性) related to vehicles."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
page = 1
|
||||
max_pages = max(10, limit)
|
||||
while len(events) < limit and page <= max_pages:
|
||||
rows = _fetch_page(std_type=2, page=page, page_size=20)
|
||||
if not rows:
|
||||
break
|
||||
events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
|
||||
page += 1
|
||||
return events[:limit]
|
||||
Reference in New Issue
Block a user