118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from loguru import logger
|
|
|
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
|
from ._utils import parse_date
|
|
|
|
_EURLEX_RSS_URLS = [
|
|
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
|
|
]
|
|
|
|
_AUTOMOTIVE_KEYWORDS = [
|
|
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
|
|
"AI Act", "artificial intelligence", "cybersecurity", "software update",
|
|
"R155", "R156", "汽车", "车辆",
|
|
]
|
|
|
|
|
|
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
|
|
|
|
|
|
def _is_automotive_relevant(title: str, description: str) -> bool:
|
|
combined = (title + " " + description).lower()
|
|
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
|
|
|
|
|
|
def _extract_celex(url: str) -> str:
|
|
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
|
|
return m.group(1) if m else ""
|
|
|
|
|
|
def _parse_rss_date(rfc2822: str) -> str:
|
|
try:
|
|
dt = parsedate_to_datetime(rfc2822)
|
|
return dt.date().isoformat()
|
|
except Exception:
|
|
return parse_date(rfc2822)
|
|
|
|
|
|
class EurlexCrawler(BaseCrawler):
|
|
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
|
|
|
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
|
events: list[RawEvent] = []
|
|
for rss_url in _EURLEX_RSS_URLS:
|
|
if len(events) >= limit:
|
|
break
|
|
try:
|
|
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
|
|
resp.raise_for_status()
|
|
except Exception as exc:
|
|
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
|
|
continue
|
|
|
|
soup = BeautifulSoup(resp.content, "lxml-xml")
|
|
for item in soup.find_all("item"):
|
|
if len(events) >= limit:
|
|
break
|
|
title_tag = item.find("title")
|
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
desc_tag = item.find("description")
|
|
description = desc_tag.get_text(strip=True) if desc_tag else ""
|
|
link_tag = item.find("link")
|
|
link = link_tag.get_text(strip=True) if link_tag else ""
|
|
pub_date_tag = item.find("pubDate")
|
|
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
|
|
|
|
if not _is_automotive_relevant(title, description):
|
|
continue
|
|
|
|
celex = _extract_celex(link)
|
|
standard_code = celex if celex else title[:60]
|
|
published_at = _parse_rss_date(pub_date) if pub_date else ""
|
|
|
|
events.append(RawEvent(
|
|
source="EUR-Lex",
|
|
source_label="欧盟官方公报",
|
|
standard_code=standard_code,
|
|
title=title,
|
|
summary=description[:500],
|
|
full_text_url=link,
|
|
status="enacted",
|
|
published_at=published_at,
|
|
effective_at=None,
|
|
category="EU法规",
|
|
tags=_extract_eurlex_tags(title, description),
|
|
raw_text=f"{title}\n{description}",
|
|
))
|
|
|
|
return events[:limit]
|
|
|
|
|
|
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
|
|
combined = title + " " + description
|
|
tag_map = {
|
|
"AI Act": "EU AI Act",
|
|
"artificial intelligence": "EU AI Act",
|
|
"R155": "UN R155",
|
|
"R156": "UN R156",
|
|
"cybersecurity": "网络安全",
|
|
"emission": "排放",
|
|
"autonomous": "自动驾驶",
|
|
"ADAS": "ADAS",
|
|
}
|
|
combined_lower = combined.lower()
|
|
tags = []
|
|
for kw, tag in tag_map.items():
|
|
if kw.lower() in combined_lower:
|
|
tags.append(tag)
|
|
return tags[:5]
|