"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations.""" from __future__ import annotations import re from email.utils import parsedate_to_datetime import httpx from bs4 import BeautifulSoup from loguru import logger from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent from ._utils import parse_date _EURLEX_RSS_URLS = [ "https://eur-lex.europa.eu/rss-feed/OJ-L.rss", ] _AUTOMOTIVE_KEYWORDS = [ "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous", "AI Act", "artificial intelligence", "cybersecurity", "software update", "R155", "R156", "汽车", "车辆", ] _AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS] def _is_automotive_relevant(title: str, description: str) -> bool: combined = (title + " " + description).lower() return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER) def _extract_celex(url: str) -> str: m = re.search(r"CELEX[:/]([0-9A-Z]+)", url) return m.group(1) if m else "" def _parse_rss_date(rfc2822: str) -> str: try: dt = parsedate_to_datetime(rfc2822) return dt.date().isoformat() except Exception: return parse_date(rfc2822) class EurlexCrawler(BaseCrawler): """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds.""" def fetch(self, limit: int = 50) -> list[RawEvent]: events: list[RawEvent] = [] for rss_url in _EURLEX_RSS_URLS: if len(events) >= limit: break try: resp = httpx.get(rss_url, timeout=30, follow_redirects=True) resp.raise_for_status() except Exception as exc: logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc) continue soup = BeautifulSoup(resp.content, "lxml-xml") for item in soup.find_all("item"): if len(events) >= limit: break title_tag = item.find("title") title = title_tag.get_text(strip=True) if title_tag else "" desc_tag = item.find("description") description = desc_tag.get_text(strip=True) if desc_tag else "" link_tag = item.find("link") link = link_tag.get_text(strip=True) if link_tag else "" pub_date_tag = item.find("pubDate") pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else "" if not _is_automotive_relevant(title, description): continue celex = _extract_celex(link) standard_code = celex if celex else title[:60] published_at = _parse_rss_date(pub_date) if pub_date else "" events.append(RawEvent( source="EUR-Lex", source_label="欧盟官方公报", standard_code=standard_code, title=title, summary=description[:500], full_text_url=link, status="enacted", published_at=published_at, effective_at=None, category="EU法规", tags=_extract_eurlex_tags(title, description), raw_text=f"{title}\n{description}", )) return events[:limit] def _extract_eurlex_tags(title: str, description: str) -> list[str]: combined = title + " " + description tag_map = { "AI Act": "EU AI Act", "artificial intelligence": "EU AI Act", "R155": "UN R155", "R156": "UN R156", "cybersecurity": "网络安全", "emission": "排放", "autonomous": "自动驾驶", "ADAS": "ADAS", } combined_lower = combined.lower() tags = [] for kw, tag in tag_map.items(): if kw.lower() in combined_lower: tags.append(tag) return tags[:5]