fix somethings

This commit is contained in:
2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions

View File

@@ -0,0 +1,117 @@
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
from __future__ import annotations
import re
from email.utils import parsedate_to_datetime
import httpx
from bs4 import BeautifulSoup
from loguru import logger
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import parse_date
_EURLEX_RSS_URLS = [
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
]
_AUTOMOTIVE_KEYWORDS = [
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
"AI Act", "artificial intelligence", "cybersecurity", "software update",
"R155", "R156", "汽车", "车辆",
]
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
def _is_automotive_relevant(title: str, description: str) -> bool:
combined = (title + " " + description).lower()
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
def _extract_celex(url: str) -> str:
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
return m.group(1) if m else ""
def _parse_rss_date(rfc2822: str) -> str:
try:
dt = parsedate_to_datetime(rfc2822)
return dt.date().isoformat()
except Exception:
return parse_date(rfc2822)
class EurlexCrawler(BaseCrawler):
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
def fetch(self, limit: int = 50) -> list[RawEvent]:
events: list[RawEvent] = []
for rss_url in _EURLEX_RSS_URLS:
if len(events) >= limit:
break
try:
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
resp.raise_for_status()
except Exception as exc:
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
continue
soup = BeautifulSoup(resp.content, "lxml-xml")
for item in soup.find_all("item"):
if len(events) >= limit:
break
title_tag = item.find("title")
title = title_tag.get_text(strip=True) if title_tag else ""
desc_tag = item.find("description")
description = desc_tag.get_text(strip=True) if desc_tag else ""
link_tag = item.find("link")
link = link_tag.get_text(strip=True) if link_tag else ""
pub_date_tag = item.find("pubDate")
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
if not _is_automotive_relevant(title, description):
continue
celex = _extract_celex(link)
standard_code = celex if celex else title[:60]
published_at = _parse_rss_date(pub_date) if pub_date else ""
events.append(RawEvent(
source="EUR-Lex",
source_label="欧盟官方公报",
standard_code=standard_code,
title=title,
summary=description[:500],
full_text_url=link,
status="enacted",
published_at=published_at,
effective_at=None,
category="EU法规",
tags=_extract_eurlex_tags(title, description),
raw_text=f"{title}\n{description}",
))
return events[:limit]
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
combined = title + " " + description
tag_map = {
"AI Act": "EU AI Act",
"artificial intelligence": "EU AI Act",
"R155": "UN R155",
"R156": "UN R156",
"cybersecurity": "网络安全",
"emission": "排放",
"autonomous": "自动驾驶",
"ADAS": "ADAS",
}
combined_lower = combined.lower()
tags = []
for kw, tag in tag_map.items():
if kw.lower() in combined_lower:
tags.append(tag)
return tags[:5]