fix somethings
This commit is contained in:
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
117
backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from ._utils import parse_date
|
||||
|
||||
_EURLEX_RSS_URLS = [
|
||||
"https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
|
||||
]
|
||||
|
||||
_AUTOMOTIVE_KEYWORDS = [
|
||||
"vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
|
||||
"AI Act", "artificial intelligence", "cybersecurity", "software update",
|
||||
"R155", "R156", "汽车", "车辆",
|
||||
]
|
||||
|
||||
|
||||
_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
|
||||
|
||||
|
||||
def _is_automotive_relevant(title: str, description: str) -> bool:
|
||||
combined = (title + " " + description).lower()
|
||||
return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
|
||||
|
||||
|
||||
def _extract_celex(url: str) -> str:
|
||||
m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
|
||||
return m.group(1) if m else ""
|
||||
|
||||
|
||||
def _parse_rss_date(rfc2822: str) -> str:
|
||||
try:
|
||||
dt = parsedate_to_datetime(rfc2822)
|
||||
return dt.date().isoformat()
|
||||
except Exception:
|
||||
return parse_date(rfc2822)
|
||||
|
||||
|
||||
class EurlexCrawler(BaseCrawler):
|
||||
"""Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
|
||||
|
||||
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
||||
events: list[RawEvent] = []
|
||||
for rss_url in _EURLEX_RSS_URLS:
|
||||
if len(events) >= limit:
|
||||
break
|
||||
try:
|
||||
resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(resp.content, "lxml-xml")
|
||||
for item in soup.find_all("item"):
|
||||
if len(events) >= limit:
|
||||
break
|
||||
title_tag = item.find("title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else ""
|
||||
desc_tag = item.find("description")
|
||||
description = desc_tag.get_text(strip=True) if desc_tag else ""
|
||||
link_tag = item.find("link")
|
||||
link = link_tag.get_text(strip=True) if link_tag else ""
|
||||
pub_date_tag = item.find("pubDate")
|
||||
pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
|
||||
|
||||
if not _is_automotive_relevant(title, description):
|
||||
continue
|
||||
|
||||
celex = _extract_celex(link)
|
||||
standard_code = celex if celex else title[:60]
|
||||
published_at = _parse_rss_date(pub_date) if pub_date else ""
|
||||
|
||||
events.append(RawEvent(
|
||||
source="EUR-Lex",
|
||||
source_label="欧盟官方公报",
|
||||
standard_code=standard_code,
|
||||
title=title,
|
||||
summary=description[:500],
|
||||
full_text_url=link,
|
||||
status="enacted",
|
||||
published_at=published_at,
|
||||
effective_at=None,
|
||||
category="EU法规",
|
||||
tags=_extract_eurlex_tags(title, description),
|
||||
raw_text=f"{title}\n{description}",
|
||||
))
|
||||
|
||||
return events[:limit]
|
||||
|
||||
|
||||
def _extract_eurlex_tags(title: str, description: str) -> list[str]:
|
||||
combined = title + " " + description
|
||||
tag_map = {
|
||||
"AI Act": "EU AI Act",
|
||||
"artificial intelligence": "EU AI Act",
|
||||
"R155": "UN R155",
|
||||
"R156": "UN R156",
|
||||
"cybersecurity": "网络安全",
|
||||
"emission": "排放",
|
||||
"autonomous": "自动驾驶",
|
||||
"ADAS": "ADAS",
|
||||
}
|
||||
combined_lower = combined.lower()
|
||||
tags = []
|
||||
for kw, tag in tag_map.items():
|
||||
if kw.lower() in combined_lower:
|
||||
tags.append(tag)
|
||||
return tags[:5]
|
||||
Reference in New Issue
Block a user