fix somethings

2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions
--- a/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
@@ -0,0 +1,117 @@
+"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
+
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import parse_date
+
+_EURLEX_RSS_URLS = [
+    "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
+]
+
+_AUTOMOTIVE_KEYWORDS = [
+    "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
+    "AI Act", "artificial intelligence", "cybersecurity", "software update",
+    "R155", "R156", "汽车", "车辆",
+]
+
+
+_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
+
+
+def _is_automotive_relevant(title: str, description: str) -> bool:
+    combined = (title + " " + description).lower()
+    return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
+
+
+def _extract_celex(url: str) -> str:
+    m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
+    return m.group(1) if m else ""
+
+
+def _parse_rss_date(rfc2822: str) -> str:
+    try:
+        dt = parsedate_to_datetime(rfc2822)
+        return dt.date().isoformat()
+    except Exception:
+        return parse_date(rfc2822)
+
+
+class EurlexCrawler(BaseCrawler):
+    """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        for rss_url in _EURLEX_RSS_URLS:
+            if len(events) >= limit:
+                break
+            try:
+                resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
+                resp.raise_for_status()
+            except Exception as exc:
+                logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
+                continue
+
+            soup = BeautifulSoup(resp.content, "lxml-xml")
+            for item in soup.find_all("item"):
+                if len(events) >= limit:
+                    break
+                title_tag = item.find("title")
+                title = title_tag.get_text(strip=True) if title_tag else ""
+                desc_tag = item.find("description")
+                description = desc_tag.get_text(strip=True) if desc_tag else ""
+                link_tag = item.find("link")
+                link = link_tag.get_text(strip=True) if link_tag else ""
+                pub_date_tag = item.find("pubDate")
+                pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
+
+                if not _is_automotive_relevant(title, description):
+                    continue
+
+                celex = _extract_celex(link)
+                standard_code = celex if celex else title[:60]
+                published_at = _parse_rss_date(pub_date) if pub_date else ""
+
+                events.append(RawEvent(
+                    source="EUR-Lex",
+                    source_label="欧盟官方公报",
+                    standard_code=standard_code,
+                    title=title,
+                    summary=description[:500],
+                    full_text_url=link,
+                    status="enacted",
+                    published_at=published_at,
+                    effective_at=None,
+                    category="EU法规",
+                    tags=_extract_eurlex_tags(title, description),
+                    raw_text=f"{title}\n{description}",
+                ))
+
+        return events[:limit]
+
+
+def _extract_eurlex_tags(title: str, description: str) -> list[str]:
+    combined = title + " " + description
+    tag_map = {
+        "AI Act": "EU AI Act",
+        "artificial intelligence": "EU AI Act",
+        "R155": "UN R155",
+        "R156": "UN R156",
+        "cybersecurity": "网络安全",
+        "emission": "排放",
+        "autonomous": "自动驾驶",
+        "ADAS": "ADAS",
+    }
+    combined_lower = combined.lower()
+    tags = []
+    for kw, tag in tag_map.items():
+        if kw.lower() in combined_lower:
+            tags.append(tag)
+    return tags[:5]