AIRegulation-DocAnalysis/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py

"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""

from __future__ import annotations

import re
from email.utils import parsedate_to_datetime

import httpx
from bs4 import BeautifulSoup
from loguru import logger

from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from ._utils import parse_date

_EURLEX_RSS_URLS = [
    "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
]

_AUTOMOTIVE_KEYWORDS = [
    "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
    "AI Act", "artificial intelligence", "cybersecurity", "software update",
    "R155", "R156", "汽车", "车辆",
]


_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]


def _is_automotive_relevant(title: str, description: str) -> bool:
    combined = (title + " " + description).lower()
    return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)


def _extract_celex(url: str) -> str:
    m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
    return m.group(1) if m else ""


def _parse_rss_date(rfc2822: str) -> str:
    try:
        dt = parsedate_to_datetime(rfc2822)
        return dt.date().isoformat()
    except Exception:
        return parse_date(rfc2822)


class EurlexCrawler(BaseCrawler):
    """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""

    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        for rss_url in _EURLEX_RSS_URLS:
            if len(events) >= limit:
                break
            try:
                resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
                resp.raise_for_status()
            except Exception as exc:
                logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
                continue

            soup = BeautifulSoup(resp.content, "lxml-xml")
            for item in soup.find_all("item"):
                if len(events) >= limit:
                    break
                title_tag = item.find("title")
                title = title_tag.get_text(strip=True) if title_tag else ""
                desc_tag = item.find("description")
                description = desc_tag.get_text(strip=True) if desc_tag else ""
                link_tag = item.find("link")
                link = link_tag.get_text(strip=True) if link_tag else ""
                pub_date_tag = item.find("pubDate")
                pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""

                if not _is_automotive_relevant(title, description):
                    continue

                celex = _extract_celex(link)
                standard_code = celex if celex else title[:60]
                published_at = _parse_rss_date(pub_date) if pub_date else ""

                events.append(RawEvent(
                    source="EUR-Lex",
                    source_label="欧盟官方公报",
                    standard_code=standard_code,
                    title=title,
                    summary=description[:500],
                    full_text_url=link,
                    status="enacted",
                    published_at=published_at,
                    effective_at=None,
                    category="EU法规",
                    tags=_extract_eurlex_tags(title, description),
                    raw_text=f"{title}\n{description}",
                ))

        return events[:limit]


def _extract_eurlex_tags(title: str, description: str) -> list[str]:
    combined = title + " " + description
    tag_map = {
        "AI Act": "EU AI Act",
        "artificial intelligence": "EU AI Act",
        "R155": "UN R155",
        "R156": "UN R156",
        "cybersecurity": "网络安全",
        "emission": "排放",
        "autonomous": "自动驾驶",
        "ADAS": "ADAS",
    }
    combined_lower = combined.lower()
    tags = []
    for kw, tag in tag_map.items():
        if kw.lower() in combined_lower:
            tags.append(tag)
    return tags[:5]