"""Unit tests for crawlers — mock httpx responses."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
def test_raw_event_fields():
ev = RawEvent(
source="TEST",
source_label="Test",
standard_code="TST-001",
title="Test",
summary="Summary",
full_text_url="https://example.com",
status="enacted",
published_at="2026-01-01",
effective_at=None,
category="test",
tags=["a"],
raw_text="full text here",
)
assert ev.source == "TEST"
assert ev.tags == ["a"]
CATARC_HTML = """
"""
def test_catarc_crawler_parses_html():
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = CATARC_HTML
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = CatarcCrawler()
events = crawler.fetch(limit=10)
assert isinstance(events, list)
assert len(events) >= 1
assert all(isinstance(e, RawEvent) for e in events)
codes = [e.standard_code for e in events]
assert "GB 18384-2025" in codes
GUOBIAO_JSON = {
"rows": [
{
"std_code": "GB 18384-2025",
"std_name": "电动汽车安全要求",
"release_date": "2025-11-15",
"implement_date": "2026-07-01",
"std_status": "现行",
"std_type": "强制性",
},
]
}
def test_guobiao_crawler_parses_json():
from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = GUOBIAO_JSON
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = GuobiaoMandatoryCrawler()
events = crawler.fetch(limit=10)
assert len(events) >= 1
assert events[0].source == "国标委"
assert events[0].standard_code == "GB 18384-2025"
EURLEX_RSS = """
EUR-Lex
-
Regulation (EU) 2024/1689 — AI Act
https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689
The EU Artificial Intelligence Act enters into force.
Fri, 12 Jul 2024 00:00:00 GMT
"""
def test_eurlex_crawler_parses_rss():
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = EURLEX_RSS
mock_resp.content = EURLEX_RSS
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = EurlexCrawler()
events = crawler.fetch(limit=5)
assert isinstance(events, list)
assert len(events) >= 1
assert events[0].source == "EUR-Lex"