Files
AIRegulation-DocAnalysis/backend/tests/perception/test_crawlers.py

128 lines
3.4 KiB
Python
Raw Normal View History

2026-06-08 11:16:28 +08:00
"""Unit tests for crawlers — mock httpx responses."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
def test_raw_event_fields():
ev = RawEvent(
source="TEST",
source_label="Test",
standard_code="TST-001",
title="Test",
summary="Summary",
full_text_url="https://example.com",
status="enacted",
published_at="2026-01-01",
effective_at=None,
category="test",
tags=["a"],
raw_text="full text here",
)
assert ev.source == "TEST"
assert ev.tags == ["a"]
CATARC_HTML = """
<html><body>
<table>
<tr>
<td><a href="/std/detail/123">GB 18384-2025</a></td>
<td>电动汽车安全要求</td>
<td>2025-11-15</td>
<td>现行</td>
</tr>
<tr>
<td><a href="/std/detail/456">GB/T 40429-2026</a></td>
<td>汽车驾驶自动化分级</td>
<td>2026-02-01</td>
<td>即将实施</td>
</tr>
</table>
</body></html>
"""
def test_catarc_crawler_parses_html():
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = CATARC_HTML
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = CatarcCrawler()
events = crawler.fetch(limit=10)
assert isinstance(events, list)
assert len(events) >= 1
assert all(isinstance(e, RawEvent) for e in events)
codes = [e.standard_code for e in events]
assert "GB 18384-2025" in codes
GUOBIAO_JSON = {
"rows": [
{
"std_code": "GB 18384-2025",
"std_name": "电动汽车安全要求",
"release_date": "2025-11-15",
"implement_date": "2026-07-01",
"std_status": "现行",
"std_type": "强制性",
},
]
}
def test_guobiao_crawler_parses_json():
from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = GUOBIAO_JSON
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = GuobiaoMandatoryCrawler()
events = crawler.fetch(limit=10)
assert len(events) >= 1
assert events[0].source == "国标委"
assert events[0].standard_code == "GB 18384-2025"
EURLEX_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>EUR-Lex</title>
<item>
<title>Regulation (EU) 2024/1689 AI Act</title>
<link>https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689</link>
<description>The EU Artificial Intelligence Act enters into force.</description>
<pubDate>Fri, 12 Jul 2024 00:00:00 GMT</pubDate>
</item>
</channel>
</rss>"""
def test_eurlex_crawler_parses_rss():
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = EURLEX_RSS
mock_resp.content = EURLEX_RSS
mock_resp.raise_for_status = MagicMock()
with patch("httpx.get", return_value=mock_resp):
crawler = EurlexCrawler()
events = crawler.fetch(limit=5)
assert isinstance(events, list)
assert len(events) >= 1
assert events[0].source == "EUR-Lex"