128 lines
3.4 KiB
Python
128 lines
3.4 KiB
Python
"""Unit tests for crawlers — mock httpx responses."""
|
|
from __future__ import annotations
|
|
from unittest.mock import MagicMock, patch
|
|
import pytest
|
|
|
|
from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
|
|
|
|
|
|
def test_raw_event_fields():
|
|
ev = RawEvent(
|
|
source="TEST",
|
|
source_label="Test",
|
|
standard_code="TST-001",
|
|
title="Test",
|
|
summary="Summary",
|
|
full_text_url="https://example.com",
|
|
status="enacted",
|
|
published_at="2026-01-01",
|
|
effective_at=None,
|
|
category="test",
|
|
tags=["a"],
|
|
raw_text="full text here",
|
|
)
|
|
assert ev.source == "TEST"
|
|
assert ev.tags == ["a"]
|
|
|
|
|
|
CATARC_HTML = """
|
|
<html><body>
|
|
<table>
|
|
<tr>
|
|
<td><a href="/std/detail/123">GB 18384-2025</a></td>
|
|
<td>电动汽车安全要求</td>
|
|
<td>2025-11-15</td>
|
|
<td>现行</td>
|
|
</tr>
|
|
<tr>
|
|
<td><a href="/std/detail/456">GB/T 40429-2026</a></td>
|
|
<td>汽车驾驶自动化分级</td>
|
|
<td>2026-02-01</td>
|
|
<td>即将实施</td>
|
|
</tr>
|
|
</table>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
def test_catarc_crawler_parses_html():
|
|
from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
|
|
|
|
mock_resp = MagicMock()
|
|
mock_resp.status_code = 200
|
|
mock_resp.text = CATARC_HTML
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
with patch("httpx.get", return_value=mock_resp):
|
|
crawler = CatarcCrawler()
|
|
events = crawler.fetch(limit=10)
|
|
|
|
assert isinstance(events, list)
|
|
assert len(events) >= 1
|
|
assert all(isinstance(e, RawEvent) for e in events)
|
|
codes = [e.standard_code for e in events]
|
|
assert "GB 18384-2025" in codes
|
|
|
|
|
|
GUOBIAO_JSON = {
|
|
"rows": [
|
|
{
|
|
"std_code": "GB 18384-2025",
|
|
"std_name": "电动汽车安全要求",
|
|
"release_date": "2025-11-15",
|
|
"implement_date": "2026-07-01",
|
|
"std_status": "现行",
|
|
"std_type": "强制性",
|
|
},
|
|
]
|
|
}
|
|
|
|
|
|
def test_guobiao_crawler_parses_json():
|
|
from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
|
|
|
|
mock_resp = MagicMock()
|
|
mock_resp.status_code = 200
|
|
mock_resp.json.return_value = GUOBIAO_JSON
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
with patch("httpx.get", return_value=mock_resp):
|
|
crawler = GuobiaoMandatoryCrawler()
|
|
events = crawler.fetch(limit=10)
|
|
|
|
assert len(events) >= 1
|
|
assert events[0].source == "国标委"
|
|
assert events[0].standard_code == "GB 18384-2025"
|
|
|
|
|
|
EURLEX_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<title>EUR-Lex</title>
|
|
<item>
|
|
<title>Regulation (EU) 2024/1689 — AI Act</title>
|
|
<link>https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689</link>
|
|
<description>The EU Artificial Intelligence Act enters into force.</description>
|
|
<pubDate>Fri, 12 Jul 2024 00:00:00 GMT</pubDate>
|
|
</item>
|
|
</channel>
|
|
</rss>"""
|
|
|
|
|
|
def test_eurlex_crawler_parses_rss():
|
|
from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
|
|
|
|
mock_resp = MagicMock()
|
|
mock_resp.status_code = 200
|
|
mock_resp.text = EURLEX_RSS
|
|
mock_resp.content = EURLEX_RSS
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
with patch("httpx.get", return_value=mock_resp):
|
|
crawler = EurlexCrawler()
|
|
events = crawler.fetch(limit=5)
|
|
|
|
assert isinstance(events, list)
|
|
assert len(events) >= 1
|
|
assert events[0].source == "EUR-Lex"
|