33 lines
878 B
Python
33 lines
878 B
Python
"""Shared contracts for regulatory source crawlers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class RawEvent:
|
|
"""Raw regulatory event returned by a crawler before enrichment."""
|
|
|
|
source: str
|
|
source_label: str
|
|
standard_code: str
|
|
title: str
|
|
summary: str
|
|
full_text_url: str
|
|
status: str # 'enacted' | 'draft' | 'consultation'
|
|
published_at: str # YYYY-MM-DD string
|
|
effective_at: str | None
|
|
category: str
|
|
tags: list[str] = field(default_factory=list)
|
|
raw_text: str = "" # full crawled text for hashing + LLM
|
|
|
|
|
|
class BaseCrawler(ABC):
|
|
"""Abstract regulatory source crawler."""
|
|
|
|
@abstractmethod
|
|
def fetch(self, limit: int = 50) -> list[RawEvent]:
|
|
"""Fetch up to `limit` recent events from the data source."""
|