44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
|
|
"""Shared utility functions for crawlers."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import re
|
||
|
|
from datetime import date
|
||
|
|
|
||
|
|
|
||
|
|
def parse_date(text: str) -> str:
|
||
|
|
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
|
||
|
|
text = text.strip()
|
||
|
|
if not text:
|
||
|
|
return date.today().isoformat()
|
||
|
|
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
|
||
|
|
if m:
|
||
|
|
try:
|
||
|
|
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
|
||
|
|
except ValueError:
|
||
|
|
pass
|
||
|
|
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
|
||
|
|
if m2:
|
||
|
|
try:
|
||
|
|
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
|
||
|
|
except ValueError:
|
||
|
|
pass
|
||
|
|
return date.today().isoformat()
|
||
|
|
|
||
|
|
|
||
|
|
def extract_tags(standard_code: str, title: str) -> list[str]:
|
||
|
|
"""Derive simple keyword tags from standard code and title."""
|
||
|
|
tags: list[str] = []
|
||
|
|
code_upper = standard_code.upper()
|
||
|
|
if "GB" in code_upper:
|
||
|
|
tags.append("国家标准")
|
||
|
|
if "/T" in code_upper:
|
||
|
|
tags.append("推荐性")
|
||
|
|
else:
|
||
|
|
tags.append("强制性")
|
||
|
|
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
|
||
|
|
for kw in keywords:
|
||
|
|
if kw in title:
|
||
|
|
tags.append(kw)
|
||
|
|
return tags[:5]
|