fix somethings

This commit is contained in:
2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions

View File

@@ -0,0 +1,43 @@
"""Shared utility functions for crawlers."""
from __future__ import annotations
import re
from datetime import date
def parse_date(text: str) -> str:
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
text = text.strip()
if not text:
return date.today().isoformat()
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
except ValueError:
pass
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
if m2:
try:
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
except ValueError:
pass
return date.today().isoformat()
def extract_tags(standard_code: str, title: str) -> list[str]:
"""Derive simple keyword tags from standard code and title."""
tags: list[str] = []
code_upper = standard_code.upper()
if "GB" in code_upper:
tags.append("国家标准")
if "/T" in code_upper:
tags.append("推荐性")
else:
tags.append("强制性")
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
for kw in keywords:
if kw in title:
tags.append(kw)
return tags[:5]