fix somethings
This commit is contained in:
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
43
backend/app/infrastructure/perception/crawlers/_utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Shared utility functions for crawlers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
|
||||
|
||||
def parse_date(text: str) -> str:
|
||||
"""Return YYYY-MM-DD from common Chinese date formats, or today's date."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return date.today().isoformat()
|
||||
m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
|
||||
if m:
|
||||
try:
|
||||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
|
||||
if m2:
|
||||
try:
|
||||
return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
return date.today().isoformat()
|
||||
|
||||
|
||||
def extract_tags(standard_code: str, title: str) -> list[str]:
|
||||
"""Derive simple keyword tags from standard code and title."""
|
||||
tags: list[str] = []
|
||||
code_upper = standard_code.upper()
|
||||
if "GB" in code_upper:
|
||||
tags.append("国家标准")
|
||||
if "/T" in code_upper:
|
||||
tags.append("推荐性")
|
||||
else:
|
||||
tags.append("强制性")
|
||||
keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
|
||||
for kw in keywords:
|
||||
if kw in title:
|
||||
tags.append(kw)
|
||||
return tags[:5]
|
||||
Reference in New Issue
Block a user