first commit
This commit is contained in:
389
catarc_scraper.py
Normal file
389
catarc_scraper.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
全国汽车标准化技术委员会 - 汽车标准数据采集脚本
|
||||
数据来源: https://www.catarc.org.cn/bzzxd/qcbz/index.html
|
||||
API: POST https://www.catarc.org.cn/prod-api/api/customform/getPageList
|
||||
|
||||
功能:
|
||||
1. 全量采集汽车标准数据 (标准编号、标准名称、英文名称、代替标准、发布日期、实施日期、标准状态)
|
||||
2. 支持关键词搜索
|
||||
3. 支持断点续采
|
||||
4. 导出为格式化的 Excel 文件
|
||||
5. 支持增量更新 (已有数据自动跳过)
|
||||
|
||||
用法:
|
||||
python catarc_scraper.py # 全量采集
|
||||
python catarc_scraper.py --search "制动" # 搜索包含"制动"的标准
|
||||
python catarc_scraper.py --resume # 断点续采
|
||||
python catarc_scraper.py --page-size 100 # 自定义每页大小
|
||||
python catarc_scraper.py --output result # 自定义输出文件名
|
||||
"""
|
||||
|
||||
import sys
|
||||
import io
|
||||
import os
|
||||
|
||||
# 修复 Windows 控制台中文输出问题
|
||||
if sys.platform == "win32":
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ─── 配置 ───────────────────────────────────────────────
|
||||
API_URL = "https://www.catarc.org.cn/prod-api/api/customform/getPageList"
|
||||
FORM_ID = "615560029638725"
|
||||
DEFAULT_PAGE_SIZE = 50
|
||||
MAX_RETRIES = 5
|
||||
RETRY_DELAY = 3 # 秒
|
||||
REQUEST_TIMEOUT = 30 # 秒
|
||||
CACHE_FILE = ".catarc_cache.json"
|
||||
HEADERS = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Origin": "https://www.catarc.org.cn",
|
||||
"Referer": "https://www.catarc.org.cn/bzzxd/qcbz/index.html",
|
||||
}
|
||||
|
||||
# 字段映射: API字段 → 中文显示名
|
||||
FIELD_MAP = {
|
||||
"STD_CODE": "标准编号",
|
||||
"C_NAME": "标准名称",
|
||||
"E_NAME": "英文名称",
|
||||
"REVISE_STD_CODES": "代替标准",
|
||||
"ISSUE_TIME": "发布日期",
|
||||
"ACT_TIME": "实施日期",
|
||||
"STD_STATUS": "标准状态",
|
||||
}
|
||||
|
||||
# Excel 列宽
|
||||
COLUMN_WIDTHS = {
|
||||
"标准编号": 20,
|
||||
"标准名称": 55,
|
||||
"英文名称": 70,
|
||||
"代替标准": 20,
|
||||
"发布日期": 14,
|
||||
"实施日期": 14,
|
||||
"标准状态": 12,
|
||||
}
|
||||
|
||||
|
||||
def timestamp_to_date(ts_ms):
|
||||
"""将毫秒级时间戳转换为 YYYY-MM-DD 字符串"""
|
||||
if not ts_ms:
|
||||
return ""
|
||||
try:
|
||||
return datetime.fromtimestamp(int(ts_ms) / 1000).strftime("%Y-%m-%d")
|
||||
except (ValueError, OSError):
|
||||
return str(ts_ms)
|
||||
|
||||
|
||||
def fetch_page(session, page_num, page_size, keyword=""):
|
||||
"""请求单页数据"""
|
||||
payload = {
|
||||
"formId": FORM_ID,
|
||||
"pageSize": page_size,
|
||||
"pageNum": page_num,
|
||||
"key": keyword,
|
||||
}
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
resp = session.post(API_URL, json=payload, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if data.get("code") == 200:
|
||||
return data["data"]
|
||||
print(f" [!] API 返回错误: {data.get('msg', '未知错误')}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f" [!] 第 {page_num} 页请求失败 (第 {attempt}/{MAX_RETRIES} 次): {e}")
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(RETRY_DELAY * attempt)
|
||||
return None
|
||||
|
||||
|
||||
def normalize_record(record):
|
||||
"""标准化单条记录: 转换时间戳, 清理空白"""
|
||||
return {
|
||||
"标准编号": (record.get("STD_CODE") or "").strip(),
|
||||
"标准名称": (record.get("C_NAME") or "").strip(),
|
||||
"英文名称": (record.get("E_NAME") or "").strip(),
|
||||
"代替标准": (record.get("REVISE_STD_CODES") or "").strip(),
|
||||
"发布日期": timestamp_to_date(record.get("ISSUE_TIME")),
|
||||
"实施日期": timestamp_to_date(record.get("ACT_TIME")),
|
||||
"标准状态": (record.get("STD_STATUS") or "").strip(),
|
||||
}
|
||||
|
||||
|
||||
def load_cache():
|
||||
"""加载缓存 (用于断点续采和去重)"""
|
||||
if os.path.exists(CACHE_FILE):
|
||||
try:
|
||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
pass
|
||||
return {"records": [], "last_page": 0, "keyword": "", "total": 0}
|
||||
|
||||
|
||||
def save_cache(cache):
|
||||
"""保存缓存"""
|
||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(cache, f, ensure_ascii=False)
|
||||
|
||||
|
||||
def scrape_all(page_size, keyword="", resume=False):
|
||||
"""采集所有数据"""
|
||||
session = requests.Session()
|
||||
all_records = []
|
||||
start_page = 1
|
||||
total = 0
|
||||
|
||||
# 断点续采
|
||||
if resume:
|
||||
cache = load_cache()
|
||||
if cache["records"] and cache["keyword"] == keyword:
|
||||
all_records = cache["records"]
|
||||
start_page = cache["last_page"] + 1
|
||||
total = cache["total"]
|
||||
print(f" [*] 从缓存恢复: 已有 {len(all_records)} 条, 从第 {start_page} 页继续")
|
||||
else:
|
||||
print(" [*] 缓存不匹配, 从头开始采集")
|
||||
|
||||
# 第一次请求, 获取总页数
|
||||
first_data = fetch_page(session, start_page, page_size, keyword)
|
||||
if not first_data:
|
||||
print(" [✗] 无法获取数据, 请检查网络连接")
|
||||
return all_records
|
||||
|
||||
total = first_data.get("total", 0)
|
||||
total_pages = first_data.get("pages", 0)
|
||||
|
||||
records = [normalize_record(r) for r in (first_data.get("list") or [])]
|
||||
all_records.extend(records)
|
||||
|
||||
print(f" 总计: {total} 条标准, 共 {total_pages} 页, 每页 {page_size} 条")
|
||||
print(f" 已采集: {len(all_records)}/{total}", end="\r")
|
||||
|
||||
# 更新缓存
|
||||
cache = {
|
||||
"records": all_records,
|
||||
"last_page": start_page,
|
||||
"keyword": keyword,
|
||||
"total": total,
|
||||
}
|
||||
save_cache(cache)
|
||||
|
||||
# 逐页采集
|
||||
for page_num in range(start_page + 1, total_pages + 1):
|
||||
data = fetch_page(session, page_num, page_size, keyword)
|
||||
if data is None:
|
||||
print(f"\n [!] 第 {page_num} 页采集失败, 保存进度并退出")
|
||||
save_cache(cache)
|
||||
break
|
||||
|
||||
records = [normalize_record(r) for r in (data.get("list") or [])]
|
||||
all_records.extend(records)
|
||||
|
||||
# 更新缓存
|
||||
cache["records"] = all_records
|
||||
cache["last_page"] = page_num
|
||||
save_cache(cache)
|
||||
|
||||
print(f" 已采集: {len(all_records)}/{total} (第 {page_num}/{total_pages} 页)", end="\r")
|
||||
|
||||
# 礼貌延迟, 避免给服务器造成压力
|
||||
time.sleep(0.3)
|
||||
|
||||
print()
|
||||
return all_records
|
||||
|
||||
|
||||
def export_to_excel(records, output_path):
|
||||
"""将记录导出为格式化的 Excel 文件"""
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "汽车标准"
|
||||
|
||||
# ── 标题行 ──
|
||||
headers = list(FIELD_MAP.values())
|
||||
header_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11)
|
||||
header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
thin_border = Border(
|
||||
left=Side(style="thin", color="B4C6E7"),
|
||||
right=Side(style="thin", color="B4C6E7"),
|
||||
top=Side(style="thin", color="B4C6E7"),
|
||||
bottom=Side(style="thin", color="B4C6E7"),
|
||||
)
|
||||
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=header)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = thin_border
|
||||
|
||||
# ── 数据行 ──
|
||||
data_font = Font(name="微软雅黑", size=10)
|
||||
data_align = Alignment(vertical="center", wrap_text=True)
|
||||
even_fill = PatternFill(start_color="D6E4F0", end_color="D6E4F0", fill_type="solid")
|
||||
|
||||
for row_idx, record in enumerate(records, 2):
|
||||
for col_idx, key in enumerate(FIELD_MAP.keys()):
|
||||
cn_key = FIELD_MAP[key]
|
||||
cell = ws.cell(row=row_idx, column=col_idx + 1, value=record.get(cn_key, ""))
|
||||
cell.font = data_font
|
||||
cell.alignment = data_align
|
||||
cell.border = thin_border
|
||||
if row_idx % 2 == 0:
|
||||
cell.fill = even_fill
|
||||
|
||||
# ── 列宽 ──
|
||||
for col_idx, header in enumerate(headers, 1):
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = COLUMN_WIDTHS.get(header, 15)
|
||||
|
||||
# ── 冻结首行 & 自动筛选 ──
|
||||
ws.freeze_panes = "A2"
|
||||
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}{len(records) + 1}"
|
||||
|
||||
# ── 添加统计信息 Sheet ──
|
||||
ws_stat = wb.create_sheet("统计信息")
|
||||
status_count = {}
|
||||
year_count = {}
|
||||
for r in records:
|
||||
s = r.get("标准状态", "未知")
|
||||
status_count[s] = status_count.get(s, 0) + 1
|
||||
issue = r.get("发布日期", "")
|
||||
year = issue[:4] if issue else "未知"
|
||||
year_count[year] = year_count.get(year, 0) + 1
|
||||
|
||||
stat_data = [
|
||||
["采集时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
|
||||
["数据来源", "全国汽车标准化技术委员会 (catarc.org.cn)"],
|
||||
["标准总数", len(records)],
|
||||
[], # 空行
|
||||
["标准状态分布"],
|
||||
["状态", "数量"],
|
||||
]
|
||||
for s, c in sorted(status_count.items(), key=lambda x: -x[1]):
|
||||
stat_data.append([s, c])
|
||||
|
||||
stat_data.append([])
|
||||
stat_data.append(["按发布年份分布"])
|
||||
stat_data.append(["年份", "数量"])
|
||||
for y, c in sorted(year_count.items(), reverse=True):
|
||||
stat_data.append([y, c])
|
||||
|
||||
for row_idx, row in enumerate(stat_data, 1):
|
||||
for col_idx, val in enumerate(row, 1):
|
||||
cell = ws_stat.cell(row=row_idx, column=col_idx, value=val)
|
||||
if row_idx in (5, stat_data.index([]) + 5 + len(status_count) + 2 + 1):
|
||||
cell.font = Font(name="微软雅黑", bold=True, size=11)
|
||||
else:
|
||||
cell.font = Font(name="微软雅黑", size=10)
|
||||
|
||||
ws_stat.column_dimensions["A"].width = 20
|
||||
ws_stat.column_dimensions["B"].width = 50
|
||||
|
||||
# ── 保存 ──
|
||||
wb.save(output_path)
|
||||
return output_path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="全国汽车标准化技术委员会 - 汽车标准数据采集工具",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
python catarc_scraper.py # 全量采集
|
||||
python catarc_scraper.py --search "制动" # 搜索"制动"相关标准
|
||||
python catarc_scraper.py --search "GB 7258" # 按标准编号搜索
|
||||
python catarc_scraper.py --resume # 断点续采
|
||||
python catarc_scraper.py --page-size 100 # 每页100条, 减少请求次数
|
||||
python catarc_scraper.py --output 我的标准 # 自定义输出文件名
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--search", "-s", default="", help="搜索关键词 (标准编号或标准名称)")
|
||||
parser.add_argument("--resume", "-r", action="store_true", help="从上次中断处继续采集")
|
||||
parser.add_argument("--page-size", "-p", type=int, default=DEFAULT_PAGE_SIZE, help="每页条数 (默认50, 最大建议100)")
|
||||
parser.add_argument("--output", "-o", default=None, help="输出文件名 (不含扩展名, 默认自动生成)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 输出文件名
|
||||
if args.output:
|
||||
output_name = args.output
|
||||
elif args.search:
|
||||
output_name = f"汽车标准_搜索_{args.search}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
else:
|
||||
output_name = f"汽车标准全量_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{output_name}.xlsx")
|
||||
|
||||
print("=" * 60)
|
||||
print(" 全国汽车标准化技术委员会 - 汽车标准数据采集工具")
|
||||
print("=" * 60)
|
||||
print(f" 数据来源: catarc.org.cn")
|
||||
if args.search:
|
||||
print(f" 搜索关键词: {args.search}")
|
||||
print(f" 每页条数: {args.page_size}")
|
||||
print(f" 输出文件: {output_path}")
|
||||
print("-" * 60)
|
||||
|
||||
# 开始采集
|
||||
start_time = time.time()
|
||||
records = scrape_all(args.page_size, args.search, args.resume)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if not records:
|
||||
print("\n 未获取到任何数据")
|
||||
sys.exit(1)
|
||||
|
||||
# 去重 (按标准编号)
|
||||
seen = set()
|
||||
unique_records = []
|
||||
for r in records:
|
||||
code = r["标准编号"]
|
||||
if code not in seen:
|
||||
seen.add(code)
|
||||
unique_records.append(r)
|
||||
dup_count = len(records) - len(unique_records)
|
||||
|
||||
print("-" * 60)
|
||||
print(f" 采集完成! 用时 {elapsed:.1f} 秒")
|
||||
print(f" 获取 {len(records)} 条, 去重后 {len(unique_records)} 条", end="")
|
||||
if dup_count:
|
||||
print(f" (移除 {dup_count} 条重复)")
|
||||
else:
|
||||
print()
|
||||
|
||||
# 导出 Excel
|
||||
print(f" 正在生成 Excel 文件...")
|
||||
export_to_excel(unique_records, output_path)
|
||||
file_size = os.path.getsize(output_path) / 1024
|
||||
|
||||
print(f"\n {'=' * 50}")
|
||||
print(f" 导出完成: {output_path}")
|
||||
print(f" 文件大小: {file_size:.1f} KB")
|
||||
print(f" 标准总数: {len(unique_records)}")
|
||||
print(f" {'=' * 50}")
|
||||
|
||||
# 清理缓存
|
||||
if not args.resume and os.path.exists(CACHE_FILE):
|
||||
os.remove(CACHE_FILE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user