""" 全国汽车标准化技术委员会 - 汽车标准数据采集脚本 数据来源: https://www.catarc.org.cn/bzzxd/qcbz/index.html API: POST https://www.catarc.org.cn/prod-api/api/customform/getPageList 功能: 1. 全量采集汽车标准数据 (标准编号、标准名称、英文名称、代替标准、发布日期、实施日期、标准状态) 2. 支持关键词搜索 3. 支持断点续采 4. 导出为格式化的 Excel 文件 5. 支持增量更新 (已有数据自动跳过) 用法: python catarc_scraper.py # 全量采集 python catarc_scraper.py --search "制动" # 搜索包含"制动"的标准 python catarc_scraper.py --resume # 断点续采 python catarc_scraper.py --page-size 100 # 自定义每页大小 python catarc_scraper.py --output result # 自定义输出文件名 """ import sys import io import os # 修复 Windows 控制台中文输出问题 if sys.platform == "win32": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") import requests import json import time import argparse import sys from datetime import datetime from pathlib import Path # ─── 配置 ─────────────────────────────────────────────── API_URL = "https://www.catarc.org.cn/prod-api/api/customform/getPageList" FORM_ID = "615560029638725" DEFAULT_PAGE_SIZE = 50 MAX_RETRIES = 5 RETRY_DELAY = 3 # 秒 REQUEST_TIMEOUT = 30 # 秒 CACHE_FILE = ".catarc_cache.json" HEADERS = { "Content-Type": "application/json", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept": "application/json, text/plain, */*", "Origin": "https://www.catarc.org.cn", "Referer": "https://www.catarc.org.cn/bzzxd/qcbz/index.html", } # 字段映射: API字段 → 中文显示名 FIELD_MAP = { "STD_CODE": "标准编号", "C_NAME": "标准名称", "E_NAME": "英文名称", "REVISE_STD_CODES": "代替标准", "ISSUE_TIME": "发布日期", "ACT_TIME": "实施日期", "STD_STATUS": "标准状态", } # Excel 列宽 COLUMN_WIDTHS = { "标准编号": 20, "标准名称": 55, "英文名称": 70, "代替标准": 20, "发布日期": 14, "实施日期": 14, "标准状态": 12, } def timestamp_to_date(ts_ms): """将毫秒级时间戳转换为 YYYY-MM-DD 字符串""" if not ts_ms: return "" try: return datetime.fromtimestamp(int(ts_ms) / 1000).strftime("%Y-%m-%d") except (ValueError, OSError): return str(ts_ms) def fetch_page(session, page_num, page_size, keyword=""): """请求单页数据""" payload = { "formId": FORM_ID, "pageSize": page_size, "pageNum": page_num, "key": keyword, } for attempt in range(1, MAX_RETRIES + 1): try: resp = session.post(API_URL, json=payload, headers=HEADERS, timeout=REQUEST_TIMEOUT) resp.raise_for_status() data = resp.json() if data.get("code") == 200: return data["data"] print(f" [!] API 返回错误: {data.get('msg', '未知错误')}") return None except requests.exceptions.RequestException as e: print(f" [!] 第 {page_num} 页请求失败 (第 {attempt}/{MAX_RETRIES} 次): {e}") if attempt < MAX_RETRIES: time.sleep(RETRY_DELAY * attempt) return None def normalize_record(record): """标准化单条记录: 转换时间戳, 清理空白""" return { "标准编号": (record.get("STD_CODE") or "").strip(), "标准名称": (record.get("C_NAME") or "").strip(), "英文名称": (record.get("E_NAME") or "").strip(), "代替标准": (record.get("REVISE_STD_CODES") or "").strip(), "发布日期": timestamp_to_date(record.get("ISSUE_TIME")), "实施日期": timestamp_to_date(record.get("ACT_TIME")), "标准状态": (record.get("STD_STATUS") or "").strip(), } def load_cache(): """加载缓存 (用于断点续采和去重)""" if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, IOError): pass return {"records": [], "last_page": 0, "keyword": "", "total": 0} def save_cache(cache): """保存缓存""" with open(CACHE_FILE, "w", encoding="utf-8") as f: json.dump(cache, f, ensure_ascii=False) def scrape_all(page_size, keyword="", resume=False): """采集所有数据""" session = requests.Session() all_records = [] start_page = 1 total = 0 # 断点续采 if resume: cache = load_cache() if cache["records"] and cache["keyword"] == keyword: all_records = cache["records"] start_page = cache["last_page"] + 1 total = cache["total"] print(f" [*] 从缓存恢复: 已有 {len(all_records)} 条, 从第 {start_page} 页继续") else: print(" [*] 缓存不匹配, 从头开始采集") # 第一次请求, 获取总页数 first_data = fetch_page(session, start_page, page_size, keyword) if not first_data: print(" [✗] 无法获取数据, 请检查网络连接") return all_records total = first_data.get("total", 0) total_pages = first_data.get("pages", 0) records = [normalize_record(r) for r in (first_data.get("list") or [])] all_records.extend(records) print(f" 总计: {total} 条标准, 共 {total_pages} 页, 每页 {page_size} 条") print(f" 已采集: {len(all_records)}/{total}", end="\r") # 更新缓存 cache = { "records": all_records, "last_page": start_page, "keyword": keyword, "total": total, } save_cache(cache) # 逐页采集 for page_num in range(start_page + 1, total_pages + 1): data = fetch_page(session, page_num, page_size, keyword) if data is None: print(f"\n [!] 第 {page_num} 页采集失败, 保存进度并退出") save_cache(cache) break records = [normalize_record(r) for r in (data.get("list") or [])] all_records.extend(records) # 更新缓存 cache["records"] = all_records cache["last_page"] = page_num save_cache(cache) print(f" 已采集: {len(all_records)}/{total} (第 {page_num}/{total_pages} 页)", end="\r") # 礼貌延迟, 避免给服务器造成压力 time.sleep(0.3) print() return all_records def export_to_excel(records, output_path): """将记录导出为格式化的 Excel 文件""" from openpyxl import Workbook from openpyxl.styles import Font, Alignment, PatternFill, Border, Side from openpyxl.utils import get_column_letter wb = Workbook() ws = wb.active ws.title = "汽车标准" # ── 标题行 ── headers = list(FIELD_MAP.values()) header_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11) header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid") header_align = Alignment(horizontal="center", vertical="center", wrap_text=True) thin_border = Border( left=Side(style="thin", color="B4C6E7"), right=Side(style="thin", color="B4C6E7"), top=Side(style="thin", color="B4C6E7"), bottom=Side(style="thin", color="B4C6E7"), ) for col_idx, header in enumerate(headers, 1): cell = ws.cell(row=1, column=col_idx, value=header) cell.font = header_font cell.fill = header_fill cell.alignment = header_align cell.border = thin_border # ── 数据行 ── data_font = Font(name="微软雅黑", size=10) data_align = Alignment(vertical="center", wrap_text=True) even_fill = PatternFill(start_color="D6E4F0", end_color="D6E4F0", fill_type="solid") for row_idx, record in enumerate(records, 2): for col_idx, key in enumerate(FIELD_MAP.keys()): cn_key = FIELD_MAP[key] cell = ws.cell(row=row_idx, column=col_idx + 1, value=record.get(cn_key, "")) cell.font = data_font cell.alignment = data_align cell.border = thin_border if row_idx % 2 == 0: cell.fill = even_fill # ── 列宽 ── for col_idx, header in enumerate(headers, 1): ws.column_dimensions[get_column_letter(col_idx)].width = COLUMN_WIDTHS.get(header, 15) # ── 冻结首行 & 自动筛选 ── ws.freeze_panes = "A2" ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}{len(records) + 1}" # ── 添加统计信息 Sheet ── ws_stat = wb.create_sheet("统计信息") status_count = {} year_count = {} for r in records: s = r.get("标准状态", "未知") status_count[s] = status_count.get(s, 0) + 1 issue = r.get("发布日期", "") year = issue[:4] if issue else "未知" year_count[year] = year_count.get(year, 0) + 1 stat_data = [ ["采集时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")], ["数据来源", "全国汽车标准化技术委员会 (catarc.org.cn)"], ["标准总数", len(records)], [], # 空行 ["标准状态分布"], ["状态", "数量"], ] for s, c in sorted(status_count.items(), key=lambda x: -x[1]): stat_data.append([s, c]) stat_data.append([]) stat_data.append(["按发布年份分布"]) stat_data.append(["年份", "数量"]) for y, c in sorted(year_count.items(), reverse=True): stat_data.append([y, c]) for row_idx, row in enumerate(stat_data, 1): for col_idx, val in enumerate(row, 1): cell = ws_stat.cell(row=row_idx, column=col_idx, value=val) if row_idx in (5, stat_data.index([]) + 5 + len(status_count) + 2 + 1): cell.font = Font(name="微软雅黑", bold=True, size=11) else: cell.font = Font(name="微软雅黑", size=10) ws_stat.column_dimensions["A"].width = 20 ws_stat.column_dimensions["B"].width = 50 # ── 保存 ── wb.save(output_path) return output_path def main(): parser = argparse.ArgumentParser( description="全国汽车标准化技术委员会 - 汽车标准数据采集工具", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python catarc_scraper.py # 全量采集 python catarc_scraper.py --search "制动" # 搜索"制动"相关标准 python catarc_scraper.py --search "GB 7258" # 按标准编号搜索 python catarc_scraper.py --resume # 断点续采 python catarc_scraper.py --page-size 100 # 每页100条, 减少请求次数 python catarc_scraper.py --output 我的标准 # 自定义输出文件名 """, ) parser.add_argument("--search", "-s", default="", help="搜索关键词 (标准编号或标准名称)") parser.add_argument("--resume", "-r", action="store_true", help="从上次中断处继续采集") parser.add_argument("--page-size", "-p", type=int, default=DEFAULT_PAGE_SIZE, help="每页条数 (默认50, 最大建议100)") parser.add_argument("--output", "-o", default=None, help="输出文件名 (不含扩展名, 默认自动生成)") args = parser.parse_args() # 输出文件名 if args.output: output_name = args.output elif args.search: output_name = f"汽车标准_搜索_{args.search}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" else: output_name = f"汽车标准全量_{datetime.now().strftime('%Y%m%d_%H%M%S')}" output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{output_name}.xlsx") print("=" * 60) print(" 全国汽车标准化技术委员会 - 汽车标准数据采集工具") print("=" * 60) print(f" 数据来源: catarc.org.cn") if args.search: print(f" 搜索关键词: {args.search}") print(f" 每页条数: {args.page_size}") print(f" 输出文件: {output_path}") print("-" * 60) # 开始采集 start_time = time.time() records = scrape_all(args.page_size, args.search, args.resume) elapsed = time.time() - start_time if not records: print("\n 未获取到任何数据") sys.exit(1) # 去重 (按标准编号) seen = set() unique_records = [] for r in records: code = r["标准编号"] if code not in seen: seen.add(code) unique_records.append(r) dup_count = len(records) - len(unique_records) print("-" * 60) print(f" 采集完成! 用时 {elapsed:.1f} 秒") print(f" 获取 {len(records)} 条, 去重后 {len(unique_records)} 条", end="") if dup_count: print(f" (移除 {dup_count} 条重复)") else: print() # 导出 Excel print(f" 正在生成 Excel 文件...") export_to_excel(unique_records, output_path) file_size = os.path.getsize(output_path) / 1024 print(f"\n {'=' * 50}") print(f" 导出完成: {output_path}") print(f" 文件大小: {file_size:.1f} KB") print(f" 标准总数: {len(unique_records)}") print(f" {'=' * 50}") # 清理缓存 if not args.resume and os.path.exists(CACHE_FILE): os.remove(CACHE_FILE) if __name__ == "__main__": main()