""" 全国标准信息公共服务平台 - 行业标准(汽车)数据采集脚本 数据来源: https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6 API: GET https://std.samr.gov.cn/hb/search/hbPage 功能: 1. 全量采集 QC 汽车行业标准数据 (共约 990 条) 2. 采集字段: 标准号、标准名称、发布日期、实施日期、所属行业、标准状态、 标准性质、标准类别、归口单位、发布文号、CCS分类、ICS分类、制修定、备案号 等 3. 支持关键词搜索 4. 支持断点续采 5. 导出为格式化的 Excel 文件 (含统计 Sheet) 用法: python samr_qc_scraper.py # 全量采集 QC 汽车行业标准 python samr_qc_scraper.py --search "制动" # 搜索关键词 python samr_qc_scraper.py --industry "QC 汽车" # 指定行业 (默认 QC 汽车) python samr_qc_scraper.py --resume # 断点续采 python samr_qc_scraper.py --page-size 50 # 每页50条 python samr_qc_scraper.py --output 自定义名称 # 自定义输出文件名 """ import sys import io import os import json import time import argparse from datetime import datetime import requests from openpyxl import Workbook from openpyxl.styles import Font, Alignment, PatternFill, Border, Side from openpyxl.utils import get_column_letter # ─── Windows 控制台中文输出修复 ───────────────────────── if sys.platform == "win32": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") # ─── 配置 ─────────────────────────────────────────────── API_URL = "https://std.samr.gov.cn/hb/search/hbPage" DETAIL_URL = "https://std.samr.gov.cn/hb/search/stdHBDetailed?id={id}" SOURCE_PAGE = "https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6" DEFAULT_PAGE_SIZE = 50 MAX_RETRIES = 5 RETRY_DELAY = 3 REQUEST_TIMEOUT = 30 CACHE_FILE = ".samr_qc_cache.json" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept": "application/json, text/plain, */*", "Referer": "https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6", "X-Requested-With": "XMLHttpRequest", } # 字段映射: API 字段 → 中文显示名 FIELD_MAP = { "C_STD_CODE": "标准号", "C_NAME": "标准名称", "ISSUE_DATE": "发布日期", "ACT_DATE": "实施日期", "TRADE_DEPT": "所属行业", "STATE": "标准状态", "STD_NATURE": "标准性质", "STD_CATEGORY": "标准类别", "CHARGE_DEPT": "归口单位", "NOTICE_NO": "发布文号", "CCS": "CCS分类", "ICS": "ICS分类", "ICS_NAME1_1": "ICS分类名称", "STD_ZXD": "制修定", "RECORD_NO": "备案号", "STD_LEVEL": "标准层级", "STD_DOMAIN": "标准领域", "id": "标准ID", } # Excel 输出列定义 (列名, 列宽) OUTPUT_COLUMNS = [ ("标准号", 20), ("标准名称", 52), ("发布日期", 13), ("实施日期", 13), ("所属行业", 12), ("标准状态", 12), ("标准性质", 10), ("标准类别", 12), ("归口单位", 20), ("发布文号", 18), ("CCS分类", 10), ("ICS分类", 10), ("ICS分类名称", 20), ("制修定", 8), ("备案号", 16), ("标准层级", 10), ("标准领域", 10), ("详情链接", 55), ] # ─── 网络请求 ─────────────────────────────────────────── def fetch_page(session, page_num, page_size, industry="QC 汽车"): """请求单页数据""" params = { "op": industry, "ISSUE_DATE": "", "pageNumber": page_num, "pageSize": page_size, } for attempt in range(1, MAX_RETRIES + 1): try: resp = session.get(API_URL, params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT) resp.raise_for_status() data = json.loads(resp.content.decode("utf-8")) return data except Exception as e: print(f" [!] 第 {page_num} 页请求失败 (第 {attempt}/{MAX_RETRIES} 次): {e}") if attempt < MAX_RETRIES: time.sleep(RETRY_DELAY * attempt) return None def normalize_record(record): """标准化单条记录""" result = {} for api_key, cn_name in FIELD_MAP.items(): val = record.get(api_key, "") if isinstance(val, str): val = val.strip() result[cn_name] = val if val else "" # 补充详情链接 std_id = record.get("id", "") result["详情链接"] = DETAIL_URL.format(id=std_id) if std_id else "" # 清理 ICS 分类名称 (格式: "43_道路车辆工程" → "道路车辆工程") ics_name = result.get("ICS分类名称", "") if "_" in ics_name: result["ICS分类名称"] = ics_name.split("_")[-1] or ics_name return result # ─── 缓存 ─────────────────────────────────────────────── def load_cache(): if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, IOError): pass return {"records": [], "last_page": 0, "industry": "", "total": 0} def save_cache(cache): with open(CACHE_FILE, "w", encoding="utf-8") as f: json.dump(cache, f, ensure_ascii=False) # ─── 主采集流程 ───────────────────────────────────────── def scrape_all(page_size, industry="QC 汽车", search_keyword="", resume=False): """采集所有数据""" session = requests.Session() all_records = [] start_page = 1 total = 0 if resume: cache = load_cache() if cache["records"] and cache["industry"] == industry: all_records = cache["records"] start_page = cache["last_page"] + 1 total = cache["total"] print(f" [*] 从缓存恢复: 已有 {len(all_records)} 条, 从第 {start_page} 页继续") else: print(" [*] 缓存不匹配, 从头开始采集") # 首次请求, 获取总数 first_data = fetch_page(session, start_page, page_size, industry) if not first_data: print(" [!] 无法获取数据, 请检查网络") return all_records total = first_data.get("total", 0) rows = first_data.get("rows") or [] total_pages = (total + page_size - 1) // page_size records = [normalize_record(r) for r in rows] all_records.extend(records) print(f" 总计: {total} 条标准, 共 {total_pages} 页, 每页 {page_size} 条") print(f" 已采集: {len(all_records)}/{total}", end="\r") cache = {"records": all_records, "last_page": start_page, "industry": industry, "total": total} save_cache(cache) for page_num in range(start_page + 1, total_pages + 1): data = fetch_page(session, page_num, page_size, industry) if data is None: print(f"\n [!] 第 {page_num} 页失败, 保存进度退出") save_cache(cache) break rows = data.get("rows") or [] records = [normalize_record(r) for r in rows] all_records.extend(records) cache["records"] = all_records cache["last_page"] = page_num save_cache(cache) print(f" 已采集: {len(all_records)}/{total} (第 {page_num}/{total_pages} 页)", end="\r") time.sleep(0.3) print() return all_records # ─── Excel 导出 ───────────────────────────────────────── def export_to_excel(records, output_path): wb = Workbook() ws = wb.active ws.title = "汽车行业标准" # ── 样式定义 ── hdr_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11) hdr_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid") hdr_align = Alignment(horizontal="center", vertical="center", wrap_text=True) dat_font = Font(name="微软雅黑", size=10) dat_align = Alignment(vertical="center", wrap_text=True) even_fill = PatternFill(start_color="D6E4F0", end_color="D6E4F0", fill_type="solid") border = Border( left=Side(style="thin", color="B4C6E7"), right=Side(style="thin", color="B4C6E7"), top=Side(style="thin", color="B4C6E7"), bottom=Side(style="thin", color="B4C6E7"), ) # ── 标题行 ── col_names = [c[0] for c in OUTPUT_COLUMNS] for ci, name in enumerate(col_names, 1): cell = ws.cell(row=1, column=ci, value=name) cell.font = hdr_font cell.fill = hdr_fill cell.alignment = hdr_align cell.border = border # ── 数据行 ── for ri, rec in enumerate(records, 2): for ci, (col_name, _) in enumerate(OUTPUT_COLUMNS, 1): val = rec.get(col_name, "") cell = ws.cell(row=ri, column=ci, value=val) cell.font = dat_font cell.alignment = dat_align cell.border = border if ri % 2 == 0: cell.fill = even_fill # ── 列宽 ── for ci, (_, w) in enumerate(OUTPUT_COLUMNS, 1): ws.column_dimensions[get_column_letter(ci)].width = w # ── 冻结 & 筛选 ── ws.freeze_panes = "A2" ws.auto_filter.ref = f"A1:{get_column_letter(len(col_names))}{len(records) + 1}" # ── 统计信息 Sheet ── ws_stat = wb.create_sheet("统计信息") status_count = {} nature_count = {} category_count = {} zxd_count = {} year_count = {} for r in records: for field, target in [ ("标准状态", status_count), ("标准性质", nature_count), ("标准类别", category_count), ("制修定", zxd_count), ]: v = r.get(field, "未知") or "未知" target[v] = target.get(v, 0) + 1 issue = r.get("发布日期", "") or "" year = issue[:4] if len(issue) >= 4 else "未知" year_count[year] = year_count.get(year, 0) + 1 stat_rows = [ ("采集时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ("数据来源", "全国标准信息公共服务平台 (std.samr.gov.cn)"), ("所属行业", "QC 汽车"), ("标准总数", len(records)), ("", ""), ("── 标准状态分布 ──", ""), ("状态", "数量"), ] for k, v in sorted(status_count.items(), key=lambda x: -x[1]): stat_rows.append((k, v)) stat_rows += [("", ""), ("── 标准性质分布 ──", ""), ("性质", "数量")] for k, v in sorted(nature_count.items(), key=lambda x: -x[1]): stat_rows.append((k, v)) stat_rows += [("", ""), ("── 标准类别分布 ──", ""), ("类别", "数量")] for k, v in sorted(category_count.items(), key=lambda x: -x[1]): stat_rows.append((k, v)) stat_rows += [("", ""), ("── 制修定分布 ──", ""), ("类型", "数量")] for k, v in sorted(zxd_count.items(), key=lambda x: -x[1]): stat_rows.append((k, v)) stat_rows += [("", ""), ("── 按发布年份分布 ──", ""), ("年份", "数量")] for y, c in sorted(year_count.items(), reverse=True): stat_rows.append((y, c)) for ri, (a, b) in enumerate(stat_rows, 1): ca = ws_stat.cell(row=ri, column=1, value=a) cb = ws_stat.cell(row=ri, column=2, value=b) if a.startswith("──"): ca.font = Font(name="微软雅黑", bold=True, size=11) else: ca.font = Font(name="微软雅黑", size=10) cb.font = Font(name="微软雅黑", size=10) ws_stat.column_dimensions["A"].width = 28 ws_stat.column_dimensions["B"].width = 50 wb.save(output_path) # ─── 入口 ─────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="全国标准信息公共服务平台 — 行业标准(汽车)数据采集工具", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python samr_qc_scraper.py # 全量采集 python samr_qc_scraper.py --resume # 断点续采 python samr_qc_scraper.py --page-size 50 # 每页50条 python samr_qc_scraper.py --output QC汽车标准 # 自定义文件名 """, ) parser.add_argument("--industry", "-i", default="QC 汽车", help="行业筛选 (默认: QC 汽车)") parser.add_argument("--resume", "-r", action="store_true", help="断点续采") parser.add_argument("--page-size", "-p", type=int, default=DEFAULT_PAGE_SIZE, help="每页条数 (默认50)") parser.add_argument("--output", "-o", default=None, help="输出文件名 (不含扩展名)") args = parser.parse_args() if args.output: output_name = args.output else: output_name = f"行业标准_QC汽车_{datetime.now().strftime('%Y%m%d_%H%M%S')}" output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{output_name}.xlsx") print("=" * 60) print(" 全国标准信息公共服务平台 — 行业标准数据采集工具") print("=" * 60) print(f" 数据来源: std.samr.gov.cn") print(f" 所属行业: {args.industry}") print(f" 每页条数: {args.page_size}") print(f" 输出文件: {output_path}") print("-" * 60) start_time = time.time() records = scrape_all(args.page_size, args.industry, resume=args.resume) elapsed = time.time() - start_time if not records: print(" 未获取到任何数据") sys.exit(1) # 去重 seen = set() unique = [] for r in records: code = r.get("标准号", "") if code not in seen: seen.add(code) unique.append(r) dup = len(records) - len(unique) print("-" * 60) print(f" 采集完成! 用时 {elapsed:.1f} 秒") print(f" 获取 {len(records)} 条, 去重后 {len(unique)} 条", end="") print(f" (移除 {dup} 条重复)" if dup else "") print(" 生成 Excel 文件...") export_to_excel(unique, output_path) size = os.path.getsize(output_path) / 1024 print(f"\n {'=' * 50}") print(f" 导出完成: {output_path}") print(f" 文件大小: {size:.1f} KB") print(f" 标准总数: {len(unique)}") print(f" {'=' * 50}") if not args.resume and os.path.exists(CACHE_FILE): os.remove(CACHE_FILE) if __name__ == "__main__": main()