407 lines
15 KiB
Python
407 lines
15 KiB
Python
|
|
"""
|
||
|
|
全国标准信息公共服务平台 - 行业标准(汽车)数据采集脚本
|
||
|
|
数据来源: https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6
|
||
|
|
API: GET https://std.samr.gov.cn/hb/search/hbPage
|
||
|
|
|
||
|
|
功能:
|
||
|
|
1. 全量采集 QC 汽车行业标准数据 (共约 990 条)
|
||
|
|
2. 采集字段: 标准号、标准名称、发布日期、实施日期、所属行业、标准状态、
|
||
|
|
标准性质、标准类别、归口单位、发布文号、CCS分类、ICS分类、制修定、备案号 等
|
||
|
|
3. 支持关键词搜索
|
||
|
|
4. 支持断点续采
|
||
|
|
5. 导出为格式化的 Excel 文件 (含统计 Sheet)
|
||
|
|
|
||
|
|
用法:
|
||
|
|
python samr_qc_scraper.py # 全量采集 QC 汽车行业标准
|
||
|
|
python samr_qc_scraper.py --search "制动" # 搜索关键词
|
||
|
|
python samr_qc_scraper.py --industry "QC 汽车" # 指定行业 (默认 QC 汽车)
|
||
|
|
python samr_qc_scraper.py --resume # 断点续采
|
||
|
|
python samr_qc_scraper.py --page-size 50 # 每页50条
|
||
|
|
python samr_qc_scraper.py --output 自定义名称 # 自定义输出文件名
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import io
|
||
|
|
import os
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
import argparse
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
import requests
|
||
|
|
from openpyxl import Workbook
|
||
|
|
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
|
||
|
|
from openpyxl.utils import get_column_letter
|
||
|
|
|
||
|
|
# ─── Windows 控制台中文输出修复 ─────────────────────────
|
||
|
|
if sys.platform == "win32":
|
||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
||
|
|
|
||
|
|
# ─── 配置 ───────────────────────────────────────────────
|
||
|
|
API_URL = "https://std.samr.gov.cn/hb/search/hbPage"
|
||
|
|
DETAIL_URL = "https://std.samr.gov.cn/hb/search/stdHBDetailed?id={id}"
|
||
|
|
SOURCE_PAGE = "https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6"
|
||
|
|
DEFAULT_PAGE_SIZE = 50
|
||
|
|
MAX_RETRIES = 5
|
||
|
|
RETRY_DELAY = 3
|
||
|
|
REQUEST_TIMEOUT = 30
|
||
|
|
CACHE_FILE = ".samr_qc_cache.json"
|
||
|
|
|
||
|
|
HEADERS = {
|
||
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||
|
|
"Accept": "application/json, text/plain, */*",
|
||
|
|
"Referer": "https://std.samr.gov.cn/hb/hbQuery?initnode=QC%20%E6%B1%BD%E8%BD%A6",
|
||
|
|
"X-Requested-With": "XMLHttpRequest",
|
||
|
|
}
|
||
|
|
|
||
|
|
# 字段映射: API 字段 → 中文显示名
|
||
|
|
FIELD_MAP = {
|
||
|
|
"C_STD_CODE": "标准号",
|
||
|
|
"C_NAME": "标准名称",
|
||
|
|
"ISSUE_DATE": "发布日期",
|
||
|
|
"ACT_DATE": "实施日期",
|
||
|
|
"TRADE_DEPT": "所属行业",
|
||
|
|
"STATE": "标准状态",
|
||
|
|
"STD_NATURE": "标准性质",
|
||
|
|
"STD_CATEGORY": "标准类别",
|
||
|
|
"CHARGE_DEPT": "归口单位",
|
||
|
|
"NOTICE_NO": "发布文号",
|
||
|
|
"CCS": "CCS分类",
|
||
|
|
"ICS": "ICS分类",
|
||
|
|
"ICS_NAME1_1": "ICS分类名称",
|
||
|
|
"STD_ZXD": "制修定",
|
||
|
|
"RECORD_NO": "备案号",
|
||
|
|
"STD_LEVEL": "标准层级",
|
||
|
|
"STD_DOMAIN": "标准领域",
|
||
|
|
"id": "标准ID",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Excel 输出列定义 (列名, 列宽)
|
||
|
|
OUTPUT_COLUMNS = [
|
||
|
|
("标准号", 20),
|
||
|
|
("标准名称", 52),
|
||
|
|
("发布日期", 13),
|
||
|
|
("实施日期", 13),
|
||
|
|
("所属行业", 12),
|
||
|
|
("标准状态", 12),
|
||
|
|
("标准性质", 10),
|
||
|
|
("标准类别", 12),
|
||
|
|
("归口单位", 20),
|
||
|
|
("发布文号", 18),
|
||
|
|
("CCS分类", 10),
|
||
|
|
("ICS分类", 10),
|
||
|
|
("ICS分类名称", 20),
|
||
|
|
("制修定", 8),
|
||
|
|
("备案号", 16),
|
||
|
|
("标准层级", 10),
|
||
|
|
("标准领域", 10),
|
||
|
|
("详情链接", 55),
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 网络请求 ───────────────────────────────────────────
|
||
|
|
def fetch_page(session, page_num, page_size, industry="QC 汽车"):
|
||
|
|
"""请求单页数据"""
|
||
|
|
params = {
|
||
|
|
"op": industry,
|
||
|
|
"ISSUE_DATE": "",
|
||
|
|
"pageNumber": page_num,
|
||
|
|
"pageSize": page_size,
|
||
|
|
}
|
||
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
||
|
|
try:
|
||
|
|
resp = session.get(API_URL, params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||
|
|
resp.raise_for_status()
|
||
|
|
data = json.loads(resp.content.decode("utf-8"))
|
||
|
|
return data
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [!] 第 {page_num} 页请求失败 (第 {attempt}/{MAX_RETRIES} 次): {e}")
|
||
|
|
if attempt < MAX_RETRIES:
|
||
|
|
time.sleep(RETRY_DELAY * attempt)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def normalize_record(record):
|
||
|
|
"""标准化单条记录"""
|
||
|
|
result = {}
|
||
|
|
for api_key, cn_name in FIELD_MAP.items():
|
||
|
|
val = record.get(api_key, "")
|
||
|
|
if isinstance(val, str):
|
||
|
|
val = val.strip()
|
||
|
|
result[cn_name] = val if val else ""
|
||
|
|
# 补充详情链接
|
||
|
|
std_id = record.get("id", "")
|
||
|
|
result["详情链接"] = DETAIL_URL.format(id=std_id) if std_id else ""
|
||
|
|
# 清理 ICS 分类名称 (格式: "43_道路车辆工程" → "道路车辆工程")
|
||
|
|
ics_name = result.get("ICS分类名称", "")
|
||
|
|
if "_" in ics_name:
|
||
|
|
result["ICS分类名称"] = ics_name.split("_")[-1] or ics_name
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 缓存 ───────────────────────────────────────────────
|
||
|
|
def load_cache():
|
||
|
|
if os.path.exists(CACHE_FILE):
|
||
|
|
try:
|
||
|
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||
|
|
return json.load(f)
|
||
|
|
except (json.JSONDecodeError, IOError):
|
||
|
|
pass
|
||
|
|
return {"records": [], "last_page": 0, "industry": "", "total": 0}
|
||
|
|
|
||
|
|
|
||
|
|
def save_cache(cache):
|
||
|
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(cache, f, ensure_ascii=False)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 主采集流程 ─────────────────────────────────────────
|
||
|
|
def scrape_all(page_size, industry="QC 汽车", search_keyword="", resume=False):
|
||
|
|
"""采集所有数据"""
|
||
|
|
session = requests.Session()
|
||
|
|
all_records = []
|
||
|
|
start_page = 1
|
||
|
|
total = 0
|
||
|
|
|
||
|
|
if resume:
|
||
|
|
cache = load_cache()
|
||
|
|
if cache["records"] and cache["industry"] == industry:
|
||
|
|
all_records = cache["records"]
|
||
|
|
start_page = cache["last_page"] + 1
|
||
|
|
total = cache["total"]
|
||
|
|
print(f" [*] 从缓存恢复: 已有 {len(all_records)} 条, 从第 {start_page} 页继续")
|
||
|
|
else:
|
||
|
|
print(" [*] 缓存不匹配, 从头开始采集")
|
||
|
|
|
||
|
|
# 首次请求, 获取总数
|
||
|
|
first_data = fetch_page(session, start_page, page_size, industry)
|
||
|
|
if not first_data:
|
||
|
|
print(" [!] 无法获取数据, 请检查网络")
|
||
|
|
return all_records
|
||
|
|
|
||
|
|
total = first_data.get("total", 0)
|
||
|
|
rows = first_data.get("rows") or []
|
||
|
|
total_pages = (total + page_size - 1) // page_size
|
||
|
|
|
||
|
|
records = [normalize_record(r) for r in rows]
|
||
|
|
all_records.extend(records)
|
||
|
|
|
||
|
|
print(f" 总计: {total} 条标准, 共 {total_pages} 页, 每页 {page_size} 条")
|
||
|
|
print(f" 已采集: {len(all_records)}/{total}", end="\r")
|
||
|
|
|
||
|
|
cache = {"records": all_records, "last_page": start_page, "industry": industry, "total": total}
|
||
|
|
save_cache(cache)
|
||
|
|
|
||
|
|
for page_num in range(start_page + 1, total_pages + 1):
|
||
|
|
data = fetch_page(session, page_num, page_size, industry)
|
||
|
|
if data is None:
|
||
|
|
print(f"\n [!] 第 {page_num} 页失败, 保存进度退出")
|
||
|
|
save_cache(cache)
|
||
|
|
break
|
||
|
|
|
||
|
|
rows = data.get("rows") or []
|
||
|
|
records = [normalize_record(r) for r in rows]
|
||
|
|
all_records.extend(records)
|
||
|
|
|
||
|
|
cache["records"] = all_records
|
||
|
|
cache["last_page"] = page_num
|
||
|
|
save_cache(cache)
|
||
|
|
|
||
|
|
print(f" 已采集: {len(all_records)}/{total} (第 {page_num}/{total_pages} 页)", end="\r")
|
||
|
|
time.sleep(0.3)
|
||
|
|
|
||
|
|
print()
|
||
|
|
return all_records
|
||
|
|
|
||
|
|
|
||
|
|
# ─── Excel 导出 ─────────────────────────────────────────
|
||
|
|
def export_to_excel(records, output_path):
|
||
|
|
wb = Workbook()
|
||
|
|
ws = wb.active
|
||
|
|
ws.title = "汽车行业标准"
|
||
|
|
|
||
|
|
# ── 样式定义 ──
|
||
|
|
hdr_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11)
|
||
|
|
hdr_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||
|
|
hdr_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||
|
|
dat_font = Font(name="微软雅黑", size=10)
|
||
|
|
dat_align = Alignment(vertical="center", wrap_text=True)
|
||
|
|
even_fill = PatternFill(start_color="D6E4F0", end_color="D6E4F0", fill_type="solid")
|
||
|
|
border = Border(
|
||
|
|
left=Side(style="thin", color="B4C6E7"),
|
||
|
|
right=Side(style="thin", color="B4C6E7"),
|
||
|
|
top=Side(style="thin", color="B4C6E7"),
|
||
|
|
bottom=Side(style="thin", color="B4C6E7"),
|
||
|
|
)
|
||
|
|
|
||
|
|
# ── 标题行 ──
|
||
|
|
col_names = [c[0] for c in OUTPUT_COLUMNS]
|
||
|
|
for ci, name in enumerate(col_names, 1):
|
||
|
|
cell = ws.cell(row=1, column=ci, value=name)
|
||
|
|
cell.font = hdr_font
|
||
|
|
cell.fill = hdr_fill
|
||
|
|
cell.alignment = hdr_align
|
||
|
|
cell.border = border
|
||
|
|
|
||
|
|
# ── 数据行 ──
|
||
|
|
for ri, rec in enumerate(records, 2):
|
||
|
|
for ci, (col_name, _) in enumerate(OUTPUT_COLUMNS, 1):
|
||
|
|
val = rec.get(col_name, "")
|
||
|
|
cell = ws.cell(row=ri, column=ci, value=val)
|
||
|
|
cell.font = dat_font
|
||
|
|
cell.alignment = dat_align
|
||
|
|
cell.border = border
|
||
|
|
if ri % 2 == 0:
|
||
|
|
cell.fill = even_fill
|
||
|
|
|
||
|
|
# ── 列宽 ──
|
||
|
|
for ci, (_, w) in enumerate(OUTPUT_COLUMNS, 1):
|
||
|
|
ws.column_dimensions[get_column_letter(ci)].width = w
|
||
|
|
|
||
|
|
# ── 冻结 & 筛选 ──
|
||
|
|
ws.freeze_panes = "A2"
|
||
|
|
ws.auto_filter.ref = f"A1:{get_column_letter(len(col_names))}{len(records) + 1}"
|
||
|
|
|
||
|
|
# ── 统计信息 Sheet ──
|
||
|
|
ws_stat = wb.create_sheet("统计信息")
|
||
|
|
|
||
|
|
status_count = {}
|
||
|
|
nature_count = {}
|
||
|
|
category_count = {}
|
||
|
|
zxd_count = {}
|
||
|
|
year_count = {}
|
||
|
|
|
||
|
|
for r in records:
|
||
|
|
for field, target in [
|
||
|
|
("标准状态", status_count),
|
||
|
|
("标准性质", nature_count),
|
||
|
|
("标准类别", category_count),
|
||
|
|
("制修定", zxd_count),
|
||
|
|
]:
|
||
|
|
v = r.get(field, "未知") or "未知"
|
||
|
|
target[v] = target.get(v, 0) + 1
|
||
|
|
|
||
|
|
issue = r.get("发布日期", "") or ""
|
||
|
|
year = issue[:4] if len(issue) >= 4 else "未知"
|
||
|
|
year_count[year] = year_count.get(year, 0) + 1
|
||
|
|
|
||
|
|
stat_rows = [
|
||
|
|
("采集时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
|
||
|
|
("数据来源", "全国标准信息公共服务平台 (std.samr.gov.cn)"),
|
||
|
|
("所属行业", "QC 汽车"),
|
||
|
|
("标准总数", len(records)),
|
||
|
|
("", ""),
|
||
|
|
("── 标准状态分布 ──", ""),
|
||
|
|
("状态", "数量"),
|
||
|
|
]
|
||
|
|
for k, v in sorted(status_count.items(), key=lambda x: -x[1]):
|
||
|
|
stat_rows.append((k, v))
|
||
|
|
|
||
|
|
stat_rows += [("", ""), ("── 标准性质分布 ──", ""), ("性质", "数量")]
|
||
|
|
for k, v in sorted(nature_count.items(), key=lambda x: -x[1]):
|
||
|
|
stat_rows.append((k, v))
|
||
|
|
|
||
|
|
stat_rows += [("", ""), ("── 标准类别分布 ──", ""), ("类别", "数量")]
|
||
|
|
for k, v in sorted(category_count.items(), key=lambda x: -x[1]):
|
||
|
|
stat_rows.append((k, v))
|
||
|
|
|
||
|
|
stat_rows += [("", ""), ("── 制修定分布 ──", ""), ("类型", "数量")]
|
||
|
|
for k, v in sorted(zxd_count.items(), key=lambda x: -x[1]):
|
||
|
|
stat_rows.append((k, v))
|
||
|
|
|
||
|
|
stat_rows += [("", ""), ("── 按发布年份分布 ──", ""), ("年份", "数量")]
|
||
|
|
for y, c in sorted(year_count.items(), reverse=True):
|
||
|
|
stat_rows.append((y, c))
|
||
|
|
|
||
|
|
for ri, (a, b) in enumerate(stat_rows, 1):
|
||
|
|
ca = ws_stat.cell(row=ri, column=1, value=a)
|
||
|
|
cb = ws_stat.cell(row=ri, column=2, value=b)
|
||
|
|
if a.startswith("──"):
|
||
|
|
ca.font = Font(name="微软雅黑", bold=True, size=11)
|
||
|
|
else:
|
||
|
|
ca.font = Font(name="微软雅黑", size=10)
|
||
|
|
cb.font = Font(name="微软雅黑", size=10)
|
||
|
|
|
||
|
|
ws_stat.column_dimensions["A"].width = 28
|
||
|
|
ws_stat.column_dimensions["B"].width = 50
|
||
|
|
|
||
|
|
wb.save(output_path)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 入口 ───────────────────────────────────────────────
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="全国标准信息公共服务平台 — 行业标准(汽车)数据采集工具",
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog="""
|
||
|
|
示例:
|
||
|
|
python samr_qc_scraper.py # 全量采集
|
||
|
|
python samr_qc_scraper.py --resume # 断点续采
|
||
|
|
python samr_qc_scraper.py --page-size 50 # 每页50条
|
||
|
|
python samr_qc_scraper.py --output QC汽车标准 # 自定义文件名
|
||
|
|
""",
|
||
|
|
)
|
||
|
|
parser.add_argument("--industry", "-i", default="QC 汽车", help="行业筛选 (默认: QC 汽车)")
|
||
|
|
parser.add_argument("--resume", "-r", action="store_true", help="断点续采")
|
||
|
|
parser.add_argument("--page-size", "-p", type=int, default=DEFAULT_PAGE_SIZE, help="每页条数 (默认50)")
|
||
|
|
parser.add_argument("--output", "-o", default=None, help="输出文件名 (不含扩展名)")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
if args.output:
|
||
|
|
output_name = args.output
|
||
|
|
else:
|
||
|
|
output_name = f"行业标准_QC汽车_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||
|
|
output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{output_name}.xlsx")
|
||
|
|
|
||
|
|
print("=" * 60)
|
||
|
|
print(" 全国标准信息公共服务平台 — 行业标准数据采集工具")
|
||
|
|
print("=" * 60)
|
||
|
|
print(f" 数据来源: std.samr.gov.cn")
|
||
|
|
print(f" 所属行业: {args.industry}")
|
||
|
|
print(f" 每页条数: {args.page_size}")
|
||
|
|
print(f" 输出文件: {output_path}")
|
||
|
|
print("-" * 60)
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
records = scrape_all(args.page_size, args.industry, resume=args.resume)
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
|
||
|
|
if not records:
|
||
|
|
print(" 未获取到任何数据")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# 去重
|
||
|
|
seen = set()
|
||
|
|
unique = []
|
||
|
|
for r in records:
|
||
|
|
code = r.get("标准号", "")
|
||
|
|
if code not in seen:
|
||
|
|
seen.add(code)
|
||
|
|
unique.append(r)
|
||
|
|
dup = len(records) - len(unique)
|
||
|
|
|
||
|
|
print("-" * 60)
|
||
|
|
print(f" 采集完成! 用时 {elapsed:.1f} 秒")
|
||
|
|
print(f" 获取 {len(records)} 条, 去重后 {len(unique)} 条", end="")
|
||
|
|
print(f" (移除 {dup} 条重复)" if dup else "")
|
||
|
|
|
||
|
|
print(" 生成 Excel 文件...")
|
||
|
|
export_to_excel(unique, output_path)
|
||
|
|
size = os.path.getsize(output_path) / 1024
|
||
|
|
|
||
|
|
print(f"\n {'=' * 50}")
|
||
|
|
print(f" 导出完成: {output_path}")
|
||
|
|
print(f" 文件大小: {size:.1f} KB")
|
||
|
|
print(f" 标准总数: {len(unique)}")
|
||
|
|
print(f" {'=' * 50}")
|
||
|
|
|
||
|
|
if not args.resume and os.path.exists(CACHE_FILE):
|
||
|
|
os.remove(CACHE_FILE)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|