""" 全国汽车标准化技术委员会 - ISO发布标准数据采集脚本 数据来源: https://www.catarc.org.cn/gjbzh/isoiec/iso/gzdt/fbbz/index.html 方式: 解析服务端渲染的 HTML 页面 (非 API) 功能: 1. 自动发现并采集所有子页面中的 ISO 标准表格数据 2. 支持不同时期的表格列结构差异 (新旧格式自适应) 3. 采集: 序号、所属机构、文件号、英文名称、中文名称、代替标准、所属分技术委员会 4. 自动补充: 发布批次、发布日期、来源页面 5. 支持断点续采 6. 导出为格式化的 Excel 文件 (含统计 Sheet) 用法: python catarc_iso_scraper.py # 全量采集 python catarc_iso_scraper.py --resume # 断点续采 python catarc_iso_scraper.py --output result # 自定义输出文件名 python catarc_iso_scraper.py --delay 1.0 # 自定义请求间隔(秒) """ import sys import io import os import re import json import time import argparse from datetime import datetime from html.parser import HTMLParser from urllib.parse import urljoin import requests from openpyxl import Workbook from openpyxl.styles import Font, Alignment, PatternFill, Border, Side from openpyxl.utils import get_column_letter # ─── Windows 控制台中文输出修复 ───────────────────────── if sys.platform == "win32": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") # ─── 配置 ─────────────────────────────────────────────── BASE_URL = "https://www.catarc.org.cn" LIST_PAGE_PATTERN = "/gjbzh/isoiec/iso/gzdt/fbbz/index{page}.html" CACHE_FILE = ".catarc_iso_cache.json" MAX_RETRIES = 3 DEFAULT_DELAY = 0.5 # 秒 REQUEST_TIMEOUT = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", } # 已知的表头关键词 → 标准字段名映射 HEADER_KEYWORDS = { "序号": "序号", "所属机构": "所属机构", "文件号": "文件号", "标准号": "文件号", "英文名称": "英文名称", "中文名称": "中文名称", "所属分技术委员会": "所属分技术委员会", "分技术委员会": "所属分技术委员会", "代替": "代替标准", "代替标准": "代替标准", } # Excel 列定义 (输出顺序) OUTPUT_COLUMNS = [ ("序号", 8), ("所属机构", 28), ("文件号", 25), ("英文名称", 60), ("中文名称", 50), ("所属分技术委员会", 18), ("代替标准", 22), ("发布批次", 22), ("发布日期", 14), ("来源页面", 55), ] # ─── HTML 解析器 ───────────────────────────────────────── class TableParser(HTMLParser): """从 HTML 中提取
| 解析""" def __init__(self): super().__init__() self.tables = [] # list of list of list of str self._current_table = [] self._current_row = [] self._current_cell = [] self._in_table = 0 self._in_td = False self._skip = False def handle_starttag(self, tag, attrs): tag = tag.lower() if tag == "table": self._in_table += 1 if self._in_table == 1: self._current_table = [] elif tag == "tr" and self._in_table == 1: self._current_row = [] elif tag in ("td", "th") and self._in_table == 1: self._in_td = True self._current_cell = [] elif tag in ("script", "style"): self._skip = True def handle_endtag(self, tag): tag = tag.lower() if tag == "table": if self._in_table == 1 and self._current_table: self.tables.append(self._current_table) self._in_table = max(0, self._in_table - 1) elif tag == "tr" and self._in_table == 1: if self._current_row: self._current_table.append(self._current_row) elif tag in ("td", "th") and self._in_table == 1: self._in_td = False cell_text = " ".join("".join(self._current_cell).split()).strip() self._current_row.append(cell_text) elif tag in ("script", "style"): self._skip = False def handle_data(self, data): if self._skip: return if self._in_td: self._current_cell.append(data) elif self._in_table == 1: # 处理 | 中没有子标签包裹的文本
pass
def handle_entityref(self, name):
char_map = {"nbsp": " ", "amp": "&", "lt": "<", "gt": ">", "mdash": "—", "ndash": "–"}
if self._in_td:
self._current_cell.append(char_map.get(name, f"&{name};"))
def parse_html_tables(html_text):
"""解析 HTML 文本, 返回所有表格数据"""
parser = TableParser()
try:
parser.feed(html_text)
except Exception:
pass
return parser.tables
def normalize_headers(headers):
"""将表头文本映射为标准字段名, 返回字段名列表"""
result = []
for h in headers:
mapped = None
for keyword, field in HEADER_KEYWORDS.items():
if keyword in h:
mapped = field
break
result.append(mapped if mapped else h.strip())
return result
def extract_records_from_table(table):
"""从解析后的表格中提取记录列表, 返回 (records, headers)"""
if len(table) < 2:
return [], []
# 第一行是表头
raw_headers = table[0]
headers = normalize_headers(raw_headers)
records = []
for row in table[1:]:
if len(row) < 2:
continue
# 跳过空行或重复表头
first_cell = row[0].strip()
if not first_cell or first_cell in ("序号", "所属机构", "文件号"):
continue
record = {}
for i, val in enumerate(row):
if i < len(headers):
record[headers[i]] = val
else:
record[f"col_{i}"] = val
records.append(record)
return records, headers
# ─── 网络请求 ───────────────────────────────────────────
def fetch_page(session, url):
"""请求页面, 返回 UTF-8 编码的 HTML 文本"""
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = session.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
# 显式用 utf-8 解码, 避免 requests 自动检测编码错误
return resp.content.decode("utf-8")
except requests.exceptions.RequestException as e:
print(f" [!] 请求失败 (第 {attempt}/{MAX_RETRIES} 次): {e}")
if attempt < MAX_RETRIES:
time.sleep(2 * attempt)
return None
# ─── 列表页解析 ─────────────────────────────────────────
def discover_subpages(session):
"""从列表页发现所有子页面, 返回 [(url, title, date), ...]"""
subpages = []
page_num = 1
while True:
if page_num == 1:
page_suffix = ""
else:
page_suffix = f"_{page_num}"
url = BASE_URL + LIST_PAGE_PATTERN.format(page=page_suffix)
html = fetch_page(session, url)
if not html:
break
# 提取子页面链接: |
| ]*>(.*?) | ', row, re.S) if not cells or len(cells) < 2: continue cells_clean = [re.sub(r'<[^>]+>', ' ', c).strip() for c in cells] record = {} for i, val in enumerate(cells_clean): if i < len(headers): record[headers[i]] = val record["发布批次"] = title record["发布日期"] = date records.append(record) return records # ─── 缓存管理 ─────────────────────────────────────────── def load_cache(): if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, IOError): pass return {"records": [], "done_urls": [], "total_pages": 0} def save_cache(cache): with open(CACHE_FILE, "w", encoding="utf-8") as f: json.dump(cache, f, ensure_ascii=False) # ─── Excel 导出 ───────────────────────────────────────── def export_to_excel(records, output_path): wb = Workbook() ws = wb.active ws.title = "ISO发布标准" # ── 标题行 ── header_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11) header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid") header_align = Alignment(horizontal="center", vertical="center", wrap_text=True) thin_border = Border( left=Side(style="thin", color="B4C6E7"), right=Side(style="thin", color="B4C6E7"), top=Side(style="thin", color="B4C6E7"), bottom=Side(style="thin", color="B4C6E7"), ) col_names = [col[0] for col in OUTPUT_COLUMNS] for col_idx, name in enumerate(col_names, 1): cell = ws.cell(row=1, column=col_idx, value=name) cell.font = header_font cell.fill = header_fill cell.alignment = header_align cell.border = thin_border # ── 数据行 ── data_font = Font(name="微软雅黑", size=10) data_align = Alignment(vertical="center", wrap_text=True) even_fill = PatternFill(start_color="D6E4F0", end_color="D6E4F0", fill_type="solid") for row_idx, record in enumerate(records, 2): for col_idx, (col_name, _) in enumerate(OUTPUT_COLUMNS, 1): val = record.get(col_name, "") if col_name == "序号": try: val = int(val) except (ValueError, TypeError): pass cell = ws.cell(row=row_idx, column=col_idx, value=val) cell.font = data_font cell.alignment = data_align cell.border = thin_border if row_idx % 2 == 0: cell.fill = even_fill # ── 列宽 ── for col_idx, (name, width) in enumerate(OUTPUT_COLUMNS, 1): ws.column_dimensions[get_column_letter(col_idx)].width = width # ── 冻结 & 筛选 ── ws.freeze_panes = "A2" ws.auto_filter.ref = f"A1:{get_column_letter(len(col_names))}{len(records) + 1}" # ── 统计信息 Sheet ── ws_stat = wb.create_sheet("统计信息") # 所属机构分布 org_count = {} batch_count = {} year_count = {} tc_count = {} for r in records: org = r.get("所属机构", "未知") or "未知" org_count[org] = org_count.get(org, 0) + 1 batch = r.get("发布批次", "未知") or "未知" batch_count[batch] = batch_count.get(batch, 0) + 1 pub_date = r.get("发布日期", "") or "" year = pub_date[:4] if pub_date else "未知" year_count[year] = year_count.get(year, 0) + 1 tc = r.get("所属分技术委员会", "") or "" if tc: tc_count[tc] = tc_count.get(tc, 0) + 1 stat_rows = [ ("采集时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ("数据来源", "全国汽车标准化技术委员会 — ISO发布标准"), ("来源网址", "https://www.catarc.org.cn/gjbzh/isoiec/iso/gzdt/fbbz/index.html"), ("标准总数", len(records)), ("子页面数", len(batch_count)), ("", ""), ("── 所属机构分布 (Top 30) ──", ""), ("所属机构", "标准数量"), ] for org, cnt in sorted(org_count.items(), key=lambda x: -x[1])[:30]: stat_rows.append((org, cnt)) stat_rows.append(("", "")) stat_rows.append(("── 按发布年份分布 ──", "")) stat_rows.append(("年份", "标准数量")) for y, cnt in sorted(year_count.items(), reverse=True): stat_rows.append((y, cnt)) stat_rows.append(("", "")) stat_rows.append(("── 按发布批次分布 ──", "")) stat_rows.append(("批次", "标准数量")) for b, cnt in sorted(batch_count.items(), key=lambda x: -x[1]): stat_rows.append((b, cnt)) if tc_count: stat_rows.append(("", "")) stat_rows.append(("── 分技术委员会分布 (Top 20) ──", "")) stat_rows.append(("分技术委员会", "标准数量")) for tc, cnt in sorted(tc_count.items(), key=lambda x: -x[1])[:20]: stat_rows.append((tc, cnt)) for row_idx, (a, b) in enumerate(stat_rows, 1): cell_a = ws_stat.cell(row=row_idx, column=1, value=a) cell_b = ws_stat.cell(row=row_idx, column=2, value=b) if a.startswith("──"): cell_a.font = Font(name="微软雅黑", bold=True, size=11) else: cell_a.font = Font(name="微软雅黑", size=10) cell_b.font = Font(name="微软雅黑", size=10) ws_stat.column_dimensions["A"].width = 50 ws_stat.column_dimensions["B"].width = 20 wb.save(output_path) # ─── 主流程 ───────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="全国汽车标准化技术委员会 — ISO发布标准数据采集工具", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python catarc_iso_scraper.py # 全量采集 python catarc_iso_scraper.py --resume # 断点续采 python catarc_iso_scraper.py --delay 1.0 # 每次请求间隔1秒 python catarc_iso_scraper.py --output ISO标准 # 自定义输出文件名 """, ) parser.add_argument("--resume", "-r", action="store_true", help="从上次中断处继续采集") parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY, help="请求间隔秒数 (默认0.5)") parser.add_argument("--output", "-o", default=None, help="输出文件名 (不含扩展名)") args = parser.parse_args() if args.output: output_name = args.output else: output_name = f"ISO发布标准_{datetime.now().strftime('%Y%m%d_%H%M%S')}" output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{output_name}.xlsx") print("=" * 60) print(" 全国汽车标准化技术委员会 — ISO发布标准数据采集工具") print("=" * 60) print(f" 数据来源: catarc.org.cn/gjbzh/isoiec/iso/gzdt/fbbz/") print(f" 请求间隔: {args.delay}s") print(f" 输出文件: {output_path}") print("-" * 60) session = requests.Session() all_records = [] done_urls = set() # 断点续采 if args.resume: cache = load_cache() if cache["records"]: all_records = cache["records"] done_urls = set(cache["done_urls"]) print(f" [*] 从缓存恢复: 已有 {len(all_records)} 条, {len(done_urls)} 个子页面已完成") # Step 1: 发现子页面 print(" [1/3] 发现子页面...") subpages = discover_subpages(session) print(f" 共发现 {len(subpages)} 个子页面") if not subpages: print(" [!] 未发现任何子页面, 请检查网络") sys.exit(1) # Step 2: 逐个采集 print(f" [2/3] 采集标准数据...") start_time = time.time() for idx, (url, title, date) in enumerate(subpages, 1): if url in done_urls: continue records = scrape_detail_page(session, url, title, date) all_records.extend(records) done_urls.add(url) print(f" [{idx}/{len(subpages)}] {title} — 获取 {len(records)} 条标准", end="\r") # 保存进度 cache = {"records": all_records, "done_urls": list(done_urls), "total_pages": len(subpages)} save_cache(cache) time.sleep(args.delay) elapsed = time.time() - start_time print() print(f" 采集完成! 用时 {elapsed:.1f} 秒, 共获取 {len(all_records)} 条标准") if not all_records: print(" [!] 未获取到任何数据") sys.exit(1) # Step 3: 导出 Excel print(f" [3/3] 生成 Excel 文件...") export_to_excel(all_records, output_path) file_size = os.path.getsize(output_path) / 1024 print() print(f" {'=' * 50}") print(f" 导出完成: {output_path}") print(f" 文件大小: {file_size:.1f} KB") print(f" 标准总数: {len(all_records)}") print(f" 子页面数: {len(subpages)}") print(f" {'=' * 50}") # 清理缓存 if not args.resume and os.path.exists(CACHE_FILE): os.remove(CACHE_FILE) if __name__ == "__main__": main()