416 lines
15 KiB
Python
416 lines
15 KiB
Python
|
|
"""
|
||
|
|
国家标准全文公开系统 — 推荐性国家标准 PDF 批量下载工具
|
||
|
|
数据来源: https://openstd.samr.gov.cn/bzgk/std/std_list_type (p.p1=2 推荐性国家标准)
|
||
|
|
下载地址: http://c.gb688.cn/bzgk/gb/viewGb
|
||
|
|
|
||
|
|
功能:
|
||
|
|
1. 按关键词搜索推荐性国家标准 (如 "车" 可匹配所有车辆相关标准)
|
||
|
|
2. 自动识别验证码 (ddddocr) 并下载 PDF 全文
|
||
|
|
3. 支持筛选: 现行/即将实施/废止
|
||
|
|
4. 文件命名: "标准号 标准名称.pdf" (如 "GB/T 1234-2024 xxx技术要求.pdf")
|
||
|
|
5. 断点续传: 已下载的文件自动跳过
|
||
|
|
6. 导出标准元数据 Excel
|
||
|
|
|
||
|
|
用法:
|
||
|
|
python openstd_gb_t_downloader.py # 下载"车"相关推荐性国家标准
|
||
|
|
python openstd_gb_t_downloader.py --keyword "制动" # 搜索关键词
|
||
|
|
python openstd_gb_t_downloader.py --status "现行" # 只下载现行标准
|
||
|
|
python openstd_gb_t_downloader.py --page-size 50 # 每页50条
|
||
|
|
python openstd_gb_t_downloader.py --output-dir ./GB_T_Doc # 自定义下载目录
|
||
|
|
python openstd_gb_t_downloader.py --no-download # 仅采集元数据, 不下载PDF
|
||
|
|
|
||
|
|
依赖:
|
||
|
|
pip install requests ddddocr openpyxl
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import io
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
import argparse
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
import requests
|
||
|
|
from openpyxl import Workbook
|
||
|
|
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
|
||
|
|
from openpyxl.utils import get_column_letter
|
||
|
|
|
||
|
|
# ─── Windows 控制台中文输出修复 ─────────────────────────
|
||
|
|
if sys.platform == "win32":
|
||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
||
|
|
|
||
|
|
# ─── 配置 ───────────────────────────────────────────────
|
||
|
|
LIST_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
|
||
|
|
DOWNLOAD_INIT_URL = "http://c.gb688.cn/bzgk/gb/showGb?type=download&hcno={hcno}"
|
||
|
|
CAPTCHA_URL = "http://c.gb688.cn/bzgk/gb/gc?_{ts}"
|
||
|
|
VERIFY_URL = "http://c.gb688.cn/bzgk/gb/verifyCode"
|
||
|
|
PDF_URL = "http://c.gb688.cn/bzgk/gb/viewGb?hcno={hcno}"
|
||
|
|
|
||
|
|
# 推荐性国家标准 p.p1=2
|
||
|
|
STD_TYPE_P1 = "2"
|
||
|
|
DEFAULT_KEYWORD = "车"
|
||
|
|
|
||
|
|
MAX_CAPTCHA_RETRIES = 8
|
||
|
|
REQUEST_TIMEOUT = 30
|
||
|
|
CACHE_FILE = ".openstd_gb_t_cache.json"
|
||
|
|
|
||
|
|
HEADERS = {
|
||
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||
|
|
}
|
||
|
|
|
||
|
|
# 输出列定义
|
||
|
|
OUTPUT_COLUMNS = [
|
||
|
|
("标准号", 22),
|
||
|
|
("标准名称", 50),
|
||
|
|
("标准状态", 10),
|
||
|
|
("发布日期", 14),
|
||
|
|
("实施日期", 14),
|
||
|
|
("是否采标", 10),
|
||
|
|
("hcno", 35),
|
||
|
|
("文件名", 60),
|
||
|
|
("下载状态", 10),
|
||
|
|
]
|
||
|
|
|
||
|
|
# ─── 列表页解析 ─────────────────────────────────────────
|
||
|
|
def fetch_list_page(session, keyword, page_num, page_size):
|
||
|
|
"""请求列表页, 返回 HTML"""
|
||
|
|
params = {
|
||
|
|
"p.p1": STD_TYPE_P1, # 推荐性国家标准
|
||
|
|
"p.p2": keyword,
|
||
|
|
"p.p90": "circulation_date",
|
||
|
|
"p.p91": "desc",
|
||
|
|
}
|
||
|
|
if page_num > 1:
|
||
|
|
params["page"] = page_num
|
||
|
|
params["pageSize"] = page_size
|
||
|
|
|
||
|
|
for attempt in range(3):
|
||
|
|
try:
|
||
|
|
resp = session.get(LIST_URL, params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||
|
|
resp.raise_for_status()
|
||
|
|
return resp.content.decode("utf-8")
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [!] 列表页请求失败 (第 {attempt+1} 次): {e}")
|
||
|
|
time.sleep(2)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def parse_list_page(html):
|
||
|
|
"""解析列表页 HTML, 返回标准列表和总数"""
|
||
|
|
hcnos = list(dict.fromkeys(re.findall(r"showInfo\('([A-F0-9]{32})'\)", html)))
|
||
|
|
|
||
|
|
status_counts = re.findall(r'现行\((\d+)\).*?即将实施\((\d+)\).*?废止\((\d+)\)', html, re.S)
|
||
|
|
total = 0
|
||
|
|
if status_counts:
|
||
|
|
total = sum(int(x) for x in status_counts[0])
|
||
|
|
|
||
|
|
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html, re.S)
|
||
|
|
standards = []
|
||
|
|
for row in rows:
|
||
|
|
cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.S)
|
||
|
|
if len(cells) < 6:
|
||
|
|
continue
|
||
|
|
first_cell = re.sub(r'<[^>]+>', '', cells[0]).strip()
|
||
|
|
if not first_cell.isdigit():
|
||
|
|
continue
|
||
|
|
|
||
|
|
std_code = re.sub(r'<[^>]+>', '', cells[1]).strip()
|
||
|
|
std_name_raw = cells[3]
|
||
|
|
std_name = re.sub(r'<[^>]+>', '', std_name_raw).strip()
|
||
|
|
std_status = re.sub(r'<[^>]+>', '', cells[4]).strip()
|
||
|
|
issue_date = re.sub(r'<[^>]+>', '', cells[5]).strip()
|
||
|
|
act_date = re.sub(r'<[^>]+>', '', cells[6]).strip() if len(cells) > 6 else ""
|
||
|
|
|
||
|
|
hcno_m = re.search(r"showInfo\('([A-F0-9]{32})'\)", cells[1])
|
||
|
|
hcno = hcno_m.group(1) if hcno_m else ""
|
||
|
|
|
||
|
|
adopted = re.sub(r'<[^>]+>', '', cells[2]).strip() if len(cells) > 2 else ""
|
||
|
|
|
||
|
|
standards.append({
|
||
|
|
"标准号": std_code,
|
||
|
|
"标准名称": std_name,
|
||
|
|
"标准状态": std_status,
|
||
|
|
"发布日期": issue_date[:10] if issue_date else "",
|
||
|
|
"实施日期": act_date[:10] if act_date else "",
|
||
|
|
"是否采标": adopted,
|
||
|
|
"hcno": hcno,
|
||
|
|
})
|
||
|
|
|
||
|
|
return standards, total
|
||
|
|
|
||
|
|
|
||
|
|
def collect_all_standards(keyword, page_size, status_filter=""):
|
||
|
|
"""采集所有标准列表"""
|
||
|
|
session = requests.Session()
|
||
|
|
all_standards = []
|
||
|
|
|
||
|
|
html = fetch_list_page(session, keyword, 1, page_size)
|
||
|
|
if not html:
|
||
|
|
return all_standards
|
||
|
|
|
||
|
|
standards, total = parse_list_page(html)
|
||
|
|
all_standards.extend(standards)
|
||
|
|
|
||
|
|
total_pages = (total + page_size - 1) // page_size if total > 0 else 1
|
||
|
|
print(f" 总计: {total} 条标准, {total_pages} 页")
|
||
|
|
|
||
|
|
for page_num in range(2, total_pages + 1):
|
||
|
|
html = fetch_list_page(session, keyword, page_num, page_size)
|
||
|
|
if not html:
|
||
|
|
break
|
||
|
|
standards, _ = parse_list_page(html)
|
||
|
|
if not standards:
|
||
|
|
break
|
||
|
|
all_standards.extend(standards)
|
||
|
|
print(f" 已采集: {len(all_standards)}/{total} (第 {page_num}/{total_pages} 页)", end="\r")
|
||
|
|
time.sleep(0.3)
|
||
|
|
|
||
|
|
print()
|
||
|
|
|
||
|
|
if status_filter:
|
||
|
|
all_standards = [s for s in all_standards if status_filter in s.get("标准状态", "")]
|
||
|
|
print(f" 筛选 [{status_filter}]: {len(all_standards)} 条")
|
||
|
|
|
||
|
|
return all_standards
|
||
|
|
|
||
|
|
|
||
|
|
# ─── PDF 下载 ───────────────────────────────────────────
|
||
|
|
def download_pdf(hcno, save_path, max_retries=3):
|
||
|
|
"""下载单个标准 PDF, 自动识别验证码"""
|
||
|
|
import ddddocr
|
||
|
|
ocr = ddddocr.DdddOcr(show_ad=False)
|
||
|
|
|
||
|
|
for retry in range(max_retries):
|
||
|
|
s = requests.Session()
|
||
|
|
s.headers.update(HEADERS)
|
||
|
|
|
||
|
|
try:
|
||
|
|
s.get(DOWNLOAD_INIT_URL.format(hcno=hcno), timeout=REQUEST_TIMEOUT)
|
||
|
|
|
||
|
|
verified = False
|
||
|
|
for captcha_attempt in range(MAX_CAPTCHA_RETRIES):
|
||
|
|
r = s.get(CAPTCHA_URL.format(ts=int(time.time() * 1000)), timeout=REQUEST_TIMEOUT)
|
||
|
|
if len(r.content) < 100:
|
||
|
|
time.sleep(1)
|
||
|
|
continue
|
||
|
|
|
||
|
|
code = ocr.classification(r.content)
|
||
|
|
|
||
|
|
vr = s.post(VERIFY_URL, data={"verifyCode": code}, timeout=REQUEST_TIMEOUT)
|
||
|
|
if vr.text.strip() == "success":
|
||
|
|
verified = True
|
||
|
|
break
|
||
|
|
|
||
|
|
if not verified:
|
||
|
|
if retry < max_retries - 1:
|
||
|
|
print(f"验证码失败,重试({retry+1})")
|
||
|
|
continue
|
||
|
|
|
||
|
|
dr = s.get(PDF_URL.format(hcno=hcno), timeout=60)
|
||
|
|
if len(dr.content) > 1000:
|
||
|
|
with open(save_path, "wb") as f:
|
||
|
|
f.write(dr.content)
|
||
|
|
return True, len(dr.content)
|
||
|
|
else:
|
||
|
|
return False, -1
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [!] 下载异常: {e}, 重试 ({retry+1}/{max_retries})")
|
||
|
|
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
return False, 0
|
||
|
|
|
||
|
|
|
||
|
|
def sanitize_filename(name):
|
||
|
|
"""清理文件名中的非法字符"""
|
||
|
|
return re.sub(r'[\\/:*?"<>|]', ' ', name).strip()
|
||
|
|
|
||
|
|
|
||
|
|
# ─── Excel 导出 ─────────────────────────────────────────
|
||
|
|
def export_to_excel(records, output_path):
|
||
|
|
wb = Workbook()
|
||
|
|
ws = wb.active
|
||
|
|
ws.title = "推荐性国家标准清单"
|
||
|
|
|
||
|
|
hdr_font = Font(name="微软雅黑", bold=True, color="FFFFFF", size=11)
|
||
|
|
hdr_fill = PatternFill(start_color="375623", end_color="375623", fill_type="solid") # 绿色表示推荐性
|
||
|
|
hdr_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||
|
|
dat_font = Font(name="微软雅黑", size=10)
|
||
|
|
dat_align = Alignment(vertical="center", wrap_text=True)
|
||
|
|
even_fill = PatternFill(start_color="E2EFDA", end_color="E2EFDA", fill_type="solid")
|
||
|
|
border = Border(
|
||
|
|
left=Side(style="thin", color="A9D08E"),
|
||
|
|
right=Side(style="thin", color="A9D08E"),
|
||
|
|
top=Side(style="thin", color="A9D08E"),
|
||
|
|
bottom=Side(style="thin", color="A9D08E"),
|
||
|
|
)
|
||
|
|
|
||
|
|
col_names = [c[0] for c in OUTPUT_COLUMNS]
|
||
|
|
for ci, name in enumerate(col_names, 1):
|
||
|
|
cell = ws.cell(row=1, column=ci, value=name)
|
||
|
|
cell.font = hdr_font
|
||
|
|
cell.fill = hdr_fill
|
||
|
|
cell.alignment = hdr_align
|
||
|
|
cell.border = border
|
||
|
|
|
||
|
|
for ri, rec in enumerate(records, 2):
|
||
|
|
for ci, (col_name, _) in enumerate(OUTPUT_COLUMNS, 1):
|
||
|
|
val = rec.get(col_name, "")
|
||
|
|
cell = ws.cell(row=ri, column=ci, value=val)
|
||
|
|
cell.font = dat_font
|
||
|
|
cell.alignment = dat_align
|
||
|
|
cell.border = border
|
||
|
|
if ri % 2 == 0:
|
||
|
|
cell.fill = even_fill
|
||
|
|
|
||
|
|
for ci, (_, w) in enumerate(OUTPUT_COLUMNS, 1):
|
||
|
|
ws.column_dimensions[get_column_letter(ci)].width = w
|
||
|
|
|
||
|
|
ws.freeze_panes = "A2"
|
||
|
|
ws.auto_filter.ref = f"A1:{get_column_letter(len(col_names))}{len(records) + 1}"
|
||
|
|
|
||
|
|
wb.save(output_path)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 缓存 ───────────────────────────────────────────────
|
||
|
|
def load_cache():
|
||
|
|
if os.path.exists(CACHE_FILE):
|
||
|
|
try:
|
||
|
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||
|
|
return json.load(f)
|
||
|
|
except (json.JSONDecodeError, IOError):
|
||
|
|
pass
|
||
|
|
return {"downloaded_hcnos": [], "records": []}
|
||
|
|
|
||
|
|
|
||
|
|
def save_cache(cache):
|
||
|
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(cache, f, ensure_ascii=False)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── 主流程 ─────────────────────────────────────────────
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="国家标准全文公开系统 — 推荐性国家标准 PDF 批量下载工具",
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog="""
|
||
|
|
示例:
|
||
|
|
python openstd_gb_t_downloader.py # 下载"车"相关推荐性国家标准
|
||
|
|
python openstd_gb_t_downloader.py --keyword "制动" # 搜索关键词
|
||
|
|
python openstd_gb_t_downloader.py --status "现行" # 只下载现行标准
|
||
|
|
python openstd_gb_t_downloader.py --no-download # 仅采集元数据, 不下载PDF
|
||
|
|
""",
|
||
|
|
)
|
||
|
|
parser.add_argument("--keyword", "-k", default=DEFAULT_KEYWORD, help="搜索关键词 (默认: 车)")
|
||
|
|
parser.add_argument("--status", "-s", default="", help="状态筛选: 现行/即将实施/废止 (默认: 全部)")
|
||
|
|
parser.add_argument("--page-size", "-p", type=int, default=50, help="每页条数 (默认50)")
|
||
|
|
parser.add_argument("--output-dir", "-o", default="GB_T_Doc", help="PDF下载目录 (默认: GB_T_Doc)")
|
||
|
|
parser.add_argument("--no-download", action="store_true", help="仅采集元数据, 不下载PDF")
|
||
|
|
parser.add_argument("--max-count", "-n", type=int, default=0, help="最大下载数量 (0=全部)")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
|
excel_path = os.path.join(args.output_dir, f"推荐性国家标准清单_{args.keyword}_{timestamp}.xlsx")
|
||
|
|
|
||
|
|
print("=" * 60)
|
||
|
|
print(" 国家标准全文公开系统 — 推荐性国家标准 PDF 批量下载工具")
|
||
|
|
print("=" * 60)
|
||
|
|
print(f" 关键词: {args.keyword}")
|
||
|
|
print(f" 类型: 推荐性国家标准 (GB/T)")
|
||
|
|
print(f" 状态: {args.status or '全部'}")
|
||
|
|
print(f" 下载目录: {args.output_dir}/")
|
||
|
|
print(f" 下载PDF: {'否' if args.no_download else '是'}")
|
||
|
|
print("-" * 60)
|
||
|
|
|
||
|
|
# Step 1: 采集标准列表
|
||
|
|
print(" [1/2] 采集标准列表...")
|
||
|
|
standards = collect_all_standards(args.keyword, args.page_size, args.status)
|
||
|
|
|
||
|
|
if not standards:
|
||
|
|
print(" 未找到任何标准")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f" 共 {len(standards)} 条标准")
|
||
|
|
|
||
|
|
if args.max_count > 0:
|
||
|
|
standards = standards[:args.max_count]
|
||
|
|
print(f" 限制下载前 {args.max_count} 条")
|
||
|
|
|
||
|
|
# Step 2: 下载 PDF
|
||
|
|
cache = load_cache()
|
||
|
|
downloaded_hcnos = set(cache.get("downloaded_hcnos", []))
|
||
|
|
|
||
|
|
if not args.no_download:
|
||
|
|
print(f"\n [2/2] 下载 PDF 文件...")
|
||
|
|
success_count = 0
|
||
|
|
skip_count = 0
|
||
|
|
fail_count = 0
|
||
|
|
|
||
|
|
for idx, std in enumerate(standards, 1):
|
||
|
|
hcno = std.get("hcno", "")
|
||
|
|
code = std.get("标准号", "")
|
||
|
|
name = std.get("标准名称", "")
|
||
|
|
filename = sanitize_filename(f"{code} {name}.pdf")
|
||
|
|
filepath = os.path.join(args.output_dir, filename)
|
||
|
|
|
||
|
|
if hcno in downloaded_hcnos or os.path.exists(filepath):
|
||
|
|
std["下载状态"] = "已存在"
|
||
|
|
std["文件名"] = filename
|
||
|
|
skip_count += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f" [{idx}/{len(standards)}] {code} {name[:30]}...", end=" ")
|
||
|
|
|
||
|
|
ok, size = download_pdf(hcno, filepath)
|
||
|
|
|
||
|
|
if ok:
|
||
|
|
std["下载状态"] = "成功"
|
||
|
|
std["文件名"] = filename
|
||
|
|
downloaded_hcnos.add(hcno)
|
||
|
|
success_count += 1
|
||
|
|
print(f"OK ({size/1024:.0f} KB)")
|
||
|
|
elif size == -1:
|
||
|
|
std["下载状态"] = "无PDF"
|
||
|
|
std["文件名"] = ""
|
||
|
|
fail_count += 1
|
||
|
|
print("NO PDF")
|
||
|
|
else:
|
||
|
|
std["下载状态"] = "失败"
|
||
|
|
std["文件名"] = ""
|
||
|
|
fail_count += 1
|
||
|
|
print("FAILED")
|
||
|
|
|
||
|
|
cache["downloaded_hcnos"] = list(downloaded_hcnos)
|
||
|
|
save_cache(cache)
|
||
|
|
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
print(f"\n 下载完成: 成功 {success_count}, 跳过 {skip_count}, 无PDF/失败 {fail_count}")
|
||
|
|
else:
|
||
|
|
for std in standards:
|
||
|
|
std["下载状态"] = "跳过"
|
||
|
|
std["文件名"] = ""
|
||
|
|
print("\n [2/2] 跳过下载 (--no-download)")
|
||
|
|
|
||
|
|
export_to_excel(standards, excel_path)
|
||
|
|
print(f"\n 元数据已导出: {excel_path}")
|
||
|
|
|
||
|
|
print(f"\n {'=' * 50}")
|
||
|
|
print(f" 总计: {len(standards)} 条推荐性国家标准")
|
||
|
|
print(f" Excel: {excel_path}")
|
||
|
|
if not args.no_download:
|
||
|
|
print(f" PDF目录: {args.output_dir}/")
|
||
|
|
pdfs = [f for f in os.listdir(args.output_dir) if f.endswith('.pdf')]
|
||
|
|
print(f" PDF文件数: {len(pdfs)}")
|
||
|
|
print(f" {'=' * 50}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|