440 lines
13 KiB
Python
440 lines
13 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
import argparse
|
|||
|
|
import threading
|
|||
|
|
import subprocess
|
|||
|
|
import shutil
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
import queue
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
from prometheus_client import start_http_server, Counter, Gauge, Summary, Histogram
|
|||
|
|
from flask import Flask, request, jsonify
|
|||
|
|
|
|||
|
|
# —— 常量 & 路径 —— #
|
|||
|
|
BASE = os.getcwd()
|
|||
|
|
INPUT_ROOT = os.path.join(BASE, "input")
|
|||
|
|
OUTPUT_ROOT = os.path.join(BASE, "output")
|
|||
|
|
EMPTY_DIR = os.path.join(BASE, "empty")
|
|||
|
|
LOG_DIR = os.path.join(BASE, "logs")
|
|||
|
|
|
|||
|
|
COS_BUCKET = "mb_raw_rosbag_decode_dirs"
|
|||
|
|
DOCKER_IMAGE = (
|
|||
|
|
"artifact.swfcn.i.mercedes-benz.com/swfcn_docker/perception-3d/mmtbag_decoder:v6.6"
|
|||
|
|
)
|
|||
|
|
DOCKER_CMD_TEMPLATE = [
|
|||
|
|
"docker",
|
|||
|
|
"run",
|
|||
|
|
"--rm",
|
|||
|
|
"-v",
|
|||
|
|
"{in_dir}:/input",
|
|||
|
|
"-v",
|
|||
|
|
"{out_dir}:/output",
|
|||
|
|
DOCKER_IMAGE,
|
|||
|
|
"bash",
|
|||
|
|
"-c",
|
|||
|
|
"source /opt/ros/noetic/setup.bash && "
|
|||
|
|
"/opt/perception-3d/scripts/tools/"
|
|||
|
|
"mmt_bag_decoder_scripts/decoded-bag.sh /input /output 3 1",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
BATCH_SIZE = 50
|
|||
|
|
MAX_LOCAL = 100
|
|||
|
|
MAX_RETRIES = 3
|
|||
|
|
RETRY_DELAY_S = 2
|
|||
|
|
METRICS_PORT = 8000
|
|||
|
|
|
|||
|
|
SENTINEL = (None, None)
|
|||
|
|
|
|||
|
|
# —— 日志配置 —— #
|
|||
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|||
|
|
logger = logging.getLogger("pipeline")
|
|||
|
|
logger.setLevel(logging.INFO)
|
|||
|
|
h_info = logging.FileHandler(os.path.join(LOG_DIR, "pipeline.log"), encoding="utf-8")
|
|||
|
|
h_err = logging.FileHandler(os.path.join(LOG_DIR, "error_tasks.log"), encoding="utf-8")
|
|||
|
|
fmt = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
|
|||
|
|
h_info.setFormatter(fmt)
|
|||
|
|
h_err.setFormatter(fmt)
|
|||
|
|
h_err.setLevel(logging.ERROR)
|
|||
|
|
logger.addHandler(h_info)
|
|||
|
|
logger.addHandler(h_err)
|
|||
|
|
|
|||
|
|
# —— Prometheus 指标 —— #
|
|||
|
|
DL_TOTAL = Counter("pipeline_download_total", "下载尝试总数")
|
|||
|
|
DL_FAIL = Counter("pipeline_download_failures", "下载失败总数")
|
|||
|
|
DL_RETRY = Counter("pipeline_download_retries", "下载重试总数")
|
|||
|
|
PR_TOTAL = Counter("pipeline_process_total", "处理尝试总数")
|
|||
|
|
PR_FAIL = Counter("pipeline_process_failures", "处理失败总数")
|
|||
|
|
PR_RETRY = Counter("pipeline_process_retries", "处理重试总数")
|
|||
|
|
UP_TOTAL = Counter("pipeline_upload_total", "上传尝试总数")
|
|||
|
|
UP_FAIL = Counter("pipeline_upload_failures", "上传失败总数")
|
|||
|
|
UP_RETRY = Counter("pipeline_upload_retries", "上传重试总数")
|
|||
|
|
|
|||
|
|
DL_DUR = Summary("pipeline_download_duration_seconds", "单批下载耗时秒")
|
|||
|
|
PR_DUR = Summary("pipeline_process_duration_seconds", "单批处理耗时秒")
|
|||
|
|
UP_DUR = Summary("pipeline_upload_duration_seconds", "单批上传耗时秒")
|
|||
|
|
|
|||
|
|
BATCH_SIZE_HIST = Histogram(
|
|||
|
|
"pipeline_batch_size",
|
|||
|
|
"单批任务中文件数量分布",
|
|||
|
|
buckets=[1, 10, 20, 50, 100, 200, 500],
|
|||
|
|
)
|
|||
|
|
FILE_DL_DUR = Histogram("pipeline_file_download_duration_seconds", "单文件下载耗时分布")
|
|||
|
|
BATCH_OUT_FILES = Gauge("pipeline_batch_output_file_count", "单批处理后输出文件数")
|
|||
|
|
|
|||
|
|
Q_BATCH = Gauge("pipeline_queue_batches", "待下载批次数")
|
|||
|
|
Q_PROC = Gauge("pipeline_queue_processing", "待处理批次数")
|
|||
|
|
Q_UP = Gauge("pipeline_queue_uploading", "待上传批次数")
|
|||
|
|
LOCAL_FILES = Gauge("pipeline_local_file_count", "本地 input 文件总数")
|
|||
|
|
|
|||
|
|
# —— 全局队列 & inflight 计数 —— #
|
|||
|
|
batch_q = queue.Queue()
|
|||
|
|
proc_q = queue.Queue()
|
|||
|
|
up_q = queue.Queue()
|
|||
|
|
|
|||
|
|
inflight = 0
|
|||
|
|
inflight_lock = threading.Lock()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— 辅助函数 —— #
|
|||
|
|
def count_local_files():
|
|||
|
|
return sum(len(files) for _, _, files in os.walk(INPUT_ROOT))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run(cmd, timeout=None):
|
|||
|
|
logger.info("CMD: %s", " ".join(cmd))
|
|||
|
|
try:
|
|||
|
|
p = subprocess.Popen(
|
|||
|
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
|||
|
|
)
|
|||
|
|
start = time.time()
|
|||
|
|
out_lines = []
|
|||
|
|
timed_out = False
|
|||
|
|
for line in p.stdout:
|
|||
|
|
out_lines.append(line)
|
|||
|
|
if timeout and (time.time() - start) > timeout:
|
|||
|
|
p.kill()
|
|||
|
|
timed_out = True
|
|||
|
|
break
|
|||
|
|
code = p.wait()
|
|||
|
|
return code, timed_out, "".join(out_lines)
|
|||
|
|
except Exception:
|
|||
|
|
logger.exception("CMD 执行异常")
|
|||
|
|
return -1, False, ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def with_retry(tag, fn, *args):
|
|||
|
|
for i in range(1, MAX_RETRIES + 1):
|
|||
|
|
code, timed_out, _ = fn(*args)
|
|||
|
|
if code == 0:
|
|||
|
|
return True
|
|||
|
|
if timed_out:
|
|||
|
|
logger.error("%s 阶段超时,不再重试", tag)
|
|||
|
|
break
|
|||
|
|
# 计数重试
|
|||
|
|
if tag.startswith("DL["):
|
|||
|
|
DL_RETRY.inc()
|
|||
|
|
if tag.startswith("PR["):
|
|||
|
|
PR_RETRY.inc()
|
|||
|
|
if tag.startswith("UP["):
|
|||
|
|
UP_RETRY.inc()
|
|||
|
|
logger.warning("%s 重试 %d/%d", tag, i, MAX_RETRIES)
|
|||
|
|
time.sleep(RETRY_DELAY_S)
|
|||
|
|
logger.error("%s 最终失败", tag)
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— 下载 —— #
|
|||
|
|
@DL_DUR.time()
|
|||
|
|
def do_download(batch_id, paths, batch_timeout):
|
|||
|
|
if batch_id is None:
|
|||
|
|
proc_q.put(SENTINEL)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
DL_TOTAL.inc()
|
|||
|
|
start = time.time()
|
|||
|
|
in_dir = os.path.join(INPUT_ROOT, batch_id)
|
|||
|
|
os.makedirs(in_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
# 限制本地文件数
|
|||
|
|
while count_local_files() >= MAX_LOCAL:
|
|||
|
|
logger.warning("本地文件过多,暂停下载5分钟")
|
|||
|
|
time.sleep(300)
|
|||
|
|
|
|||
|
|
for p in paths:
|
|||
|
|
if time.time() - start > batch_timeout:
|
|||
|
|
logger.error("DL[%s] 下载阶段超时,跳过剩余", batch_id)
|
|||
|
|
DL_FAIL.inc()
|
|||
|
|
break
|
|||
|
|
dst = os.path.join(in_dir, os.path.basename(p))
|
|||
|
|
f_start = time.time()
|
|||
|
|
ok = with_retry(
|
|||
|
|
f"DL[{batch_id}]",
|
|||
|
|
lambda s, d: run(
|
|||
|
|
["coscmd", "-s", "download", s, d],
|
|||
|
|
timeout=batch_timeout - (time.time() - start),
|
|||
|
|
),
|
|||
|
|
p,
|
|||
|
|
dst,
|
|||
|
|
)
|
|||
|
|
FILE_DL_DUR.observe(time.time() - f_start)
|
|||
|
|
if not ok:
|
|||
|
|
DL_FAIL.inc()
|
|||
|
|
|
|||
|
|
proc_q.put((batch_id, in_dir))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— 处理 —— #
|
|||
|
|
@PR_DUR.time()
|
|||
|
|
def do_process(batch_id, in_dir, batch_timeout):
|
|||
|
|
if batch_id is None:
|
|||
|
|
up_q.put(SENTINEL)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
PR_TOTAL.inc()
|
|||
|
|
start = time.time()
|
|||
|
|
out_dir = os.path.join(OUTPUT_ROOT, batch_id)
|
|||
|
|
os.makedirs(out_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
cmd = [c.format(in_dir=in_dir, out_dir=out_dir) for c in DOCKER_CMD_TEMPLATE]
|
|||
|
|
|
|||
|
|
def run_pr(command):
|
|||
|
|
p = subprocess.Popen(
|
|||
|
|
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
|||
|
|
)
|
|||
|
|
for line in p.stdout:
|
|||
|
|
logger.info("[PR %s] %s", batch_id, line.rstrip())
|
|||
|
|
if time.time() - start > batch_timeout:
|
|||
|
|
p.kill()
|
|||
|
|
return p.wait(), True, ""
|
|||
|
|
return p.wait(), False, ""
|
|||
|
|
|
|||
|
|
ok = with_retry(f"PR[{batch_id}]", run_pr, cmd)
|
|||
|
|
if not ok:
|
|||
|
|
PR_FAIL.inc()
|
|||
|
|
|
|||
|
|
# 统计输出文件
|
|||
|
|
files = []
|
|||
|
|
for r, _, fs in os.walk(out_dir):
|
|||
|
|
for fn in fs:
|
|||
|
|
files.append(os.path.relpath(os.path.join(r, fn), out_dir))
|
|||
|
|
BATCH_OUT_FILES.set(len(files))
|
|||
|
|
|
|||
|
|
shutil.rmtree(in_dir, ignore_errors=True)
|
|||
|
|
up_q.put((batch_id, out_dir))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— 上传 —— #
|
|||
|
|
@UP_DUR.time()
|
|||
|
|
def do_upload(batch_id, out_dir, batch_timeout):
|
|||
|
|
global inflight
|
|||
|
|
if batch_id is None:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
UP_TOTAL.inc()
|
|||
|
|
ok = with_retry(
|
|||
|
|
f"UP[{batch_id}]",
|
|||
|
|
lambda d: run(
|
|||
|
|
["coscmd", "-s", "upload", "-r", d, COS_BUCKET], timeout=batch_timeout
|
|||
|
|
),
|
|||
|
|
out_dir,
|
|||
|
|
)
|
|||
|
|
if not ok:
|
|||
|
|
UP_FAIL.inc()
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 删除目录结构
|
|||
|
|
for cmd in [
|
|||
|
|
["sudo", "rsync", "-av", "--delete", f"{EMPTY_DIR}/", f"{out_dir}/"],
|
|||
|
|
["sudo", "rm", "-rf", out_dir],
|
|||
|
|
]:
|
|||
|
|
run(cmd, timeout=60)
|
|||
|
|
|
|||
|
|
logger.info("UP[%s] 完成", batch_id)
|
|||
|
|
finally:
|
|||
|
|
# 无论成功失败,任务算完成,inflight-1
|
|||
|
|
with inflight_lock:
|
|||
|
|
inflight -= 1
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— Worker 模板 —— #
|
|||
|
|
def worker(q, fn, timeout):
|
|||
|
|
while True:
|
|||
|
|
bid, data = q.get()
|
|||
|
|
fn(bid, data, timeout)
|
|||
|
|
q.task_done()
|
|||
|
|
if bid is None:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— Service HTTP —— #
|
|||
|
|
app = Flask(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@app.route("/ready", methods=["GET"])
|
|||
|
|
def api_ready():
|
|||
|
|
with inflight_lock:
|
|||
|
|
busy = inflight > 0
|
|||
|
|
return jsonify(ready=not busy)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@app.route("/notify", methods=["POST"])
|
|||
|
|
def api_notify():
|
|||
|
|
global inflight
|
|||
|
|
data = request.get_json(force=True)
|
|||
|
|
if not isinstance(data, list):
|
|||
|
|
return jsonify(error="Expect JSON list"), 400
|
|||
|
|
|
|||
|
|
# 兼容 bag-checker,只发 name 时补前缀
|
|||
|
|
paths = []
|
|||
|
|
for item in data:
|
|||
|
|
if not isinstance(item, str):
|
|||
|
|
continue
|
|||
|
|
if item.startswith("mb_cuct_data_collection/"):
|
|||
|
|
paths.append(item)
|
|||
|
|
else:
|
|||
|
|
paths.append("mb_cuct_data_collection/" + item)
|
|||
|
|
|
|||
|
|
TIME_RE = re.compile(r"_(\d{8})-(\d{6})_") # 匹配 20230803-160828
|
|||
|
|
|
|||
|
|
def extract_ts(p: str) -> datetime:
|
|||
|
|
m = TIME_RE.search(os.path.basename(p))
|
|||
|
|
if not m:
|
|||
|
|
return datetime.min # 无法解析的放最后
|
|||
|
|
date_part, time_part = m.groups()
|
|||
|
|
ts_str = f"{date_part}{time_part}"
|
|||
|
|
return datetime.strptime(ts_str, "%Y%m%d%H%M%S")
|
|||
|
|
|
|||
|
|
paths.sort(key=extract_ts, reverse=True)
|
|||
|
|
|
|||
|
|
with inflight_lock:
|
|||
|
|
inflight += 1
|
|||
|
|
|
|||
|
|
for idx in range(0, len(paths), BATCH_SIZE):
|
|||
|
|
blk = paths[idx : idx + BATCH_SIZE]
|
|||
|
|
BATCH_SIZE_HIST.observe(len(blk))
|
|||
|
|
bid = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") + f"_{idx // BATCH_SIZE + 1}"
|
|||
|
|
batch_q.put((bid, blk))
|
|||
|
|
batch_q.put(SENTINEL)
|
|||
|
|
|
|||
|
|
# batch_q.put((bid, paths))
|
|||
|
|
return jsonify(status="accepted", batch_size=BATCH_SIZE), 202
|
|||
|
|
|
|||
|
|
|
|||
|
|
def start_metric_updater():
|
|||
|
|
def loop():
|
|||
|
|
while True:
|
|||
|
|
Q_BATCH.set(batch_q.qsize())
|
|||
|
|
Q_PROC.set(proc_q.qsize())
|
|||
|
|
Q_UP.set(up_q.qsize())
|
|||
|
|
LOCAL_FILES.set(count_local_files())
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
t = threading.Thread(target=loop, daemon=True)
|
|||
|
|
t.start()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# —— 两种模式的入口 —— #
|
|||
|
|
def file_mode(args):
|
|||
|
|
# 读 tasks-file,分批入队,放入 sentinel,然后启动处理
|
|||
|
|
lines = [
|
|||
|
|
line.strip() for line in open(args.tasks_file, encoding="utf-8") if line.strip()
|
|||
|
|
]
|
|||
|
|
for idx in range(0, len(lines), args.batch_size):
|
|||
|
|
blk = lines[idx : idx + args.batch_size]
|
|||
|
|
BATCH_SIZE_HIST.observe(len(blk))
|
|||
|
|
bid = (
|
|||
|
|
datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
|||
|
|
+ f"_{idx // args.batch_size + 1}"
|
|||
|
|
)
|
|||
|
|
batch_q.put((bid, blk))
|
|||
|
|
batch_q.put(SENTINEL)
|
|||
|
|
|
|||
|
|
start_http_server(METRICS_PORT)
|
|||
|
|
logger.info("Metrics HTTP 启动,端口 %d", METRICS_PORT)
|
|||
|
|
|
|||
|
|
threads = [
|
|||
|
|
threading.Thread(
|
|||
|
|
target=worker, args=(batch_q, do_download, args.batch_timeout)
|
|||
|
|
),
|
|||
|
|
threading.Thread(target=worker, args=(proc_q, do_process, args.batch_timeout)),
|
|||
|
|
threading.Thread(target=worker, args=(up_q, do_upload, args.batch_timeout)),
|
|||
|
|
]
|
|||
|
|
for t in threads:
|
|||
|
|
t.start()
|
|||
|
|
|
|||
|
|
# 更新指标 & 等待完成
|
|||
|
|
while batch_q.unfinished_tasks or proc_q.unfinished_tasks or up_q.unfinished_tasks:
|
|||
|
|
Q_BATCH.set(batch_q.qsize())
|
|||
|
|
Q_PROC.set(proc_q.qsize())
|
|||
|
|
Q_UP.set(up_q.qsize())
|
|||
|
|
LOCAL_FILES.set(count_local_files())
|
|||
|
|
time.sleep(1)
|
|||
|
|
|
|||
|
|
for t in threads:
|
|||
|
|
t.join()
|
|||
|
|
|
|||
|
|
logger.info("文件模式处理完成,退出。")
|
|||
|
|
sys.exit(0)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def service_mode(args):
|
|||
|
|
# 确保目录存在
|
|||
|
|
for d in (INPUT_ROOT, OUTPUT_ROOT, EMPTY_DIR, LOG_DIR):
|
|||
|
|
os.makedirs(d, exist_ok=True)
|
|||
|
|
|
|||
|
|
# 启动 Prometheus 和指标更新
|
|||
|
|
start_http_server(METRICS_PORT)
|
|||
|
|
logger.info("Metrics HTTP 启动,端口 %d", METRICS_PORT)
|
|||
|
|
start_metric_updater()
|
|||
|
|
|
|||
|
|
# 启动后台 worker
|
|||
|
|
threads = [
|
|||
|
|
threading.Thread(
|
|||
|
|
target=worker, args=(batch_q, do_download, args.batch_timeout), daemon=True
|
|||
|
|
),
|
|||
|
|
threading.Thread(
|
|||
|
|
target=worker, args=(proc_q, do_process, args.batch_timeout), daemon=True
|
|||
|
|
),
|
|||
|
|
threading.Thread(
|
|||
|
|
target=worker, args=(up_q, do_upload, args.batch_timeout), daemon=True
|
|||
|
|
),
|
|||
|
|
]
|
|||
|
|
for t in threads:
|
|||
|
|
t.start()
|
|||
|
|
|
|||
|
|
# 启动 Flask
|
|||
|
|
logger.info("Decode Service 启动 HTTP on %s:%d", args.host, args.port)
|
|||
|
|
app.run(host=args.host, port=args.port, threaded=True)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
p = argparse.ArgumentParser()
|
|||
|
|
sub = p.add_subparsers(dest="mode", required=True)
|
|||
|
|
|
|||
|
|
f = sub.add_parser("file", help="文件模式:--tasks-file")
|
|||
|
|
f.add_argument("--tasks-file", required=True)
|
|||
|
|
f.add_argument("--batch-size", type=int, default=BATCH_SIZE)
|
|||
|
|
f.add_argument("--batch-timeout", type=int, default=3600)
|
|||
|
|
|
|||
|
|
s = sub.add_parser("service", help="服务模式:启动 HTTP ready/notify")
|
|||
|
|
s.add_argument("--batch-timeout", type=int, default=3600)
|
|||
|
|
s.add_argument("--host", default="0.0.0.0")
|
|||
|
|
s.add_argument("--port", type=int, default=5000)
|
|||
|
|
|
|||
|
|
args = p.parse_args()
|
|||
|
|
if args.mode == "file":
|
|||
|
|
file_mode(args)
|
|||
|
|
else:
|
|||
|
|
service_mode(args)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|