第一次
This commit is contained in:
439
fst_data_pipeline/pipelines/tencent/decoder.py
Normal file
439
fst_data_pipeline/pipelines/tencent/decoder.py
Normal file
@@ -0,0 +1,439 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import threading
|
||||
import subprocess
|
||||
import shutil
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import queue
|
||||
import re
|
||||
|
||||
from prometheus_client import start_http_server, Counter, Gauge, Summary, Histogram
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
# —— 常量 & 路径 —— #
|
||||
BASE = os.getcwd()
|
||||
INPUT_ROOT = os.path.join(BASE, "input")
|
||||
OUTPUT_ROOT = os.path.join(BASE, "output")
|
||||
EMPTY_DIR = os.path.join(BASE, "empty")
|
||||
LOG_DIR = os.path.join(BASE, "logs")
|
||||
|
||||
COS_BUCKET = "mb_raw_rosbag_decode_dirs"
|
||||
DOCKER_IMAGE = (
|
||||
"artifact.swfcn.i.mercedes-benz.com/swfcn_docker/perception-3d/mmtbag_decoder:v6.6"
|
||||
)
|
||||
DOCKER_CMD_TEMPLATE = [
|
||||
"docker",
|
||||
"run",
|
||||
"--rm",
|
||||
"-v",
|
||||
"{in_dir}:/input",
|
||||
"-v",
|
||||
"{out_dir}:/output",
|
||||
DOCKER_IMAGE,
|
||||
"bash",
|
||||
"-c",
|
||||
"source /opt/ros/noetic/setup.bash && "
|
||||
"/opt/perception-3d/scripts/tools/"
|
||||
"mmt_bag_decoder_scripts/decoded-bag.sh /input /output 3 1",
|
||||
]
|
||||
|
||||
BATCH_SIZE = 50
|
||||
MAX_LOCAL = 100
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY_S = 2
|
||||
METRICS_PORT = 8000
|
||||
|
||||
SENTINEL = (None, None)
|
||||
|
||||
# —— 日志配置 —— #
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logger = logging.getLogger("pipeline")
|
||||
logger.setLevel(logging.INFO)
|
||||
h_info = logging.FileHandler(os.path.join(LOG_DIR, "pipeline.log"), encoding="utf-8")
|
||||
h_err = logging.FileHandler(os.path.join(LOG_DIR, "error_tasks.log"), encoding="utf-8")
|
||||
fmt = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
|
||||
h_info.setFormatter(fmt)
|
||||
h_err.setFormatter(fmt)
|
||||
h_err.setLevel(logging.ERROR)
|
||||
logger.addHandler(h_info)
|
||||
logger.addHandler(h_err)
|
||||
|
||||
# —— Prometheus 指标 —— #
|
||||
DL_TOTAL = Counter("pipeline_download_total", "下载尝试总数")
|
||||
DL_FAIL = Counter("pipeline_download_failures", "下载失败总数")
|
||||
DL_RETRY = Counter("pipeline_download_retries", "下载重试总数")
|
||||
PR_TOTAL = Counter("pipeline_process_total", "处理尝试总数")
|
||||
PR_FAIL = Counter("pipeline_process_failures", "处理失败总数")
|
||||
PR_RETRY = Counter("pipeline_process_retries", "处理重试总数")
|
||||
UP_TOTAL = Counter("pipeline_upload_total", "上传尝试总数")
|
||||
UP_FAIL = Counter("pipeline_upload_failures", "上传失败总数")
|
||||
UP_RETRY = Counter("pipeline_upload_retries", "上传重试总数")
|
||||
|
||||
DL_DUR = Summary("pipeline_download_duration_seconds", "单批下载耗时秒")
|
||||
PR_DUR = Summary("pipeline_process_duration_seconds", "单批处理耗时秒")
|
||||
UP_DUR = Summary("pipeline_upload_duration_seconds", "单批上传耗时秒")
|
||||
|
||||
BATCH_SIZE_HIST = Histogram(
|
||||
"pipeline_batch_size",
|
||||
"单批任务中文件数量分布",
|
||||
buckets=[1, 10, 20, 50, 100, 200, 500],
|
||||
)
|
||||
FILE_DL_DUR = Histogram("pipeline_file_download_duration_seconds", "单文件下载耗时分布")
|
||||
BATCH_OUT_FILES = Gauge("pipeline_batch_output_file_count", "单批处理后输出文件数")
|
||||
|
||||
Q_BATCH = Gauge("pipeline_queue_batches", "待下载批次数")
|
||||
Q_PROC = Gauge("pipeline_queue_processing", "待处理批次数")
|
||||
Q_UP = Gauge("pipeline_queue_uploading", "待上传批次数")
|
||||
LOCAL_FILES = Gauge("pipeline_local_file_count", "本地 input 文件总数")
|
||||
|
||||
# —— 全局队列 & inflight 计数 —— #
|
||||
batch_q = queue.Queue()
|
||||
proc_q = queue.Queue()
|
||||
up_q = queue.Queue()
|
||||
|
||||
inflight = 0
|
||||
inflight_lock = threading.Lock()
|
||||
|
||||
|
||||
# —— 辅助函数 —— #
|
||||
def count_local_files():
|
||||
return sum(len(files) for _, _, files in os.walk(INPUT_ROOT))
|
||||
|
||||
|
||||
def run(cmd, timeout=None):
|
||||
logger.info("CMD: %s", " ".join(cmd))
|
||||
try:
|
||||
p = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
start = time.time()
|
||||
out_lines = []
|
||||
timed_out = False
|
||||
for line in p.stdout:
|
||||
out_lines.append(line)
|
||||
if timeout and (time.time() - start) > timeout:
|
||||
p.kill()
|
||||
timed_out = True
|
||||
break
|
||||
code = p.wait()
|
||||
return code, timed_out, "".join(out_lines)
|
||||
except Exception:
|
||||
logger.exception("CMD 执行异常")
|
||||
return -1, False, ""
|
||||
|
||||
|
||||
def with_retry(tag, fn, *args):
|
||||
for i in range(1, MAX_RETRIES + 1):
|
||||
code, timed_out, _ = fn(*args)
|
||||
if code == 0:
|
||||
return True
|
||||
if timed_out:
|
||||
logger.error("%s 阶段超时,不再重试", tag)
|
||||
break
|
||||
# 计数重试
|
||||
if tag.startswith("DL["):
|
||||
DL_RETRY.inc()
|
||||
if tag.startswith("PR["):
|
||||
PR_RETRY.inc()
|
||||
if tag.startswith("UP["):
|
||||
UP_RETRY.inc()
|
||||
logger.warning("%s 重试 %d/%d", tag, i, MAX_RETRIES)
|
||||
time.sleep(RETRY_DELAY_S)
|
||||
logger.error("%s 最终失败", tag)
|
||||
return False
|
||||
|
||||
|
||||
# —— 下载 —— #
|
||||
@DL_DUR.time()
|
||||
def do_download(batch_id, paths, batch_timeout):
|
||||
if batch_id is None:
|
||||
proc_q.put(SENTINEL)
|
||||
return
|
||||
|
||||
DL_TOTAL.inc()
|
||||
start = time.time()
|
||||
in_dir = os.path.join(INPUT_ROOT, batch_id)
|
||||
os.makedirs(in_dir, exist_ok=True)
|
||||
|
||||
# 限制本地文件数
|
||||
while count_local_files() >= MAX_LOCAL:
|
||||
logger.warning("本地文件过多,暂停下载5分钟")
|
||||
time.sleep(300)
|
||||
|
||||
for p in paths:
|
||||
if time.time() - start > batch_timeout:
|
||||
logger.error("DL[%s] 下载阶段超时,跳过剩余", batch_id)
|
||||
DL_FAIL.inc()
|
||||
break
|
||||
dst = os.path.join(in_dir, os.path.basename(p))
|
||||
f_start = time.time()
|
||||
ok = with_retry(
|
||||
f"DL[{batch_id}]",
|
||||
lambda s, d: run(
|
||||
["coscmd", "-s", "download", s, d],
|
||||
timeout=batch_timeout - (time.time() - start),
|
||||
),
|
||||
p,
|
||||
dst,
|
||||
)
|
||||
FILE_DL_DUR.observe(time.time() - f_start)
|
||||
if not ok:
|
||||
DL_FAIL.inc()
|
||||
|
||||
proc_q.put((batch_id, in_dir))
|
||||
|
||||
|
||||
# —— 处理 —— #
|
||||
@PR_DUR.time()
|
||||
def do_process(batch_id, in_dir, batch_timeout):
|
||||
if batch_id is None:
|
||||
up_q.put(SENTINEL)
|
||||
return
|
||||
|
||||
PR_TOTAL.inc()
|
||||
start = time.time()
|
||||
out_dir = os.path.join(OUTPUT_ROOT, batch_id)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
cmd = [c.format(in_dir=in_dir, out_dir=out_dir) for c in DOCKER_CMD_TEMPLATE]
|
||||
|
||||
def run_pr(command):
|
||||
p = subprocess.Popen(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
for line in p.stdout:
|
||||
logger.info("[PR %s] %s", batch_id, line.rstrip())
|
||||
if time.time() - start > batch_timeout:
|
||||
p.kill()
|
||||
return p.wait(), True, ""
|
||||
return p.wait(), False, ""
|
||||
|
||||
ok = with_retry(f"PR[{batch_id}]", run_pr, cmd)
|
||||
if not ok:
|
||||
PR_FAIL.inc()
|
||||
|
||||
# 统计输出文件
|
||||
files = []
|
||||
for r, _, fs in os.walk(out_dir):
|
||||
for fn in fs:
|
||||
files.append(os.path.relpath(os.path.join(r, fn), out_dir))
|
||||
BATCH_OUT_FILES.set(len(files))
|
||||
|
||||
shutil.rmtree(in_dir, ignore_errors=True)
|
||||
up_q.put((batch_id, out_dir))
|
||||
|
||||
|
||||
# —— 上传 —— #
|
||||
@UP_DUR.time()
|
||||
def do_upload(batch_id, out_dir, batch_timeout):
|
||||
global inflight
|
||||
if batch_id is None:
|
||||
return
|
||||
|
||||
try:
|
||||
UP_TOTAL.inc()
|
||||
ok = with_retry(
|
||||
f"UP[{batch_id}]",
|
||||
lambda d: run(
|
||||
["coscmd", "-s", "upload", "-r", d, COS_BUCKET], timeout=batch_timeout
|
||||
),
|
||||
out_dir,
|
||||
)
|
||||
if not ok:
|
||||
UP_FAIL.inc()
|
||||
return
|
||||
|
||||
# 删除目录结构
|
||||
for cmd in [
|
||||
["sudo", "rsync", "-av", "--delete", f"{EMPTY_DIR}/", f"{out_dir}/"],
|
||||
["sudo", "rm", "-rf", out_dir],
|
||||
]:
|
||||
run(cmd, timeout=60)
|
||||
|
||||
logger.info("UP[%s] 完成", batch_id)
|
||||
finally:
|
||||
# 无论成功失败,任务算完成,inflight-1
|
||||
with inflight_lock:
|
||||
inflight -= 1
|
||||
|
||||
|
||||
# —— Worker 模板 —— #
|
||||
def worker(q, fn, timeout):
|
||||
while True:
|
||||
bid, data = q.get()
|
||||
fn(bid, data, timeout)
|
||||
q.task_done()
|
||||
if bid is None:
|
||||
break
|
||||
|
||||
|
||||
# —— Service HTTP —— #
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/ready", methods=["GET"])
|
||||
def api_ready():
|
||||
with inflight_lock:
|
||||
busy = inflight > 0
|
||||
return jsonify(ready=not busy)
|
||||
|
||||
|
||||
@app.route("/notify", methods=["POST"])
|
||||
def api_notify():
|
||||
global inflight
|
||||
data = request.get_json(force=True)
|
||||
if not isinstance(data, list):
|
||||
return jsonify(error="Expect JSON list"), 400
|
||||
|
||||
# 兼容 bag-checker,只发 name 时补前缀
|
||||
paths = []
|
||||
for item in data:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
if item.startswith("mb_cuct_data_collection/"):
|
||||
paths.append(item)
|
||||
else:
|
||||
paths.append("mb_cuct_data_collection/" + item)
|
||||
|
||||
TIME_RE = re.compile(r"_(\d{8})-(\d{6})_") # 匹配 20230803-160828
|
||||
|
||||
def extract_ts(p: str) -> datetime:
|
||||
m = TIME_RE.search(os.path.basename(p))
|
||||
if not m:
|
||||
return datetime.min # 无法解析的放最后
|
||||
date_part, time_part = m.groups()
|
||||
ts_str = f"{date_part}{time_part}"
|
||||
return datetime.strptime(ts_str, "%Y%m%d%H%M%S")
|
||||
|
||||
paths.sort(key=extract_ts, reverse=True)
|
||||
|
||||
with inflight_lock:
|
||||
inflight += 1
|
||||
|
||||
for idx in range(0, len(paths), BATCH_SIZE):
|
||||
blk = paths[idx : idx + BATCH_SIZE]
|
||||
BATCH_SIZE_HIST.observe(len(blk))
|
||||
bid = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") + f"_{idx // BATCH_SIZE + 1}"
|
||||
batch_q.put((bid, blk))
|
||||
batch_q.put(SENTINEL)
|
||||
|
||||
# batch_q.put((bid, paths))
|
||||
return jsonify(status="accepted", batch_size=BATCH_SIZE), 202
|
||||
|
||||
|
||||
def start_metric_updater():
|
||||
def loop():
|
||||
while True:
|
||||
Q_BATCH.set(batch_q.qsize())
|
||||
Q_PROC.set(proc_q.qsize())
|
||||
Q_UP.set(up_q.qsize())
|
||||
LOCAL_FILES.set(count_local_files())
|
||||
time.sleep(1)
|
||||
|
||||
t = threading.Thread(target=loop, daemon=True)
|
||||
t.start()
|
||||
|
||||
|
||||
# —— 两种模式的入口 —— #
|
||||
def file_mode(args):
|
||||
# 读 tasks-file,分批入队,放入 sentinel,然后启动处理
|
||||
lines = [
|
||||
line.strip() for line in open(args.tasks_file, encoding="utf-8") if line.strip()
|
||||
]
|
||||
for idx in range(0, len(lines), args.batch_size):
|
||||
blk = lines[idx : idx + args.batch_size]
|
||||
BATCH_SIZE_HIST.observe(len(blk))
|
||||
bid = (
|
||||
datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||
+ f"_{idx // args.batch_size + 1}"
|
||||
)
|
||||
batch_q.put((bid, blk))
|
||||
batch_q.put(SENTINEL)
|
||||
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info("Metrics HTTP 启动,端口 %d", METRICS_PORT)
|
||||
|
||||
threads = [
|
||||
threading.Thread(
|
||||
target=worker, args=(batch_q, do_download, args.batch_timeout)
|
||||
),
|
||||
threading.Thread(target=worker, args=(proc_q, do_process, args.batch_timeout)),
|
||||
threading.Thread(target=worker, args=(up_q, do_upload, args.batch_timeout)),
|
||||
]
|
||||
for t in threads:
|
||||
t.start()
|
||||
|
||||
# 更新指标 & 等待完成
|
||||
while batch_q.unfinished_tasks or proc_q.unfinished_tasks or up_q.unfinished_tasks:
|
||||
Q_BATCH.set(batch_q.qsize())
|
||||
Q_PROC.set(proc_q.qsize())
|
||||
Q_UP.set(up_q.qsize())
|
||||
LOCAL_FILES.set(count_local_files())
|
||||
time.sleep(1)
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
logger.info("文件模式处理完成,退出。")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def service_mode(args):
|
||||
# 确保目录存在
|
||||
for d in (INPUT_ROOT, OUTPUT_ROOT, EMPTY_DIR, LOG_DIR):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
# 启动 Prometheus 和指标更新
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info("Metrics HTTP 启动,端口 %d", METRICS_PORT)
|
||||
start_metric_updater()
|
||||
|
||||
# 启动后台 worker
|
||||
threads = [
|
||||
threading.Thread(
|
||||
target=worker, args=(batch_q, do_download, args.batch_timeout), daemon=True
|
||||
),
|
||||
threading.Thread(
|
||||
target=worker, args=(proc_q, do_process, args.batch_timeout), daemon=True
|
||||
),
|
||||
threading.Thread(
|
||||
target=worker, args=(up_q, do_upload, args.batch_timeout), daemon=True
|
||||
),
|
||||
]
|
||||
for t in threads:
|
||||
t.start()
|
||||
|
||||
# 启动 Flask
|
||||
logger.info("Decode Service 启动 HTTP on %s:%d", args.host, args.port)
|
||||
app.run(host=args.host, port=args.port, threaded=True)
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
sub = p.add_subparsers(dest="mode", required=True)
|
||||
|
||||
f = sub.add_parser("file", help="文件模式:--tasks-file")
|
||||
f.add_argument("--tasks-file", required=True)
|
||||
f.add_argument("--batch-size", type=int, default=BATCH_SIZE)
|
||||
f.add_argument("--batch-timeout", type=int, default=3600)
|
||||
|
||||
s = sub.add_parser("service", help="服务模式:启动 HTTP ready/notify")
|
||||
s.add_argument("--batch-timeout", type=int, default=3600)
|
||||
s.add_argument("--host", default="0.0.0.0")
|
||||
s.add_argument("--port", type=int, default=5000)
|
||||
|
||||
args = p.parse_args()
|
||||
if args.mode == "file":
|
||||
file_mode(args)
|
||||
else:
|
||||
service_mode(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user