From bf845de31681c0ac8b14a88732b747ec597a9481 Mon Sep 17 00:00:00 2001 From: Dang Zerong Date: Fri, 6 Mar 2026 16:00:15 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..a196a8e --- /dev/null +++ b/test.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# PEP 723 metadata +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "huggingface-hub", +# "nltk", +# ] +# /// + +from huggingface_hub import snapshot_download +from typing import Union +import nltk +import os +import urllib.request +import argparse + + +def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]: + if use_china_mirrors: + return [ + "http://mirrors.tuna.tsinghua.edu.cn/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb", + "http://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_arm64.deb", + "https://repo.huaweicloud.com/repository/maven/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar", + "https://repo.huaweicloud.com/repository/maven/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5", + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + ["https://registry.npmmirror.com/-/binary/chrome-for-testing/121.0.6167.85/linux64/chrome-linux64.zip", + "chrome-linux64-121-0-6167-85"], + ["https://registry.npmmirror.com/-/binary/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip", + "chromedriver-linux64-121-0-6167-85"], + ] + else: + return [ + "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb", + "http://ports.ubuntu.com/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_arm64.deb", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5", + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + ["https://storage.googleapis.com/chrome-for-testing-public/121.0.6167.85/linux64/chrome-linux64.zip", + "chrome-linux64-121-0-6167-85"], + ["https://storage.googleapis.com/chrome-for-testing-public/121.0.6167.85/linux64/chromedriver-linux64.zip", + "chromedriver-linux64-121-0-6167-85"], + ] + + +repos = [ + "InfiniFlow/text_concat_xgb_v1.0", + "InfiniFlow/deepdoc", + "InfiniFlow/huqie", + "BAAI/bge-large-zh-v1.5", + "maidalun1020/bce-embedding-base_v1", +] + + +def download_model(repo_id): + local_dir = os.path.abspath(os.path.join("huggingface.co", repo_id)) + os.makedirs(local_dir, exist_ok=True) + snapshot_download(repo_id=repo_id, local_dir=local_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Download dependencies with optional China mirror support') + parser.add_argument('--china-mirrors', action='store_true', help='Use China-accessible mirrors for downloads') + args = parser.parse_args() + + urls = get_urls(args.china_mirrors) + + for url in urls: + download_url = url[0] if isinstance(url, list) else url + filename = url[1] if isinstance(url, list) else url.split("/")[-1] + print(f"Downloading {filename} from {download_url}...") + if not os.path.exists(filename): + urllib.request.urlretrieve(download_url, filename) + + local_dir = os.path.abspath('nltk_data') + for data in ['wordnet', 'punkt', 'punkt_tab']: + print(f"Downloading nltk {data}...") + nltk.download(data, download_dir=local_dir) + + for repo_id in repos: + print(f"Downloading huggingface repo {repo_id}...") + download_model(repo_id)