diff --git a/.gitignore b/.gitignore index 956cd63..fbf80b3 100644 --- a/.gitignore +++ b/.gitignore @@ -149,7 +149,7 @@ out # Nuxt.js build / generate output .nuxt dist - +ragflow_cli.egg-info # Gatsby files .cache/ # Comment in the public line in if your project uses Gatsby and not Next.js diff --git a/Dockerfile b/Dockerfile index 4524d59..4279875 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,197 @@ -# Application stage - builds on top of the base image -# First build the base image using: docker build -f Dockerfile.base -t ragflow-base:latest . -FROM ragflow-base:latest AS production +# base stage +FROM ubuntu:22.04 AS base +USER root +SHELL ["/bin/bash", "-c"] + +ARG NEED_MIRROR=0 +ARG LIGHTEN=0 +ENV LIGHTEN=${LIGHTEN} + +WORKDIR /ragflow + +# Copy models downloaded via download_deps.py +RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ + cp /huggingface.co/InfiniFlow/huqie/huqie.txt.trie /ragflow/rag/res/ && \ + tar --exclude='.*' -cf - \ + /huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \ + /huggingface.co/InfiniFlow/deepdoc \ + | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ + if [ "$LIGHTEN" != "1" ]; then \ + (tar -cf - \ + /huggingface.co/BAAI/bge-large-zh-v1.5 \ + /huggingface.co/maidalun1020/bce-embedding-base_v1 \ + | tar -xf - --strip-components=2 -C /root/.ragflow) \ + fi + +# https://github.com/chrismattmann/tika-python +# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \ + cp -r /deps/nltk_data /root/ && \ + cp /deps/tika-server-standard-3.0.0.jar /deps/tika-server-standard-3.0.0.jar.md5 /ragflow/ && \ + cp /deps/cl100k_base.tiktoken /ragflow/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 + +ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard-3.0.0.jar" +ENV DEBIAN_FRONTEND=noninteractive + +# Setup apt +# Python package and implicit dependencies: +# opencv-python: libglib2.0-0 libglx-mesa0 libgl1 +# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb +# python-pptx: default-jdk tika-server-standard-3.0.0.jar +# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85 +# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev +RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ + if [ "$NEED_MIRROR" == "1" ]; then \ + sed -i 's|http://ports.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \ + sed -i 's|http://archive.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \ + fi; \ + rm -f /etc/apt/apt.conf.d/docker-clean && \ + echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \ + chmod 1777 /tmp && \ + apt update && \ + apt --no-install-recommends install -y ca-certificates && \ + apt update && \ + apt install -y libglib2.0-0 libglx-mesa0 libgl1 && \ + apt install -y pkg-config libicu-dev libgdiplus && \ + apt install -y default-jdk && \ + apt install -y libatk-bridge2.0-0 && \ + apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \ + apt install -y libjemalloc-dev && \ + apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ + apt install -y ghostscript + +RUN if [ "$NEED_MIRROR" == "1" ]; then \ + pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ + pip3 config set global.trusted-host mirrors.aliyun.com; \ + mkdir -p /etc/uv && \ + echo "[[index]]" > /etc/uv/uv.toml && \ + echo 'url = "https://mirrors.aliyun.com/pypi/simple"' >> /etc/uv/uv.toml && \ + echo "default = true" >> /etc/uv/uv.toml; \ + fi; \ + pipx install uv + +ENV PYTHONDONTWRITEBYTECODE=1 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 +ENV PATH=/root/.local/bin:$PATH + +# nodejs 12.22 on Ubuntu 22.04 is too old +RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ + apt purge -y nodejs npm cargo && \ + apt autoremove -y && \ + apt update && \ + apt install -y nodejs + +# A modern version of cargo is needed for the latest version of the Rust compiler. +RUN apt update && apt install -y curl build-essential \ + && if [ "$NEED_MIRROR" == "1" ]; then \ + # Use TUNA mirrors for rustup/rust dist files + export RUSTUP_DIST_SERVER="https://mirrors.tuna.tsinghua.edu.cn/rustup"; \ + export RUSTUP_UPDATE_ROOT="https://mirrors.tuna.tsinghua.edu.cn/rustup/rustup"; \ + echo "Using TUNA mirrors for Rustup."; \ + fi; \ + # Force curl to use HTTP/1.1 + curl --proto '=https' --tlsv1.2 --http1.1 -sSf https://sh.rustup.rs | bash -s -- -y --profile minimal \ + && echo 'export PATH="/root/.cargo/bin:${PATH}"' >> /root/.bashrc + +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN cargo --version && rustc --version + +# Add msssql ODBC driver +# macOS ARM64 environment, install msodbcsql18. +# general x86_64 environment, install msodbcsql17. +RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ + curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ + curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ + apt update && \ + arch="$(uname -m)"; \ + if [ "$arch" = "arm64" ] || [ "$arch" = "aarch64" ]; then \ + # ARM64 (macOS/Apple Silicon or Linux aarch64) + ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql18; \ + else \ + # x86_64 or others + ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql17; \ + fi || \ + { echo "Failed to install ODBC driver"; exit 1; } + + + +# Add dependencies of selenium +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chrome-linux64-121-0-6167-85,target=/chrome-linux64.zip \ + unzip /chrome-linux64.zip && \ + mv chrome-linux64 /opt/chrome && \ + ln -s /opt/chrome/chrome /usr/local/bin/ +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-linux64-121-0-6167-85,target=/chromedriver-linux64.zip \ + unzip -j /chromedriver-linux64.zip chromedriver-linux64/chromedriver && \ + mv chromedriver /usr/local/bin/ && \ + rm -f /usr/bin/google-chrome + +# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13 +# aspose-slides on linux/arm64 is unavailable +RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \ + if [ "$(uname -m)" = "x86_64" ]; then \ + dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_arm64.deb; \ + fi + + +# builder stage +FROM base AS builder USER root WORKDIR /ragflow -# Copy application source code (these files change frequently) +# install dependencies from uv.lock file +COPY pyproject.toml uv.lock ./ + +# https://github.com/astral-sh/uv/issues/10462 +# uv records index url into uv.lock but doesn't failover among multiple indexes +RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \ + if [ "$NEED_MIRROR" == "1" ]; then \ + sed -i 's|pypi.org|mirrors.aliyun.com/pypi|g' uv.lock; \ + else \ + sed -i 's|mirrors.aliyun.com/pypi|pypi.org|g' uv.lock; \ + fi; \ + if [ "$LIGHTEN" == "1" ]; then \ + uv sync --python 3.10 --frozen; \ + else \ + uv sync --python 3.10 --frozen --all-extras; \ + fi + +COPY web web +COPY docs docs +RUN --mount=type=cache,id=ragflow_npm,target=/root/.npm,sharing=locked \ + cd web && npm install && npm run build + +COPY .git /ragflow/.git + +RUN version_info=$(git describe --tags --match=v* --first-parent --always); \ + if [ "$LIGHTEN" == "1" ]; then \ + version_info="$version_info slim"; \ + else \ + version_info="$version_info full"; \ + fi; \ + echo "RAGFlow version: $version_info"; \ + echo $version_info > /ragflow/VERSION + +# production stage +FROM base AS production +USER root + +WORKDIR /ragflow + +# Copy Python environment and packages +ENV VIRTUAL_ENV=/ragflow/.venv +COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" + +ENV PYTHONPATH=/ragflow/ + +COPY web web +COPY admin admin COPY api api COPY conf conf COPY deepdoc deepdoc @@ -13,14 +199,16 @@ COPY rag rag COPY agent agent COPY graphrag graphrag COPY agentic_reasoning agentic_reasoning -COPY pyproject.toml ./ +COPY pyproject.toml uv.lock ./ COPY mcp mcp COPY plugin plugin -# Copy configuration templates and entrypoint COPY docker/service_conf.yaml.template ./conf/service_conf.yaml.template COPY docker/entrypoint.sh ./ RUN chmod +x ./entrypoint*.sh -# Set the entrypoint -ENTRYPOINT ["./entrypoint.sh"] \ No newline at end of file +# Copy compiled web pages +COPY --from=builder /ragflow/web/dist /ragflow/web/dist + +COPY --from=builder /ragflow/VERSION /ragflow/VERSION +ENTRYPOINT ["./entrypoint.sh"] diff --git a/Dockerfile.base b/Dockerfile.base deleted file mode 100644 index dded3f7..0000000 --- a/Dockerfile.base +++ /dev/null @@ -1,189 +0,0 @@ -# base stage -FROM ubuntu:22.04 AS base -USER root -SHELL ["/bin/bash", "-c"] - -ARG NEED_MIRROR=0 -ARG LIGHTEN=0 -ENV LIGHTEN=${LIGHTEN} - -WORKDIR /ragflow - -# Copy models downloaded via download_deps.py -RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ - cp /huggingface.co/InfiniFlow/huqie/huqie.txt.trie /ragflow/rag/res/ && \ - tar --exclude='.*' -cf - \ - /huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \ - /huggingface.co/InfiniFlow/deepdoc \ - | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ - if [ "$LIGHTEN" != "1" ]; then \ - (tar -cf - \ - /huggingface.co/BAAI/bge-large-zh-v1.5 \ - /huggingface.co/maidalun1020/bce-embedding-base_v1 \ - | tar -xf - --strip-components=2 -C /root/.ragflow) \ - fi - -# https://github.com/chrismattmann/tika-python -# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \ - cp -r /deps/nltk_data /root/ && \ - cp /deps/tika-server-standard-3.0.0.jar /deps/tika-server-standard-3.0.0.jar.md5 /ragflow/ && \ - cp /deps/cl100k_base.tiktoken /ragflow/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 - -ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard-3.0.0.jar" -ENV DEBIAN_FRONTEND=noninteractive - -# Setup apt -# Python package and implicit dependencies: -# opencv-python: libglib2.0-0 libglx-mesa0 libgl1 -# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb -# python-pptx: default-jdk tika-server-standard-3.0.0.jar -# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85 -# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev -RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ - if [ "$NEED_MIRROR" == "1" ]; then \ - sed -i 's|http://ports.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \ - sed -i 's|http://archive.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \ - fi; \ - rm -f /etc/apt/apt.conf.d/docker-clean && \ - echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \ - chmod 1777 /tmp && \ - apt update && \ - apt --no-install-recommends install -y ca-certificates && \ - apt update && \ - apt install -y libglib2.0-0 libglx-mesa0 libgl1 && \ - apt install -y pkg-config libicu-dev libgdiplus && \ - apt install -y default-jdk && \ - apt install -y libatk-bridge2.0-0 && \ - apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \ - apt install -y libjemalloc-dev && \ - apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ - apt install -y ghostscript - -RUN if [ "$NEED_MIRROR" == "1" ]; then \ - pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ - pip3 config set global.trusted-host mirrors.aliyun.com; \ - mkdir -p /etc/uv && \ - echo "[[index]]" > /etc/uv/uv.toml && \ - echo 'url = "https://mirrors.aliyun.com/pypi/simple"' >> /etc/uv/uv.toml && \ - echo "default = true" >> /etc/uv/uv.toml; \ - fi; \ - pipx install uv - -ENV PYTHONDONTWRITEBYTECODE=1 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 -ENV PATH=/root/.local/bin:$PATH - -# nodejs 12.22 on Ubuntu 22.04 is too old -RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ - curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ - apt purge -y nodejs npm cargo && \ - apt autoremove -y && \ - apt update && \ - apt install -y nodejs - -# A modern version of cargo is needed for the latest version of the Rust compiler. -RUN apt update && apt install -y curl build-essential \ - && if [ "$NEED_MIRROR" == "1" ]; then \ - # Use TUNA mirrors for rustup/rust dist files - export RUSTUP_DIST_SERVER="https://mirrors.tuna.tsinghua.edu.cn/rustup"; \ - export RUSTUP_UPDATE_ROOT="https://mirrors.tuna.tsinghua.edu.cn/rustup/rustup"; \ - echo "Using TUNA mirrors for Rustup."; \ - fi; \ - # Force curl to use HTTP/1.1 - curl --proto '=https' --tlsv1.2 --http1.1 -sSf https://sh.rustup.rs | bash -s -- -y --profile minimal \ - && echo 'export PATH="/root/.cargo/bin:${PATH}"' >> /root/.bashrc - -ENV PATH="/root/.cargo/bin:${PATH}" - -RUN cargo --version && rustc --version - -# Add msssql ODBC driver -# macOS ARM64 environment, install msodbcsql18. -# general x86_64 environment, install msodbcsql17. -RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ - curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ - curl https://packages.microsoft.com/config/ubuntu/22.04/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ - apt update && \ - arch="$(uname -m)"; \ - if [ "$arch" = "arm64" ] || [ "$arch" = "aarch64" ]; then \ - # ARM64 (macOS/Apple Silicon or Linux aarch64) - ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql18; \ - else \ - # x86_64 or others - ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql17; \ - fi || \ - { echo "Failed to install ODBC driver"; exit 1; } - -# Add dependencies of selenium -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chrome-linux64-121-0-6167-85,target=/chrome-linux64.zip \ - unzip /chrome-linux64.zip && \ - mv chrome-linux64 /opt/chrome && \ - ln -s /opt/chrome/chrome /usr/local/bin/ -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-linux64-121-0-6167-85,target=/chromedriver-linux64.zip \ - unzip -j /chromedriver-linux64.zip chromedriver-linux64/chromedriver && \ - mv chromedriver /usr/local/bin/ && \ - rm -f /usr/bin/google-chrome - -# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13 -# aspose-slides on linux/arm64 is unavailable -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \ - if [ "$(uname -m)" = "x86_64" ]; then \ - dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \ - elif [ "$(uname -m)" = "aarch64" ]; then \ - dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_arm64.deb; \ - fi - -# builder stage -FROM base AS builder -USER root - -WORKDIR /ragflow - -# install dependencies from uv.lock file -COPY pyproject.toml ./ -RUN uv lock --python 3.10 - -# https://github.com/astral-sh/uv/issues/10462 -# uv records index url into uv.lock but doesn't failover among multiple indexes -RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \ - if [ "$NEED_MIRROR" == "1" ]; then \ - sed -i 's|pypi.org|mirrors.aliyun.com/pypi|g' uv.lock; \ - else \ - sed -i 's|mirrors.aliyun.com/pypi|pypi.org|g' uv.lock; \ - fi; \ - if [ "$LIGHTEN" == "1" ]; then \ - uv sync --python 3.10 --frozen; \ - else \ - uv sync --python 3.10 --frozen --all-extras; \ - fi - -RUN --mount=type=cache,id=ragflow_npm,target=/root/.npm,sharing=locked - -COPY .git /ragflow/.git - -RUN version_info=$(git describe --tags --match=v* --first-parent --always); \ - if [ "$LIGHTEN" == "1" ]; then \ - version_info="$version_info slim"; \ - else \ - version_info="$version_info full"; \ - fi; \ - echo "RAGFlow version: $version_info"; \ - echo $version_info > /ragflow/VERSION - -# Final base image with Python environment -FROM base AS ragflow-base -USER root - -WORKDIR /ragflow - -# Copy Python environment and packages from builder -ENV VIRTUAL_ENV=/ragflow/.venv -COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} -ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" - -ENV PYTHONPATH=/ragflow/ - -# Copy version info -COPY --from=builder /ragflow/VERSION /ragflow/VERSION diff --git a/README.md b/README.md index 915000b..2730ff1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
@@ -22,7 +22,7 @@