Put huqie.txt in RAGFflow image

This commit is contained in:
Ling Qin 2025-12-02 16:51:58 +08:00
parent 27b0550876
commit cfc63d3c4a
4 changed files with 15 additions and 555632 deletions

View file

@ -10,11 +10,14 @@ WORKDIR /ragflow
# Copy models downloaded via download_deps.py # Copy models downloaded via download_deps.py
RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \
cp /huggingface.co/InfiniFlow/huqie/huqie.txt.trie /ragflow/rag/res/ && \
tar --exclude='.*' -cf - \ tar --exclude='.*' -cf - \
/huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \ /huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \
/huggingface.co/InfiniFlow/deepdoc \ /huggingface.co/InfiniFlow/deepdoc \
| tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/rag,target=/rag \
mkdir -p /usr/share/infinity/resource/rag/ && \
cp /rag/huqie.txt /usr/share/infinity/resource/rag/
# https://github.com/chrismattmann/tika-python # https://github.com/chrismattmann/tika-python
# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. # This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache.

View file

@ -8,3 +8,5 @@ COPY chromedriver-linux64-121-0-6167-85 chrome-linux64-121-0-6167-85 cl100k_base
COPY nltk_data /nltk_data COPY nltk_data /nltk_data
COPY huggingface.co /huggingface.co COPY huggingface.co /huggingface.co
COPY resource/rag/ /rag

View file

@ -5,11 +5,14 @@
# requires-python = ">=3.10" # requires-python = ">=3.10"
# dependencies = [ # dependencies = [
# "nltk", # "nltk",
# "huggingface_hub",
# "gitpython"
# ] # ]
# /// # ///
import argparse import argparse
import os import os
import git
import urllib.request import urllib.request
from typing import Union from typing import Union
@ -43,7 +46,6 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]:
repos = [ repos = [
"InfiniFlow/text_concat_xgb_v1.0", "InfiniFlow/text_concat_xgb_v1.0",
"InfiniFlow/deepdoc", "InfiniFlow/deepdoc",
"InfiniFlow/huqie",
] ]
@ -75,3 +77,8 @@ if __name__ == "__main__":
for repo_id in repos: for repo_id in repos:
print(f"Downloading huggingface repo {repo_id}...") print(f"Downloading huggingface repo {repo_id}...")
download_model(repo_id) download_model(repo_id)
repo_url = "https://github.com/infiniflow/resource.git"
clone_dir = os.path.abspath("resource")
print(f"Cloning GitHub repo {repo_url}...")
repo = git.Repo.clone_from(repo_url, clone_dir)

File diff suppressed because it is too large Load diff