Put huqie.txt in RAGFflow image

2025-12-02 16:51:58 +08:00 · 2025-12-02 16:51:58 +08:00 · cfc63d3c4a
commit cfc63d3c4a
parent 27b0550876
4 changed files with 15 additions and 555632 deletions
--- a/7
+++ b/7
@ -10,11 +10,14 @@ WORKDIR /ragflow
 # Copy models downloaded via download_deps.py
 RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow
 RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \
    cp /huggingface.co/InfiniFlow/huqie/huqie.txt.trie /ragflow/rag/res/ && \
    tar --exclude='.*' -cf - \
        /huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \
        /huggingface.co/InfiniFlow/deepdoc \
-        | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc 
+        | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc
 RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/rag,target=/rag \
    mkdir -p /usr/share/infinity/resource/rag/ && \
    cp /rag/huqie.txt /usr/share/infinity/resource/rag/
 # https://github.com/chrismattmann/tika-python
 # This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache.
--- a/Dockerfile.deps
+++ b/Dockerfile.deps
@ -8,3 +8,5 @@ COPY chromedriver-linux64-121-0-6167-85 chrome-linux64-121-0-6167-85 cl100k_base
 COPY nltk_data /nltk_data
 COPY huggingface.co /huggingface.co
 COPY resource/rag/ /rag
--- a/download_deps.py
+++ b/download_deps.py
@ -5,11 +5,14 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #   "nltk",
 #   "huggingface_hub",
 #   "gitpython"
 # ]
 # ///
 import argparse
 import os
 import git
 import urllib.request
 from typing import Union
@ -43,7 +46,6 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]:
 repos = [
    "InfiniFlow/text_concat_xgb_v1.0",
    "InfiniFlow/deepdoc",
    "InfiniFlow/huqie",
 ]
@ -75,3 +77,8 @@ if __name__ == "__main__":
    for repo_id in repos:
        print(f"Downloading huggingface repo {repo_id}...")
        download_model(repo_id)
    repo_url = "https://github.com/infiniflow/resource.git"
    clone_dir = os.path.abspath("resource")
    print(f"Cloning GitHub repo {repo_url}...")
    repo = git.Repo.clone_from(repo_url, clone_dir)
--- a/rag/res/huqie.txt
+++ b/rag/res/huqie.txt