change documents directory

This commit is contained in:
Mike Fortman 2025-11-26 12:59:40 -06:00
parent 59b88a7020
commit d6466db5a1
18 changed files with 34 additions and 17 deletions

View file

@ -81,7 +81,7 @@ services:
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
volumes: volumes:
- ./documents:/app/documents:Z - ./openrag-documents:/app/documents:Z
- ./keys:/app/keys:Z - ./keys:/app/keys:Z
- ./flows:/app/flows:U,z - ./flows:/app/flows:U,z

View file

@ -80,7 +80,7 @@ services:
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
volumes: volumes:
- ./documents:/app/documents:Z - ./openrag-documents:/app/documents:Z
- ./keys:/app/keys:Z - ./keys:/app/keys:Z
- ./flows:/app/flows:U,z - ./flows:/app/flows:U,z

View file

@ -42,7 +42,7 @@ If you are using GitHub pages for hosting, this command is a convenient way to b
## Update the OpenRAG documentation PDF ## Update the OpenRAG documentation PDF
The documentation PDF at `openrag/documents/openrag-documentation.pdf` is used by the OpenRAG application, so keep it up to date. The documentation PDF at `openrag/openrag-documents/openrag-documentation.pdf` is used by the OpenRAG application, so keep it up to date.
To update the PDF, do the following: To update the PDF, do the following:
@ -68,7 +68,7 @@ To remove these items, give the following prompt or something similar to your ID
2. Check your `.mdx` files to confirm these elements are removed. 2. Check your `.mdx` files to confirm these elements are removed.
Don't commit the changes. Don't commit the changes.
3. From `openrag/docs`, run this command to build the site with the changes, and create a PDF at `openrag/documents`. 3. From `openrag/docs`, run this command to build the site with the changes, and create a PDF at `openrag/openrag-documents`.
``` ```
npm run build:pdf npm run build:pdf

View file

@ -29,7 +29,7 @@ To configure the knowledge ingestion pipeline parameters, see [Docling Ingestion
The **Knowledge Ingest** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database. The **Knowledge Ingest** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose. The default path to your local folder is mounted from the `./openrag-documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
To load and process a single file from the mapped location, click **Add Knowledge**, and then click <Icon name="File" aria-hidden="true"/> **File**. To load and process a single file from the mapped location, click **Add Knowledge**, and then click <Icon name="File" aria-hidden="true"/> **File**.
The file is loaded into your OpenSearch database, and appears in the Knowledge page. The file is loaded into your OpenSearch database, and appears in the Knowledge page.

View file

@ -187,7 +187,7 @@ docker compose up -d --force-recreate
Reset state by rebuilding all of your containers. Reset state by rebuilding all of your containers.
Your OpenSearch and Langflow databases will be lost. Your OpenSearch and Langflow databases will be lost.
Documents stored in the `./documents` directory will persist, since the directory is mounted as a volume in the OpenRAG backend container. Documents stored in the `./openrag-documents` directory will persist, since the directory is mounted as a volume in the OpenRAG backend container.
```bash ```bash
docker compose up --build --force-recreate --remove-orphans docker compose up --build --force-recreate --remove-orphans

View file

@ -101,7 +101,7 @@ You can click a document to view the chunks of the document as they are stored i
For this quickstart, use either the <Icon name="File" aria-hidden="true"/> **File** or <Icon name="Folder" aria-hidden="true"/> **Folder** upload options to load documents from your local machine. For this quickstart, use either the <Icon name="File" aria-hidden="true"/> **File** or <Icon name="Folder" aria-hidden="true"/> **Folder** upload options to load documents from your local machine.
**Folder** uploads an entire directory. **Folder** uploads an entire directory.
The default directory is the `/documents` subdirectory in your OpenRAG installation directory. The default directory is the `/openrag-documents` subdirectory in your OpenRAG installation directory.
For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion). For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).

View file

@ -81,7 +81,7 @@ For more information, see [Ingestion](/ingestion).
| `DISABLE_INGEST_WITH_LANGFLOW` | `false` | Disable Langflow ingestion pipeline. | | `DISABLE_INGEST_WITH_LANGFLOW` | `false` | Disable Langflow ingestion pipeline. |
| `DOCLING_OCR_ENGINE` | - | OCR engine for document processing. | | `DOCLING_OCR_ENGINE` | - | OCR engine for document processing. |
| `OCR_ENABLED` | `false` | Enable OCR for image processing. | | `OCR_ENABLED` | `false` | Enable OCR for image processing. |
| `OPENRAG_DOCUMENTS_PATHS` | `./documents` | Document paths for ingestion. | | `OPENRAG_DOCUMENTS_PATHS` | `./openrag-documents` | Document paths for ingestion. |
| `PICTURE_DESCRIPTIONS_ENABLED` | `false` | Enable picture descriptions. | | `PICTURE_DESCRIPTIONS_ENABLED` | `false` | Enable picture descriptions. |
### Langflow settings ### Langflow settings

View file

@ -6,7 +6,7 @@
"docusaurus": "docusaurus", "docusaurus": "docusaurus",
"start": "docusaurus start", "start": "docusaurus start",
"build": "docusaurus build", "build": "docusaurus build",
"build:pdf": "rm -f ../documents/openrag-documentation.pdf && npm run build && npm run serve & sleep 10 && npx docusaurus-to-pdf && pkill -f 'docusaurus serve'", "build:pdf": "rm -f ../openrag-documents/openrag-documentation.pdf && npm run build && npm run serve & sleep 10 && npx docusaurus-to-pdf && pkill -f 'docusaurus serve'",
"swizzle": "docusaurus swizzle", "swizzle": "docusaurus swizzle",
"deploy": "docusaurus deploy", "deploy": "docusaurus deploy",
"clear": "docusaurus clear", "clear": "docusaurus clear",

View file

@ -1,7 +1,7 @@
{ {
"baseUrl": "http://localhost:3000", "baseUrl": "http://localhost:3000",
"entryPoint": "http://localhost:3000", "entryPoint": "http://localhost:3000",
"outputDir": "../documents/openrag-documentation.pdf", "outputDir": "../openrag-documents/openrag-documentation.pdf",
"customStyles": "table { max-width: 3500px !important; } .navbar, .footer, .breadcrumbs { display: none !important; }", "customStyles": "table { max-width: 3500px !important; } .navbar, .footer, .breadcrumbs { display: none !important; }",
"forceImages": true "forceImages": true
} }

View file

@ -2,6 +2,7 @@
from connectors.langflow_connector_service import LangflowConnectorService from connectors.langflow_connector_service import LangflowConnectorService
from connectors.service import ConnectorService from connectors.service import ConnectorService
from services.flows_service import FlowsService from services.flows_service import FlowsService
from utils.container_utils import detect_container_environment
from utils.embeddings import create_dynamic_index_body from utils.embeddings import create_dynamic_index_body
from utils.logging_config import configure_from_env, get_logger from utils.logging_config import configure_from_env, get_logger
@ -13,6 +14,7 @@ import atexit
import mimetypes import mimetypes
import multiprocessing import multiprocessing
import os import os
import shutil
import subprocess import subprocess
from functools import partial from functools import partial
@ -300,6 +302,21 @@ async def init_index_when_ready():
) )
def _get_documents_dir():
"""Get the documents directory path, handling both Docker and local environments."""
# In Docker, the volume is mounted at /app/documents
# Locally, we use openrag-documents
container_env = detect_container_environment()
if container_env:
path = os.path.abspath("/app/documents")
logger.debug(f"Running in {container_env}, using container path: {path}")
return path
else:
path = os.path.abspath(os.path.join(os.getcwd(), "openrag-documents"))
logger.debug(f"Running locally, using local path: {path}")
return path
async def ingest_default_documents_when_ready(services): async def ingest_default_documents_when_ready(services):
"""Scan the local documents folder and ingest files like a non-auth upload.""" """Scan the local documents folder and ingest files like a non-auth upload."""
try: try:
@ -307,7 +324,7 @@ async def ingest_default_documents_when_ready(services):
"Ingesting default documents when ready", "Ingesting default documents when ready",
disable_langflow_ingest=DISABLE_INGEST_WITH_LANGFLOW, disable_langflow_ingest=DISABLE_INGEST_WITH_LANGFLOW,
) )
base_dir = os.path.abspath(os.path.join(os.getcwd(), "documents")) base_dir = _get_documents_dir()
if not os.path.isdir(base_dir): if not os.path.isdir(base_dir):
logger.info( logger.info(
"Default documents directory not found; skipping ingestion", "Default documents directory not found; skipping ingestion",

View file

@ -455,7 +455,7 @@ def _copy_assets(resource_tree, destination: Path, allowed_suffixes: Optional[It
def copy_sample_documents(*, force: bool = False) -> None: def copy_sample_documents(*, force: bool = False) -> None:
"""Copy sample documents from package to current directory if they don't exist.""" """Copy sample documents from package to current directory if they don't exist."""
documents_dir = Path("documents") documents_dir = Path("openrag-documents")
try: try:
assets_files = files("tui._assets.documents") assets_files = files("tui._assets.documents")

View file

@ -64,7 +64,7 @@ class EnvConfig:
nudges_flow_id: str = "ebc01d31-1976-46ce-a385-b0240327226c" nudges_flow_id: str = "ebc01d31-1976-46ce-a385-b0240327226c"
# Document paths (comma-separated) # Document paths (comma-separated)
openrag_documents_paths: str = "./documents" openrag_documents_paths: str = "./openrag-documents"
# OpenSearch data path # OpenSearch data path
opensearch_data_path: str = "./opensearch-data" opensearch_data_path: str = "./opensearch-data"
@ -454,7 +454,7 @@ class EnvManager:
( (
"openrag_documents_paths", "openrag_documents_paths",
"Documents Paths", "Documents Paths",
"./documents,/path/to/more/docs", "./openrag-documents,/path/to/more/docs",
False, False,
), ),
] ]
@ -521,7 +521,7 @@ class EnvManager:
) )
if not is_valid: if not is_valid:
return ["./documents:/app/documents:Z"] # fallback return ["./openrag-documents:/app/documents:Z"] # fallback
volume_mounts = [] volume_mounts = []
for i, path in enumerate(validated_paths): for i, path in enumerate(validated_paths):

View file

@ -523,7 +523,7 @@ class ConfigScreen(Screen):
yield Label("Documents Paths") yield Label("Documents Paths")
current_value = getattr(self.env_manager.config, "openrag_documents_paths", "") current_value = getattr(self.env_manager.config, "openrag_documents_paths", "")
input_widget = Input( input_widget = Input(
placeholder="./documents,/path/to/more/docs", placeholder="./openrag-documents,/path/to/more/docs",
value=current_value, value=current_value,
validators=[DocumentsPathValidator()], validators=[DocumentsPathValidator()],
id="input-openrag_documents_paths", id="input-openrag_documents_paths",

View file

@ -29,7 +29,7 @@ async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0):
def count_files_in_documents() -> int: def count_files_in_documents() -> int:
base_dir = Path(os.getcwd()) / "documents" base_dir = Path(os.getcwd()) / "openrag-documents"
if not base_dir.is_dir(): if not base_dir.is_dir():
return 0 return 0
return sum(1 for _ in base_dir.rglob("*") if _.is_file() and _.name not in EXCLUDED_INGESTION_FILES) return sum(1 for _ in base_dir.rglob("*") if _.is_file() and _.name not in EXCLUDED_INGESTION_FILES)