diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index a665710d..eb1f737c 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -6,8 +6,7 @@ on: - main paths: - 'docs/**' - # Review gh actions docs if you want to further define triggers, paths, etc - # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on + workflow_dispatch: jobs: deploy: diff --git a/.gitignore b/.gitignore index 8bf471e7..970b5bec 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ wheels/ *.json .DS_Store -config.yaml +config/ diff --git a/config/config.example.yaml b/config/config.example.yaml new file mode 100644 index 00000000..410025e7 --- /dev/null +++ b/config/config.example.yaml @@ -0,0 +1,15 @@ +# OpenRAG Configuration File +provider: + model_provider: "openai" # openai, anthropic, azure, etc. + api_key: "your-api-key" # or use OPENAI_API_KEY env var + +knowledge: + embedding_model: "text-embedding-3-small" + chunk_size: 1000 + chunk_overlap: 200 + ocr: true + picture_descriptions: false + +agent: + llm_model: "gpt-4o-mini" + system_prompt: "You are a helpful AI assistant..." \ No newline at end of file diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml index d0de6ce9..9b0ff88b 100644 --- a/docker-compose-cpu.yml +++ b/docker-compose-cpu.yml @@ -74,6 +74,7 @@ services: - ./documents:/app/documents:Z - ./keys:/app/keys:Z - ./flows:/app/flows:Z + - ./config:/app/config:z openrag-frontend: image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} diff --git a/docker-compose.yml b/docker-compose.yml index daa921ae..34a5947f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,6 +73,7 @@ services: - ./documents:/app/documents:Z - ./keys:/app/keys:Z - ./flows:/app/flows:z + - ./config:/app/config:z gpus: all openrag-frontend: diff --git a/docs/VERSIONING_SETUP.md b/docs/VERSIONING_SETUP.md new file mode 100644 index 00000000..17a8c8f6 --- /dev/null +++ b/docs/VERSIONING_SETUP.md @@ -0,0 +1,111 @@ +# Docusaurus versioning setup + +Docs versioning is currently **DISABLED** but configured and ready to enable. +The configuration is found in `docusaurus.config.js` with commented-out sections. + +To enable versioning, do the following: + +1. Open `docusaurus.config.js` +2. Find the versioning configuration section (around line 57) +3. Uncomment the versioning configuration: + +```javascript +docs: { + // ... other config + lastVersion: 'current', // Use 'current' to make ./docs the latest version + versions: { + current: { + label: 'Next (unreleased)', + path: 'next', + }, + }, + onlyIncludeVersions: ['current'], // Limit versions for faster builds +}, +``` + +## Create docs versions + +See the [Docusaurus docs](https://docusaurus.io/docs/versioning) for more info. + +1. Use the Docusaurus CLI command to create a version. +You can use `yarn` instead of `npm`. +```bash +# Create version 1.0.0 from current docs +npm run docusaurus docs:version 1.0.0 +``` + +This command will: +- Copy the full `docs/` folder contents into `versioned_docs/version-1.0.0/` +- Create a versioned sidebar file at `versioned_sidebars/version-1.0.0-sidebars.json` +- Append the new version to `versions.json` + +3. After creating a version, update the Docusaurus configuration to include multiple versions. +`lastVersion:'1.0.0'` makes the '1.0.0' release the `latest` version. +`current` is the work-in-progress docset, accessible at `/docs/next`. +To remove a version, remove it from `onlyIncludeVersions`. + +```javascript +docs: { + // ... other config + lastVersion: '1.0.0', // Make 1.0.0 the latest version + versions: { + current: { + label: 'Next (unreleased)', + path: 'next', + }, + '1.0.0': { + label: '1.0.0', + path: '1.0.0', + }, + }, + onlyIncludeVersions: ['current', '1.0.0'], // Include both versions +}, +``` + +4. Test the deployment locally. + +```bash +npm run build +npm run serve +``` + +5. To add subsequent versions, repeat the process, first running the CLI command then updating `docusaurus.config.js`. + +```bash +# Create version 2.0.0 from current docs +npm run docusaurus docs:version 2.0.0 +``` + +After creating a new version, update `docusaurus.config.js`. + +```javascript +docs: { + lastVersion: '2.0.0', // Make 2.0.0 the latest version + versions: { + current: { + label: 'Next (unreleased)', + path: 'next', + }, + '2.0.0': { + label: '2.0.0', + path: '2.0.0', + }, + '1.0.0': { + label: '1.0.0', + path: '1.0.0', + }, + }, + onlyIncludeVersions: ['current', '2.0.0', '1.0.0'], // Include all versions +}, +``` + +## Disable versioning + +1. Remove the `versions` configuration from `docusaurus.config.js`. +2. Delete the `docs/versioned_docs/` and `docs/versioned_sidebars/` directories. +3. Delete `docs/versions.json`. + +## References + +- [Official Docusaurus Versioning Documentation](https://docusaurus.io/docs/versioning) +- [Docusaurus Versioning Best Practices](https://docusaurus.io/docs/versioning#recommended-practices) \ No newline at end of file diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx new file mode 100644 index 00000000..7e5afb20 --- /dev/null +++ b/docs/docs/core-components/ingestion.mdx @@ -0,0 +1,50 @@ +--- +title: Docling Ingestion +slug: /ingestion +--- + +import Icon from "@site/src/components/icon/icon"; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; + +OpenRAG uses [Docling](https://docling-project.github.io/docling/) for its document ingestion pipeline. +More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling-serve` process on your local machine and runs Docling ingestion through an API service. + +Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index. + +OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images. + +## Docling ingestion settings + +These settings configure the Docling ingestion parameters. + +OpenRAG will warn you if `docling-serve` is not running. +To start or stop `docling-serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**. + +**Embedding model** determines which AI model is used to create vector embeddings. The default is `text-embedding-3-small`. + +**Chunk size** determines how large each text chunk is in number of characters. +Larger chunks yield more context per chunk, but may include irrelevant information. Smaller chunks yield more precise semantic search, but may lack context. +The default value of `1000` characters provides a good starting point that balances these considerations. + +**Chunk overlap** controls the number of characters that overlap over chunk boundaries. +Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important. +The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents. + +**OCR** enables or disabled OCR processing when extracting text from images and scanned documents. +OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed. + +Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance. + +If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/). + +**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance. + +## Use OpenRAG default ingestion instead of Docling serve + +If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/configure/configuration#ingestion-configuration). + +The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API. + +For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58). \ No newline at end of file diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx index 2ea5ef9f..852991a7 100644 --- a/docs/docs/core-components/knowledge.mdx +++ b/docs/docs/core-components/knowledge.mdx @@ -97,6 +97,10 @@ You can monitor the sync progress in the