Merge remote-tracking branch 'origin/main' into feat/COG-184-add-falkordb

This commit is contained in:
Boris Arzentar 2024-10-29 17:25:19 +01:00
commit 62022a85f3
9 changed files with 1477 additions and 1430 deletions

View file

@ -17,22 +17,21 @@ permissions:
contents: write contents: write
jobs: jobs:
get_docs_changes:
name: run docs changes
# get_docs_changes: uses: ./.github/workflows/get_docs_changes.yml
# name: run docs changes
# uses: ./.github/workflows/get_docs_changes.yml
deploy: deploy:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# needs: get_docs_changes needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'false'
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@master uses: actions/checkout@master
- name: Install Poetry - name: Install Poetry
uses: snok/install-poetry@v1.3.1 uses: snok/install-poetry@v1.3.1
- name: Use output - name: Use output
run: echo "The stage is finished" run: echo "The stage is finished"
@ -41,7 +40,6 @@ jobs:
with: with:
python-version: '3.11.x' python-version: '3.11.x'
- name: Install APT packages - name: Install APT packages
run: | run: |
sudo apt-get update && sudo apt-get update &&
@ -49,9 +47,10 @@ jobs:
- name: Install via Poetry - name: Install via Poetry
run: poetry install --with dev,docs run: poetry install --with dev,docs
env: env:
GH_TOKEN: ${{ secrets.PAT_FOR_CROSS_REPOS_CICD_TRIGGERING }} GH_TOKEN: ${{ secrets.PAT_FOR_CROSS_REPOS_CICD_TRIGGERING }}
- name: Build and deploy MkDocs - name: Build and deploy MkDocs
run: poetry run mkdocs gh-deploy --force run: poetry run mkdocs gh-deploy --force
env:
DOCS_SEGMENT_KEY: ${{ secrets.DOCS_SEGMENT_KEY }}

61
.github/workflows/test_notebook.yml vendored Normal file
View file

@ -0,0 +1,61 @@
name: test | notebook
on:
pull_request:
branches:
- main
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
RUNTIME__LOG_LEVEL: ERROR
jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
run_notebook_test:
name: test
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true'
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- name: Check out
uses: actions/checkout@master
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11.x'
- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Install dependencies
run: |
poetry install --no-interaction
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
poetry run jupyter nbconvert \
--to notebook \
--execute notebooks/cognee_demo.ipynb \
--output executed_notebook.ipynb \
--ExecutePreprocessor.timeout=1200

View file

@ -200,7 +200,7 @@ Cognee supports a variety of tools and services for different operations:
## Demo ## Demo
Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/main/notebooks/cognee%20-%20Get%20Started.ipynb) Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/main/notebooks/cognee_demo.ipynb)

View file

@ -1,7 +1,9 @@
from os import path from os import path
from typing import AsyncGenerator from uuid import UUID
from typing import Optional
from typing import AsyncGenerator, List
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from sqlalchemy import text, select from sqlalchemy import text, select, MetaData, Table
from sqlalchemy.orm import joinedload from sqlalchemy.orm import joinedload
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
@ -50,11 +52,14 @@ class SQLAlchemyAdapter():
await connection.execute(text(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});")) await connection.execute(text(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});"))
await connection.close() await connection.close()
async def delete_table(self, table_name: str): async def delete_table(self, table_name: str, schema_name: Optional[str] = "public"):
async with self.engine.begin() as connection: async with self.engine.begin() as connection:
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;")) if self.engine.dialect.name == "sqlite":
# SQLite doesnt support schema namespaces and the CASCADE keyword.
await connection.close() # However, foreign key constraint can be defined with ON DELETE CASCADE during table creation.
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name};"))
else:
await connection.execute(text(f"DROP TABLE IF EXISTS {schema_name}.{table_name} CASCADE;"))
async def insert_data(self, schema_name: str, table_name: str, data: list[dict]): async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
columns = ", ".join(data[0].keys()) columns = ", ".join(data[0].keys())
@ -65,6 +70,55 @@ class SQLAlchemyAdapter():
await connection.execute(insert_query, data) await connection.execute(insert_query, data)
await connection.close() await connection.close()
async def get_schema_list(self) -> List[str]:
"""
Return a list of all schema names in database
"""
if self.engine.dialect.name == "postgresql":
async with self.engine.begin() as connection:
result = await connection.execute(
text("""
SELECT schema_name FROM information_schema.schemata
WHERE schema_name NOT IN ('pg_catalog', 'pg_toast', 'information_schema');
""")
)
return [schema[0] for schema in result.fetchall()]
return []
async def delete_data_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"):
"""
Delete data in given table based on id. Table must have an id Column.
"""
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table:
"""
Dynamically loads a table using the given table name and schema name.
"""
async with self.engine.begin() as connection:
if self.engine.dialect.name == "sqlite":
# Load the schema information into the MetaData object
await connection.run_sync(Base.metadata.reflect)
if table_name in Base.metadata.tables:
return Base.metadata.tables[table_name]
else:
raise ValueError(f"Table '{table_name}' not found.")
else:
# Create a MetaData instance to load table information
metadata = MetaData()
# Load table information from schema into MetaData
await connection.run_sync(metadata.reflect, schema=schema_name)
# Define the full table name
full_table_name = f"{schema_name}.{table_name}"
# Check if table is in list of tables for the given schema
if full_table_name in metadata.tables:
return metadata.tables[full_table_name]
raise ValueError(f"Table '{full_table_name}' not found.")
async def get_data(self, table_name: str, filters: dict = None): async def get_data(self, table_name: str, filters: dict = None):
async with self.engine.begin() as connection: async with self.engine.begin() as connection:
query = f"SELECT * FROM {table_name}" query = f"SELECT * FROM {table_name}"
@ -119,12 +173,17 @@ class SQLAlchemyAdapter():
self.db_path = None self.db_path = None
else: else:
async with self.engine.begin() as connection: async with self.engine.begin() as connection:
# Load the schema information into the MetaData object schema_list = await self.get_schema_list()
await connection.run_sync(Base.metadata.reflect) # Create a MetaData instance to load table information
for table in Base.metadata.sorted_tables: metadata = MetaData()
drop_table_query = text(f"DROP TABLE IF EXISTS {table.name} CASCADE") # Drop all tables from all schemas
await connection.execute(drop_table_query) for schema_name in schema_list:
# Load the schema information into the MetaData object
await connection.run_sync(metadata.reflect, schema=schema_name)
for table in metadata.sorted_tables:
drop_table_query = text(f"DROP TABLE IF EXISTS {schema_name}.{table.name} CASCADE")
await connection.execute(drop_table_query)
metadata.clear()
except Exception as e: except Exception as e:
print(f"Error deleting database: {e}") print(f"Error deleting database: {e}")

View file

@ -0,0 +1,15 @@
<script>
var segmentKey = "{{ config.extra.analytics.key }}"
/* Wait for page to load and application to mount */
document.addEventListener("DOMContentLoaded", function() {
try {
!function(){var i="analytics",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","screen","once","off","on","addSourceMiddleware","addIntegrationMiddleware","setAnonymousId","addDestinationMiddleware","register"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if(["track","screen","alias","group","page","identify"].indexOf(e)>-1){var c=document.querySelector("link[rel='canonical']");n.push({__t:"bpc",c:c&&c.getAttribute("href")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n<analytics.methods.length;n++){var key=analytics.methods[n];analytics[key]=analytics.factory(key)}analytics.load=function(key,n){var t=document.createElement("script");t.type="text/javascript";t.async=!0;t.setAttribute("data-global-segment-analytics-key",i);t.src="https://cdn.segment.com/analytics.js/v1/" + key + "/analytics.min.js";var r=document.getElementsByTagName("script")[0];r.parentNode.insertBefore(t,r);analytics._loadOptions=n};analytics._writeKey=segmentKey;;analytics.SNIPPET_VERSION="5.2.0";
analytics.load(segmentKey);
analytics.page();
}}();
} catch (error) {
console.error("Failed to load Segment analytics", error);
}
});
</script>

View file

@ -63,6 +63,11 @@ theme:
code: Roboto Mono code: Roboto Mono
custom_dir: docs/overrides custom_dir: docs/overrides
extra:
analytics:
provider: segment
key: !ENV DOCS_SEGMENT_KEY
extra_css: extra_css:
- stylesheets/extra.css - stylesheets/extra.css
@ -120,25 +125,24 @@ nav:
- Add data: 'data_ingestion.md' - Add data: 'data_ingestion.md'
- Create LLM enriched data store: 'templates.md' - Create LLM enriched data store: 'templates.md'
- Explore data: 'search.md' - Explore data: 'search.md'
# - SDK:
# - Overview: 'sdk_overview.md'
- Configuration: 'configuration.md' - Configuration: 'configuration.md'
- What is cognee: - What is cognee:
- Introduction: 'conceptual_overview.md' - Introduction: 'conceptual_overview.md'
- API reference: 'api_reference.md' - API reference: 'api_reference.md'
- Blog:
- "blog/index.md"
plugins: plugins:
- mkdocs-jupyter:
ignore_h1_titles: true
execute: false
- social
- search: - search:
separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
- minify: - minify:
minify_html: true minify_html: true
minify_js: true
minify_css: true
htmlmin_opts:
remove_comments: true
cache_safe: true
- mkdocstrings: - mkdocstrings:
handlers: handlers:
python: python:
@ -146,13 +150,3 @@ plugins:
members_order: alphabetical members_order: alphabetical
allow_inspection: true allow_inspection: true
show_bases: true show_bases: true
- blog:
enabled: !ENV CI
blog_dir: "blog"
blog_toc: true
post_dir: blog/posts
post_date_format: yyyy/MM/dd
post_url_format: "{date}/{slug}"
authors_file: "{blog}/.authors.yml"

View file

@ -537,10 +537,14 @@
"import os\n", "import os\n",
"\n", "\n",
"# # Setting environment variables\n", "# # Setting environment variables\n",
"os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
"os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n", "\n",
"os.environ[\"LLM_API_KEY\"] = \"\"\n", "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"\n",
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n", "\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n", "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n",
"# Not needed if using networkx\n", "# Not needed if using networkx\n",
@ -577,6 +581,7 @@
"\n", "\n",
"import cognee\n", "import cognee\n",
"\n", "\n",
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)" "await cognee.prune.prune_system(metadata=True)"
] ]
}, },
@ -639,7 +644,8 @@
" chunks_into_graph, \\\n", " chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n", " source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n", " check_permissions_on_documents, \\\n",
" classify_documents\n", " classify_documents, \\\n",
" chunk_naive_llm_classifier\n",
"from cognee.tasks.summarization import summarize_text\n", "from cognee.tasks.summarization import summarize_text\n",
"\n", "\n",
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n", "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
@ -667,6 +673,10 @@
" summarization_model = cognee_config.summarization_model,\n", " summarization_model = cognee_config.summarization_model,\n",
" collection_name = \"summaries\",\n", " collection_name = \"summaries\",\n",
" ),\n", " ),\n",
" Task(\n",
" chunk_naive_llm_classifier,\n",
" classification_model = cognee_config.classification_model,\n",
" ),\n",
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n", " Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
" ]\n", " ]\n",
"\n", "\n",
@ -876,7 +886,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "cognee-bGi0WgSG-py3.9", "display_name": ".venv",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -890,7 +900,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.5" "version": "3.9.6"
} }
}, },
"nbformat": 4, "nbformat": 4,

2660
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -91,20 +91,9 @@ mypy = "^1.7.1"
notebook = "^7.1.1" notebook = "^7.1.1"
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
mkdocs = "^1.4.3" mkdocs-material = "^9.5.42"
mkdocs-material = {extras = ["imaging"], version = "^9.5.9"}
mkdocstrings = "^0.22.0"
mkdocstrings-python = "^1.1.2"
pytest-examples = "^0.0.10"
mkdocs-jupyter = "^0.24.6"
mkdocs-minify-plugin = "^0.8.0" mkdocs-minify-plugin = "^0.8.0"
mkdocs-redirects = "^1.2.1" mkdocstrings = {extras = ["python"], version = "^0.26.2"}
[tool.poetry.group.test-docs.dependencies]
fastapi = "^0.109.2"
diskcache = "^5.6.3"
pandas = "2.0.3"
tabulate = "^0.9.0"
[tool.ruff] # https://beta.ruff.rs/docs/ [tool.ruff] # https://beta.ruff.rs/docs/
line-length = 100 line-length = 100