Merge remote-tracking branch 'origin/main' into feat/COG-184-add-falkordb

This commit is contained in:
Boris Arzentar 2024-10-29 17:25:19 +01:00
commit 62022a85f3
9 changed files with 1477 additions and 1430 deletions

View file

@ -17,22 +17,21 @@ permissions:
contents: write
jobs:
# get_docs_changes:
# name: run docs changes
# uses: ./.github/workflows/get_docs_changes.yml
get_docs_changes:
name: run docs changes
uses: ./.github/workflows/get_docs_changes.yml
deploy:
runs-on: ubuntu-latest
# needs: get_docs_changes
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'false'
steps:
- name: Checkout code
uses: actions/checkout@master
- name: Install Poetry
uses: snok/install-poetry@v1.3.1
- name: Use output
run: echo "The stage is finished"
@ -41,7 +40,6 @@ jobs:
with:
python-version: '3.11.x'
- name: Install APT packages
run: |
sudo apt-get update &&
@ -49,9 +47,10 @@ jobs:
- name: Install via Poetry
run: poetry install --with dev,docs
env:
GH_TOKEN: ${{ secrets.PAT_FOR_CROSS_REPOS_CICD_TRIGGERING }}
- name: Build and deploy MkDocs
run: poetry run mkdocs gh-deploy --force
run: poetry run mkdocs gh-deploy --force
env:
DOCS_SEGMENT_KEY: ${{ secrets.DOCS_SEGMENT_KEY }}

61
.github/workflows/test_notebook.yml vendored Normal file
View file

@ -0,0 +1,61 @@
name: test | notebook
on:
pull_request:
branches:
- main
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
RUNTIME__LOG_LEVEL: ERROR
jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
run_notebook_test:
name: test
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true'
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- name: Check out
uses: actions/checkout@master
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11.x'
- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Install dependencies
run: |
poetry install --no-interaction
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
poetry run jupyter nbconvert \
--to notebook \
--execute notebooks/cognee_demo.ipynb \
--output executed_notebook.ipynb \
--ExecutePreprocessor.timeout=1200

View file

@ -200,7 +200,7 @@ Cognee supports a variety of tools and services for different operations:
## Demo
Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/main/notebooks/cognee%20-%20Get%20Started.ipynb)
Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/main/notebooks/cognee_demo.ipynb)

View file

@ -1,7 +1,9 @@
from os import path
from typing import AsyncGenerator
from uuid import UUID
from typing import Optional
from typing import AsyncGenerator, List
from contextlib import asynccontextmanager
from sqlalchemy import text, select
from sqlalchemy import text, select, MetaData, Table
from sqlalchemy.orm import joinedload
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
@ -50,11 +52,14 @@ class SQLAlchemyAdapter():
await connection.execute(text(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});"))
await connection.close()
async def delete_table(self, table_name: str):
async def delete_table(self, table_name: str, schema_name: Optional[str] = "public"):
async with self.engine.begin() as connection:
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;"))
await connection.close()
if self.engine.dialect.name == "sqlite":
# SQLite doesnt support schema namespaces and the CASCADE keyword.
# However, foreign key constraint can be defined with ON DELETE CASCADE during table creation.
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name};"))
else:
await connection.execute(text(f"DROP TABLE IF EXISTS {schema_name}.{table_name} CASCADE;"))
async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
columns = ", ".join(data[0].keys())
@ -65,6 +70,55 @@ class SQLAlchemyAdapter():
await connection.execute(insert_query, data)
await connection.close()
async def get_schema_list(self) -> List[str]:
"""
Return a list of all schema names in database
"""
if self.engine.dialect.name == "postgresql":
async with self.engine.begin() as connection:
result = await connection.execute(
text("""
SELECT schema_name FROM information_schema.schemata
WHERE schema_name NOT IN ('pg_catalog', 'pg_toast', 'information_schema');
""")
)
return [schema[0] for schema in result.fetchall()]
return []
async def delete_data_by_id(self, table_name: str, data_id: UUID, schema_name: Optional[str] = "public"):
"""
Delete data in given table based on id. Table must have an id Column.
"""
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table:
"""
Dynamically loads a table using the given table name and schema name.
"""
async with self.engine.begin() as connection:
if self.engine.dialect.name == "sqlite":
# Load the schema information into the MetaData object
await connection.run_sync(Base.metadata.reflect)
if table_name in Base.metadata.tables:
return Base.metadata.tables[table_name]
else:
raise ValueError(f"Table '{table_name}' not found.")
else:
# Create a MetaData instance to load table information
metadata = MetaData()
# Load table information from schema into MetaData
await connection.run_sync(metadata.reflect, schema=schema_name)
# Define the full table name
full_table_name = f"{schema_name}.{table_name}"
# Check if table is in list of tables for the given schema
if full_table_name in metadata.tables:
return metadata.tables[full_table_name]
raise ValueError(f"Table '{full_table_name}' not found.")
async def get_data(self, table_name: str, filters: dict = None):
async with self.engine.begin() as connection:
query = f"SELECT * FROM {table_name}"
@ -119,12 +173,17 @@ class SQLAlchemyAdapter():
self.db_path = None
else:
async with self.engine.begin() as connection:
# Load the schema information into the MetaData object
await connection.run_sync(Base.metadata.reflect)
for table in Base.metadata.sorted_tables:
drop_table_query = text(f"DROP TABLE IF EXISTS {table.name} CASCADE")
await connection.execute(drop_table_query)
schema_list = await self.get_schema_list()
# Create a MetaData instance to load table information
metadata = MetaData()
# Drop all tables from all schemas
for schema_name in schema_list:
# Load the schema information into the MetaData object
await connection.run_sync(metadata.reflect, schema=schema_name)
for table in metadata.sorted_tables:
drop_table_query = text(f"DROP TABLE IF EXISTS {schema_name}.{table.name} CASCADE")
await connection.execute(drop_table_query)
metadata.clear()
except Exception as e:
print(f"Error deleting database: {e}")

View file

@ -0,0 +1,15 @@
<script>
var segmentKey = "{{ config.extra.analytics.key }}"
/* Wait for page to load and application to mount */
document.addEventListener("DOMContentLoaded", function() {
try {
!function(){var i="analytics",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","screen","once","off","on","addSourceMiddleware","addIntegrationMiddleware","setAnonymousId","addDestinationMiddleware","register"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if(["track","screen","alias","group","page","identify"].indexOf(e)>-1){var c=document.querySelector("link[rel='canonical']");n.push({__t:"bpc",c:c&&c.getAttribute("href")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n<analytics.methods.length;n++){var key=analytics.methods[n];analytics[key]=analytics.factory(key)}analytics.load=function(key,n){var t=document.createElement("script");t.type="text/javascript";t.async=!0;t.setAttribute("data-global-segment-analytics-key",i);t.src="https://cdn.segment.com/analytics.js/v1/" + key + "/analytics.min.js";var r=document.getElementsByTagName("script")[0];r.parentNode.insertBefore(t,r);analytics._loadOptions=n};analytics._writeKey=segmentKey;;analytics.SNIPPET_VERSION="5.2.0";
analytics.load(segmentKey);
analytics.page();
}}();
} catch (error) {
console.error("Failed to load Segment analytics", error);
}
});
</script>

View file

@ -63,6 +63,11 @@ theme:
code: Roboto Mono
custom_dir: docs/overrides
extra:
analytics:
provider: segment
key: !ENV DOCS_SEGMENT_KEY
extra_css:
- stylesheets/extra.css
@ -120,25 +125,24 @@ nav:
- Add data: 'data_ingestion.md'
- Create LLM enriched data store: 'templates.md'
- Explore data: 'search.md'
# - SDK:
# - Overview: 'sdk_overview.md'
- Configuration: 'configuration.md'
- What is cognee:
- Introduction: 'conceptual_overview.md'
- API reference: 'api_reference.md'
- Blog:
- "blog/index.md"
plugins:
- mkdocs-jupyter:
ignore_h1_titles: true
execute: false
- social
- search:
separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
- minify:
minify_html: true
minify_js: true
minify_css: true
htmlmin_opts:
remove_comments: true
cache_safe: true
- mkdocstrings:
handlers:
python:
@ -146,13 +150,3 @@ plugins:
members_order: alphabetical
allow_inspection: true
show_bases: true
- blog:
enabled: !ENV CI
blog_dir: "blog"
blog_toc: true
post_dir: blog/posts
post_date_format: yyyy/MM/dd
post_url_format: "{date}/{slug}"
authors_file: "{blog}/.authors.yml"

View file

@ -537,10 +537,14 @@
"import os\n",
"\n",
"# # Setting environment variables\n",
"os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n",
"os.environ[\"LLM_API_KEY\"] = \"\"\n",
"if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"\n",
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n",
"# Not needed if using networkx\n",
@ -577,6 +581,7 @@
"\n",
"import cognee\n",
"\n",
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)"
]
},
@ -639,7 +644,8 @@
" chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n",
" classify_documents\n",
" classify_documents, \\\n",
" chunk_naive_llm_classifier\n",
"from cognee.tasks.summarization import summarize_text\n",
"\n",
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
@ -667,6 +673,10 @@
" summarization_model = cognee_config.summarization_model,\n",
" collection_name = \"summaries\",\n",
" ),\n",
" Task(\n",
" chunk_naive_llm_classifier,\n",
" classification_model = cognee_config.classification_model,\n",
" ),\n",
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
" ]\n",
"\n",
@ -876,7 +886,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "cognee-bGi0WgSG-py3.9",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
@ -890,7 +900,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.9.6"
}
},
"nbformat": 4,

2660
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -91,20 +91,9 @@ mypy = "^1.7.1"
notebook = "^7.1.1"
[tool.poetry.group.docs.dependencies]
mkdocs = "^1.4.3"
mkdocs-material = {extras = ["imaging"], version = "^9.5.9"}
mkdocstrings = "^0.22.0"
mkdocstrings-python = "^1.1.2"
pytest-examples = "^0.0.10"
mkdocs-jupyter = "^0.24.6"
mkdocs-material = "^9.5.42"
mkdocs-minify-plugin = "^0.8.0"
mkdocs-redirects = "^1.2.1"
[tool.poetry.group.test-docs.dependencies]
fastapi = "^0.109.2"
diskcache = "^5.6.3"
pandas = "2.0.3"
tabulate = "^0.9.0"
mkdocstrings = {extras = ["python"], version = "^0.26.2"}
[tool.ruff] # https://beta.ruff.rs/docs/
line-length = 100