chore: remove unused dependencies and make some optional (#661)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin
This commit is contained in:
Boris 2025-03-25 10:19:52 +01:00 committed by GitHub
parent 08b326550a
commit d192d1fe20
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1389 additions and 1786 deletions

View file

@ -44,7 +44,7 @@ jobs:
installer-parallel: true
- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --extras chromadb --no-interaction
- name: Run chromadb test
env:

View file

@ -41,7 +41,7 @@ jobs:
- name: Install dependencies
run: |
poetry install --no-interaction
poetry install --extras api --no-interaction
- name: Run cognee server
env:

View file

@ -2,6 +2,8 @@ FROM python:3.11-slim
# Define Poetry extras to install
ARG POETRY_EXTRAS="\
# API \
api \
# Storage & Databases \
filesystem postgres weaviate qdrant neo4j falkordb milvus kuzu \
# Notebooks & Interactive Environments \

View file

@ -169,9 +169,9 @@ app.include_router(get_settings_router(), prefix="/api/v1/settings", tags=["sett
app.include_router(get_visualize_router(), prefix="/api/v1/visualize", tags=["visualize"])
app.include_router(
get_code_pipeline_router(), prefix="/api/v1/code-pipeline", tags=["code-pipeline"]
)
codegraph_routes = get_code_pipeline_router()
if codegraph_routes:
app.include_router(codegraph_routes, prefix="/api/v1/code-pipeline", tags=["code-pipeline"])
def start_api_server(host: str = "0.0.0.0", port: int = 8000):

View file

@ -2,10 +2,10 @@ from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse
from fastapi import APIRouter
from typing import List
import aiohttp
import subprocess
import logging
import os
import requests
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
@ -36,17 +36,12 @@ def get_add_router() -> APIRouter:
)
else:
# Fetch and store the data from other types of URL using curl
async with aiohttp.ClientSession() as session:
async with session.get(data) as resp:
if resp.status == 200:
file_data = await resp.read()
filename = os.path.basename(data)
with open(f".data/{filename}", "wb") as f:
f.write(file_data)
await cognee_add(
"data://.data/",
f"{data.split('/')[-1]}",
)
response = requests.get(data)
response.raise_for_status()
file_data = await response.content()
return await cognee_add(file_data)
else:
await cognee_add(
data,

View file

@ -25,6 +25,7 @@ from cognee.tasks.summarization import summarize_text
from cognee.infrastructure.llm import get_max_chunk_tokens
monitoring = get_base_config().monitoring_tool
if monitoring == MonitoringTool.LANGFUSE:
from langfuse.decorators import observe

View file

@ -3,7 +3,6 @@ import logging
from fastapi import APIRouter
from fastapi.responses import JSONResponse
from cognee.api.DTO import InDTO
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
from cognee.modules.retrieval.code_retriever import CodeRetriever
from cognee.modules.storage.utils import JSONEncoder
@ -22,11 +21,19 @@ class CodePipelineRetrievePayloadDTO(InDTO):
def get_code_pipeline_router() -> APIRouter:
try:
import run_code_graph_pipeline
except ModuleNotFoundError:
logger.error("codegraph dependencies not found. Skipping codegraph API routes.")
return None
router = APIRouter()
@router.post("/index", response_model=None)
async def code_pipeline_index(payload: CodePipelineIndexPayloadDTO):
"""This endpoint is responsible for running the indexation on code repo."""
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
try:
async for result in run_code_graph_pipeline(payload.repo_path, payload.include_docs):
logger.info(result)

View file

@ -1,11 +1,7 @@
from fastapi import Form, UploadFile, Depends
import logging
from fastapi import Depends
from fastapi.responses import JSONResponse
from fastapi import APIRouter
from typing import List
import aiohttp
import subprocess
import logging
import os
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user

View file

@ -1,8 +1,4 @@
import requests
import os
import json
import random
from typing import Optional, Any, List, Tuple
from typing import Any
from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter

View file

@ -1,9 +1,11 @@
import asyncio
import httpx
import aiohttp
import logging
from typing import List, Optional
import os
import aiohttp.http_exceptions
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
@ -48,14 +50,10 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
if self.mock:
return [[0.0] * self.dimensions for _ in text]
embeddings = []
async with httpx.AsyncClient() as client:
for prompt in text:
embedding = await self._get_embedding(client, prompt)
embeddings.append(embedding)
embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text])
return embeddings
async def _get_embedding(self, client: httpx.AsyncClient, prompt: str) -> List[float]:
async def _get_embedding(self, prompt: str) -> List[float]:
"""
Internal method to call the Ollama embeddings endpoint for a single prompt.
"""
@ -71,13 +69,13 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
retries = 0
while retries < self.MAX_RETRIES:
try:
response = await client.post(
self.endpoint, json=payload, headers=headers, timeout=60.0
)
response.raise_for_status()
data = response.json()
return data["embedding"]
except httpx.HTTPStatusError as e:
async with aiohttp.ClientSession() as session:
async with session.post(
self.endpoint, json=payload, headers=headers, timeout=60.0
) as response:
data = await response.json()
return data["embedding"]
except aiohttp.http_exceptions.HttpBadRequest as e:
logger.error(f"HTTP error on attempt {retries + 1}: {e}")
retries += 1
await asyncio.sleep(min(2**retries, 60))

View file

@ -1,7 +1,6 @@
from typing import Type
from pydantic import BaseModel
import instructor
import anthropic
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.llm.llm_interface import LLMInterface
@ -15,9 +14,12 @@ class AnthropicAdapter(LLMInterface):
model: str
def __init__(self, max_tokens: int, model: str = None):
import anthropic
self.aclient = instructor.patch(
create=anthropic.Anthropic().messages.create, mode=instructor.Mode.ANTHROPIC_TOOLS
)
self.model = model
self.max_tokens = max_tokens

View file

@ -2,7 +2,6 @@ from typing import Type, Optional
from pydantic import BaseModel
import logging
import litellm
import asyncio
from litellm import acompletion, JSONSchemaValidationError
from cognee.shared.data_models import MonitoringTool
from cognee.exceptions import InvalidValueError
@ -13,6 +12,7 @@ from cognee.base_config import get_base_config
logger = logging.getLogger(__name__)
monitoring = get_base_config().monitoring_tool
if monitoring == MonitoringTool.LANGFUSE:
from langfuse.decorators import observe

View file

@ -13,6 +13,7 @@ from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.base_config import get_base_config
monitoring = get_base_config().monitoring_tool
if monitoring == MonitoringTool.LANGFUSE:
from langfuse.decorators import observe

View file

@ -4,7 +4,6 @@ from typing import Type
from instructor.exceptions import InstructorRetryException
from pydantic import BaseModel
from tenacity import RetryError
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt
@ -36,7 +35,7 @@ async def extract_code_summary(content: str):
else:
try:
result = await extract_summary(content, response_model=SummarizedCode)
except (RetryError, InstructorRetryException) as e:
except InstructorRetryException as e:
logger.error("Failed to extract code summary, falling back to mock summary", exc_info=e)
result = get_mock_summarized_code()

View file

@ -1,63 +0,0 @@
import re
from nltk.downloader import download
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def extract_topics(text: str):
sentences = sent_tokenize(text)
try:
wordnet.ensure_loaded()
except LookupError:
download("wordnet")
wordnet.ensure_loaded()
lemmatizer = WordNetLemmatizer()
base_notation_sentences = [lemmatizer.lemmatize(sentence) for sentence in sentences]
tf_vectorizer = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
transformed_corpus = tf_vectorizer.fit_transform(base_notation_sentences)
svd = TruncatedSVD(n_components=10)
svd_corpus = svd.fit(transformed_corpus)
feature_scores = dict(zip(tf_vectorizer.vocabulary_, svd_corpus.components_[0]))
topics = sorted(
feature_scores,
# key = feature_scores.get,
key=lambda x: transformed_corpus[0, tf_vectorizer.vocabulary_[x]],
reverse=True,
)[:10]
return topics
def clean_text(text: str):
text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower())
return re.sub(r"[`\"'.,;!?…]", "", text).strip()
def remove_stop_words(text: str):
try:
stopwords.ensure_loaded()
except LookupError:
download("stopwords")
stopwords.ensure_loaded()
stop_words = set(stopwords.words("english"))
text = text.split()
text = [word for word in text if word not in stop_words]
return " ".join(text)
if __name__ == "__main__":
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry... Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book… It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
Why do we use it?
It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout! The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).
"""
print(extract_topics(remove_stop_words(clean_text(text))))

View file

@ -21,7 +21,6 @@ from cognee.infrastructure.databases.graph import get_graph_engine
from uuid import uuid4
import pathlib
import nltk
from cognee.shared.exceptions import IngestionError
# Analytics Proxy Url, currently hosted by Vercel
@ -29,7 +28,10 @@ proxy_url = "https://test.prometh.ai"
def get_entities(tagged_tokens):
import nltk
nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk
return ne_chunk(tagged_tokens)
@ -37,6 +39,7 @@ def get_entities(tagged_tokens):
def extract_pos_tags(sentence):
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
import nltk
# Ensure that the necessary NLTK resources are downloaded
nltk.download("words", quiet=True)
@ -308,37 +311,6 @@ def embed_logo(p, layout_scale, logo_alpha, position):
)
def style_and_render_graph(p, G, layout_positions, node_attribute, node_colors, centrality):
"""
Apply styling and render the graph into the plot.
"""
from bokeh.plotting import figure, from_networkx
from bokeh.models import Circle, MultiLine, HoverTool, ColumnDataSource, Range1d
from bokeh.plotting import output_file, show
from bokeh.embed import file_html
from bokeh.resources import CDN
graph_renderer = from_networkx(G, layout_positions)
node_radii = [0.02 + 0.1 * centrality[node] for node in G.nodes()]
graph_renderer.node_renderer.data_source.data["radius"] = node_radii
graph_renderer.node_renderer.data_source.data["fill_color"] = node_colors
graph_renderer.node_renderer.glyph = Circle(
radius="radius",
fill_color="fill_color",
fill_alpha=0.9,
line_color="#000000",
line_width=1.5,
)
graph_renderer.edge_renderer.glyph = MultiLine(
line_color="#000000",
line_alpha=0.3,
line_width=1.5,
)
p.renderers.append(graph_renderer)
return graph_renderer
def graph_to_tuple(graph):
"""
Converts a networkx graph to a tuple of (nodes, edges).

View file

@ -1,14 +1,12 @@
import os
import logging
import aiofiles
import importlib
from typing import AsyncGenerator, Optional
from uuid import NAMESPACE_OID, uuid5
import tree_sitter_python as tspython
from tree_sitter import Language, Node, Parser, Tree
import aiofiles
import logging
from cognee.low_level import DataPoint
from cognee.shared.CodeGraphEntities import (
CodeFile,
@ -19,15 +17,15 @@ from cognee.shared.CodeGraphEntities import (
logger = logging.getLogger(__name__)
PY_LANGUAGE = Language(tspython.language())
source_code_parser = Parser(PY_LANGUAGE)
class FileParser:
def __init__(self):
self.parsed_files = {}
async def parse_file(self, file_path: str) -> tuple[str, Tree]:
PY_LANGUAGE = Language(tspython.language())
source_code_parser = Parser(PY_LANGUAGE)
if file_path not in self.parsed_files:
source_code = await get_source_code(file_path)
source_code_tree = source_code_parser.parse(bytes(source_code, "utf-8"))

View file

@ -8,7 +8,6 @@ from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, Repository
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
async def get_source_code_files(repo_path):
@ -75,6 +74,9 @@ async def get_repo_file_dependencies(
for chunk_number in range(number_of_chunks)
]
# Codegraph dependencies are not installed by default, so we import where we use them.
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
for start_range, end_range in chunk_ranges:
# with ProcessPoolExecutor(max_workers=12) as executor:
tasks = [

2901
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,53 +20,51 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.10,<=3.13"
openai = "^1.59.4"
pydantic = "2.10.5"
python-dotenv = "1.0.1"
fastapi = "0.115.7"
uvicorn = "0.34.0"
requests = "2.32.3"
aiohttp = "3.10.10"
pydantic = "2.10.5"
pydantic-settings = "^2.2.1"
typing_extensions = "4.12.2"
nest_asyncio = "1.6.0"
nltk = "3.9.1"
numpy = "1.26.4"
datasets = "3.1.0"
falkordb = {version = "1.0.9", optional = true}
kuzu = {version = "0.8.2", optional = true}
pandas = "2.2.3"
boto3 = "^1.26.125"
botocore="^1.35.54"
gunicorn = "^20.1.0"
sqlalchemy = "2.0.36"
instructor = "1.7.2"
networkx = "^3.2.1"
aiosqlite = "^0.20.0"
pandas = "2.2.3"
tiktoken = "<=0.9.0"
litellm = ">=1.57.4"
instructor = "1.7.2"
langfuse = "^2.32.0"
filetype = "^1.2.0"
dlt = {extras = ["sqlalchemy"], version = "^1.4.1"}
aiohttp = "^3.11.14"
aiofiles = "^23.2.1"
qdrant-client = {version = "^1.9.0", optional = true}
owlready2 = "^0.47"
graphistry = "^0.33.5"
tenacity = "^9.0.0"
weaviate-client = {version = "4.9.6", optional = true}
scikit-learn = "^1.5.0"
pypdf = ">=4.1.0,<6.0.0"
neo4j = {version = "^5.20.0", optional = true}
jinja2 = "^3.1.3"
matplotlib = "^3.8.3"
tiktoken = "<=0.9.0"
networkx = "^3.2.1"
lancedb = "0.16.0"
alembic = "^1.13.3"
pre-commit = "^4.0.1"
scikit-learn = "^1.6.1"
fastapi = {version = "0.115.7", optional = true}
fastapi-users = {version = "14.0.0", extras = ["sqlalchemy"]}
uvicorn = {version = "0.34.0", optional = true}
gunicorn = {version = "^20.1.0", optional = true}
dlt = {extras = ["sqlalchemy"], version = "^1.4.1"}
qdrant-client = {version = "^1.9.0", optional = true}
weaviate-client = {version = "4.9.6", optional = true}
neo4j = {version = "^5.20.0", optional = true}
falkordb = {version = "1.0.9", optional = true}
kuzu = {version = "0.8.2", optional = true}
chromadb = {version = "^0.6.0", optional = true}
langchain_text_splitters = {version = "0.3.2", optional = true}
langsmith = {version = "0.2.3", optional = true}
langdetect = "1.0.9"
posthog = {version = "^3.5.0", optional = true}
lancedb = "0.16.0"
chromadb = "^0.6.0"
litellm = ">=1.57.4"
groq = {version = "0.8.0", optional = true}
langfuse = "^2.32.0"
pydantic-settings = "^2.2.1"
anthropic = "^0.26.1"
anthropic = {version = "^0.26.1", optional = true}
sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"}
fastapi-users = {version = "14.0.0", extras = ["sqlalchemy"]}
alembic = "^1.13.3"
asyncpg = {version = "0.30.0", optional = true}
pgvector = {version = "^0.3.5", optional = true}
psycopg2 = {version = "^2.9.10", optional = true}
@ -75,24 +73,18 @@ deepeval = {version = "^2.0.1", optional = true}
transformers = {version = "^4.46.3", optional = true}
pymilvus = {version = "^2.5.0", optional = true}
unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.13", optional = true }
pre-commit = "^4.0.1"
httpx = "0.27.0"
bokeh="^3.6.2"
nltk = "3.9.1"
google-generativeai = {version = "^0.8.4", optional = true}
mistral-common = {version = "^1.5.2", optional = true}
fastembed = {version = "^0.6.0", optional = true, markers = "python_version < '3.13'"}
tree-sitter = {version = "^0.24.0", optional = true}
tree-sitter-python = {version = "^0.23.6", optional = true}
plotly = {version = "^6.0.0", optional = true}
gdown = {version = "^5.2.0", optional = true}
pyside6 = {version = "^6.8.2.1", optional = true}
qasync = {version = "^0.27.1", optional = true}
graphiti-core = {version = "^0.7.0", optional = true}
owlready2 = "^0.47"
[tool.poetry.extras]
api = ["fastapi", "fastapi-users", "uvicorn", "gunicorn"]
filesystem = ["s3fs", "botocore"]
weaviate = ["weaviate-client"]
qdrant = ["qdrant-client"]
@ -105,12 +97,14 @@ gemini = ["google-generativeai"]
huggingface = ["transformers"]
ollama = ["transformers"]
mistral = ["mistral-common"]
anthropic = ["anthropic"]
deepeval = ["deepeval"]
posthog = ["posthog"]
falkordb = ["falkordb"]
kuzu = ["kuzu"]
groq = ["groq"]
milvus = ["pymilvus"]
chromadb = ["chromadb"]
docs = ["unstructured"]
codegraph = ["fastembed", "transformers", "tree-sitter", "tree-sitter-python"]
evals = ["plotly", "gdown"]