diff --git a/.data/multimedia/example.png b/.data/multimedia/example.png new file mode 100644 index 000000000..4d406cafd Binary files /dev/null and b/.data/multimedia/example.png differ diff --git a/.data/multimedia/text_to_speech.mp3 b/.data/multimedia/text_to_speech.mp3 new file mode 100644 index 000000000..e84aea505 Binary files /dev/null and b/.data/multimedia/text_to_speech.mp3 differ diff --git a/.github/workflows/test_cognee_multimedia_notebook.yml b/.github/workflows/test_cognee_multimedia_notebook.yml new file mode 100644 index 000000000..dd14fa5e6 --- /dev/null +++ b/.github/workflows/test_cognee_multimedia_notebook.yml @@ -0,0 +1,63 @@ +name: test | multimedia notebook + +on: + workflow_dispatch: + pull_request: + branches: + - main + types: [labeled, synchronize] + + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + RUNTIME__LOG_LEVEL: ERROR + +jobs: + get_docs_changes: + name: docs changes + uses: ./.github/workflows/get_docs_changes.yml + + run_notebook_test: + name: test + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }} + runs-on: ubuntu-latest + defaults: + run: + shell: bash + steps: + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11.x' + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Install dependencies + run: | + poetry install --no-interaction + poetry add jupyter --no-interaction + + - name: Execute Jupyter Notebook + env: + ENV: 'dev' + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} + GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} + run: | + poetry run jupyter nbconvert \ + --to notebook \ + --execute notebooks/cognee_multimedia_demo.ipynb \ + --output executed_notebook.ipynb \ + --ExecutePreprocessor.timeout=1200 \ No newline at end of file diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 83d794c03..5a4523853 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -13,6 +13,7 @@ concurrency: env: RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' jobs: get_docs_changes: @@ -56,12 +57,6 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run convert_graph_from_code_graph test - run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py - env: - ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - name: Run default basic pipeline env: ENV: 'dev' diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 9483cb687..9c79fb0ff 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -13,6 +13,7 @@ concurrency: env: RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' jobs: get_docs_changes: @@ -56,12 +57,6 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run convert_graph_from_code_graph test - run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py - env: - ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - name: Run default basic pipeline env: ENV: 'dev' diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml index c4f6d271a..9c8456536 100644 --- a/.github/workflows/test_python_3_9.yml +++ b/.github/workflows/test_python_3_9.yml @@ -13,6 +13,7 @@ concurrency: env: RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' jobs: get_docs_changes: @@ -56,12 +57,6 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run convert_graph_from_code_graph test - run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py - env: - ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - name: Run default basic pipeline env: ENV: 'dev' diff --git a/README.md b/README.md index 28d5858a0..ed4489fbf 100644 --- a/README.md +++ b/README.md @@ -105,37 +105,65 @@ import asyncio from cognee.api.v1.search import SearchType async def main(): - # Reset cognee data + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") await cognee.prune.prune_data() - # Reset cognee system state await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + # cognee knowledge graph will be created based on this text text = """ Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. """ - - # Add text to cognee + + print("Adding text to cognee:") + print(text.strip()) + # Add the text, and make it available for cognify await cognee.add(text) + print("Text added successfully.\n") + + print("Running cognify to create knowledge graph...\n") + print("Cognify process steps:") + print("1. Classifying the document: Determining the type and category of the input text.") + print("2. Checking permissions: Ensuring the user has the necessary rights to process the text.") + print("3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis.") + print("4. Adding data points: Storing the extracted chunks for processing.") + print("5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph.") + print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n") + # Use LLMs and cognee to create knowledge graph await cognee.cognify() + print("Cognify process complete.\n") - # Search cognee for insights + + query_text = 'Tell me about NLP' + print(f"Searching cognee for insights with query: '{query_text}'") + # Query cognee for insights on the added text search_results = await cognee.search( - SearchType.INSIGHTS, - "Tell me about NLP", + SearchType.INSIGHTS, query_text=query_text ) - + + print("Search results:") # Display results for result_text in search_results: print(result_text) - # natural_language_processing is_a field - # natural_language_processing is_subfield_of computer_science - # natural_language_processing is_subfield_of information_retrieval -asyncio.run(main()) + # Example output: + # ({'id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'natural language processing', 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'target_node_id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 15, 473137, tzinfo=datetime.timezone.utc)}, {'id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'computer science', 'description': 'The study of computation and information processing.'}) + # (...) + # + # It represents nodes and relationships in the knowledge graph: + # - The first element is the source node (e.g., 'natural language processing'). + # - The second element is the relationship between nodes (e.g., 'is_a_subfield_of'). + # - The third element is the target node (e.g., 'computer science'). + +if __name__ == '__main__': + asyncio.run(main()) + ``` +When you run this script, you will see step-by-step messages in the console that help you trace the execution flow and understand what the script is doing at each stage. A version of this example is here: `examples/python/simple_example.py` ### Create your own memory store diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 5770bcda4..5b03cac2e 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -4,7 +4,7 @@ from .config import get_graph_config from .graph_db_interface import GraphDBInterface -async def get_graph_engine() -> GraphDBInterface : +async def get_graph_engine() -> GraphDBInterface: """Factory function to get the appropriate graph client based on the graph type.""" config = get_graph_config() diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 1121a24d5..e6520e4e2 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -2,7 +2,7 @@ import logging import asyncio from textwrap import dedent -from typing import Optional, Any, List, Dict +from typing import Optional, Any, List, Dict, Union from contextlib import asynccontextmanager from uuid import UUID from neo4j import AsyncSession @@ -432,3 +432,49 @@ class Neo4jAdapter(GraphDBInterface): ) for record in result] return (nodes, edges) + + async def get_filtered_graph_data(self, attribute_filters): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: nodes and edges. + """ + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + values_str = ", ".join(f"'{value}'" if isinstance(value, str) else str(value) for value in values) + where_clauses.append(f"n.{attribute} IN [{values_str}]") + + where_clause = " AND ".join(where_clauses) + + query_nodes = f""" + MATCH (n) + WHERE {where_clause} + RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties + """ + result_nodes = await self.query(query_nodes) + + nodes = [( + record["id"], + record["properties"], + ) for record in result_nodes] + + query_edges = f""" + MATCH (n)-[r]->(m) + WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} + RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties + """ + result_edges = await self.query(query_edges) + + edges = [( + record["source"], + record["target"], + record["type"], + record["properties"], + ) for record in result_edges] + + return (nodes, edges) \ No newline at end of file diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index a72376082..d249b6336 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -6,7 +6,7 @@ import json import asyncio import logging from re import A -from typing import Dict, Any, List +from typing import Dict, Any, List, Union from uuid import UUID import aiofiles import aiofiles.os as aiofiles_os @@ -301,3 +301,39 @@ class NetworkXAdapter(GraphDBInterface): logger.info("Graph deleted successfully.") except Exception as error: logger.error("Failed to delete graph: %s", error) + + async def get_filtered_graph_data(self, attribute_filters: List[Dict[str, List[Union[str, int]]]]): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: + - Nodes: List of tuples (node_id, node_properties). + - Edges: List of tuples (source_id, target_id, relationship_type, edge_properties). + """ + # Create filters for nodes based on the attribute filters + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + where_clauses.append((attribute, values)) + + # Filter nodes + filtered_nodes = [ + (node, data) for node, data in self.graph.nodes(data=True) + if all(data.get(attr) in values for attr, values in where_clauses) + ] + + # Filter edges where both source and target nodes satisfy the filters + filtered_edges = [ + (source, target, data.get('relationship_type', 'UNKNOWN'), data) + for source, target, data in self.graph.edges(data=True) + if ( + all(self.graph.nodes[source].get(attr) in values for attr, values in where_clauses) and + all(self.graph.nodes[target].get(attr) in values for attr, values in where_clauses) + ) + ] + + return filtered_nodes, filtered_edges \ No newline at end of file diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 96f026b4f..6cbe45655 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -10,6 +10,7 @@ from cognee.infrastructure.files.storage import LocalStorage from cognee.modules.storage.utils import copy_model, get_own_properties from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine class IndexSchema(DataPoint): @@ -141,6 +142,34 @@ class LanceDBAdapter(VectorDBInterface): score = 0, ) for result in results.to_dict("index").values()] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + ): + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + connection = await self.get_connection() + collection = await connection.open_table(collection_name) + + results = await collection.vector_search(query_vector).to_pandas() + + result_values = list(results.to_dict("index").values()) + + normalized_values = normalize_distances(result_values) + + return [ScoredResult( + id=UUID(result["id"]), + payload=result["payload"], + score=normalized_values[value_index], + ) for value_index, result in enumerate(result_values)] + async def search( self, collection_name: str, @@ -148,6 +177,7 @@ class LanceDBAdapter(VectorDBInterface): query_vector: List[float] = None, limit: int = 5, with_vector: bool = False, + normalized: bool = True ): if query_text is None and query_vector is None: raise ValueError("One of query_text or query_vector must be provided!") @@ -162,26 +192,7 @@ class LanceDBAdapter(VectorDBInterface): result_values = list(results.to_dict("index").values()) - min_value = 100 - max_value = 0 - - for result in result_values: - value = float(result["_distance"]) - if value > max_value: - max_value = value - if value < min_value: - min_value = value - - normalized_values = [] - min_value = min(result["_distance"] for result in result_values) - max_value = max(result["_distance"] for result in result_values) - - if max_value == min_value: - # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) - normalized_values = [0 for _ in result_values] - else: - normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in - result_values] + normalized_values = normalize_distances(result_values) return [ScoredResult( id = UUID(result["id"]), diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 01691714b..97571a274 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -11,6 +11,7 @@ from cognee.infrastructure.engine import DataPoint from .serialize_data import serialize_data from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from ...relational.ModelBase import Base @@ -22,6 +23,19 @@ class IndexSchema(DataPoint): "index_fields": ["text"] } +def singleton(class_): + # Note: Using this singleton as a decorator to a class removes + # the option to use class methods for that class + instances = {} + + def getinstance(*args, **kwargs): + if class_ not in instances: + instances[class_] = class_(*args, **kwargs) + return instances[class_] + + return getinstance + +@singleton class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): def __init__( @@ -162,6 +176,53 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): ) for result in results ] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + )-> List[ScoredResult]: + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + # Get PGVectorDataPoint Table from database + PGVectorDataPoint = await self.get_table(collection_name) + + closest_items = [] + + # Use async session to connect to the database + async with self.get_async_session() as session: + # Find closest vectors to query_vector + closest_items = await session.execute( + select( + PGVectorDataPoint, + PGVectorDataPoint.c.vector.cosine_distance(query_vector).label( + "similarity" + ), + ) + .order_by("similarity") + ) + + vector_list = [] + + # Extract distances and find min/max for normalization + for vector in closest_items: + # TODO: Add normalization of similarity score + vector_list.append(vector) + + # Create and return ScoredResult objects + return [ + ScoredResult( + id = UUID(str(row.id)), + payload = row.payload, + score = row.similarity + ) for row in vector_list + ] + async def search( self, collection_name: str, diff --git a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py index ef27e2889..f40299939 100644 --- a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +++ b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py @@ -10,5 +10,3 @@ async def create_db_and_tables(): await vector_engine.create_database() async with vector_engine.engine.begin() as connection: await connection.execute(text("CREATE EXTENSION IF NOT EXISTS vector;")) - - diff --git a/cognee/infrastructure/databases/vector/utils.py b/cognee/infrastructure/databases/vector/utils.py new file mode 100644 index 000000000..ced161ea3 --- /dev/null +++ b/cognee/infrastructure/databases/vector/utils.py @@ -0,0 +1,26 @@ +from typing import List + + +def normalize_distances(result_values: List[dict]) -> List[float]: + min_value = 100 + max_value = 0 + + for result in result_values: + value = float(result["_distance"]) + if value > max_value: + max_value = value + if value < min_value: + min_value = value + + normalized_values = [] + min_value = min(result["_distance"] for result in result_values) + max_value = max(result["_distance"] for result in result_values) + + if max_value == min_value: + # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) + normalized_values = [0 for _ in result_values] + else: + normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in + result_values] + + return normalized_values \ No newline at end of file diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index f8ea1c9f0..d08c52fa8 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -11,6 +11,7 @@ class DataPoint(BaseModel): __tablename__ = "data_point" id: UUID = Field(default_factory = uuid4) updated_at: Optional[datetime] = datetime.now(timezone.utc) + topological_rank: Optional[int] = 0 _metadata: Optional[MetaData] = { "index_fields": [] } diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py index 28cdfff4e..1dc9b70f5 100644 --- a/cognee/infrastructure/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/openai/adapter.py @@ -87,6 +87,9 @@ class OpenAIAdapter(LLMInterface): transcription = litellm.transcription( model = self.transcription_model, file = Path(input), + api_key=self.api_key, + api_base=self.endpoint, + api_version=self.api_version, max_retries = 5, ) @@ -112,6 +115,9 @@ class OpenAIAdapter(LLMInterface): }, ], }], + api_key=self.api_key, + api_base=self.endpoint, + api_version=self.api_version, max_tokens = 300, max_retries = 5, ) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index d15d93b73..158fb9d07 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -1,9 +1,12 @@ -from typing import List, Dict, Union +import numpy as np +from typing import List, Dict, Union from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge from cognee.modules.graph.cognee_graph.CogneeAbstractGraph import CogneeAbstractGraph -from cognee.infrastructure.databases.graph import get_graph_engine +import heapq +from graphistry import edges + class CogneeGraph(CogneeAbstractGraph): """ @@ -39,26 +42,33 @@ class CogneeGraph(CogneeAbstractGraph): def get_node(self, node_id: str) -> Node: return self.nodes.get(node_id, None) - def get_edges(self, node_id: str) -> List[Edge]: + def get_edges_of_node(self, node_id: str) -> List[Edge]: node = self.get_node(node_id) if node: return node.skeleton_edges else: raise ValueError(f"Node with id {node_id} does not exist.") + def get_edges(self)-> List[Edge]: + return edges + async def project_graph_from_db(self, adapter: Union[GraphDBInterface], node_properties_to_project: List[str], edge_properties_to_project: List[str], directed = True, node_dimension = 1, - edge_dimension = 1) -> None: + edge_dimension = 1, + memory_fragment_filter = []) -> None: if node_dimension < 1 or edge_dimension < 1: raise ValueError("Dimensions must be positive integers") try: - nodes_data, edges_data = await adapter.get_graph_data() + if len(memory_fragment_filter) == 0: + nodes_data, edges_data = await adapter.get_graph_data() + else: + nodes_data, edges_data = await adapter.get_filtered_graph_data(attribute_filters = memory_fragment_filter) if not nodes_data: raise ValueError("No node data retrieved from the database.") @@ -89,3 +99,81 @@ class CogneeGraph(CogneeAbstractGraph): print(f"Error projecting graph: {e}") except Exception as ex: print(f"Unexpected error: {ex}") + + async def map_vector_distances_to_graph_nodes(self, node_distances) -> None: + for category, scored_results in node_distances.items(): + for scored_result in scored_results: + node_id = str(scored_result.id) + score = scored_result.score + node =self.get_node(node_id) + if node: + node.add_attribute("vector_distance", score) + else: + print(f"Node with id {node_id} not found in the graph.") + + async def map_vector_distances_to_graph_edges(self, vector_engine, query) -> None: # :TODO: When we calculate edge embeddings in vector db change this similarly to node mapping + try: + # Step 1: Generate the query embedding + query_vector = await vector_engine.embed_data([query]) + query_vector = query_vector[0] + if query_vector is None or len(query_vector) == 0: + raise ValueError("Failed to generate query embedding.") + + # Step 2: Collect all unique relationship types + unique_relationship_types = set() + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if relationship_type: + unique_relationship_types.add(relationship_type) + + # Step 3: Embed all unique relationship types + unique_relationship_types = list(unique_relationship_types) + relationship_type_embeddings = await vector_engine.embed_data(unique_relationship_types) + + # Step 4: Map relationship types to their embeddings and calculate distances + embedding_map = {} + for relationship_type, embedding in zip(unique_relationship_types, relationship_type_embeddings): + edge_vector = np.array(embedding) + + # Calculate cosine similarity + similarity = np.dot(query_vector, edge_vector) / ( + np.linalg.norm(query_vector) * np.linalg.norm(edge_vector) + ) + distance = 1 - similarity + + # Round the distance to 4 decimal places and store it + embedding_map[relationship_type] = round(distance, 4) + + # Step 4: Assign precomputed distances to edges + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if not relationship_type or relationship_type not in embedding_map: + print(f"Edge {edge} has an unknown or missing relationship type.") + continue + + # Assign the precomputed distance + edge.attributes["vector_distance"] = embedding_map[relationship_type] + + except Exception as ex: + print(f"Error mapping vector distances to edges: {ex}") + + + async def calculate_top_triplet_importances(self, k = int) -> List: + min_heap = [] + for i, edge in enumerate(self.edges): + source_node = self.get_node(edge.node1.id) + target_node = self.get_node(edge.node2.id) + + source_distance = source_node.attributes.get("vector_distance", 0) if source_node else 0 + target_distance = target_node.attributes.get("vector_distance", 0) if target_node else 0 + edge_distance = edge.attributes.get("vector_distance", 0) + + total_distance = source_distance + target_distance + edge_distance + + heapq.heappush(min_heap, (-total_distance, i, edge)) + if len(min_heap) > k: + heapq.heappop(min_heap) + + + return [edge for _, _, edge in sorted(min_heap)] + diff --git a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py index 8235cb24d..cecb0a272 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py @@ -1,5 +1,5 @@ import numpy as np -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Union class Node: """ @@ -21,6 +21,7 @@ class Node: raise ValueError("Dimension must be a positive integer") self.id = node_id self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.skeleton_neighbours = [] self.skeleton_edges = [] self.status = np.ones(dimension, dtype=int) @@ -55,6 +56,12 @@ class Node: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: return f"Node({self.id}, attributes={self.attributes})" @@ -87,6 +94,7 @@ class Edge: self.node1 = node1 self.node2 = node2 self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.directed = directed self.status = np.ones(dimension, dtype=int) @@ -95,6 +103,12 @@ class Edge: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str, value: Any) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: direction = "->" if self.directed else "--" return f"Edge({self.node1.id} {direction} {self.node2.id}, attributes={self.attributes})" diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py index 6fbe2ee99..c4fa0d654 100644 --- a/cognee/modules/graph/utils/__init__.py +++ b/cognee/modules/graph/utils/__init__.py @@ -2,3 +2,4 @@ from .expand_with_nodes_and_edges import expand_with_nodes_and_edges from .get_graph_from_model import get_graph_from_model from .get_model_instance_from_graph import get_model_instance_from_graph from .retrieve_existing_edges import retrieve_existing_edges +from .convert_node_to_data_point import convert_node_to_data_point diff --git a/cognee/modules/graph/utils/convert_node_to_data_point.py b/cognee/modules/graph/utils/convert_node_to_data_point.py new file mode 100644 index 000000000..292f53733 --- /dev/null +++ b/cognee/modules/graph/utils/convert_node_to_data_point.py @@ -0,0 +1,23 @@ +from cognee.infrastructure.engine import DataPoint + + +def convert_node_to_data_point(node_data: dict) -> DataPoint: + subclass = find_subclass_by_name(DataPoint, node_data["type"]) + + return subclass(**node_data) + + +def get_all_subclasses(cls): + subclasses = [] + for subclass in cls.__subclasses__(): + subclasses.append(subclass) + subclasses.extend(get_all_subclasses(subclass)) # Recursively get subclasses + + return subclasses + +def find_subclass_by_name(cls, name): + for subclass in get_all_subclasses(cls): + if subclass.__name__ == name: + return subclass + + return None diff --git a/cognee/modules/graph/utils/get_graph_from_model.py b/cognee/modules/graph/utils/get_graph_from_model.py index 29137ddc7..d2c6269ab 100644 --- a/cognee/modules/graph/utils/get_graph_from_model.py +++ b/cognee/modules/graph/utils/get_graph_from_model.py @@ -2,9 +2,18 @@ from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.storage.utils import copy_model -def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes = {}, added_edges = {}): +async def get_graph_from_model( + data_point: DataPoint, + include_root = True, + added_nodes = None, + added_edges = None, + visited_properties = None, +): nodes = [] edges = [] + added_nodes = added_nodes or {} + added_edges = added_edges or {} + visited_properties = visited_properties or {} data_point_properties = {} excluded_properties = set() @@ -13,10 +22,27 @@ def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes if field_name == "_metadata": continue + if field_value is None: + excluded_properties.add(field_name) + continue + if isinstance(field_value, DataPoint): excluded_properties.add(field_name) - property_nodes, property_edges = get_graph_from_model(field_value, True, added_nodes, added_edges) + property_key = f"{str(data_point.id)}{field_name}{str(field_value.id)}" + + if property_key in visited_properties: + return [], [] + + visited_properties[property_key] = 0 + + property_nodes, property_edges = await get_graph_from_model( + field_value, + True, + added_nodes, + added_edges, + visited_properties, + ) for node in property_nodes: if str(node.id) not in added_nodes: @@ -47,7 +73,20 @@ def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes excluded_properties.add(field_name) for item in field_value: - property_nodes, property_edges = get_graph_from_model(item, True, added_nodes, added_edges) + property_key = f"{str(data_point.id)}{field_name}{str(item.id)}" + + if property_key in visited_properties: + return [], [] + + visited_properties[property_key] = 0 + + property_nodes, property_edges = await get_graph_from_model( + item, + True, + added_nodes, + added_edges, + visited_properties, + ) for node in property_nodes: if str(node.id) not in added_nodes: diff --git a/cognee/pipelines/__init__.py b/cognee/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/__init__.py b/cognee/pipelines/retriever/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/diffusion_retriever.py b/cognee/pipelines/retriever/diffusion_retriever.py new file mode 100644 index 000000000..a6b79310e --- /dev/null +++ b/cognee/pipelines/retriever/diffusion_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await diffusion_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def diffusion_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/g_retriever.py b/cognee/pipelines/retriever/g_retriever.py new file mode 100644 index 000000000..4b319acd9 --- /dev/null +++ b/cognee/pipelines/retriever/g_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await g_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def g_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py new file mode 100644 index 000000000..92ef2be2e --- /dev/null +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -0,0 +1,119 @@ +import asyncio +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.graph import get_graph_engine + + +def format_triplets(edges): + print("\n\n\n") + def filter_attributes(obj, attributes): + """Helper function to filter out non-None properties, including nested dicts.""" + result = {} + for attr in attributes: + value = getattr(obj, attr, None) + if value is not None: + # If the value is a dict, extract relevant keys from it + if isinstance(value, dict): + nested_values = {k: v for k, v in value.items() if k in attributes and v is not None} + result[attr] = nested_values + else: + result[attr] = value + return result + + triplets = [] + for edge in edges: + node1 = edge.node1 + node2 = edge.node2 + edge_attributes = edge.attributes + node1_attributes = node1.attributes + node2_attributes = node2.attributes + + # Filter only non-None properties + node1_info = {key: value for key, value in node1_attributes.items() if value is not None} + node2_info = {key: value for key, value in node2_attributes.items() if value is not None} + edge_info = {key: value for key, value in edge_attributes.items() if value is not None} + + # Create the formatted triplet + triplet = ( + f"Node1: {node1_info}\n" + f"Edge: {edge_info}\n" + f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation + ) + triplets.append(triplet) + + return "".join(triplets) + + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await run_two_step_retriever(query, user) + + filtered_search_results = [] + + return retrieved_results + + +def delete_duplicated_vector_db_elements(collections, results): #:TODO: This is just for now to fix vector db duplicates + results_dict = {} + for collection, results in zip(collections, results): + seen_ids = set() + unique_results = [] + for result in results: + if result.id not in seen_ids: + unique_results.append(result) + seen_ids.add(result.id) + else: + print(f"Duplicate found in collection '{collection}': {result.id}") + results_dict[collection] = unique_results + + return results_dict + + +async def run_two_step_retriever(query: str, user, community_filter = []) -> list: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + + collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] + results = await asyncio.gather( + *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + ) + + ############################################# This part is a quick fix til we don't fix the vector db inconsistency + node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed + # results_dict = {collection: result for collection, result in zip(collections, results)} + ############################################## + + memory_fragment = CogneeGraph() + + await memory_fragment.project_graph_from_db(graph_engine, + node_properties_to_project=['id', + 'description', + 'name', + 'type', + 'text'], + edge_properties_to_project=['id', + 'relationship_name']) + + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) + + await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db + + results = await memory_fragment.calculate_top_triplet_importances(k=5) + + print(format_triplets(results)) + print(f'Query was the following:{query}' ) + + return results diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 9052cf89e..4811106e5 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -1,13 +1,28 @@ +from typing import List, Optional from cognee.infrastructure.engine import DataPoint class Repository(DataPoint): path: str + type: Optional[str] = "Repository" class CodeFile(DataPoint): extracted_id: str # actually file path + type: Optional[str] = "CodeFile" + source_code: Optional[str] = None + part_of: Optional[Repository] = None + depends_on: Optional[List["CodeFile"]] = None + depends_directly_on: Optional[List["CodeFile"]] = None + contains: Optional[List["CodePart"]] = None + + _metadata: dict = { + "index_fields": ["source_code"] + } + +class CodePart(DataPoint): type: str + # part_of: Optional[CodeFile] source_code: str - part_of: Repository + type: Optional[str] = "CodePart" _metadata: dict = { "index_fields": ["source_code"] @@ -18,3 +33,6 @@ class CodeRelationship(DataPoint): target_id: str type: str # between files relation: str # depends on or depends directly + +CodeFile.model_rebuild() +CodePart.model_rebuild() diff --git a/cognee/tasks/code/enrich_dependency_graph_checker.py b/cognee/tasks/code/enrich_dependency_graph_checker.py index 3e56fab59..7b04e0357 100644 --- a/cognee/tasks/code/enrich_dependency_graph_checker.py +++ b/cognee/tasks/code/enrich_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph @@ -15,7 +15,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) graph = asyncio.run(enrich_dependency_graph(graph)) for node in graph.nodes: print(f"Node: {node}") diff --git a/cognee/tasks/code/expand_dependency_graph_checker.py b/cognee/tasks/code/expand_dependency_graph_checker.py index 1b3ce3246..bdef34162 100644 --- a/cognee/tasks/code/expand_dependency_graph_checker.py +++ b/cognee/tasks/code/expand_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph from cognee.tasks.repo_processor.expand_dependency_graph import expand_dependency_graph @@ -16,7 +16,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) graph = asyncio.run(enrich_dependency_graph(graph)) graph = expand_dependency_graph(graph) for node in graph.nodes: diff --git a/cognee/tasks/code/get_repo_dependency_graph_checker.py b/cognee/tasks/code/get_repo_dependency_graph_checker.py index 57afc7274..3a393d3f3 100644 --- a/cognee/tasks/code/get_repo_dependency_graph_checker.py +++ b/cognee/tasks/code/get_repo_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies def main(): @@ -14,7 +14,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) for node in graph.nodes: print(f"Node: {node}") diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index d881514a2..8ee87bcad 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -1,16 +1,51 @@ from cognee.modules.data.models import Data -from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument +from cognee.modules.data.processing.document_types import ( + Document, + PdfDocument, + AudioDocument, + ImageDocument, + TextDocument, +) EXTENSION_TO_DOCUMENT_CLASS = { - "pdf": PdfDocument, - "audio": AudioDocument, - "image": ImageDocument, - "txt": TextDocument + "pdf": PdfDocument, # Text documents + "txt": TextDocument, + "png": ImageDocument, # Image documents + "dwg": ImageDocument, + "xcf": ImageDocument, + "jpg": ImageDocument, + "jpx": ImageDocument, + "apng": ImageDocument, + "gif": ImageDocument, + "webp": ImageDocument, + "cr2": ImageDocument, + "tif": ImageDocument, + "bmp": ImageDocument, + "jxr": ImageDocument, + "psd": ImageDocument, + "ico": ImageDocument, + "heic": ImageDocument, + "avif": ImageDocument, + "aac": AudioDocument, # Audio documents + "mid": AudioDocument, + "mp3": AudioDocument, + "m4a": AudioDocument, + "ogg": AudioDocument, + "flac": AudioDocument, + "wav": AudioDocument, + "amr": AudioDocument, + "aiff": AudioDocument, } + def classify_documents(data_documents: list[Data]) -> list[Document]: documents = [ - EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name) + EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( + id=data_item.id, + title=f"{data_item.name}.{data_item.extension}", + raw_data_location=data_item.raw_data_location, + name=data_item.name, + ) for data_item in data_documents ] return documents diff --git a/cognee/tasks/graph/convert_graph_from_code_graph.py b/cognee/tasks/graph/convert_graph_from_code_graph.py deleted file mode 100644 index bc8544994..000000000 --- a/cognee/tasks/graph/convert_graph_from_code_graph.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import networkx as nx - -from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository -from cognee.tasks.storage import add_data_points - - -async def convert_graph_from_code_graph( - graph: nx.DiGraph, repo_path: str -) -> tuple[str, list[CodeFile], list[CodeRelationship]]: - code_objects = code_objects_from_di_graph(graph, repo_path) - - add_data_points(code_objects) - - return code_objects - - -def create_code_file(path, type, repo): - abspath = os.path.abspath(path) - - with open(abspath, "r") as f: - source_code = f.read() - - code_file = CodeFile( - extracted_id = abspath, - type = type, - source_code = source_code, - part_of = repo, - ) - - return code_file - - -def code_objects_from_di_graph( - graph: nx.DiGraph, repo_path: str -) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]: - repo = Repository(path=repo_path) - - code_files = [ - create_code_file(os.path.join(repo_path, path), "python_file", repo) - for path in graph.nodes - ] - - code_relationships = [ - CodeRelationship( - os.path.join(repo_path, source), - os.path.join(repo_path, target), - "python_file", - graph.get_edge_data(source, target)["relation"], - ) - for source, target in graph.edges - ] - - return (repo, code_files, code_relationships) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index ad6ae19d2..2bf4aeba4 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -24,11 +24,13 @@ async def extract_graph_from_data( (chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs) ] existing_edges_map = await retrieve_existing_edges( - chunk_and_chunk_graphs, graph_engine + chunk_and_chunk_graphs, + graph_engine, ) graph_nodes, graph_edges = expand_with_nodes_and_edges( - chunk_and_chunk_graphs, existing_edges_map + chunk_and_chunk_graphs, + existing_edges_map, ) if len(graph_nodes) > 0: diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index a1aeabcdc..05e111b29 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,4 @@ logger = logging.getLogger("task:repo_processor") from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph -from .get_repo_dependency_graph import get_repo_dependency_graph +from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/enrich_dependency_graph.py b/cognee/tasks/repo_processor/enrich_dependency_graph.py index c3aa9fd74..ba222ef3f 100644 --- a/cognee/tasks/repo_processor/enrich_dependency_graph.py +++ b/cognee/tasks/repo_processor/enrich_dependency_graph.py @@ -1,25 +1,38 @@ +import asyncio import networkx as nx from typing import Dict, List +from tqdm.asyncio import tqdm + +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile +from cognee.modules.graph.utils import get_graph_from_model, convert_node_to_data_point +from cognee.infrastructure.databases.graph import get_graph_engine def topologically_sort_subgraph(subgraph_node_to_indegree: Dict[str, int], graph: nx.DiGraph) -> List[str]: """Performs a topological sort on a subgraph based on node indegrees.""" results = [] remaining_nodes = subgraph_node_to_indegree.copy() + while remaining_nodes: next_node = min(remaining_nodes, key=remaining_nodes.get) results.append(next_node) + for successor in graph.successors(next_node): if successor in remaining_nodes: remaining_nodes[successor] -= 1 + remaining_nodes.pop(next_node) + return results def topologically_sort(graph: nx.DiGraph) -> List[str]: """Performs a topological sort on the entire graph.""" subgraphs = (graph.subgraph(c).copy() for c in nx.weakly_connected_components(graph)) + topological_order = [] + for subgraph in subgraphs: node_to_indegree = { node: len(list(subgraph.successors(node))) @@ -28,29 +41,84 @@ def topologically_sort(graph: nx.DiGraph) -> List[str]: topological_order.extend( topologically_sort_subgraph(node_to_indegree, subgraph) ) + return topological_order -def node_enrich_and_connect(graph: nx.MultiDiGraph, topological_order: List[str], node: str) -> None: +async def node_enrich_and_connect( + graph: nx.MultiDiGraph, + topological_order: List[str], + node: CodeFile, + data_points_map: Dict[str, DataPoint], +) -> None: """Adds 'depends_on' edges to the graph based on topological order.""" - topological_rank = topological_order.index(node) - graph.nodes[node]['topological_rank'] = topological_rank - node_descendants = nx.descendants(graph, node) - if graph.has_edge(node,node): - node_descendants.add(node) - for desc in node_descendants: - if desc not in topological_order[:topological_rank+1]: + topological_rank = topological_order.index(node.id) + node.topological_rank = topological_rank + node_descendants = nx.descendants(graph, node.id) + + if graph.has_edge(node.id, node.id): + node_descendants.add(node.id) + + new_connections = [] + graph_engine = await get_graph_engine() + + for desc_id in node_descendants: + if desc_id not in topological_order[:topological_rank + 1]: continue - graph.add_edge(node, desc, relation='depends_on') -async def enrich_dependency_graph(graph: nx.DiGraph) -> nx.MultiDiGraph: + if desc_id in data_points_map: + desc = data_points_map[desc_id] + else: + node_data = await graph_engine.extract_node(desc_id) + desc = convert_node_to_data_point(node_data) + + new_connections.append(desc) + + node.depends_directly_on = node.depends_directly_on or [] + node.depends_directly_on.extend(new_connections) + + +async def enrich_dependency_graph(data_points: list[DataPoint]) -> list[DataPoint]: """Enriches the graph with topological ranks and 'depends_on' edges.""" - graph = nx.MultiDiGraph(graph) + nodes = [] + edges = [] + + for data_point in data_points: + graph_nodes, graph_edges = await get_graph_from_model(data_point) + nodes.extend(graph_nodes) + edges.extend(graph_edges) + + graph = nx.MultiDiGraph() + + simple_nodes = [(node.id, node.model_dump()) for node in nodes] + + graph.add_nodes_from(simple_nodes) + + graph.add_edges_from(edges) + topological_order = topologically_sort(graph) + node_rank_map = {node: idx for idx, node in enumerate(topological_order)} - for node in graph.nodes: - if node not in node_rank_map: + + # for node_id, node in tqdm(graph.nodes(data = True), desc = "Enriching dependency graph", unit = "node"): + # if node_id not in node_rank_map: + # continue + + # data_points.append(node_enrich_and_connect(graph, topological_order, node)) + + data_points_map = {data_point.id: data_point for data_point in data_points} + data_points_futures = [] + + for data_point in tqdm(data_points, desc = "Enriching dependency graph", unit = "data_point"): + if data_point.id not in node_rank_map: continue - node_enrich_and_connect(graph, topological_order, node) - return graph + + if isinstance(data_point, CodeFile): + data_points_futures.append(node_enrich_and_connect(graph, topological_order, data_point, data_points_map)) + + # yield data_point + + await asyncio.gather(*data_points_futures) + + return data_points diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py index a6b724302..722bfa5c6 100644 --- a/cognee/tasks/repo_processor/expand_dependency_graph.py +++ b/cognee/tasks/repo_processor/expand_dependency_graph.py @@ -1,28 +1,43 @@ -import networkx as nx - +from uuid import NAMESPACE_OID, uuid5 +# from tqdm import tqdm +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile, CodePart from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts from cognee.tasks.repo_processor import logger - -def _add_code_parts_nodes_and_edges(graph, parent_node_id, part_type, code_parts): +def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None: """Add code part nodes and edges for a specific part type.""" if not code_parts: - logger.debug(f"No code parts to add for parent_node_id {parent_node_id} and part_type {part_type}.") + logger.debug(f"No code parts to add for node {code_file.id} and part_type {part_type}.") return + part_nodes = [] + for idx, code_part in enumerate(code_parts): if not code_part.strip(): - logger.warning(f"Empty code part in parent_node_id {parent_node_id} and part_type {part_type}.") + logger.warning(f"Empty code part in node {code_file.id} and part_type {part_type}.") continue - part_node_id = f"{parent_node_id}_{part_type}_{idx}" - graph.add_node(part_node_id, source_code=code_part, node_type=part_type) - graph.add_edge(parent_node_id, part_node_id, relation="contains") + + part_node_id = uuid5(NAMESPACE_OID, f"{code_file.id}_{part_type}_{idx}") + + part_nodes.append(CodePart( + id = part_node_id, + type = part_type, + # part_of = code_file, + source_code = code_part, + )) + + # graph.add_node(part_node_id, source_code=code_part, node_type=part_type) + # graph.add_edge(parent_node_id, part_node_id, relation="contains") + + code_file.contains = code_file.contains or [] + code_file.contains.extend(part_nodes) -def _process_single_node(graph, node_id, node_data): +def _process_single_node(code_file: CodeFile) -> None: """Process a single Python file node.""" - graph.nodes[node_id]["node_type"] = "python_file" - source_code = node_data.get("source_code", "") + node_id = code_file.id + source_code = code_file.source_code if not source_code.strip(): logger.warning(f"Node {node_id} has no or empty 'source_code'. Skipping.") @@ -35,15 +50,14 @@ def _process_single_node(graph, node_id, node_data): return for part_type, code_parts in code_parts_dict.items(): - _add_code_parts_nodes_and_edges(graph, node_id, part_type, code_parts) + _add_code_parts_nodes_and_edges(code_file, part_type, code_parts) -def expand_dependency_graph(graph: nx.MultiDiGraph) -> nx.MultiDiGraph: +async def expand_dependency_graph(data_points: list[DataPoint]) -> list[DataPoint]: """Process Python file nodes, adding code part nodes and edges.""" - expanded_graph = graph.copy() - for node_id, node_data in graph.nodes(data=True): - if not node_data: # Check if node_data is empty - logger.warning(f"Node {node_id} has no data. Skipping.") - continue - _process_single_node(expanded_graph, node_id, node_data) - return expanded_graph + # for data_point in tqdm(data_points, desc = "Expand dependency graph", unit = "data_point"): + for data_point in data_points: + if isinstance(data_point, CodeFile): + _process_single_node(data_point) + + return data_points diff --git a/cognee/tasks/repo_processor/get_repo_dependency_graph.py b/cognee/tasks/repo_processor/get_repo_dependency_graph.py deleted file mode 100644 index 7f96bd49c..000000000 --- a/cognee/tasks/repo_processor/get_repo_dependency_graph.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import aiofiles -import networkx as nx - -from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies - - -async def get_py_path_and_source(file_path, repo_path): - relative_path = os.path.relpath(file_path, repo_path) - try: - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - source_code = await f.read() - return relative_path, source_code - except Exception as e: - print(f"Error reading file {file_path}: {e}") - return relative_path, None - - -async def get_py_files_dict(repo_path): - """Get .py files and their source code""" - if not os.path.exists(repo_path): - return {} - - py_files_paths = ( - os.path.join(root, file) - for root, _, files in os.walk(repo_path) for file in files if file.endswith(".py") - ) - - py_files_dict = {} - for file_path in py_files_paths: - relative_path, source_code = await get_py_path_and_source(file_path, repo_path) - py_files_dict[relative_path] = {"source_code": source_code} - - return py_files_dict - - -def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple: - if relative_paths: - file_path = os.path.relpath(file_path, repo_path) - dependency = os.path.relpath(dependency, repo_path) - return (file_path, dependency, {"relation": "depends_directly_on"}) - - -async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph: - """Generate a dependency graph for Python files in the given repository path.""" - py_files_dict = await get_py_files_dict(repo_path) - - dependency_graph = nx.DiGraph() - - dependency_graph.add_nodes_from(py_files_dict.items()) - - for file_path, metadata in py_files_dict.items(): - source_code = metadata.get("source_code") - if source_code is None: - continue - - dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) - dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] - dependency_graph.add_edges_from(dependency_edges) - - return dependency_graph diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py new file mode 100644 index 000000000..58f3857a9 --- /dev/null +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -0,0 +1,87 @@ +import os +from uuid import NAMESPACE_OID, uuid5 +import aiofiles +from tqdm.asyncio import tqdm + +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile, Repository +from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies + + +async def get_py_path_and_source(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + source_code = await f.read() + return file_path, source_code + except Exception as e: + print(f"Error reading file {file_path}: {e}") + return file_path, None + + +async def get_py_files_dict(repo_path): + """Get .py files and their source code""" + if not os.path.exists(repo_path): + return {} + + py_files_paths = ( + os.path.join(root, file) + for root, _, files in os.walk(repo_path) for file in files if file.endswith(".py") + ) + + py_files_dict = {} + for file_path in py_files_paths: + absolute_path = os.path.abspath(file_path) + relative_path, source_code = await get_py_path_and_source(absolute_path) + py_files_dict[relative_path] = {"source_code": source_code} + + return py_files_dict + + +def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = False) -> tuple: + if relative_paths: + file_path = os.path.relpath(file_path, repo_path) + dependency = os.path.relpath(dependency, repo_path) + return (file_path, dependency, {"relation": "depends_directly_on"}) + + +async def get_repo_file_dependencies(repo_path: str) -> list[DataPoint]: + """Generate a dependency graph for Python files in the given repository path.""" + py_files_dict = await get_py_files_dict(repo_path) + + repo = Repository( + id = uuid5(NAMESPACE_OID, repo_path), + path = repo_path, + ) + + data_points = [repo] + + # dependency_graph = nx.DiGraph() + + # dependency_graph.add_nodes_from(py_files_dict.items()) + + async for file_path, metadata in tqdm(py_files_dict.items(), desc="Repo dependency graph", unit="file"): + source_code = metadata.get("source_code") + if source_code is None: + continue + + dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) + + data_points.append(CodeFile( + id = uuid5(NAMESPACE_OID, file_path), + source_code = source_code, + extracted_id = file_path, + part_of = repo, + depends_on = [ + CodeFile( + id = uuid5(NAMESPACE_OID, dependency), + extracted_id = dependency, + part_of = repo, + ) for dependency in dependencies + ] if len(dependencies) else None, + )) + # dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] + + # dependency_graph.add_edges_from(dependency_edges) + + return data_points + # return dependency_graph diff --git a/cognee/tasks/storage/add_data_points.py b/cognee/tasks/storage/add_data_points.py index b803c9dfd..33f9d7a70 100644 --- a/cognee/tasks/storage/add_data_points.py +++ b/cognee/tasks/storage/add_data_points.py @@ -1,3 +1,4 @@ +import asyncio from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.graph.utils import get_graph_from_model @@ -8,11 +9,13 @@ async def add_data_points(data_points: list[DataPoint]): nodes = [] edges = [] - for data_point in data_points: - property_nodes, property_edges = get_graph_from_model(data_point) + results = await asyncio.gather(*[ + get_graph_from_model(data_point) for data_point in data_points + ]) - nodes.extend(property_nodes) - edges.extend(property_edges) + for result_nodes, result_edges in results: + nodes.extend(result_nodes) + edges.extend(result_edges) graph_engine = await get_graph_engine() diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 12903173a..03ad30f9d 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -16,6 +16,9 @@ async def index_data_points(data_points: list[DataPoint]): data_point_type = type(data_point) for field_name in data_point._metadata["index_fields"]: + if getattr(data_point, field_name, None) is None: + continue + index_name = f"{data_point_type.__tablename__}.{field_name}" if index_name not in created_indexes: @@ -35,12 +38,21 @@ async def index_data_points(data_points: list[DataPoint]): return data_points -def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> list[DataPoint]: +def get_data_points_from_model(data_point: DataPoint, added_data_points = None, visited_properties = None) -> list[DataPoint]: data_points = [] + added_data_points = added_data_points or {} + visited_properties = visited_properties or {} for field_name, field_value in data_point: if isinstance(field_value, DataPoint): - new_data_points = get_data_points_from_model(field_value, added_data_points) + property_key = f"{str(data_point.id)}{field_name}{str(field_value.id)}" + + if property_key in visited_properties: + return [] + + visited_properties[property_key] = True + + new_data_points = get_data_points_from_model(field_value, added_data_points, visited_properties) for new_point in new_data_points: if str(new_point.id) not in added_data_points: @@ -49,7 +61,14 @@ def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> if isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], DataPoint): for field_value_item in field_value: - new_data_points = get_data_points_from_model(field_value_item, added_data_points) + property_key = f"{str(data_point.id)}{field_name}{str(field_value_item.id)}" + + if property_key in visited_properties: + return [] + + visited_properties[property_key] = True + + new_data_points = get_data_points_from_model(field_value_item, added_data_points, visited_properties) for new_point in new_data_points: if str(new_point.id) not in added_data_points: @@ -79,4 +98,3 @@ if __name__ == "__main__": data_points = get_data_points_from_model(person) print(data_points) - \ No newline at end of file diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index 31b86d325..277081f40 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -4,6 +4,7 @@ from uuid import uuid5 from pydantic import BaseModel +from cognee.infrastructure.engine import DataPoint from cognee.modules.data.extraction.extract_summary import extract_summary from cognee.shared.CodeGraphEntities import CodeFile from cognee.tasks.storage import add_data_points @@ -12,13 +13,16 @@ from .models import CodeSummary async def summarize_code( - code_files: list[CodeFile], summarization_model: Type[BaseModel] -) -> list[CodeFile]: + code_files: list[DataPoint], + summarization_model: Type[BaseModel], +) -> list[DataPoint]: if len(code_files) == 0: return code_files + code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)] + file_summaries = await asyncio.gather( - *[extract_summary(file.source_code, summarization_model) for file in code_files] + *[extract_summary(file.source_code, summarization_model) for file in code_files_data_points] ) summaries = [ @@ -27,9 +31,9 @@ async def summarize_code( made_from = file, text = file_summaries[file_index].summary, ) - for (file_index, file) in enumerate(code_files) + for (file_index, file) in enumerate(code_files_data_points) ] await add_data_points(summaries) - return code_files, summaries + return code_files diff --git a/cognee/tests/tasks/graph/code_graph_test_data_generation.py b/cognee/tests/tasks/graph/code_graph_test_data_generation.py deleted file mode 100644 index 74ca2de71..000000000 --- a/cognee/tests/tasks/graph/code_graph_test_data_generation.py +++ /dev/null @@ -1,51 +0,0 @@ -import random -import string - -import numpy as np - -from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship - - -def random_str(n, spaces=True): - candidates = string.ascii_letters + string.digits - if spaces: - candidates += " " - return "".join(random.choice(candidates) for _ in range(n)) - - -def code_graph_test_data_generation(): - nodes = [ - CodeFile( - extracted_id=random_str(10, spaces=False), - type="file", - source_code=random_str(random.randrange(50, 500)), - ) - for _ in range(100) - ] - n_nodes = len(nodes) - first_source = np.random.randint(0, n_nodes) - reached_nodes = {first_source} - last_iteration = [first_source] - edges = [] - while len(reached_nodes) < n_nodes: - for source in last_iteration: - last_iteration = [] - tries = 0 - while ((len(last_iteration) == 0 or tries < 500)) and ( - len(reached_nodes) < n_nodes - ): - tries += 1 - target = np.random.randint(n_nodes) - if target not in reached_nodes: - last_iteration.append(target) - edges.append( - CodeRelationship( - source_id=nodes[source].extracted_id, - target_id=nodes[target].extracted_id, - type="files", - relation="depends", - ) - ) - reached_nodes = reached_nodes.union(set(last_iteration)) - - return (nodes, edges) diff --git a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py b/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py deleted file mode 100644 index 755840b01..000000000 --- a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py +++ /dev/null @@ -1,27 +0,0 @@ -import asyncio - -import pytest - -from cognee.shared.CodeGraphEntities import Repository -from cognee.tasks.graph.convert_graph_from_code_graph import ( - convert_graph_from_code_graph, -) -from cognee.tests.tasks.graph.code_graph_test_data_generation import ( - code_graph_test_data_generation, -) - - -def test_convert_graph_from_code_graph(): - repo = Repository(path="test/repo/path") - nodes, edges = code_graph_test_data_generation() - repo_out, nodes_out, edges_out = asyncio.run( - convert_graph_from_code_graph(repo, nodes, edges) - ) - - assert repo == repo_out, f"{repo = } != {repo_out = }" - - for node_in, node_out in zip(nodes, nodes_out): - assert node_in == node_out, f"{node_in = } != {node_out = }" - - for edge_in, edge_out in zip(edges, edges_out): - assert edge_in == edge_out, f"{edge_in = } != {edge_out = }" diff --git a/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py new file mode 100644 index 000000000..f75b84826 --- /dev/null +++ b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py @@ -0,0 +1,100 @@ +import asyncio +import random +import time +from typing import List +from uuid import uuid5, NAMESPACE_OID + +from cognee.infrastructure.engine import DataPoint +from cognee.modules.graph.utils import get_graph_from_model + +random.seed(1500) + +class Repository(DataPoint): + path: str + +class CodeFile(DataPoint): + part_of: Repository + contains: List["CodePart"] = [] + depends_on: List["CodeFile"] = [] + source_code: str + +class CodePart(DataPoint): + part_of: CodeFile + source_code: str + +CodeFile.model_rebuild() +CodePart.model_rebuild() + + +def nanoseconds_to_largest_unit(nanoseconds): + # Define conversion factors + conversion_factors = { + 'weeks': 7 * 24 * 60 * 60 * 1e9, + 'days': 24 * 60 * 60 * 1e9, + 'hours': 60 * 60 * 1e9, + 'minutes': 60 * 1e9, + 'seconds': 1e9, + 'miliseconds': 1e6, + 'microseconds': 1e3, + } + + # Iterate through conversion factors to find the largest unit + for unit, factor in conversion_factors.items(): + converted_value = nanoseconds / factor + if converted_value >= 1: + return converted_value, unit + + # If nanoseconds is smaller than a second + return nanoseconds, 'nanoseconds' + + +async def test_circular_reference_extraction(): + repo = Repository(path = "repo1") + + code_files = [CodeFile( + id = uuid5(NAMESPACE_OID, f"file{file_index}"), + source_code = "source code", + part_of = repo, + contains = [], + depends_on = [CodeFile( + id = uuid5(NAMESPACE_OID, f"file{random_id}"), + source_code = "source code", + part_of = repo, + depends_on = [], + ) for random_id in [random.randint(0, 1499) for _ in range(random.randint(0, 5))]], + ) for file_index in range(1500)] + + for code_file in code_files: + code_file.contains.extend([CodePart( + part_of = code_file, + source_code = f"Part {part_index}", + ) for part_index in range(random.randint(1, 20))]) + + nodes = [] + edges = [] + + start = time.perf_counter_ns() + + results = await asyncio.gather(*[ + get_graph_from_model(code_file) for code_file in code_files + ]) + + time_to_run = time.perf_counter_ns() - start + + print(nanoseconds_to_largest_unit(time_to_run)) + + for result_nodes, result_edges in results: + nodes.extend(result_nodes) + edges.extend(result_edges) + + # for code_file in code_files: + # model_nodes, model_edges = get_graph_from_model(code_file) + + # nodes.extend(model_nodes) + # edges.extend(model_edges) + + assert len(nodes) == 1501 + assert len(edges) == 1501 * 20 + 1500 * 5 + +if __name__ == "__main__": + asyncio.run(test_circular_reference_extraction()) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index d2a1b6c59..a3755a58f 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -8,7 +8,7 @@ def test_node_initialization(): """Test that a Node is initialized correctly.""" node = Node("node1", {"attr1": "value1"}, dimension=2) assert node.id == "node1" - assert node.attributes == {"attr1": "value1"} + assert node.attributes == {"attr1": "value1", 'vector_distance': np.inf} assert len(node.status) == 2 assert np.all(node.status == 1) @@ -95,7 +95,7 @@ def test_edge_initialization(): edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2) assert edge.node1 == node1 assert edge.node2 == node2 - assert edge.attributes == {"weight": 10} + assert edge.attributes == {'vector_distance': np.inf,"weight": 10} assert edge.directed is False assert len(edge.status) == 2 assert np.all(edge.status == 1) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index d05292d75..bad474023 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -77,11 +77,11 @@ def test_get_edges_success(setup_graph): graph.add_node(node2) edge = Edge(node1, node2) graph.add_edge(edge) - assert edge in graph.get_edges("node1") + assert edge in graph.get_edges_of_node("node1") def test_get_edges_nonexistent_node(setup_graph): """Test retrieving edges for a nonexistent node raises an exception.""" graph = setup_graph with pytest.raises(ValueError, match="Node with id nonexistent does not exist."): - graph.get_edges("nonexistent") + graph.get_edges_of_node("nonexistent") diff --git a/docker-compose.yml b/docker-compose.yml index afb216169..9c40979bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,7 +46,7 @@ services: - 7687:7687 environment: - NEO4J_AUTH=neo4j/pleaseletmein - - NEO4J_PLUGINS=["apoc"] + - NEO4J_PLUGINS=["apoc", "graph-data-science"] networks: - cognee-network diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index ec93bda07..0a4806e3f 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -8,30 +8,63 @@ from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE import cognee + +from cognee.shared.data_models import SummarizedContent +from cognee.shared.utils import render_graph +from cognee.tasks.repo_processor import ( + enrich_dependency_graph, + expand_dependency_graph, + get_repo_file_dependencies, +) +from cognee.tasks.storage import add_data_points +from cognee.tasks.summarization import summarize_code +from cognee.modules.pipelines import Task, run_tasks from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt from evals.eval_utils import download_instances +from evals.eval_utils import ingest_repos +from evals.eval_utils import download_github_repo +from evals.eval_utils import delete_repo - -async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance): await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) + await cognee.prune.prune_system() - dataset_name = "SWE_test_data" - code_text = instance["text"] - await cognee.add([code_text], dataset_name) - await code_graph_pipeline([dataset_name]) - graph_engine = await get_graph_engine() - with open(graph_engine.filename, "r") as f: - graph_str = f.read() + #dataset_name = "SWE_test_data" + + #await cognee.add('', dataset_name = dataset_name) + + # repo_path = download_github_repo(instance, '../RAW_GIT_REPOS') + + repo_path = '/Users/borisarzentar/Projects/graphrag' + + tasks = [ + Task(get_repo_file_dependencies), + Task(add_data_points), + Task(enrich_dependency_graph), + Task(expand_dependency_graph), + Task(add_data_points), + # Task(summarize_code, summarization_model = SummarizedContent), + ] + + pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline") + + async for result in pipeline: + print(result) + + print('Here we have the repo under the repo_path') + + await render_graph() problem_statement = instance['problem_statement'] instructions = read_query_prompt("patch_gen_instructions.txt") + graph_str = 'HERE WE SHOULD PASS THE TRIPLETS FROM GRAPHRAG' + prompt = "\n".join([ instructions, "", @@ -41,14 +74,18 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): graph_str ]) + return 0 + + ''' :TODO: We have to find out how do we do the generation llm_client = get_llm_client() answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, ) - return answer_prediction + return answer_prediction + ''' async def generate_patch_without_cognee(instance): problem_statement = instance['problem_statement'] @@ -71,11 +108,16 @@ async def get_preds(dataset, with_cognee=True): model_name = "without_cognee" pred_func = generate_patch_without_cognee + + for instance in dataset: + await pred_func(instance) + + ''' preds = [{"instance_id": instance["instance_id"], "model_patch": await pred_func(instance), "model_name_or_path": model_name} for instance in dataset] - - return preds + ''' + return 0 async def main(): @@ -115,4 +157,5 @@ async def main(): if __name__ == "__main__": import asyncio + asyncio.run(main(), debug=True) diff --git a/evals/eval_utils.py b/evals/eval_utils.py index e95a84cec..3192127dc 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -8,7 +8,8 @@ from swebench.inference.make_datasets.create_instance import make_code_text from swebench.inference.make_datasets.utils import (AutoContextManager, ingest_directory_contents) from tqdm.auto import tqdm - +from git import Repo +import shutil def ingest_files(filenames): files_dict = dict() @@ -101,3 +102,56 @@ def download_instances( dataset = create_dataset(input_instances_with_text) dataset.save_to_disk(path) return dataset + + +def download_github_repo(instance, output_dir): + """ + Downloads a GitHub repository and checks out the specified commit. + + Args: + instance (dict): Dictionary containing 'repo', 'base_commit', and 'instance_id'. + output_dir (str): Directory to store the downloaded repositories. + + Returns: + str: Path to the downloaded repository. + """ + repo_owner_repo = instance['repo'] + base_commit = instance['base_commit'] + instance_id = instance['instance_id'] + + repo_url = f"https://github.com/{repo_owner_repo}.git" + + repo_path = os.path.abspath(os.path.join(output_dir, instance_id)) + + # Clone repository if it doesn't already exist + if not os.path.exists(repo_path): + print(f"Cloning {repo_url} to {repo_path}...") + Repo.clone_from(repo_url, repo_path) + else: + print(f"Repository already exists at {repo_path}.") + + + repo = Repo(repo_path) + repo.git.checkout(base_commit) + + return repo_path + + +def delete_repo(repo_path): + """ + Deletes the specified repository directory. + + Args: + repo_path (str): Path to the repository to delete. + + Returns: + None + """ + try: + if os.path.exists(repo_path): + shutil.rmtree(repo_path) + print(f"Deleted repository at {repo_path}.") + else: + print(f"Repository path {repo_path} does not exist. Nothing to delete.") + except Exception as e: + print(f"Error deleting repository at {repo_path}: {e}") \ No newline at end of file diff --git a/examples/python/code_graph_pipeline.py b/examples/python/code_graph_pipeline.py index fe4be371b..52c1e0474 100644 --- a/examples/python/code_graph_pipeline.py +++ b/examples/python/code_graph_pipeline.py @@ -1,22 +1,11 @@ -import argparse import asyncio -import os - from cognee.modules.pipelines import Task, run_tasks -from cognee.shared.CodeGraphEntities import CodeRelationship, Repository -from cognee.shared.data_models import SummarizedContent -from cognee.tasks.code.get_local_dependencies_checker import ( - get_local_script_dependencies, -) -from cognee.tasks.graph.convert_graph_from_code_graph import ( - create_code_file, - convert_graph_from_code_graph, -) from cognee.tasks.repo_processor import ( enrich_dependency_graph, expand_dependency_graph, - get_repo_dependency_graph, + get_repo_file_dependencies, ) +from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_code @@ -24,58 +13,24 @@ async def print_results(pipeline): async for result in pipeline: print(result) - -async def get_local_script_dependencies_wrapper(script_path, repo_path): - dependencies = await get_local_script_dependencies(script_path, repo_path) - return (script_path, dependencies) - - -async def scan_repo(path, condition): - futures = [] - for root, dirs, files in os.walk(path): - for file in files: - if condition(file): - futures.append( - get_local_script_dependencies_wrapper( - os.path.abspath(f"{root}/{file}"), path - ) - ) - results = await asyncio.gather(*futures) - - code_files = {} - code_relationships = [] - for abspath, dependencies in results: - code_file, abspath = create_code_file(abspath, "python_file") - code_files[abspath] = code_file - - for dependency in dependencies: - dependency_code_file, dependency_abspath = create_code_file( - dependency, "python_file" - ) - code_files[dependency_abspath] = dependency_code_file - code_relationship = CodeRelationship( - source_id=abspath, - target_id=dependency_abspath, - type="files", - relation="depends_on", - ) - code_relationships.append(code_relationship) - - return (Repository(path=path), list(code_files.values()), code_relationships) - - if __name__ == "__main__": + ''' parser = argparse.ArgumentParser(description="Process a file path") parser.add_argument("path", help="Path to the file") args = parser.parse_args() abspath = os.path.abspath(args.path or ".") + ''' + + abspath = '/Users/laszlohajdu/Documents/Github/RAW_GIT_REPOS/astropy__astropy-12907' tasks = [ - Task(get_repo_dependency_graph), + Task(get_repo_file_dependencies), + Task(add_data_points), Task(enrich_dependency_graph), Task(expand_dependency_graph), - Task(convert_graph_from_code_graph), - Task(summarize_code, summarization_model = SummarizedContent), + Task(add_data_points), + # Task(summarize_code, summarization_model = SummarizedContent), ] pipeline = run_tasks(tasks, abspath, "cognify_code_pipeline") + asyncio.run(print_results(pipeline)) diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 309aea82c..49b41db1c 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,32 +1,6 @@ import cognee import asyncio -from cognee.api.v1.search import SearchType - -job_position = """0:Senior Data Scientist (Machine Learning) - -Company: TechNova Solutions -Location: San Francisco, CA - -Job Description: - -TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights. - -Responsibilities: - -Develop and implement advanced machine learning algorithms and models. -Analyze large, complex datasets to extract meaningful patterns and insights. -Collaborate with cross-functional teams to integrate predictive models into products. -Stay updated with the latest advancements in machine learning and data science. -Mentor junior data scientists and provide technical guidance. -Qualifications: - -Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field. -5+ years of experience in data science and machine learning. -Proficient in Python, R, and SQL. -Experience with deep learning frameworks (e.g., TensorFlow, PyTorch). -Strong problem-solving skills and attention to detail. -Candidate CVs -""" +from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever job_1 = """ CV 1: Relevant @@ -195,7 +169,7 @@ async def main(enable_steps): # Step 2: Add text if enable_steps.get("add_text"): - text_list = [job_position, job_1, job_2, job_3, job_4, job_5] + text_list = [job_1, job_2, job_3, job_4, job_5] for text in text_list: await cognee.add(text) print(f"Added text: {text[:35]}...") @@ -206,24 +180,21 @@ async def main(enable_steps): print("Knowledge graph created.") # Step 4: Query insights - if enable_steps.get("search_insights"): - search_results = await cognee.search( - SearchType.INSIGHTS, - {'query': 'Which applicant has the most relevant experience in data science?'} - ) - print("Search results:") - for result_text in search_results: - print(result_text) + if enable_steps.get("retriever"): + await two_step_retriever('Who has Phd?') if __name__ == '__main__': # Flags to enable/disable steps + + rebuild_kg = True + retrieve = True steps_to_enable = { - "prune_data": True, - "prune_system": True, - "add_text": True, - "cognify": True, - "search_insights": True + "prune_data": rebuild_kg, + "prune_system": rebuild_kg, + "add_text": rebuild_kg, + "cognify": rebuild_kg, + "retriever": retrieve } asyncio.run(main(steps_to_enable)) diff --git a/examples/python/multimedia_example.py b/examples/python/multimedia_example.py new file mode 100644 index 000000000..6c8bc5995 --- /dev/null +++ b/examples/python/multimedia_example.py @@ -0,0 +1,48 @@ +import os +import asyncio +import pathlib + +import cognee +from cognee.api.v1.search import SearchType + +# Prerequisites: +# 1. Copy `.env.template` and rename it to `.env`. +# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field: +# LLM_API_KEY = "your_key_here" + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + # cognee knowledge graph will be created based on the text + # and description of these files + mp3_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + ".data/multimedia/text_to_speech.mp3", + ) + png_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + ".data/multimedia/example.png", + ) + + # Add the files, and make it available for cognify + await cognee.add([mp3_file_path, png_file_path]) + + # Use LLMs and cognee to create knowledge graph + await cognee.cognify() + + # Query cognee for summaries of the data in the multimedia files + search_results = await cognee.search( + SearchType.SUMMARIES, + query_text="What is in the multimedia files?", + ) + + # Display search results + for result_text in search_results: + print(result_text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/python/simple_example.py b/examples/python/simple_example.py index 47940ca6e..55b07c4c3 100644 --- a/examples/python/simple_example.py +++ b/examples/python/simple_example.py @@ -1,5 +1,4 @@ import asyncio - import cognee from cognee.api.v1.search import SearchType @@ -11,29 +10,57 @@ from cognee.api.v1.search import SearchType async def main(): # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") # cognee knowledge graph will be created based on this text text = """ Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. """ - + + print("Adding text to cognee:") + print(text.strip()) # Add the text, and make it available for cognify await cognee.add(text) + print("Text added successfully.\n") + + print("Running cognify to create knowledge graph...\n") + print("Cognify process steps:") + print("1. Classifying the document: Determining the type and category of the input text.") + print("2. Checking permissions: Ensuring the user has the necessary rights to process the text.") + print("3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis.") + print("4. Adding data points: Storing the extracted chunks for processing.") + print("5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph.") + print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n") + # Use LLMs and cognee to create knowledge graph await cognee.cognify() + print("Cognify process complete.\n") + + query_text = 'Tell me about NLP' + print(f"Searching cognee for insights with query: '{query_text}'") # Query cognee for insights on the added text search_results = await cognee.search( - SearchType.INSIGHTS, query_text='Tell me about NLP' + SearchType.INSIGHTS, query_text=query_text ) - - # Display search results + + print("Search results:") + # Display results for result_text in search_results: print(result_text) + # Example output: + # ({'id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'natural language processing', 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'target_node_id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 15, 473137, tzinfo=datetime.timezone.utc)}, {'id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'computer science', 'description': 'The study of computation and information processing.'}) + # (...) + # It represents nodes and relationships in the knowledge graph: + # - The first element is the source node (e.g., 'natural language processing'). + # - The second element is the relationship between nodes (e.g., 'is_a_subfield_of'). + # - The third element is the target node (e.g., 'computer science'). + if __name__ == '__main__': asyncio.run(main()) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 67bb4e07f..13fcb8cb4 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "df16431d0f48b006", "metadata": { "ExecuteTime": { @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9086abf3af077ab4", "metadata": { "ExecuteTime": { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a9de0cc07f798b7f", "metadata": { "ExecuteTime": { @@ -393,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "185ff1c102d06111", "metadata": { "ExecuteTime": { @@ -437,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "d55ce4c58f8efb67", "metadata": { "ExecuteTime": { @@ -479,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "ca4ecc32721ad332", "metadata": { "ExecuteTime": { @@ -529,14 +529,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bce39dc6", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", - "# # Setting environment variables\n", + "# Setting environment variables\n", "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", "\n", @@ -546,24 +546,26 @@ "if \"LLM_API_KEY\" not in os.environ:\n", " os.environ[\"LLM_API_KEY\"] = \"\"\n", "\n", - "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n", + "# \"neo4j\" or \"networkx\"\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", "# Not needed if using networkx\n", - "#GRAPH_DATABASE_URL=\"\"\n", - "#GRAPH_DATABASE_USERNAME=\"\"\n", - "#GRAPH_DATABASE_PASSWORD=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", "\n", - "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n", - "# Not needed if using \"lancedb\"\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", - "# Database provider\n", - "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", "\n", - "# Postgres specific parameters (Only if Postgres is run)\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", @@ -620,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "7c431fdef4921ae0", "metadata": { "ExecuteTime": { diff --git a/notebooks/cognee_llama_index.ipynb b/notebooks/cognee_llama_index.ipynb index 742c2f51c..ec899aaea 100644 --- a/notebooks/cognee_llama_index.ipynb +++ b/notebooks/cognee_llama_index.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -90,23 +90,23 @@ "# \"neo4j\" or \"networkx\"\n", "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", "# Not needed if using networkx\n", - "#GRAPH_DATABASE_URL=\"\"\n", - "#GRAPH_DATABASE_USERNAME=\"\"\n", - "#GRAPH_DATABASE_PASSWORD=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", "\n", - "# \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", - "# Not needed if using \"lancedb\"\n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", - "# Database provider\n", - "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", "\n", - "# Postgres specific parameters (Only if Postgres is run)\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", @@ -130,8 +130,6 @@ "\n", "from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n", "from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n", - "from cognee.infrastructure.databases.graph import get_graph_engine\n", - "from cognee.shared.utils import render_graph\n", "from cognee.modules.users.models import User\n", "from cognee.modules.users.methods import get_default_user\n", "from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n", @@ -196,6 +194,9 @@ "source": [ "import graphistry\n", "\n", + "from cognee.infrastructure.databases.graph import get_graph_engine\n", + "from cognee.shared.utils import render_graph\n", + "\n", "# Get graph\n", "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", "graph_engine = await get_graph_engine()\n", diff --git a/notebooks/cognee_multimedia_demo.ipynb b/notebooks/cognee_multimedia_demo.ipynb new file mode 100644 index 000000000..2d35132f6 --- /dev/null +++ b/notebooks/cognee_multimedia_demo.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cognee GraphRAG with Multimedia files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## Load Data\n", + "\n", + "We will use a few sample multimedia files which we have on GitHub for easy access." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "\n", + "# cognee knowledge graph will be created based on the text\n", + "# and description of these files\n", + "mp3_file_path = os.path.join(\n", + " os.path.abspath(''), \"../\",\n", + " \".data/multimedia/text_to_speech.mp3\",\n", + ")\n", + "png_file_path = os.path.join(\n", + " os.path.abspath(''), \"../\",\n", + " \".data/multimedia/example.png\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Setting environment variables\n", + "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", + "\n", + "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", + "\n", + "if \"LLM_API_KEY\" not in os.environ:\n", + " os.environ[\"LLM_API_KEY\"] = \"\"\n", + "\n", + "# \"neo4j\" or \"networkx\"\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", + "# Not needed if using networkx\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", + "\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", + "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", + "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", + "\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", + "\n", + "# Database name\n", + "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", + "\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", + "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", + "# os.environ[\"DB_PORT\"]=\"5432\"\n", + "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", + "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Cognee with multimedia files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cognee\n", + "\n", + "# Create a clean slate for cognee -- reset data and system state\n", + "await cognee.prune.prune_data()\n", + "await cognee.prune.prune_system(metadata=True)\n", + "\n", + "# Add multimedia files and make them available for cognify\n", + "await cognee.add([mp3_file_path, png_file_path])\n", + "\n", + "# Create knowledge graph with cognee\n", + "await cognee.cognify()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query Cognee for summaries related to multimedia files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cognee.api.v1.search import SearchType\n", + "\n", + "# Query cognee for summaries of the data in the multimedia files\n", + "search_results = await cognee.search(\n", + " SearchType.SUMMARIES,\n", + " query_text=\"What is in the multimedia files?\",\n", + ")\n", + "\n", + "# Display search results\n", + "for result_text in search_results:\n", + " print(result_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 96f9aec27..01f7e44a4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6171,11 +6171,6 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, - {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},