diff --git a/.gitignore b/.gitignore index c1c9f7970..83d66bd38 100644 --- a/.gitignore +++ b/.gitignore @@ -165,5 +165,6 @@ cython_debug/ .vscode/ database/data/ cognee/data/ +cognee/cache/ -.DS_Store +# .DS_Store diff --git a/.test_data/062c22df-d99b-599f-90cd-2d325c8bcf69.txt b/.test_data/062c22df-d99b-599f-90cd-2d325c8bcf69.txt new file mode 100644 index 000000000..a80bf8869 --- /dev/null +++ b/.test_data/062c22df-d99b-599f-90cd-2d325c8bcf69.txt @@ -0,0 +1,6 @@ +A quantum computer is a computer that takes advantage of quantum mechanical phenomena. +At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. +Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling)[2] than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. +The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. +Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. +In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. diff --git a/README.md b/README.md index 6669bbf0c..71f3b2448 100644 --- a/README.md +++ b/README.md @@ -99,13 +99,13 @@ For more info here are the docs< With pip: ```bash -pip install cognee["weaviate"] +pip install "cognee[weaviate]" ``` With poetry: ```bash -poetry add "cognee["weaviate"]" +poetry add "cognee[weaviate]" ``` ## 💻 Usage diff --git a/cognee/__init__.py b/cognee/__init__.py index 0d90ec84a..5aa772537 100644 --- a/cognee/__init__.py +++ b/cognee/__init__.py @@ -1,3 +1,4 @@ +from .api.v1.config.config import config from .api.v1.add.add import add from .api.v1.cognify.cognify import cognify from .api.v1.list_datasets.list_datasets import list_datasets diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index b91a979a8..b71a47b06 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -5,51 +5,46 @@ import dlt import duckdb from cognee.root_dir import get_absolute_path import cognee.modules.ingestion as ingestion +from cognee.infrastructure import infrastructure_config from cognee.infrastructure.files import get_file_metadata from cognee.infrastructure.files.storage import LocalStorage -async def add(file_paths: Union[str, List[str]], dataset_name: str = None): - if isinstance(file_paths, str): - # Directory path provided, we need to extract the file paths and dataset name - - def list_dir_files(root_dir_path: str, parent_dir: str = "root"): - datasets = {} - - for file_or_dir in listdir(root_dir_path): - if path.isdir(path.join(root_dir_path, file_or_dir)): - dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir - dataset_name = dataset_name.strip().replace(" ", "_") - - nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name) - - for dataset in nested_datasets.keys(): - datasets[dataset] = nested_datasets[dataset] - else: - if parent_dir not in datasets: - datasets[parent_dir] = [] - - datasets[parent_dir].append(path.join(root_dir_path, file_or_dir)) - - return datasets - - datasets = list_dir_files(file_paths) - - results = [] - - for key in datasets.keys(): - if dataset_name is not None and not key.startswith(dataset_name): - continue - - results.append(add(datasets[key], dataset_name = key)) - - return await asyncio.gather(*results) +async def add(data_path: Union[str, List[str]], dataset_name: str = None): + if isinstance(data_path, str): + # data_path is a data directory path + if "data://" in data_path: + return await add_data_directory(data_path.replace("data://", ""), dataset_name) + # data_path is a file path + if "file://" in data_path: + return await add([data_path], dataset_name) + # data_path is a text + else: + return await add_text(data_path, dataset_name) + # data_path is a list of file paths + return await add_files(data_path, dataset_name) +async def add_files(file_paths: List[str], dataset_name: str): + data_directory_path = infrastructure_config.get_config()["data_path"] db_path = get_absolute_path("./data/cognee") db_location = f"{db_path}/cognee.duckdb" LocalStorage.ensure_directory_exists(db_path) + processed_file_paths = [] + + for file_path in file_paths: + file_path = file_path.replace("file://", "") + + if data_directory_path not in file_path: + file_name = file_path.split("/")[-1] + dataset_file_path = data_directory_path + "/" + dataset_name.replace('.', "/") + "/" + file_name + + LocalStorage.copy_file(file_path, dataset_file_path) + processed_file_paths.append(dataset_file_path) + else: + processed_file_paths.append(file_path) + db = duckdb.connect(db_location) destination = dlt.destinations.duckdb( @@ -81,10 +76,56 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None): } run_info = pipeline.run( - data_resources(file_paths), + data_resources(processed_file_paths), table_name = "file_metadata", - dataset_name = dataset_name, + dataset_name = dataset_name.replace(" ", "_").replace(".", "_") if dataset_name is not None else "main_dataset", write_disposition = "merge", ) return run_info + +def extract_datasets_from_data(root_dir_path: str, parent_dir: str = "root"): + datasets = {} + + root_dir_path = root_dir_path.replace("file://", "") + + for file_or_dir in listdir(root_dir_path): + if path.isdir(path.join(root_dir_path, file_or_dir)): + dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir + + nested_datasets = extract_datasets_from_data("file://" + path.join(root_dir_path, file_or_dir), dataset_name) + + for dataset in nested_datasets.keys(): + datasets[dataset] = nested_datasets[dataset] + else: + if parent_dir not in datasets: + datasets[parent_dir] = [] + + datasets[parent_dir].append(path.join(root_dir_path, file_or_dir)) + + return datasets + +async def add_data_directory(data_path: str, dataset_name: str = None): + datasets = extract_datasets_from_data(data_path) + + results = [] + + for key in datasets.keys(): + if dataset_name is None or key.startswith(dataset_name): + results.append(add(datasets[key], dataset_name = key)) + + return await asyncio.gather(*results) + +async def add_text(text: str, dataset_name: str): + data_directory_path = infrastructure_config.get_config()["data_path"] + + classified_data = ingestion.classify(text) + data_id = ingestion.identify(classified_data) + + storage_path = data_directory_path + "/" + dataset_name.replace(".", "/") + LocalStorage.ensure_directory_exists(storage_path) + + text_file_name = str(data_id) + ".txt" + LocalStorage(storage_path).store(text_file_name, classified_data.get_data()) + + return await add(["file://" + storage_path + "/" + text_file_name], dataset_name) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index f6b2d9534..98e617a8c 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -3,7 +3,6 @@ import asyncio from typing import List, Union import instructor from openai import OpenAI -from pypdf import PdfReader from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes from cognee.modules.cognify.llm.label_content import label_content from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes @@ -26,6 +25,8 @@ from cognee.shared.data_models import GraphDBType from cognee.infrastructure.databases.relational import DuckDBAdapter from cognee.modules.cognify.graph.add_document_node import add_document_node from cognee.modules.cognify.graph.initialize_graph import initialize_graph +from cognee.infrastructure.files.utils.guess_file_type import guess_file_type +from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file from cognee.infrastructure import infrastructure_config config = Config() @@ -35,7 +36,7 @@ aclient = instructor.patch(OpenAI()) USER_ID = "default_user" -async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object = None): +async def cognify(datasets: Union[str, List[str]] = None, graph_data_model: object = None): """This function is responsible for the cognitive processing of the content.""" db = DuckDBAdapter() @@ -45,6 +46,7 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object awaitables = [] + # datasets is a list of dataset names if isinstance(datasets, list): for dataset in datasets: awaitables.append(cognify(dataset)) @@ -52,17 +54,24 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object graphs = await asyncio.gather(*awaitables) return graphs[0] - files_metadata = db.get_files_metadata(datasets) + # datasets is a dataset name string + added_datasets = db.get_datasets() + + files_metadata = [] + dataset_name = datasets.replace(".", "_").replace(" ", "_") + + for added_dataset in added_datasets: + if dataset_name in added_dataset: + files_metadata.extend(db.get_files_metadata(added_dataset)) awaitables = [] - await initialize_graph(USER_ID,graphdatamodel) + await initialize_graph(USER_ID, graph_data_model) for file_metadata in files_metadata: with open(file_metadata["file_path"], "rb") as file: - reader = PdfReader(stream = file) - pages = list(reader.pages[:3]) - text = "\n".join([page.extract_text().strip() for page in pages]) + file_type = guess_file_type(file) + text = extract_text_from_file(file, file_type) awaitables.append(process_text(text, file_metadata)) diff --git a/cognee/api/v1/config/__init__.py b/cognee/api/v1/config/__init__.py new file mode 100644 index 000000000..8dcce2096 --- /dev/null +++ b/cognee/api/v1/config/__init__.py @@ -0,0 +1 @@ +from .config import config diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py new file mode 100644 index 000000000..333257687 --- /dev/null +++ b/cognee/api/v1/config/config.py @@ -0,0 +1,9 @@ +from typing import Optional +from cognee.infrastructure import infrastructure_config + +class config(): + @staticmethod + def data_path(data_path: Optional[str] = None) -> str: + infrastructure_config.set_config({ + "data_path": data_path + }) diff --git a/cognee/config.py b/cognee/config.py index bb97d4372..90db9e49e 100644 --- a/cognee/config.py +++ b/cognee/config.py @@ -19,14 +19,16 @@ load_dotenv(dotenv_path=dotenv_path) class Config: """ Configuration for cognee - cognitive architecture framework. """ cognee_dir: str = field( - default_factory=lambda: os.getenv("COG_ARCH_DIR", "cognitive_achitecture") + default_factory=lambda: os.getenv("COG_ARCH_DIR", "cognee") ) config_path: str = field( default_factory=lambda: os.path.join( - os.getenv("COG_ARCH_DIR", "cognitive_achitecture"), "config" + os.getenv("COG_ARCH_DIR", "cognee"), "config" ) ) + data_path = os.getenv("DATA_PATH", str(Path(__file__).resolve().parent.parent / ".data")) + db_path = str(Path(__file__).resolve().parent / "data/system") vectordb: str = os.getenv("VECTORDB", "weaviate") @@ -59,7 +61,7 @@ class Config: graphistry_password = os.getenv("GRAPHISTRY_PASSWORD") # Embedding parameters - embedding_model: str = "openai" + embedding_model: str = "BAAI/bge-large-en-v1.5" embedding_dim: int = 1536 embedding_chunk_size: int = 300 diff --git a/cognee/infrastructure/InfrastructureConfig.py b/cognee/infrastructure/InfrastructureConfig.py index 22a9a7d6b..abd7f02a1 100644 --- a/cognee/infrastructure/InfrastructureConfig.py +++ b/cognee/infrastructure/InfrastructureConfig.py @@ -1,14 +1,16 @@ from cognee.config import Config from .databases.relational import SqliteEngine, DatabaseEngine -from .databases.vector import WeaviateAdapter, VectorDBInterface +from .databases.vector.weaviate_db import WeaviateAdapter +from .databases.vector.vector_db_interface import VectorDBInterface +from .databases.vector.embeddings.DefaultEmbeddingEngine import DefaultEmbeddingEngine from .llm.llm_interface import LLMInterface from .llm.openai.adapter import OpenAIAdapter -from .databases.vector import WeaviateAdapter, VectorDBInterface, DefaultEmbeddingEngine config = Config() config.load() class InfrastructureConfig(): + data_path: str = config.data_path database_engine: DatabaseEngine = None vector_engine: VectorDBInterface = None llm_engine: LLMInterface = None @@ -28,14 +30,23 @@ class InfrastructureConfig(): ) return { - "database_engine": self.database_engine, + "data_path": self.data_path, + "llm_engine": self.llm_engine, "vector_engine": self.vector_engine, - "llm_engine": self.llm_engine + "database_engine": self.database_engine, } def set_config(self, new_config: dict): - self.database_engine = new_config["database_engine"] - self.vector_engine = new_config["vector_engine"] - self.llm_engine = new_config["llm_engine"] + if "data_path" in new_config: + self.data_path = new_config["data_path"] + + if "database_engine" in new_config: + self.database_engine = new_config["database_engine"] + + if "vector_engine" in new_config: + self.vector_engine = new_config["vector_engine"] + + if "llm_engine" in new_config: + self.llm_engine = new_config["llm_engine"] infrastructure_config = InfrastructureConfig() diff --git a/cognee/infrastructure/data/__init__.py b/cognee/infrastructure/data/__init__.py index 86683ba77..6aaaf6b41 100644 --- a/cognee/infrastructure/data/__init__.py +++ b/cognee/infrastructure/data/__init__.py @@ -1,4 +1,3 @@ from .models.Data import Data from .models.Dataset import Dataset from .models.DatasetData import DatasetData -from .add_data_to_dataset import add_data_to_dataset diff --git a/cognee/infrastructure/files/utils/extract_keywords.py b/cognee/infrastructure/data/utils/extract_keywords.py similarity index 85% rename from cognee/infrastructure/files/utils/extract_keywords.py rename to cognee/infrastructure/data/utils/extract_keywords.py index 3aa633a96..c69f590e1 100644 --- a/cognee/infrastructure/files/utils/extract_keywords.py +++ b/cognee/infrastructure/data/utils/extract_keywords.py @@ -2,6 +2,9 @@ import nltk from sklearn.feature_extraction.text import TfidfVectorizer def extract_keywords(text: str) -> list[str]: + if len(text) == 0: + raise ValueError("extract_keywords cannot extract keywords from empty text.") + tokens = nltk.word_tokenize(text) tags = nltk.pos_tag(tokens) diff --git a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py index 3192cb5ef..4dd5ad4c0 100644 --- a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py @@ -1,13 +1,14 @@ from typing import List from fastembed import TextEmbedding +from cognee.config import Config +from cognee.root_dir import get_absolute_path from .EmbeddingEngine import EmbeddingEngine -from cognitive_architecture.config import Config config = Config() config.load() class DefaultEmbeddingEngine(EmbeddingEngine): async def embed_text(self, text: List[str]) -> List[float]: - embedding_model = TextEmbedding(model_name = config.embedding_model) + embedding_model = TextEmbedding(model_name = config.embedding_model, cache_dir = get_absolute_path("cache/embeddings")) embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text))) return embeddings_list diff --git a/cognee/infrastructure/files/storage/LocalStorage.py b/cognee/infrastructure/files/storage/LocalStorage.py index ddabc6f49..766e82e77 100644 --- a/cognee/infrastructure/files/storage/LocalStorage.py +++ b/cognee/infrastructure/files/storage/LocalStorage.py @@ -1,5 +1,6 @@ import os -from typing import BinaryIO +import shutil +from typing import BinaryIO, Union from .StorageManager import Storage class LocalStorage(Storage): @@ -8,13 +9,17 @@ class LocalStorage(Storage): def __init__(self, storage_path: str): self.storage_path = storage_path - def store(self, file_path: str, data: BinaryIO): + def store(self, file_path: str, data: Union[BinaryIO, str]): full_file_path = self.storage_path + "/" + file_path LocalStorage.ensure_directory_exists(self.storage_path) - with open(full_file_path, "wb") as f: - f.write(data.read()) + with open( + full_file_path, + mode = "w" if isinstance(data, str) else "wb", + encoding = "utf-8" if isinstance(data, str) else None + ) as f: + f.write(data if isinstance(data, str) else data.read()) def retrieve(self, file_path: str): full_file_path = self.storage_path + "/" + file_path @@ -30,8 +35,6 @@ class LocalStorage(Storage): def remove(self, file_path: str): os.remove(self.storage_path + "/" + file_path) - # def get_directory(self, file_path: str): - # [path, __] = file_path.split(".") - # directory = "/".join(path.split("/")[:-1]) - - # return directory if directory != "" else None + @staticmethod + def copy_file(source_file_path: str, destination_file_path: str): + return shutil.copy2(source_file_path, destination_file_path) diff --git a/cognee/infrastructure/files/utils/extract_text_from_file.py b/cognee/infrastructure/files/utils/extract_text_from_file.py new file mode 100644 index 000000000..2e413ab6a --- /dev/null +++ b/cognee/infrastructure/files/utils/extract_text_from_file.py @@ -0,0 +1,11 @@ +from typing import BinaryIO +from pypdf import PdfReader + +def extract_text_from_file(file: BinaryIO, file_type) -> str: + if file_type.extension == "pdf": + reader = PdfReader(stream = file) + pages = list(reader.pages[:3]) + return "\n".join([page.extract_text().strip() for page in pages]) + + if file_type.extension == "txt": + return file.read().decode("utf-8") diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index 9fda433fa..2b9d6ed0a 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -1,13 +1,7 @@ from typing import BinaryIO, TypedDict -import filetype -from pypdf import PdfReader -from .extract_keywords import extract_keywords - -class FileTypeException(Exception): - message: str - - def __init__(self, message: str): - self.message = message +from cognee.infrastructure.data.utils.extract_keywords import extract_keywords +from .extract_text_from_file import extract_text_from_file +from .guess_file_type import guess_file_type class FileMetadata(TypedDict): @@ -17,18 +11,12 @@ class FileMetadata(TypedDict): keywords: list[str] def get_file_metadata(file: BinaryIO) -> FileMetadata: - file_type = filetype.guess(file) + file.seek(0) + file_type = guess_file_type(file) - if file_type is None: - raise FileTypeException("Unknown file detected.") - - keywords: list = [] - - if file_type.extension == "pdf": - reader = PdfReader(stream = file) - pages = list(reader.pages[:3]) - text = "\n".join([page.extract_text().strip() for page in pages]) - keywords = extract_keywords(text) + file.seek(0) + file_text = extract_text_from_file(file, file_type) + keywords = extract_keywords(file_text) file_path = file.name file_name = file_path.split("/")[-1].split(".")[0] diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py new file mode 100644 index 000000000..923833bdc --- /dev/null +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -0,0 +1,31 @@ +from typing import BinaryIO +import filetype +from .is_text_content import is_text_content + +class FileTypeException(Exception): + message: str + + def __init__(self, message: str): + self.message = message + +class TxtFileType(filetype.Type): + MIME = "text/plain" + EXTENSION = "txt" + + def __init__(self): + super(TxtFileType, self).__init__(mime = TxtFileType.MIME, extension = TxtFileType.EXTENSION) + + def match(self, buf): + return is_text_content(buf) + +txt_file_type = TxtFileType() + +filetype.add_type(txt_file_type) + +def guess_file_type(file: BinaryIO) -> filetype.Type: + file_type = filetype.guess(file) + + if file_type is None: + raise FileTypeException("Unknown file detected.") + + return file_type diff --git a/cognee/infrastructure/files/utils/is_text_content.py b/cognee/infrastructure/files/utils/is_text_content.py new file mode 100644 index 000000000..b30327f0f --- /dev/null +++ b/cognee/infrastructure/files/utils/is_text_content.py @@ -0,0 +1,24 @@ +def is_text_content(content): + # Check for null bytes + if b'\0' in content: + return False + + # Check for common text encodings (BOMs) + if content.startswith((b'\xEF\xBB\xBF', # UTF-8 + b'\xFF\xFE', # UTF-16 LE + b'\xFE\xFF', # UTF-16 BE + b'\x00\x00\xFE\xFF', # UTF-32 LE + b'\xFF\xFE\x00\x00', # UTF-32 BE + )): + return True + + # Check for ASCII characters + if all(0x20 <= byte <= 0x7E or byte in (b'\n', b'\r', b'\t') for byte in content): + return True + + # Check for common line break characters + if b'\n' in content or b'\r' in content: + return True + + # If no obvious indicators found, assume it's a text file + return True diff --git a/cognee/modules/cognify/graph/add_node_connections.py b/cognee/modules/cognify/graph/add_node_connections.py index 9f7f47f41..ae6a0e23b 100644 --- a/cognee/modules/cognify/graph/add_node_connections.py +++ b/cognee/modules/cognify/graph/add_node_connections.py @@ -124,7 +124,7 @@ if __name__ == "__main__": connect_nodes_in_graph(graph, relationships) - from cognitive_architecture.utils import render_graph + from cognee.utils import render_graph graph_url = await render_graph(graph, graph_type="networkx") diff --git a/cognee/modules/cognify/llm/resolve_cross_graph_references.py b/cognee/modules/cognify/llm/resolve_cross_graph_references.py index 55ede0e92..653b51bc5 100644 --- a/cognee/modules/cognify/llm/resolve_cross_graph_references.py +++ b/cognee/modules/cognify/llm/resolve_cross_graph_references.py @@ -1,5 +1,4 @@ from typing import Dict, List -from cognee.infrastructure.databases.vector import get_vector_database from cognee.infrastructure import infrastructure_config async def resolve_cross_graph_references(nodes_by_layer: Dict): diff --git a/cognee/modules/ingestion/__init__.py b/cognee/modules/ingestion/__init__.py index 2846a949a..6b0049b52 100644 --- a/cognee/modules/ingestion/__init__.py +++ b/cognee/modules/ingestion/__init__.py @@ -1,3 +1,2 @@ from .classify import classify from .identify import identify -from .save import save diff --git a/cognee/infrastructure/data/add_data_to_dataset.py b/cognee/modules/ingestion/add_data_to_dataset.py similarity index 96% rename from cognee/infrastructure/data/add_data_to_dataset.py rename to cognee/modules/ingestion/add_data_to_dataset.py index 1aa5d754a..9dbb5e511 100644 --- a/cognee/infrastructure/data/add_data_to_dataset.py +++ b/cognee/modules/ingestion/add_data_to_dataset.py @@ -1,8 +1,8 @@ import logging -from . import Dataset, Data from cognee.infrastructure import infrastructure_config -from cognee.infrastructure.databases.relational import DatabaseEngine +from cognee.infrastructure.data import Dataset, Data from cognee.infrastructure.files import remove_file_from_storage +from cognee.infrastructure.databases.relational import DatabaseEngine logger = logging.getLogger(__name__) diff --git a/cognee/modules/ingestion/classify.py b/cognee/modules/ingestion/classify.py index 0eb807f72..b13db649b 100644 --- a/cognee/modules/ingestion/classify.py +++ b/cognee/modules/ingestion/classify.py @@ -10,4 +10,4 @@ def classify(data: Union[str, BinaryIO]): if isinstance(data, BufferedReader): return create_binary_data(data) - raise IngestionException(f"Data sent to cognee.classify(data: any) not supported: {type(data)}") + raise IngestionException(f"Type of data sent to cognee.add(data_path: string | List[string]) not supported: {type(data)}") diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py index bd949ae5b..82afb6dd1 100644 --- a/cognee/modules/ingestion/data_types/BinaryData.py +++ b/cognee/modules/ingestion/data_types/BinaryData.py @@ -17,11 +17,6 @@ class BinaryData(IngestionData): return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"]) - def get_extension(self): - self.ensure_metadata() - - return self.metadata["extension"] - def ensure_metadata(self): if self.metadata is None: self.metadata = get_file_metadata(self.data) diff --git a/cognee/modules/ingestion/data_types/IngestionData.py b/cognee/modules/ingestion/data_types/IngestionData.py index ee9514bc8..147dbda4e 100644 --- a/cognee/modules/ingestion/data_types/IngestionData.py +++ b/cognee/modules/ingestion/data_types/IngestionData.py @@ -2,10 +2,9 @@ from typing import Protocol, BinaryIO class IngestionData(Protocol): data: str | BinaryIO = None - metadata: dict = None def get_data(self): - pass + raise NotImplementedError() - def get_extension(self): - pass + def get_identifier(self): + raise NotImplementedError() diff --git a/cognee/modules/ingestion/data_types/TextData.py b/cognee/modules/ingestion/data_types/TextData.py index 9ec302047..8b152e986 100644 --- a/cognee/modules/ingestion/data_types/TextData.py +++ b/cognee/modules/ingestion/data_types/TextData.py @@ -1,3 +1,5 @@ +from typing import BinaryIO +from cognee.infrastructure.data.utils.extract_keywords import extract_keywords from .IngestionData import IngestionData def create_text_data(data: str): @@ -6,11 +8,13 @@ def create_text_data(data: str): class TextData(IngestionData): data: str = None - def __init__(self, data: str): + def __init__(self, data: BinaryIO): self.data = data + def get_identifier(self): + keywords = extract_keywords(self.data) + + return "text/plain" + "_" + "|".join(keywords) + def get_data(self): return self.data - - def get_chunks(self): - pass diff --git a/cognee/modules/ingestion/save.py b/cognee/modules/ingestion/save.py index 5ba47e0e5..ba5b465a1 100644 --- a/cognee/modules/ingestion/save.py +++ b/cognee/modules/ingestion/save.py @@ -1,7 +1,7 @@ import asyncio from uuid import UUID, uuid4 -from cognee.infrastructure.files import add_file_to_storage -from cognee.infrastructure.data import add_data_to_dataset, Data, Dataset +from cognee.infrastructure.data import Data, Dataset +from .add_data_to_dataset import add_data_to_dataset from .data_types import IngestionData async def save(dataset_id: UUID, dataset_name: str, data_id: UUID, data: IngestionData): @@ -9,8 +9,6 @@ async def save(dataset_id: UUID, dataset_name: str, data_id: UUID, data: Ingesti promises = [] - # promises.append(add_file_to_storage(file_path, data.get_data())) - promises.append( add_data_to_dataset( Dataset( diff --git a/notebooks/full_run.ipynb b/notebooks/full_run.ipynb index 4541db14a..4b783f2f0 100644 --- a/notebooks/full_run.ipynb +++ b/notebooks/full_run.ipynb @@ -7,19 +7,74 @@ "metadata": {}, "outputs": [], "source": [ - "from cognee import add, cognify, search\n", + "from cognee import config, add, cognify, search\n", "from cognee.utils import render_graph\n", "from os import listdir, path\n", "\n", - "data_path = path.abspath(\"../.data\")\n", + "data_directory_path = path.abspath(\"../.data\")\n", "\n", - "print(data_path)\n", + "print(data_directory_path)\n", "\n", - "await add(path.abspath(\"../.data\"), \"izmene\")\n", + "config.data_path(data_directory_path)\n", "\n", - "graph = await cognify(\"izmene\")\n", + "# dataset_name = \"pravilnik.energetska efikasnost.sertifikati\"\n", + "# await add(\"file://\" + path.abspath(\"../.test_data/062c22df-d99b-599f-90cd-2d325c8bcf69.txt\"), dataset_name)\n", "\n", - "await render_graph(graph, graph_type=\"networkx\")" + "\n", + "dataset_name = \"izmene\"\n", + "await add(\"data://\" + path.abspath(\"../.data\"), dataset_name)\n", + "\n", + "# test_text = \"\"\"A quantum computer is a computer that takes advantage of quantum mechanical phenomena.\n", + "# At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.\n", + "# Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling)[2] than any modern \"classical\" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.\n", + "# The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two \"basis\" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.\n", + "# Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.\n", + "# In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.\n", + "# \"\"\"\n", + "\n", + "# dataset_name = \"pravilnik.energetska efikasnost\"\n", + "# await add(test_text, dataset_name)\n", + "\n", + "\n", + "graph = await cognify(dataset_name)\n", + "\n", + "await render_graph(graph, graph_type = \"networkx\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44603a2a", + "metadata": {}, + "outputs": [], + "source": [ + "from cognee import list_datasets\n", + "\n", + "print(list_datasets())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66ad66ca", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "from cognee.root_dir import get_absolute_path\n", + "\n", + "db_path = get_absolute_path(\"./data/cognee\")\n", + "db_location = db_path + \"/cognee.duckdb\"\n", + "print(db_location)\n", + "\n", + "db = duckdb.connect(db_location)\n", + "\n", + "izmene = db.sql(f\"SELECT * FROM pravilnik_energetska_efikasnost_sertifikati.file_metadata;\")\n", + "print(izmene)\n", + "\n", + "# pravilnik = db.sql(f\"SELECT * FROM pravilnik.file_metadata;\")\n", + "\n", + "# print(pravilnik)" ] } ],