diff --git a/.env.template b/.env.template
index 77c845d4e..ff591c0a7 100644
--- a/.env.template
+++ b/.env.template
@@ -8,14 +8,14 @@ GRAPHISTRY_PASSWORD=
SENTRY_REPORTING_URL=
# "neo4j" or "networkx"
-GRAPH_DATABASE_PROVIDER="neo4j"
+GRAPH_DATABASE_PROVIDER="networkx"
# Not needed if using networkx
GRAPH_DATABASE_URL=
GRAPH_DATABASE_USERNAME=
GRAPH_DATABASE_PASSWORD=
# "qdrant", "pgvector", "weaviate" or "lancedb"
-VECTOR_DB_PROVIDER="qdrant"
+VECTOR_DB_PROVIDER="lancedb"
# Not needed if using "lancedb" or "pgvector"
VECTOR_DB_URL=
VECTOR_DB_KEY=
diff --git a/.github/workflows/test_cognee_llama_index_notebook.yml b/.github/workflows/test_cognee_llama_index_notebook.yml
index c46d0de0d..860eec92b 100644
--- a/.github/workflows/test_cognee_llama_index_notebook.yml
+++ b/.github/workflows/test_cognee_llama_index_notebook.yml
@@ -46,7 +46,7 @@ jobs:
- name: Install dependencies
run: |
- poetry install --no-interaction --all-extras --no-root
+ poetry install --no-interaction --all-extras
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
diff --git a/README.md b/README.md
index 82c3730dc..41d0ac7cd 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@
[](https://GitHub.com/topoteretes/cognee/commit/)
[](https://github.com/topoteretes/cognee/tags/)
[](https://pepy.tech/project/cognee)
-[](https://github.com/topoteretes/cognee/blob/master/LICENSE)
+
We build for developers who need a reliable, production-ready data layer for AI applications
-## What is cognee?
+## What is cognee?
Cognee implements scalable, modular ECL (Extract, Cognify, Load) pipelines that allow you to interconnect and retrieve past conversations, documents, and audio transcriptions while reducing hallucinations, developer effort, and cost.
Try it in a Google Colab notebook or have a look at our documentation
@@ -18,9 +18,6 @@ Try it in a Google Colab Discord community
-
-
-
## 📦 Installation
### With pip
@@ -47,6 +44,7 @@ poetry add cognee
poetry add cognee -E postgres
```
+
## 💻 Basic Usage
### Setup
@@ -83,7 +81,7 @@ docker-compose up
```
Then navigate to localhost:3000
-If you want to use the UI with PostgreSQL through docker-compose make sure to set the following values in the .env file:
+If you want to use Cognee with PostgreSQL, make sure to set the following values in the .env file:
```
DB_PROVIDER=postgres
@@ -97,9 +95,7 @@ DB_PASSWORD=cognee
### Simple example
-First, copy `.env.template` to `.env` and add your OpenAI API key to the LLM_API_KEY field.
-
-Optionally, set `VECTOR_DB_PROVIDER="lancedb"` in `.env` to simplify setup.
+First, copy `.env.template` to `.env` and add your OpenAI API key to the LLM_API_KEY field.
This script will run the default pipeline:
@@ -140,7 +136,7 @@ async def main():
asyncio.run(main())
```
-A version of this example is here: `examples/pyton/simple_example.py`
+A version of this example is here: `examples/python/simple_example.py`
### Create your own memory store
@@ -251,7 +247,6 @@ Cognee supports a variety of tools and services for different operations:
Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/main/notebooks/cognee_demo.ipynb)
-
[
](https://www.youtube.com/watch?v=BDFt4xVPmro "Learn about cognee: 55")
@@ -274,11 +269,6 @@ Please see the cognee [Development Guide](https://topoteretes.github.io/cognee/q
pip install cognee
```
-## Star History
-
-[](https://star-history.com/#topoteretes/cognee&Date)
-
-
## 💫 Contributors
@@ -286,3 +276,25 @@ pip install cognee
+## Star History
+
+[](https://star-history.com/#topoteretes/cognee&Date)
+
+
+## Vector & Graph Databases Implementation State
+
+
+
+| Name | Type | Current state | Known Issues |
+|------------------|--------------------|-------------------|---------------------------------------|
+| Qdrant | Vector | Stable ✅ | |
+| Weaviate | Vector | Stable ✅ | |
+| LanceDB | Vector | Stable ✅ | |
+| Neo4j | Graph | Stable ✅ | |
+| NetworkX | Graph | Stable ✅ | |
+| FalkorDB | Vector/Graph | Unstable ❌ | |
+| PGVector | Vector | Unstable ❌ | Postgres DB returns the Timeout error |
diff --git a/cognee-frontend/src/ui/Partials/SettingsModal/Settings.tsx b/cognee-frontend/src/ui/Partials/SettingsModal/Settings.tsx
index 9d0744323..08574bd6b 100644
--- a/cognee-frontend/src/ui/Partials/SettingsModal/Settings.tsx
+++ b/cognee-frontend/src/ui/Partials/SettingsModal/Settings.tsx
@@ -30,8 +30,8 @@ const defaultProvider = {
};
const defaultModel = {
- label: 'gpt-4o',
- value: 'gpt-4o',
+ label: 'gpt-4o-mini',
+ value: 'gpt-4o-mini',
};
export default function Settings({ onDone = () => {}, submitButtonText = 'Save' }) {
diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 222b11ad7..337306cb6 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -20,5 +20,8 @@ class DataPoint(BaseModel):
def get_embeddable_data(self):
if self._metadata and len(self._metadata["index_fields"]) > 0 \
and hasattr(self, self._metadata["index_fields"][0]):
-
- return getattr(self, self._metadata["index_fields"][0])
+ attribute = getattr(self, self._metadata["index_fields"][0])
+ if isinstance(attribute, str):
+ return(attribute.strip())
+ else:
+ return (attribute)
diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py
index a30fa75c7..37541adf2 100644
--- a/cognee/infrastructure/llm/config.py
+++ b/cognee/infrastructure/llm/config.py
@@ -4,7 +4,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
class LLMConfig(BaseSettings):
llm_provider: str = "openai"
- llm_model: str = "gpt-4o"
+ llm_model: str = "gpt-4o-mini"
llm_endpoint: str = ""
llm_api_key: Optional[str] = None
llm_temperature: float = 0.0
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index f9a222904..f0a72b58a 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -9,7 +9,6 @@ class TextChunker():
chunk_index = 0
chunk_size = 0
- paragraph_chunks = []
def __init__(self, document, get_text: callable, chunk_size: int = 1024):
self.document = document
@@ -17,7 +16,7 @@ class TextChunker():
self.get_text = get_text
def read(self):
- self.paragraph_chunks = []
+ paragraph_chunks = []
for content_text in self.get_text():
for chunk_data in chunk_by_paragraph(
content_text,
@@ -25,10 +24,10 @@ class TextChunker():
batch_paragraphs = True,
):
if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
- self.paragraph_chunks.append(chunk_data)
+ paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"]
else:
- if len(self.paragraph_chunks) == 0:
+ if len(paragraph_chunks) == 0:
yield DocumentChunk(
id = chunk_data["chunk_id"],
text = chunk_data["text"],
@@ -37,10 +36,10 @@ class TextChunker():
chunk_index = self.chunk_index,
cut_type = chunk_data["cut_type"],
)
- self.paragraph_chunks = []
+ paragraph_chunks = []
self.chunk_size = 0
else:
- chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks)
+ chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks)
try:
yield DocumentChunk(
id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
@@ -48,24 +47,24 @@ class TextChunker():
word_count = self.chunk_size,
is_part_of = self.document,
chunk_index = self.chunk_index,
- cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+ cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
)
except Exception as e:
print(e)
- self.paragraph_chunks = [chunk_data]
+ paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"]
self.chunk_index += 1
- if len(self.paragraph_chunks) > 0:
+ if len(paragraph_chunks) > 0:
try:
yield DocumentChunk(
id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
- text = " ".join(chunk["text"] for chunk in self.paragraph_chunks),
+ text = " ".join(chunk["text"] for chunk in paragraph_chunks),
word_count = self.chunk_size,
is_part_of = self.document,
chunk_index = self.chunk_index,
- cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+ cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
)
except Exception as e:
print(e)
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index d3ae0974d..0d2cddd3d 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -5,11 +5,15 @@ from .Document import Document
class AudioDocument(Document):
type: str = "audio"
+ def create_transcript(self):
+ result = get_llm_client().create_transcript(self.raw_data_location)
+ return(result.text)
+
def read(self, chunk_size: int):
# Transcribe the audio file
- result = get_llm_client().create_transcript(self.raw_data_location)
- text = result.text
+
+ text = self.create_transcript()
- chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
+ chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 5571b3bd8..e8f0dd8ee 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -5,11 +5,15 @@ from .Document import Document
class ImageDocument(Document):
type: str = "image"
+
+ def transcribe_image(self):
+ result = get_llm_client().transcribe_image(self.raw_data_location)
+ return(result.choices[0].message.content)
+
def read(self, chunk_size: int):
# Transcribe the image file
- result = get_llm_client().transcribe_image(self.raw_data_location)
- text = result.choices[0].message.content
+ text = self.transcribe_image()
- chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
+ chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
yield from chunker.read()
diff --git a/cognee/modules/settings/get_settings.py b/cognee/modules/settings/get_settings.py
index 95f2f5924..b67b9d6ab 100644
--- a/cognee/modules/settings/get_settings.py
+++ b/cognee/modules/settings/get_settings.py
@@ -73,6 +73,9 @@ def get_settings() -> SettingsDict:
"providers": llm_providers,
"models": {
"openai": [{
+ "value": "gpt-4o-mini",
+ "label": "gpt-4o-mini",
+ }, {
"value": "gpt-4o",
"label": "gpt-4o",
}, {
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index eae5f812f..00bb5670c 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -1,69 +1,72 @@
from uuid import uuid5, NAMESPACE_OID
+from typing import Dict, Any, Iterator
from .chunk_by_sentence import chunk_by_sentence
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
- paragraph = ""
+def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
+ """
+ Chunks text by paragraph while preserving exact text reconstruction capability.
+ When chunks are joined with empty string "", they reproduce the original text exactly.
+ """
+ current_chunk = ""
+ current_word_count = 0
+ chunk_index = 0
+ paragraph_ids = []
last_cut_type = None
- last_paragraph_id = None
- paragraph_word_count = 0
- paragraph_chunk_index = 0
-
- for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data):
- if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length:
- if batch_paragraphs is True:
- chunk_id = uuid5(NAMESPACE_OID, paragraph)
- yield dict(
- text = paragraph.strip(),
- word_count = paragraph_word_count,
- id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id.
- # paragraph_id doens't mean anything since multiple paragraphs are merged.
- chunk_id = chunk_id,
- chunk_index = paragraph_chunk_index,
- cut_type = last_cut_type,
- )
- else:
- yield dict(
- text = paragraph.strip(),
- word_count = paragraph_word_count,
- id = last_paragraph_id,
- chunk_id = uuid5(NAMESPACE_OID, paragraph),
- chunk_index = paragraph_chunk_index,
- cut_type = last_cut_type,
- )
-
- paragraph_chunk_index += 1
- paragraph_word_count = 0
- paragraph = ""
-
- paragraph += (" " if len(paragraph) > 0 else "") + sentence
- paragraph_word_count += word_count
-
- if end_type == "paragraph_end" or end_type == "sentence_cut":
- if batch_paragraphs is True:
- paragraph += "\n\n" if end_type == "paragraph_end" else ""
- else:
- yield dict(
- text = paragraph.strip(),
- word_count = paragraph_word_count,
- paragraph_id = paragraph_id,
- chunk_id = uuid5(NAMESPACE_OID, paragraph),
- chunk_index = paragraph_chunk_index,
- cut_type = end_type,
- )
-
- paragraph_chunk_index = 0
- paragraph_word_count = 0
- paragraph = ""
+
+ for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
+ # Check if this sentence would exceed length limit
+ if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+ # Yield current chunk
+ chunk_dict = {
+ "text": current_chunk,
+ "word_count": current_word_count,
+ "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+ "paragraph_ids": paragraph_ids,
+ "chunk_index": chunk_index,
+ "cut_type": last_cut_type,
+ }
+
+ yield chunk_dict
+
+ # Start new chunk with current sentence
+ paragraph_ids = []
+ current_chunk = ""
+ current_word_count = 0
+ chunk_index += 1
+ paragraph_ids.append(paragraph_id)
+ current_chunk += sentence
+ current_word_count += word_count
+
+ # Handle end of paragraph
+ if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
+ # For non-batch mode, yield each paragraph separately
+ chunk_dict = {
+ "text": current_chunk,
+ "word_count": current_word_count,
+ "paragraph_ids": paragraph_ids,
+ "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+ "chunk_index": chunk_index,
+ "cut_type": end_type
+ }
+ yield chunk_dict
+ paragraph_ids = []
+ current_chunk = ""
+ current_word_count = 0
+ chunk_index += 1
+
last_cut_type = end_type
- last_paragraph_id = paragraph_id
-
- if len(paragraph) > 0:
- yield dict(
- chunk_id = uuid5(NAMESPACE_OID, paragraph),
- text = paragraph,
- word_count = paragraph_word_count,
- paragraph_id = last_paragraph_id,
- chunk_index = paragraph_chunk_index,
- cut_type = last_cut_type,
- )
+
+ # Yield any remaining text
+ if current_chunk:
+ chunk_dict = {
+ "text": current_chunk,
+ "word_count": current_word_count,
+ "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+ "paragraph_ids": paragraph_ids,
+ "chunk_index": chunk_index,
+ "cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type
+ }
+
+
+ yield chunk_dict
\ No newline at end of file
diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
index 5b4a40bc1..c6848f066 100644
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -2,30 +2,43 @@
from uuid import uuid4
+from typing import Optional
from .chunk_by_word import chunk_by_word
-def chunk_by_sentence(data: str):
+def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
sentence = ""
paragraph_id = uuid4()
- chunk_index = 0
word_count = 0
+ section_end = False
+ word_type_state = None
+ # the yielded word_type_state is identical to word_type, except when
+ # the word type is 'word', the word doesn't contain any letters
+ # and words with the same characteristics connect it to a preceding
+ # word with word_type 'paragraph_end' or 'sentence_end'
for (word, word_type) in chunk_by_word(data):
- sentence += (" " if len(sentence) > 0 else "") + word
+ sentence += word
word_count += 1
- if word_type == "paragraph_end" or word_type == "sentence_end":
- yield (paragraph_id, chunk_index, sentence, word_count, word_type)
+ if word_type in ["paragraph_end", "sentence_end"]:
+ word_type_state = word_type
+ else:
+ for character in word:
+ if character.isalpha():
+ word_type_state = word_type
+ break
+
+ if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
+ yield (paragraph_id, sentence, word_count, word_type_state)
sentence = ""
word_count = 0
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
- chunk_index = 0 if word_type == "paragraph_end" else chunk_index + 1
if len(sentence) > 0:
+ section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield (
paragraph_id,
- chunk_index,
sentence,
word_count,
- "sentence_cut",
+ section_end,
)
diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py
index 8621754d5..ab4d8343e 100644
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@@ -1,60 +1,71 @@
import re
+SENTENCE_ENDINGS = r"[.;!?…]"
+PARAGRAPH_ENDINGS = r"[\n\r]"
+
+def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
+ """
+ Determines if the current position represents a real paragraph ending.
+
+ Args:
+ last_char: The last processed character
+ current_pos: Current position in the text
+ text: The input text
+
+ Returns:
+ bool: True if this is a real paragraph end, False otherwise
+ """
+ if re.match(SENTENCE_ENDINGS, last_char):
+ return True
+ j = current_pos + 1
+ if j >= len(text):
+ return False
+
+ next_character = text[j]
+ while j < len(text) and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
+ j += 1
+ if j >= len(text):
+ return False
+ next_character = text[j]
+
+ if next_character.isupper():
+ return True
+ return False
+
def chunk_by_word(data: str):
- sentence_endings = r"[.;!?…]"
- paragraph_endings = r"[\n\r]"
- last_processed_character = ""
-
- word = ""
+ """
+ Chunks text into words and endings while preserving whitespace.
+ Whitespace is included with the preceding word.
+ Outputs can be joined with "" to recreate the original input.
+ """
+ current_chunk = ""
i = 0
-
+
while i < len(data):
character = data[i]
-
- if word == "" and (re.match(paragraph_endings, character) or character == " "):
- i = i + 1
- continue
-
- def is_real_paragraph_end():
- if re.match(sentence_endings, last_processed_character):
- return True
-
- j = i + 1
- next_character = data[j] if j < len(data) else None
- while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
- j += 1
- next_character = data[j] if j < len(data) else None
- if next_character and next_character.isupper():
- return True
-
- return False
-
- if re.match(paragraph_endings, character):
- yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
- word = ""
- i = i + 1
- continue
-
+
+ current_chunk += character
+
if character == " ":
- yield [word, "word"]
- word = ""
- i = i + 1
+ yield (current_chunk, "word")
+ current_chunk = ""
+ i += 1
continue
-
- word += character
- last_processed_character = character
-
- if re.match(sentence_endings, character):
- # Check for ellipses.
- if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
- word += ".."
- i = i + 2
-
- is_paragraph_end = i + 1 < len(data) and re.match(paragraph_endings, data[i + 1])
- yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
- word = ""
-
+
+ if re.match(SENTENCE_ENDINGS, character):
+ # Look ahead for whitespace
+ next_i = i + 1
+ while next_i < len(data) and data[next_i] == " ":
+ current_chunk += data[next_i]
+ next_i += 1
+
+ is_paragraph_end = next_i < len(data) and re.match(PARAGRAPH_ENDINGS, data[next_i])
+ yield (current_chunk, "paragraph_end" if is_paragraph_end else "sentence_end")
+ current_chunk = ""
+ i = next_i
+ continue
+
i += 1
-
- if len(word) > 0:
- yield (word, "word")
+
+ if current_chunk:
+ yield (current_chunk, "word")
\ No newline at end of file
diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py
new file mode 100644
index 000000000..f133ef811
--- /dev/null
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@@ -0,0 +1,44 @@
+import uuid
+from unittest.mock import patch
+
+from cognee.modules.data.processing.document_types.AudioDocument import AudioDocument
+
+GROUND_TRUTH = [
+ {"word_count": 57, "len_text": 353, "cut_type": "sentence_end"},
+ {"word_count": 58, "len_text": 358, "cut_type": "sentence_end"},
+ {"word_count": 41, "len_text": 219, "cut_type": "sentence_end"},
+]
+
+TEST_TEXT = """
+"Mike, we need to talk about the payment processing service."
+"Good timing. The board wants one-click checkout by end of quarter."
+"That's exactly the problem. The service is held together with duct tape. One wrong move and—"
+"Sarah, we've been over this. The market won't wait."
+"And neither will a system collapse! The technical debt is crushing us. Every new feature takes twice as long as it should."
+"Then work twice as hard. Our competitors—"
+"Our competitors will laugh when our whole system goes down during Black Friday! We're talking about financial transactions here, not some blog comments section."
+"Write up your concerns in a doc. Right now, we ship one-click."
+"Then you'll ship it without me. I won't stake my reputation on a house of cards."
+"Are you threatening to quit?"
+"No, I'm threatening to be right. And when it breaks, I want it in writing that you chose this."
+"The feature ships, Sarah. That's final.\""""
+
+
+def test_AudioDocument():
+
+ document = AudioDocument(
+ id=uuid.uuid4(), name="audio-dummy-test", raw_data_location=""
+ )
+ with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
+ for ground_truth, paragraph_data in zip(
+ GROUND_TRUTH, document.read(chunk_size=64)
+ ):
+ assert (
+ ground_truth["word_count"] == paragraph_data.word_count
+ ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+ assert ground_truth["len_text"] == len(
+ paragraph_data.text
+ ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+ assert (
+ ground_truth["cut_type"] == paragraph_data.cut_type
+ ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py
new file mode 100644
index 000000000..e9caf3634
--- /dev/null
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@@ -0,0 +1,34 @@
+import uuid
+from unittest.mock import patch
+
+from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
+
+GROUND_TRUTH = [
+ {"word_count": 51, "len_text": 298, "cut_type": "sentence_end"},
+ {"word_count": 62, "len_text": 369, "cut_type": "sentence_end"},
+ {"word_count": 44, "len_text": 294, "cut_type": "sentence_end"},
+]
+
+TEST_TEXT = """A dramatic confrontation unfolds as a red fox and river otter engage in an energetic wrestling match at the water's edge. The fox, teeth bared in a playful snarl, has its front paws locked with the otter's flippers as they roll through the shallow stream, sending water spraying in all directions. The otter, displaying its surprising agility on land, counters by twisting its sleek body and attempting to wrap itself around the fox's shoulders, its whiskered face inches from the fox's muzzle.
+The commotion has attracted an audience: a murder of crows has gathered in the low branches, their harsh calls adding to the chaos as they hop excitedly from limb to limb. One particularly bold crow dive-bombs the wrestling pair, causing both animals to momentarily freeze mid-tussle, creating a perfect snapshot of suspended action—the fox's fur dripping wet, the otter's body coiled like a spring, and the crow's wings spread wide against the golden morning light."""
+
+
+def test_ImageDocument():
+
+ document = ImageDocument(
+ id=uuid.uuid4(), name="image-dummy-test", raw_data_location=""
+ )
+ with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
+
+ for ground_truth, paragraph_data in zip(
+ GROUND_TRUTH, document.read(chunk_size=64)
+ ):
+ assert (
+ ground_truth["word_count"] == paragraph_data.word_count
+ ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+ assert ground_truth["len_text"] == len(
+ paragraph_data.text
+ ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+ assert (
+ ground_truth["cut_type"] == paragraph_data.cut_type
+ ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py
similarity index 82%
rename from cognee/tests/unit/documents/PdfDocument_test.py
rename to cognee/tests/integration/documents/PdfDocument_test.py
index 917e9c3e0..d8ddbe23c 100644
--- a/cognee/tests/unit/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@@ -4,8 +4,8 @@ import uuid
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
GROUND_TRUTH = [
- {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
- {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
+ {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
+ {"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"},
]
@@ -16,12 +16,12 @@ def test_PdfDocument():
"test_data",
"artificial-intelligence.pdf",
)
- pdf_doc = PdfDocument(
+ document = PdfDocument(
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
)
for ground_truth, paragraph_data in zip(
- GROUND_TRUTH, pdf_doc.read(chunk_size=1024)
+ GROUND_TRUTH, document.read(chunk_size=1024)
):
assert (
ground_truth["word_count"] == paragraph_data.word_count
diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py
new file mode 100644
index 000000000..ef7d42272
--- /dev/null
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@@ -0,0 +1,46 @@
+import os
+import uuid
+
+import pytest
+
+from cognee.modules.data.processing.document_types.TextDocument import TextDocument
+
+GROUND_TRUTH = {
+ "code.txt": [
+ {"word_count": 205, "len_text": 1024, "cut_type": "sentence_cut"},
+ {"word_count": 104, "len_text": 833, "cut_type": "paragraph_end"},
+ ],
+ "Natural_language_processing.txt": [
+ {"word_count": 128, "len_text": 984, "cut_type": "paragraph_end"},
+ {"word_count": 1, "len_text": 1, "cut_type": "paragraph_end"},
+ ],
+}
+
+
+@pytest.mark.parametrize(
+ "input_file,chunk_size",
+ [("code.txt", 256), ("Natural_language_processing.txt", 128)],
+)
+def test_TextDocument(input_file, chunk_size):
+ test_file_path = os.path.join(
+ os.sep,
+ *(os.path.dirname(__file__).split(os.sep)[:-2]),
+ "test_data",
+ input_file,
+ )
+ document = TextDocument(
+ id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path
+ )
+
+ for ground_truth, paragraph_data in zip(
+ GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size)
+ ):
+ assert (
+ ground_truth["word_count"] == paragraph_data.word_count
+ ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+ assert ground_truth["len_text"] == len(
+ paragraph_data.text
+ ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+ assert (
+ ground_truth["cut_type"] == paragraph_data.cut_type
+ ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
diff --git a/cognee/tests/test_code_generation.py b/cognee/tests/test_code_generation.py
index aad59ace8..a21925585 100755
--- a/cognee/tests/test_code_generation.py
+++ b/cognee/tests/test_code_generation.py
@@ -26,7 +26,7 @@ async def main():
await render_graph(None, include_nodes = True, include_labels = True)
- search_results = await cognee.search(SearchType.CHUNKS, query = "Student")
+ search_results = await cognee.search(SearchType.CHUNKS, query_text = "Student")
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py
index 137b9f7e2..d2a1b6c59 100644
--- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py
+++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py
@@ -1,7 +1,7 @@
-import pytest
import numpy as np
+import pytest
-from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge
+from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge, Node
def test_node_initialization():
@@ -12,11 +12,13 @@ def test_node_initialization():
assert len(node.status) == 2
assert np.all(node.status == 1)
+
def test_node_invalid_dimension():
"""Test that initializing a Node with a non-positive dimension raises an error."""
with pytest.raises(ValueError, match="Dimension must be a positive integer"):
Node("node1", dimension=0)
+
def test_add_skeleton_neighbor():
"""Test adding a neighbor to a node."""
node1 = Node("node1")
@@ -24,6 +26,7 @@ def test_add_skeleton_neighbor():
node1.add_skeleton_neighbor(node2)
assert node2 in node1.skeleton_neighbours
+
def test_remove_skeleton_neighbor():
"""Test removing a neighbor from a node."""
node1 = Node("node1")
@@ -32,6 +35,7 @@ def test_remove_skeleton_neighbor():
node1.remove_skeleton_neighbor(node2)
assert node2 not in node1.skeleton_neighbours
+
def test_add_skeleton_edge():
"""Test adding an edge updates both skeleton_edges and skeleton_neighbours."""
node1 = Node("node1")
@@ -41,6 +45,7 @@ def test_add_skeleton_edge():
assert edge in node1.skeleton_edges
assert node2 in node1.skeleton_neighbours
+
def test_remove_skeleton_edge():
"""Test removing an edge updates both skeleton_edges and skeleton_neighbours."""
node1 = Node("node1")
@@ -51,6 +56,7 @@ def test_remove_skeleton_edge():
assert edge not in node1.skeleton_edges
assert node2 not in node1.skeleton_neighbours
+
def test_is_node_alive_in_dimension():
"""Test checking node's alive status in a specific dimension."""
node = Node("node1", dimension=2)
@@ -58,25 +64,30 @@ def test_is_node_alive_in_dimension():
node.status[1] = 0
assert not node.is_node_alive_in_dimension(1)
+
def test_node_alive_invalid_dimension():
"""Test that checking alive status with an invalid dimension raises an error."""
node = Node("node1", dimension=1)
with pytest.raises(ValueError, match="Dimension 1 is out of range"):
node.is_node_alive_in_dimension(1)
+
def test_node_equality():
"""Test equality between nodes."""
node1 = Node("node1")
node2 = Node("node1")
assert node1 == node2
+
def test_node_hash():
"""Test hashing for Node."""
node = Node("node1")
assert hash(node) == hash("node1")
+
### Tests for Edge ###
+
def test_edge_initialization():
"""Test that an Edge is initialized correctly."""
node1 = Node("node1")
@@ -89,6 +100,7 @@ def test_edge_initialization():
assert len(edge.status) == 2
assert np.all(edge.status == 1)
+
def test_edge_invalid_dimension():
"""Test that initializing an Edge with a non-positive dimension raises an error."""
node1 = Node("node1")
@@ -96,6 +108,7 @@ def test_edge_invalid_dimension():
with pytest.raises(ValueError, match="Dimensions must be a positive integer."):
Edge(node1, node2, dimension=0)
+
def test_is_edge_alive_in_dimension():
"""Test checking edge's alive status in a specific dimension."""
node1 = Node("node1")
@@ -105,6 +118,7 @@ def test_is_edge_alive_in_dimension():
edge.status[1] = 0
assert not edge.is_edge_alive_in_dimension(1)
+
def test_edge_alive_invalid_dimension():
"""Test that checking alive status with an invalid dimension raises an error."""
node1 = Node("node1")
@@ -113,6 +127,7 @@ def test_edge_alive_invalid_dimension():
with pytest.raises(ValueError, match="Dimension 1 is out of range"):
edge.is_edge_alive_in_dimension(1)
+
def test_edge_equality_directed():
"""Test equality between directed edges."""
node1 = Node("node1")
@@ -121,6 +136,7 @@ def test_edge_equality_directed():
edge2 = Edge(node1, node2, directed=True)
assert edge1 == edge2
+
def test_edge_equality_undirected():
"""Test equality between undirected edges."""
node1 = Node("node1")
@@ -129,6 +145,7 @@ def test_edge_equality_undirected():
edge2 = Edge(node2, node1, directed=False)
assert edge1 == edge2
+
def test_edge_hash_directed():
"""Test hashing for directed edges."""
node1 = Node("node1")
@@ -136,9 +153,10 @@ def test_edge_hash_directed():
edge = Edge(node1, node2, directed=True)
assert hash(edge) == hash((node1, node2))
+
def test_edge_hash_undirected():
"""Test hashing for undirected edges."""
node1 = Node("node1")
node2 = Node("node2")
edge = Edge(node1, node2, directed=False)
- assert hash(edge) == hash(frozenset({node1, node2}))
\ No newline at end of file
+ assert hash(edge) == hash(frozenset({node1, node2}))
diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py
index 235ccf11d..d05292d75 100644
--- a/cognee/tests/unit/modules/graph/cognee_graph_test.py
+++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py
@@ -1,7 +1,7 @@
import pytest
-from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
+from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge, Node
@pytest.fixture
@@ -9,6 +9,7 @@ def setup_graph():
"""Fixture to initialize a CogneeGraph instance."""
return CogneeGraph()
+
def test_add_node_success(setup_graph):
"""Test successful addition of a node."""
graph = setup_graph
@@ -16,6 +17,7 @@ def test_add_node_success(setup_graph):
graph.add_node(node)
assert graph.get_node("node1") == node
+
def test_add_duplicate_node(setup_graph):
"""Test adding a duplicate node raises an exception."""
graph = setup_graph
@@ -24,6 +26,7 @@ def test_add_duplicate_node(setup_graph):
with pytest.raises(ValueError, match="Node with id node1 already exists."):
graph.add_node(node)
+
def test_add_edge_success(setup_graph):
"""Test successful addition of an edge."""
graph = setup_graph
@@ -37,6 +40,7 @@ def test_add_edge_success(setup_graph):
assert edge in node1.skeleton_edges
assert edge in node2.skeleton_edges
+
def test_add_duplicate_edge(setup_graph):
"""Test adding a duplicate edge raises an exception."""
graph = setup_graph
@@ -49,6 +53,7 @@ def test_add_duplicate_edge(setup_graph):
with pytest.raises(ValueError, match="Edge .* already exists in the graph."):
graph.add_edge(edge)
+
def test_get_node_success(setup_graph):
"""Test retrieving an existing node."""
graph = setup_graph
@@ -56,11 +61,13 @@ def test_get_node_success(setup_graph):
graph.add_node(node)
assert graph.get_node("node1") == node
+
def test_get_node_nonexistent(setup_graph):
"""Test retrieving a nonexistent node returns None."""
graph = setup_graph
assert graph.get_node("nonexistent") is None
+
def test_get_edges_success(setup_graph):
"""Test retrieving edges of a node."""
graph = setup_graph
@@ -72,6 +79,7 @@ def test_get_edges_success(setup_graph):
graph.add_edge(edge)
assert edge in graph.get_edges("node1")
+
def test_get_edges_nonexistent_node(setup_graph):
"""Test retrieving edges for a nonexistent node raises an exception."""
graph = setup_graph
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
new file mode 100644
index 000000000..8e900727d
--- /dev/null
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@@ -0,0 +1,53 @@
+from itertools import product
+
+import numpy as np
+import pytest
+
+from cognee.tasks.chunks import chunk_by_paragraph, chunk_by_word
+from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+
+paragraph_lengths = [64, 256, 1024]
+batch_paragraphs_vals = [True, False]
+
+
+@pytest.mark.parametrize(
+ "input_text,paragraph_length,batch_paragraphs",
+ list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+)
+def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
+ chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
+ reconstructed_text = "".join([chunk["text"] for chunk in chunks])
+ assert (
+ reconstructed_text == input_text
+ ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+
+
+@pytest.mark.parametrize(
+ "input_text,paragraph_length,batch_paragraphs",
+ list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+)
+def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
+ chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs))
+
+ chunk_lengths = np.array(
+ [len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]
+ )
+
+ larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
+ assert np.all(
+ chunk_lengths <= paragraph_length
+ ), f"{paragraph_length = }: {larger_chunks} are too large"
+
+
+@pytest.mark.parametrize(
+ "input_text,paragraph_length,batch_paragraphs",
+ list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+)
+def test_chunk_by_paragraph_chunk_numbering(
+ input_text, paragraph_length, batch_paragraphs
+):
+ chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
+ chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
+ assert np.all(
+ chunk_indices == np.arange(len(chunk_indices))
+ ), f"{chunk_indices = } are not monotonically increasing"
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
index 24c3cc147..3ddc6f4f5 100644
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@@ -8,12 +8,12 @@ GROUND_TRUTH = {
"cut_type": "paragraph_end",
},
{
- "text": "This is a second paragraph. First two paragraphs are whole.",
+ "text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"cut_type": "paragraph_end",
},
{
- "text": "Third paragraph is a bit longer and is finished with a dot.",
+ "text": "\nThird paragraph is a bit longer and is finished with a dot.",
"word_count": 12,
"cut_type": "sentence_end",
},
@@ -25,12 +25,12 @@ GROUND_TRUTH = {
"cut_type": "paragraph_end",
},
{
- "text": "This is a second paragraph. First two paragraphs are whole.",
+ "text": "\nThis is a second paragraph. First two paragraphs are whole.",
"word_count": 10,
"cut_type": "paragraph_end",
},
{
- "text": "Third paragraph is cut and is missing the dot at the end",
+ "text": "\nThird paragraph is cut and is missing the dot at the end",
"word_count": 12,
"cut_type": "sentence_cut",
},
@@ -39,11 +39,11 @@ GROUND_TRUTH = {
INPUT_TEXT = {
"whole_text": """This is example text. It contains multiple sentences.
- This is a second paragraph. First two paragraphs are whole.
- Third paragraph is a bit longer and is finished with a dot.""",
+This is a second paragraph. First two paragraphs are whole.
+Third paragraph is a bit longer and is finished with a dot.""",
"cut_text": """This is example text. It contains multiple sentences.
- This is a second paragraph. First two paragraphs are whole.
- Third paragraph is cut and is missing the dot at the end""",
+This is a second paragraph. First two paragraphs are whole.
+Third paragraph is cut and is missing the dot at the end""",
}
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
new file mode 100644
index 000000000..d1c75d7ed
--- /dev/null
+++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py
@@ -0,0 +1,41 @@
+from itertools import product
+
+import numpy as np
+import pytest
+
+from cognee.tasks.chunks import chunk_by_sentence, chunk_by_word
+from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+
+maximum_length_vals = [None, 8, 64]
+
+
+@pytest.mark.parametrize(
+ "input_text,maximum_length",
+ list(product(list(INPUT_TEXTS.values()), maximum_length_vals)),
+)
+def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
+ chunks = chunk_by_sentence(input_text, maximum_length)
+ reconstructed_text = "".join([chunk[1] for chunk in chunks])
+ assert (
+ reconstructed_text == input_text
+ ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+
+
+@pytest.mark.parametrize(
+ "input_text,maximum_length",
+ list(
+ product(
+ list(INPUT_TEXTS.values()),
+ [val for val in maximum_length_vals if val is not None],
+ )
+ ),
+)
+def test_paragraph_chunk_length(input_text, maximum_length):
+ chunks = list(chunk_by_sentence(input_text, maximum_length))
+
+ chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
+
+ larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
+ assert np.all(
+ chunk_lengths <= maximum_length
+ ), f"{maximum_length = }: {larger_chunks} are too large"
diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
new file mode 100644
index 000000000..42523c106
--- /dev/null
+++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from cognee.tasks.chunks import chunk_by_word
+from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
+
+
+@pytest.mark.parametrize(
+ "input_text",
+ [
+ INPUT_TEXTS["english_text"],
+ INPUT_TEXTS["english_lists"],
+ INPUT_TEXTS["python_code"],
+ INPUT_TEXTS["chinese_text"],
+ ],
+)
+def test_chunk_by_word_isomorphism(input_text):
+ chunks = chunk_by_word(input_text)
+ reconstructed_text = "".join([chunk[0] for chunk in chunks])
+ assert (
+ reconstructed_text == input_text
+ ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
+
+
+@pytest.mark.parametrize(
+ "input_text",
+ [
+ INPUT_TEXTS["english_text"],
+ INPUT_TEXTS["english_lists"],
+ INPUT_TEXTS["python_code"],
+ INPUT_TEXTS["chinese_text"],
+ ],
+)
+def test_chunk_by_word_splits(input_text):
+ chunks = np.array(list(chunk_by_word(input_text)))
+ space_test = np.array([" " not in chunk[0].strip() for chunk in chunks])
+
+ assert np.all(
+ space_test
+ ), f"These chunks contain spaces within them: {chunks[space_test == False]}"
diff --git a/cognee/tests/unit/processing/chunks/test_input.py b/cognee/tests/unit/processing/chunks/test_input.py
new file mode 100644
index 000000000..21a0e3165
--- /dev/null
+++ b/cognee/tests/unit/processing/chunks/test_input.py
@@ -0,0 +1,284 @@
+import pytest
+
+INPUT_TEXTS = {
+ "empty": "",
+ "single_char": "x",
+ "whitespace": " \n\t \r\n ",
+ "unicode_special": "Hello 👋 مرحبا שָׁלוֹם",
+ "mixed_endings": "line1\r\nline2\nline3\r\nline4",
+ "many_newlines": "\n\n\n\ntext\n\n\n\n",
+ "html_mixed": "
Hello
\nPlain text\n