test: Add test for text deduplication
Added end to end test for text deduplication Test COG-505
This commit is contained in:
parent
378e7b81a5
commit
813b76c9c2
8 changed files with 211 additions and 1 deletions
54
.github/workflows/test_deduplication.yml
vendored
Normal file
54
.github/workflows/test_deduplication.yml
vendored
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
name: test | deduplication
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
types: [labeled, synchronize]
|
||||
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUNTIME__LOG_LEVEL: ERROR
|
||||
|
||||
jobs:
|
||||
get_docs_changes:
|
||||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
start_postgres:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }}
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
services:
|
||||
postgres:
|
||||
image: pgvector/pgvector:pg17
|
||||
env:
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
options: >-
|
||||
--health-cmd pg_isready
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
run_simple_example_test:
|
||||
needs: start_postgres
|
||||
uses: ./.github/workflows/reusable_python_example.yml
|
||||
with:
|
||||
example-location: ./cognee/tests/test_deduplication.py
|
||||
secrets:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
@ -131,10 +131,11 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
|
|||
write_disposition="merge",
|
||||
)
|
||||
else:
|
||||
# Data should be stored in the same schema to allow deduplication
|
||||
run_info = pipeline.run(
|
||||
data_resources(file_paths, user),
|
||||
table_name="file_metadata",
|
||||
dataset_name=dataset_name,
|
||||
dataset_name="public",
|
||||
write_disposition="merge",
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
|
||||
Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.
|
||||
BIN
cognee/tests/test_data/example.png
Normal file
BIN
cognee/tests/test_data/example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 10 KiB |
BIN
cognee/tests/test_data/example_copy.png
Normal file
BIN
cognee/tests/test_data/example_copy.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 10 KiB |
BIN
cognee/tests/test_data/text_to_speech.mp3
Normal file
BIN
cognee/tests/test_data/text_to_speech.mp3
Normal file
Binary file not shown.
BIN
cognee/tests/test_data/text_to_speech_copy.mp3
Normal file
BIN
cognee/tests/test_data/text_to_speech_copy.mp3
Normal file
Binary file not shown.
153
cognee/tests/test_deduplication.py
Normal file
153
cognee/tests/test_deduplication.py
Normal file
|
|
@ -0,0 +1,153 @@
|
|||
import hashlib
|
||||
import os
|
||||
import logging
|
||||
import pathlib
|
||||
|
||||
import cognee
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
async def test_deduplication():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
relational_engine = get_relational_engine()
|
||||
|
||||
dataset_name = "test_deduplication"
|
||||
dataset_name2 = "test_deduplication2"
|
||||
|
||||
# Test deduplication of local files
|
||||
explanation_file_path = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
|
||||
)
|
||||
explanation_file_path2 = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
|
||||
)
|
||||
await cognee.add([explanation_file_path], dataset_name)
|
||||
await cognee.add([explanation_file_path2], dataset_name2)
|
||||
|
||||
result = await relational_engine.get_all_data_from_table("data")
|
||||
assert len(result) == 1, "More than one data entity was found."
|
||||
assert result[0]["name"] == "Natural_language_processing_copy", "Result name does not match expected value."
|
||||
|
||||
result = await relational_engine.get_all_data_from_table("datasets")
|
||||
assert len(result) == 2, "Unexpected number of datasets found."
|
||||
assert result[0]["name"] == dataset_name, "Result name does not match expected value."
|
||||
assert result[1]["name"] == dataset_name2, "Result name does not match expected value."
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
# Test deduplication of text input
|
||||
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
|
||||
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
|
||||
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
|
||||
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
|
||||
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
|
||||
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
|
||||
"""
|
||||
|
||||
await cognee.add([text], dataset_name)
|
||||
await cognee.add([text], dataset_name2)
|
||||
|
||||
result = await relational_engine.get_all_data_from_table("data")
|
||||
assert len(result) == 1, "More than one data entity was found."
|
||||
assert hashlib.md5(text.encode('utf-8')).hexdigest() in result[0]["name"], "Content hash is not a part of file name."
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
# Test deduplication of image files
|
||||
explanation_file_path = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/example.png"
|
||||
)
|
||||
explanation_file_path2 = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/example_copy.png"
|
||||
)
|
||||
|
||||
await cognee.add([explanation_file_path], dataset_name)
|
||||
await cognee.add([explanation_file_path2], dataset_name2)
|
||||
|
||||
assert len(result) == 1, "More than one data entity was found."
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
# Test deduplication of sound files
|
||||
explanation_file_path = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3"
|
||||
)
|
||||
explanation_file_path2 = os.path.join(
|
||||
pathlib.Path(__file__).parent, "test_data/text_to_speech_copy.mp3"
|
||||
)
|
||||
|
||||
await cognee.add([explanation_file_path], dataset_name)
|
||||
await cognee.add([explanation_file_path2], dataset_name2)
|
||||
|
||||
assert len(result) == 1, "More than one data entity was found."
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
|
||||
async def test_deduplication_postgres():
|
||||
cognee.config.set_vector_db_config(
|
||||
{
|
||||
"vector_db_url": "",
|
||||
"vector_db_key": "",
|
||||
"vector_db_provider": "pgvector"
|
||||
}
|
||||
)
|
||||
cognee.config.set_relational_db_config(
|
||||
{
|
||||
"db_name": "cognee_db",
|
||||
"db_host": "127.0.0.1",
|
||||
"db_port": "5432",
|
||||
"db_username": "cognee",
|
||||
"db_password": "cognee",
|
||||
"db_provider": "postgres",
|
||||
}
|
||||
)
|
||||
|
||||
await test_deduplication()
|
||||
|
||||
async def test_deduplication_sqlite():
|
||||
cognee.config.set_vector_db_config(
|
||||
{
|
||||
"vector_db_url": "",
|
||||
"vector_db_key": "",
|
||||
"vector_db_provider": "lancedb"
|
||||
}
|
||||
)
|
||||
cognee.config.set_relational_db_config(
|
||||
{
|
||||
"db_provider": "sqlite",
|
||||
}
|
||||
)
|
||||
|
||||
await test_deduplication()
|
||||
|
||||
|
||||
async def main():
|
||||
|
||||
data_directory_path = str(
|
||||
pathlib.Path(
|
||||
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_deduplication")
|
||||
).resolve()
|
||||
)
|
||||
cognee.config.data_root_directory(data_directory_path)
|
||||
cognee_directory_path = str(
|
||||
pathlib.Path(
|
||||
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_deduplication")
|
||||
).resolve()
|
||||
)
|
||||
cognee.config.system_root_directory(cognee_directory_path)
|
||||
|
||||
await test_deduplication_postgres()
|
||||
await test_deduplication_sqlite()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(main())
|
||||
Loading…
Add table
Reference in a new issue