157 lines
6.7 KiB
Python
157 lines
6.7 KiB
Python
from uuid import UUID
|
|
from typing import Union, BinaryIO, List, Optional
|
|
|
|
from cognee.modules.pipelines import Task
|
|
from cognee.modules.users.models import User
|
|
from cognee.modules.pipelines import cognee_pipeline
|
|
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
|
|
|
|
|
async def add(
|
|
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
|
dataset_name: str = "main_dataset",
|
|
user: User = None,
|
|
node_set: Optional[List[str]] = None,
|
|
vector_db_config: dict = None,
|
|
graph_db_config: dict = None,
|
|
dataset_id: Optional[UUID] = None,
|
|
preferred_loaders: List[str] = None,
|
|
incremental_loading: bool = True,
|
|
):
|
|
"""
|
|
Add data to Cognee for knowledge graph processing.
|
|
|
|
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
|
for processing. The function accepts various data formats including text, files, and
|
|
binary streams, then stores them in a specified dataset for further processing.
|
|
|
|
Prerequisites:
|
|
- **LLM_API_KEY**: Must be set in environment variables for content processing
|
|
- **Database Setup**: Relational and vector databases must be configured
|
|
- **User Authentication**: Uses default user if none provided (created automatically)
|
|
|
|
Supported Input Types:
|
|
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
|
|
- **File paths**: Local file paths as strings in these formats:
|
|
* Absolute paths: "/path/to/document.pdf"
|
|
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
|
|
* S3 paths: "s3://bucket-name/path/to/file.pdf"
|
|
- **Binary file objects**: File handles/streams (BinaryIO)
|
|
- **Lists**: Multiple files or text strings in a single call
|
|
|
|
Supported File Formats:
|
|
- Text files (.txt, .md, .csv)
|
|
- PDFs (.pdf)
|
|
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
|
|
- Audio files (.mp3, .wav) - transcribed to text
|
|
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
|
|
- Office documents (.docx, .pptx)
|
|
|
|
Workflow:
|
|
1. **Data Resolution**: Resolves file paths and validates accessibility
|
|
2. **Content Extraction**: Extracts text content from various file formats
|
|
3. **Dataset Storage**: Stores processed content in the specified dataset
|
|
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
|
|
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
|
|
|
|
Args:
|
|
data: The data to ingest. Can be:
|
|
- Single text string: "Your text content here"
|
|
- Absolute file path: "/path/to/document.pdf"
|
|
- File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
|
|
- S3 path: "s3://my-bucket/documents/file.pdf"
|
|
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
|
|
- Binary file object: open("file.txt", "rb")
|
|
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
|
|
Create separate datasets to organize different knowledge domains.
|
|
user: User object for authentication and permissions. Uses default user if None.
|
|
Default user: "default_user@example.com" (created automatically on first use).
|
|
Users can only access datasets they have permissions for.
|
|
node_set: Optional list of node identifiers for graph organization and access control.
|
|
Used for grouping related data points in the knowledge graph.
|
|
vector_db_config: Optional configuration for vector database (for custom setups).
|
|
graph_db_config: Optional configuration for graph database (for custom setups).
|
|
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
|
|
|
Returns:
|
|
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
|
- Pipeline run ID for tracking
|
|
- Dataset ID where data was stored
|
|
- Processing status and any errors
|
|
- Execution timestamps and metadata
|
|
|
|
Next Steps:
|
|
After successfully adding data, call `cognify()` to process the ingested content:
|
|
|
|
```python
|
|
import cognee
|
|
|
|
# Step 1: Add your data (text content or file path)
|
|
await cognee.add("Your document content") # Raw text
|
|
# OR
|
|
await cognee.add("/path/to/your/file.pdf") # File path
|
|
|
|
# Step 2: Process into knowledge graph
|
|
await cognee.cognify()
|
|
|
|
# Step 3: Search and query
|
|
results = await cognee.search("What insights can you find?")
|
|
```
|
|
|
|
Example Usage:
|
|
```python
|
|
# Add a single text document
|
|
await cognee.add("Natural language processing is a field of AI...")
|
|
|
|
# Add multiple files with different path formats
|
|
await cognee.add([
|
|
"/absolute/path/to/research_paper.pdf", # Absolute path
|
|
"file://relative/path/to/dataset.csv", # Relative file URL
|
|
"file:///absolute/path/to/report.docx", # Absolute file URL
|
|
"s3://my-bucket/documents/data.json", # S3 path
|
|
"Additional context text" # Raw text content
|
|
])
|
|
|
|
# Add to a specific dataset
|
|
await cognee.add(
|
|
data="Project documentation content",
|
|
dataset_name="project_docs"
|
|
)
|
|
|
|
# Add a single file
|
|
await cognee.add("/home/user/documents/analysis.pdf")
|
|
```
|
|
|
|
Environment Variables:
|
|
Required:
|
|
- LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
|
|
|
|
Optional:
|
|
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
|
|
- LLM_MODEL: Model name (default: "gpt-4o-mini")
|
|
- DEFAULT_USER_EMAIL: Custom default user email
|
|
- DEFAULT_USER_PASSWORD: Custom default user password
|
|
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector"
|
|
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
|
|
|
|
"""
|
|
tasks = [
|
|
Task(resolve_data_directories, include_subdirectories=True),
|
|
Task(ingest_data, dataset_name, user, node_set, dataset_id, preferred_loaders),
|
|
]
|
|
|
|
pipeline_run_info = None
|
|
|
|
async for run_info in cognee_pipeline(
|
|
tasks=tasks,
|
|
datasets=dataset_id if dataset_id else dataset_name,
|
|
data=data,
|
|
user=user,
|
|
pipeline_name="add_pipeline",
|
|
vector_db_config=vector_db_config,
|
|
graph_db_config=graph_db_config,
|
|
incremental_loading=incremental_loading,
|
|
):
|
|
pipeline_run_info = run_info
|
|
|
|
return pipeline_run_info
|