Merge pull request #468 from topoteretes/COG-970-refactor-tokenizing
Cog 970 refactor tokenizing
This commit is contained in:
commit
d900060e2b
44 changed files with 633 additions and 134 deletions
|
|
@ -1,12 +1,28 @@
|
|||
ENV="local"
|
||||
TOKENIZERS_PARALLELISM="false"
|
||||
LLM_API_KEY=
|
||||
|
||||
# LLM settings
|
||||
LLM_API_KEY=""
|
||||
LLM_MODEL="openai/gpt-4o-mini"
|
||||
LLM_PROVIDER="openai"
|
||||
LLM_ENDPOINT=""
|
||||
LLM_API_VERSION=""
|
||||
LLM_MAX_TOKENS="16384"
|
||||
|
||||
GRAPHISTRY_USERNAME=
|
||||
GRAPHISTRY_PASSWORD=
|
||||
|
||||
SENTRY_REPORTING_URL=
|
||||
|
||||
# Embedding settings
|
||||
EMBEDDING_PROVIDER="openai"
|
||||
EMBEDDING_API_KEY=""
|
||||
EMBEDDING_MODEL="openai/text-embedding-3-large"
|
||||
EMBEDDING_ENDPOINT=""
|
||||
EMBEDDING_API_VERSION=""
|
||||
EMBEDDING_DIMENSIONS=3072
|
||||
EMBEDDING_MAX_TOKENS=8191
|
||||
|
||||
# "neo4j" or "networkx"
|
||||
GRAPH_DATABASE_PROVIDER="networkx"
|
||||
# Not needed if using networkx
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from cognee.tasks.repo_processor import (
|
|||
from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
|
||||
from cognee.tasks.storage import add_data_points
|
||||
from cognee.tasks.summarization import summarize_code, summarize_text
|
||||
from cognee.infrastructure.llm import get_max_chunk_tokens
|
||||
|
||||
monitoring = get_base_config().monitoring_tool
|
||||
if monitoring == MonitoringTool.LANGFUSE:
|
||||
|
|
@ -57,7 +58,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
|
|||
Task(ingest_data, dataset_name="repo_docs", user=user),
|
||||
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
||||
Task(classify_documents),
|
||||
Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
|
||||
Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()),
|
||||
Task(
|
||||
extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
|
||||
),
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from typing import Union
|
|||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cognee.infrastructure.llm import get_max_chunk_tokens
|
||||
from cognee.modules.cognify.config import get_cognify_config
|
||||
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
|
||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
|
|
@ -151,7 +152,9 @@ async def get_default_tasks(
|
|||
default_tasks = [
|
||||
Task(classify_documents),
|
||||
Task(check_permissions_on_documents, user=user, permissions=["write"]),
|
||||
Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
|
||||
Task(
|
||||
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
|
||||
), # Extract text chunks based on the document type.
|
||||
Task(
|
||||
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
|
||||
), # Generate knowledge graphs from the document chunks.
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@ import litellm
|
|||
import os
|
||||
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
|
||||
from cognee.infrastructure.llm.tokenizer.Gemini import GeminiTokenizer
|
||||
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
|
||||
from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
|
||||
|
||||
litellm.set_verbose = False
|
||||
logger = logging.getLogger("LiteLLMEmbeddingEngine")
|
||||
|
|
@ -15,23 +18,29 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
api_key: str
|
||||
endpoint: str
|
||||
api_version: str
|
||||
provider: str
|
||||
model: str
|
||||
dimensions: int
|
||||
mock: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = "openai",
|
||||
model: Optional[str] = "text-embedding-3-large",
|
||||
dimensions: Optional[int] = 3072,
|
||||
api_key: str = None,
|
||||
endpoint: str = None,
|
||||
api_version: str = None,
|
||||
max_tokens: int = 512,
|
||||
):
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.provider = provider
|
||||
self.model = model
|
||||
self.dimensions = dimensions
|
||||
self.max_tokens = max_tokens
|
||||
self.tokenizer = self.get_tokenizer()
|
||||
|
||||
enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
|
||||
if isinstance(enable_mocking, bool):
|
||||
|
|
@ -104,3 +113,18 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
|
||||
def get_vector_size(self) -> int:
|
||||
return self.dimensions
|
||||
|
||||
def get_tokenizer(self):
|
||||
logger.debug(f"Loading tokenizer for model {self.model}...")
|
||||
# If model also contains provider information, extract only model information
|
||||
model = self.model.split("/")[-1]
|
||||
|
||||
if "openai" in self.provider.lower():
|
||||
tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens)
|
||||
elif "gemini" in self.provider.lower():
|
||||
tokenizer = GeminiTokenizer(model=model, max_tokens=self.max_tokens)
|
||||
else:
|
||||
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
|
||||
|
||||
logger.debug(f"Tokenizer loaded for model: {self.model}")
|
||||
return tokenizer
|
||||
|
|
|
|||
|
|
@ -4,12 +4,13 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
|
|||
|
||||
|
||||
class EmbeddingConfig(BaseSettings):
|
||||
embedding_model: Optional[str] = "text-embedding-3-large"
|
||||
embedding_provider: Optional[str] = "openai"
|
||||
embedding_model: Optional[str] = "openai/text-embedding-3-large"
|
||||
embedding_dimensions: Optional[int] = 3072
|
||||
embedding_endpoint: Optional[str] = None
|
||||
embedding_api_key: Optional[str] = None
|
||||
embedding_api_version: Optional[str] = None
|
||||
|
||||
embedding_max_tokens: Optional[int] = 8191
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -10,9 +10,11 @@ def get_embedding_engine() -> EmbeddingEngine:
|
|||
|
||||
return LiteLLMEmbeddingEngine(
|
||||
# If OpenAI API is used for embeddings, litellm needs only the api_key.
|
||||
provider=config.embedding_provider,
|
||||
api_key=config.embedding_api_key or llm_config.llm_api_key,
|
||||
endpoint=config.embedding_endpoint,
|
||||
api_version=config.embedding_api_version,
|
||||
model=config.embedding_model,
|
||||
dimensions=config.embedding_dimensions,
|
||||
max_tokens=config.embedding_max_tokens,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1 +1,2 @@
|
|||
from .config import get_llm_config
|
||||
from .utils import get_max_chunk_tokens
|
||||
|
|
|
|||
|
|
@ -14,11 +14,12 @@ class AnthropicAdapter(LLMInterface):
|
|||
name = "Anthropic"
|
||||
model: str
|
||||
|
||||
def __init__(self, model: str = None):
|
||||
def __init__(self, max_tokens: int, model: str = None):
|
||||
self.aclient = instructor.patch(
|
||||
create=anthropic.Anthropic().messages.create, mode=instructor.Mode.ANTHROPIC_TOOLS
|
||||
)
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
async def acreate_structured_output(
|
||||
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ class LLMConfig(BaseSettings):
|
|||
llm_api_version: Optional[str] = None
|
||||
llm_temperature: float = 0.0
|
||||
llm_streaming: bool = False
|
||||
llm_max_tokens: int = 16384
|
||||
transcription_model: str = "whisper-1"
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
|
@ -24,6 +25,7 @@ class LLMConfig(BaseSettings):
|
|||
"api_version": self.llm_api_version,
|
||||
"temperature": self.llm_temperature,
|
||||
"streaming": self.llm_streaming,
|
||||
"max_tokens": self.llm_max_tokens,
|
||||
"transcription_model": self.transcription_model,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
import asyncio
|
||||
from typing import List, Type
|
||||
|
||||
from pydantic import BaseModel
|
||||
import instructor
|
||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||
|
|
@ -16,11 +17,12 @@ class GenericAPIAdapter(LLMInterface):
|
|||
model: str
|
||||
api_key: str
|
||||
|
||||
def __init__(self, endpoint, api_key: str, model: str, name: str):
|
||||
def __init__(self, endpoint, api_key: str, model: str, name: str, max_tokens: int):
|
||||
self.name = name
|
||||
self.model = model
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
llm_config = get_llm_config()
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,15 @@ def get_llm_client():
|
|||
|
||||
provider = LLMProvider(llm_config.llm_provider)
|
||||
|
||||
# Check if max_token value is defined in liteLLM for given model
|
||||
# if not use value from cognee configuration
|
||||
from cognee.infrastructure.llm.utils import (
|
||||
get_model_max_tokens,
|
||||
) # imported here to avoid circular imports
|
||||
|
||||
model_max_tokens = get_model_max_tokens(llm_config.llm_model)
|
||||
max_tokens = model_max_tokens if model_max_tokens else llm_config.llm_max_tokens
|
||||
|
||||
if provider == LLMProvider.OPENAI:
|
||||
if llm_config.llm_api_key is None:
|
||||
raise InvalidValueError(message="LLM API key is not set.")
|
||||
|
|
@ -32,6 +41,7 @@ def get_llm_client():
|
|||
api_version=llm_config.llm_api_version,
|
||||
model=llm_config.llm_model,
|
||||
transcription_model=llm_config.transcription_model,
|
||||
max_tokens=max_tokens,
|
||||
streaming=llm_config.llm_streaming,
|
||||
)
|
||||
|
||||
|
|
@ -42,13 +52,17 @@ def get_llm_client():
|
|||
from .generic_llm_api.adapter import GenericAPIAdapter
|
||||
|
||||
return GenericAPIAdapter(
|
||||
llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Ollama"
|
||||
llm_config.llm_endpoint,
|
||||
llm_config.llm_api_key,
|
||||
llm_config.llm_model,
|
||||
"Ollama",
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
elif provider == LLMProvider.ANTHROPIC:
|
||||
from .anthropic.adapter import AnthropicAdapter
|
||||
|
||||
return AnthropicAdapter(llm_config.llm_model)
|
||||
return AnthropicAdapter(max_tokens=max_tokens, model=llm_config.llm_model)
|
||||
|
||||
elif provider == LLMProvider.CUSTOM:
|
||||
if llm_config.llm_api_key is None:
|
||||
|
|
@ -57,7 +71,11 @@ def get_llm_client():
|
|||
from .generic_llm_api.adapter import GenericAPIAdapter
|
||||
|
||||
return GenericAPIAdapter(
|
||||
llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Custom"
|
||||
llm_config.llm_endpoint,
|
||||
llm_config.llm_api_key,
|
||||
llm_config.llm_model,
|
||||
"Custom",
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_version: str,
|
||||
model: str,
|
||||
transcription_model: str,
|
||||
max_tokens: int,
|
||||
streaming: bool = False,
|
||||
):
|
||||
self.aclient = instructor.from_litellm(litellm.acompletion)
|
||||
|
|
@ -41,6 +42,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.max_tokens = max_tokens
|
||||
self.streaming = streaming
|
||||
|
||||
@observe(as_type="generation")
|
||||
|
|
|
|||
1
cognee/infrastructure/llm/tokenizer/Gemini/__init__.py
Normal file
1
cognee/infrastructure/llm/tokenizer/Gemini/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
from .adapter import GeminiTokenizer
|
||||
44
cognee/infrastructure/llm/tokenizer/Gemini/adapter.py
Normal file
44
cognee/infrastructure/llm/tokenizer/Gemini/adapter.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
from typing import List, Any
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
||||
|
||||
class GeminiTokenizer(TokenizerInterface):
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
max_tokens: int = 3072,
|
||||
):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
# Get LLM API key from config
|
||||
from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
|
||||
from cognee.infrastructure.llm.config import get_llm_config
|
||||
|
||||
config = get_embedding_config()
|
||||
llm_config = get_llm_config()
|
||||
|
||||
import google.generativeai as genai
|
||||
|
||||
genai.configure(api_key=config.embedding_api_key or llm_config.llm_api_key)
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Returns the number of tokens in the given text.
|
||||
Args:
|
||||
text: str
|
||||
|
||||
Returns:
|
||||
number of tokens in the given text
|
||||
|
||||
"""
|
||||
import google.generativeai as genai
|
||||
|
||||
return len(genai.embed_content(model=f"models/{self.model}", content=text))
|
||||
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
raise NotImplementedError
|
||||
|
|
@ -0,0 +1 @@
|
|||
from .adapter import HuggingFaceTokenizer
|
||||
36
cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
Normal file
36
cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
from typing import List, Any
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
||||
|
||||
class HuggingFaceTokenizer(TokenizerInterface):
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
max_tokens: int = 512,
|
||||
):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
return tokens
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Returns the number of tokens in the given text.
|
||||
Args:
|
||||
text: str
|
||||
|
||||
Returns:
|
||||
number of tokens in the given text
|
||||
|
||||
"""
|
||||
return len(self.tokenizer.tokenize(text))
|
||||
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
raise NotImplementedError
|
||||
1
cognee/infrastructure/llm/tokenizer/TikToken/__init__.py
Normal file
1
cognee/infrastructure/llm/tokenizer/TikToken/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
from .adapter import TikTokenTokenizer
|
||||
69
cognee/infrastructure/llm/tokenizer/TikToken/adapter.py
Normal file
69
cognee/infrastructure/llm/tokenizer/TikToken/adapter.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
from typing import List, Any
|
||||
import tiktoken
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
||||
|
||||
class TikTokenTokenizer(TokenizerInterface):
|
||||
"""
|
||||
Tokenizer adapter for OpenAI.
|
||||
Inteded to be used as part of LLM Embedding and LLM Adapters classes
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
max_tokens: int = 8191,
|
||||
):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
# Initialize TikToken for GPT based on model
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
tokens = []
|
||||
# Using TikToken's method to tokenize text
|
||||
token_ids = self.tokenizer.encode(text)
|
||||
# Go through tokens and decode them to text value
|
||||
for token_id in token_ids:
|
||||
token = self.tokenizer.decode([token_id])
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Returns the number of tokens in the given text.
|
||||
Args:
|
||||
text: str
|
||||
|
||||
Returns:
|
||||
number of tokens in the given text
|
||||
|
||||
"""
|
||||
num_tokens = len(self.tokenizer.encode(text))
|
||||
return num_tokens
|
||||
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
"""
|
||||
Trims the text so that the number of tokens does not exceed max_tokens.
|
||||
|
||||
Args:
|
||||
text (str): Original text string to be trimmed.
|
||||
|
||||
Returns:
|
||||
str: Trimmed version of text or original text if under the limit.
|
||||
"""
|
||||
# First check the number of tokens
|
||||
num_tokens = self.count_tokens(text)
|
||||
|
||||
# If the number of tokens is within the limit, return the text as is
|
||||
if num_tokens <= self.max_tokens:
|
||||
return text
|
||||
|
||||
# If the number exceeds the limit, trim the text
|
||||
# This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
|
||||
encoded_text = self.tokenizer.encode(text)
|
||||
trimmed_encoded_text = encoded_text[: self.max_tokens]
|
||||
# Decoding the trimmed text
|
||||
trimmed_text = self.tokenizer.decode(trimmed_encoded_text)
|
||||
return trimmed_text
|
||||
1
cognee/infrastructure/llm/tokenizer/__init__.py
Normal file
1
cognee/infrastructure/llm/tokenizer/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
from .tokenizer_interface import TokenizerInterface
|
||||
18
cognee/infrastructure/llm/tokenizer/tokenizer_interface.py
Normal file
18
cognee/infrastructure/llm/tokenizer/tokenizer_interface.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
from typing import List, Protocol, Any
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class TokenizerInterface(Protocol):
|
||||
"""Tokenizer interface"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def count_tokens(self, text: str) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
raise NotImplementedError
|
||||
38
cognee/infrastructure/llm/utils.py
Normal file
38
cognee/infrastructure/llm/utils.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
import logging
|
||||
import litellm
|
||||
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_max_chunk_tokens():
|
||||
# Calculate max chunk size based on the following formula
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
llm_client = get_llm_client()
|
||||
|
||||
# We need to make sure chunk size won't take more than half of LLM max context token size
|
||||
# but it also can't be bigger than the embedding engine max token size
|
||||
llm_cutoff_point = llm_client.max_tokens // 2 # Round down the division
|
||||
max_chunk_tokens = min(embedding_engine.max_tokens, llm_cutoff_point)
|
||||
|
||||
return max_chunk_tokens
|
||||
|
||||
|
||||
def get_model_max_tokens(model_name: str):
|
||||
"""
|
||||
Args:
|
||||
model_name: name of LLM or embedding model
|
||||
|
||||
Returns: Number of max tokens of model, or None if model is unknown
|
||||
"""
|
||||
max_tokens = None
|
||||
|
||||
if model_name in litellm.model_cost:
|
||||
max_tokens = litellm.model_cost[model_name]["max_tokens"]
|
||||
logger.debug(f"Max input tokens for {model_name}: {max_tokens}")
|
||||
else:
|
||||
logger.info("Model not found in LiteLLM's model_cost.")
|
||||
|
||||
return max_tokens
|
||||
|
|
@ -14,17 +14,15 @@ class TextChunker:
|
|||
chunk_size = 0
|
||||
token_count = 0
|
||||
|
||||
def __init__(
|
||||
self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024
|
||||
):
|
||||
def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024):
|
||||
self.document = document
|
||||
self.max_chunk_size = chunk_size
|
||||
self.get_text = get_text
|
||||
self.max_tokens = max_tokens if max_tokens else float("inf")
|
||||
self.max_chunk_tokens = max_chunk_tokens
|
||||
|
||||
def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
|
||||
word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
|
||||
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
|
||||
token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
|
||||
return word_count_fits and token_count_fits
|
||||
|
||||
def read(self):
|
||||
|
|
@ -32,7 +30,7 @@ class TextChunker:
|
|||
for content_text in self.get_text():
|
||||
for chunk_data in chunk_by_paragraph(
|
||||
content_text,
|
||||
self.max_tokens,
|
||||
self.max_chunk_tokens,
|
||||
self.max_chunk_size,
|
||||
batch_paragraphs=True,
|
||||
):
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ import os
|
|||
class CognifyConfig(BaseSettings):
|
||||
classification_model: object = DefaultContentPrediction
|
||||
summarization_model: object = SummarizedContent
|
||||
max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
|
|
|
|||
|
|
@ -13,14 +13,14 @@ class AudioDocument(Document):
|
|||
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||
return result.text
|
||||
|
||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||
def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
|
||||
# Transcribe the audio file
|
||||
|
||||
text = self.create_transcript()
|
||||
|
||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||
chunker = chunker_func(
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -11,5 +11,5 @@ class Document(DataPoint):
|
|||
mime_type: str
|
||||
_metadata: dict = {"index_fields": ["name"], "type": "Document"}
|
||||
|
||||
def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
|
||||
def read(self, chunk_size: int, chunker=str, max_chunk_tokens: Optional[int] = None) -> str:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ class ImageDocument(Document):
|
|||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||
return result.choices[0].message.content
|
||||
|
||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||
def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
|
||||
# Transcribe the image file
|
||||
text = self.transcribe_image()
|
||||
|
||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||
chunker = chunker_func(
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
|
||||
self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from .Document import Document
|
|||
class PdfDocument(Document):
|
||||
type: str = "pdf"
|
||||
|
||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||
def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
|
||||
file = PdfReader(self.raw_data_location)
|
||||
|
||||
def get_text():
|
||||
|
|
@ -19,7 +19,7 @@ class PdfDocument(Document):
|
|||
|
||||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||
chunker = chunker_func(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from .Document import Document
|
|||
class TextDocument(Document):
|
||||
type: str = "text"
|
||||
|
||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
|
||||
def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
|
||||
def get_text():
|
||||
with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
|
||||
while True:
|
||||
|
|
@ -21,7 +21,7 @@ class TextDocument(Document):
|
|||
chunker_func = ChunkerConfig.get_chunker(chunker)
|
||||
|
||||
chunker = chunker_func(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from .Document import Document
|
|||
class UnstructuredDocument(Document):
|
||||
type: str = "unstructured"
|
||||
|
||||
def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
|
||||
def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int) -> str:
|
||||
def get_text():
|
||||
try:
|
||||
from unstructured.partition.auto import partition
|
||||
|
|
@ -29,6 +29,8 @@ class UnstructuredDocument(Document):
|
|||
|
||||
yield text
|
||||
|
||||
chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens)
|
||||
chunker = TextChunker(
|
||||
self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
|
||||
)
|
||||
|
||||
yield from chunker.read()
|
||||
|
|
|
|||
|
|
@ -10,8 +10,6 @@ import graphistry
|
|||
import networkx as nx
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import tiktoken
|
||||
import time
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
|
@ -100,15 +98,6 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
|
|||
print(f"Error sending telemetry through proxy: {response.status_code}")
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
"""Returns the number of tokens in a text string."""
|
||||
|
||||
# tiktoken.get_encoding("cl100k_base")
|
||||
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
|
||||
def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str:
|
||||
h = hashlib.md5()
|
||||
|
||||
|
|
@ -134,34 +123,6 @@ def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str:
|
|||
raise IngestionError(message=f"Failed to load data from {file}: {e}")
|
||||
|
||||
|
||||
def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> str:
|
||||
"""
|
||||
Trims the text so that the number of tokens does not exceed max_tokens.
|
||||
|
||||
Args:
|
||||
text (str): Original text string to be trimmed.
|
||||
max_tokens (int): Maximum number of tokens allowed.
|
||||
encoding_name (str): The name of the token encoding to use.
|
||||
|
||||
Returns:
|
||||
str: Trimmed version of text or original text if under the limit.
|
||||
"""
|
||||
# First check the number of tokens
|
||||
num_tokens = num_tokens_from_string(text, encoding_name)
|
||||
|
||||
# If the number of tokens is within the limit, return the text as is
|
||||
if num_tokens <= max_tokens:
|
||||
return text
|
||||
|
||||
# If the number exceeds the limit, trim the text
|
||||
# This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
|
||||
encoded_text = tiktoken.get_encoding(encoding_name).encode(text)
|
||||
trimmed_encoded_text = encoded_text[:max_tokens]
|
||||
# Decoding the trimmed text
|
||||
trimmed_text = tiktoken.get_encoding(encoding_name).decode(trimmed_encoded_text)
|
||||
return trimmed_text
|
||||
|
||||
|
||||
def generate_color_palette(unique_layers):
|
||||
colormap = plt.cm.get_cmap("viridis", len(unique_layers))
|
||||
colors = [colormap(i) for i in range(len(unique_layers))]
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from .chunk_by_sentence import chunk_by_sentence
|
|||
|
||||
def chunk_by_paragraph(
|
||||
data: str,
|
||||
max_tokens: Optional[Union[int, float]] = None,
|
||||
max_chunk_tokens,
|
||||
paragraph_length: int = 1024,
|
||||
batch_paragraphs: bool = True,
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
|
|
@ -30,8 +30,6 @@ def chunk_by_paragraph(
|
|||
paragraph_ids = []
|
||||
last_cut_type = None
|
||||
current_token_count = 0
|
||||
if not max_tokens:
|
||||
max_tokens = float("inf")
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
embedding_model = vector_engine.embedding_engine.model
|
||||
|
|
@ -47,7 +45,7 @@ def chunk_by_paragraph(
|
|||
|
||||
if current_word_count > 0 and (
|
||||
current_word_count + word_count > paragraph_length
|
||||
or current_token_count + token_count > max_tokens
|
||||
or current_token_count + token_count > max_chunk_tokens
|
||||
):
|
||||
# Yield current chunk
|
||||
chunk_dict = {
|
||||
|
|
|
|||
|
|
@ -5,9 +5,9 @@ from cognee.modules.data.processing.document_types.Document import Document
|
|||
|
||||
async def extract_chunks_from_documents(
|
||||
documents: list[Document],
|
||||
max_chunk_tokens: int,
|
||||
chunk_size: int = 1024,
|
||||
chunker="text_chunker",
|
||||
max_tokens: Optional[int] = None,
|
||||
) -> AsyncGenerator:
|
||||
"""
|
||||
Extracts chunks of data from a list of documents based on the specified chunking parameters.
|
||||
|
|
@ -18,6 +18,6 @@ async def extract_chunks_from_documents(
|
|||
"""
|
||||
for document in documents:
|
||||
for document_chunk in document.read(
|
||||
chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens
|
||||
chunk_size=chunk_size, chunker=chunker, max_chunk_tokens=max_chunk_tokens
|
||||
):
|
||||
yield document_chunk
|
||||
|
|
|
|||
|
|
@ -89,26 +89,29 @@ def _get_subchunk_token_counts(
|
|||
|
||||
|
||||
def _get_chunk_source_code(
|
||||
code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int
|
||||
code_token_counts: list[tuple[str, int]], overlap: float
|
||||
) -> tuple[list[tuple[str, int]], str]:
|
||||
"""Generates a chunk of source code from tokenized subchunks with overlap handling."""
|
||||
current_count = 0
|
||||
cumulative_counts = []
|
||||
current_source_code = ""
|
||||
|
||||
# Get embedding engine used in vector database
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
|
||||
for i, (child_code, token_count) in enumerate(code_token_counts):
|
||||
current_count += token_count
|
||||
cumulative_counts.append(current_count)
|
||||
if current_count > max_tokens:
|
||||
if current_count > embedding_engine.max_tokens:
|
||||
break
|
||||
current_source_code += f"\n{child_code}"
|
||||
|
||||
if current_count <= max_tokens:
|
||||
if current_count <= embedding_engine.max_tokens:
|
||||
return [], current_source_code.strip()
|
||||
|
||||
cutoff = 1
|
||||
for i, cum_count in enumerate(cumulative_counts):
|
||||
if cum_count > (1 - overlap) * max_tokens:
|
||||
if cum_count > (1 - overlap) * embedding_engine.max_tokens:
|
||||
break
|
||||
cutoff = i
|
||||
|
||||
|
|
@ -117,21 +120,18 @@ def _get_chunk_source_code(
|
|||
|
||||
def get_source_code_chunks_from_code_part(
|
||||
code_file_part: CodePart,
|
||||
max_tokens: int = 8192,
|
||||
overlap: float = 0.25,
|
||||
granularity: float = 0.1,
|
||||
model_name: str = "text-embedding-3-large",
|
||||
) -> Generator[SourceCodeChunk, None, None]:
|
||||
"""Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
|
||||
if not code_file_part.source_code:
|
||||
logger.error(f"No source code in CodeFile {code_file_part.id}")
|
||||
return
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
embedding_model = vector_engine.embedding_engine.model
|
||||
model_name = embedding_model.split("/")[-1]
|
||||
tokenizer = tiktoken.encoding_for_model(model_name)
|
||||
max_subchunk_tokens = max(1, int(granularity * max_tokens))
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
tokenizer = embedding_engine.tokenizer
|
||||
|
||||
max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens))
|
||||
subchunk_token_counts = _get_subchunk_token_counts(
|
||||
tokenizer, code_file_part.source_code, max_subchunk_tokens
|
||||
)
|
||||
|
|
@ -139,7 +139,7 @@ def get_source_code_chunks_from_code_part(
|
|||
previous_chunk = None
|
||||
while subchunk_token_counts:
|
||||
subchunk_token_counts, chunk_source_code = _get_chunk_source_code(
|
||||
subchunk_token_counts, overlap, max_tokens
|
||||
subchunk_token_counts, overlap
|
||||
)
|
||||
if not chunk_source_code:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ def test_AudioDocument():
|
|||
)
|
||||
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ def test_ImageDocument():
|
|||
)
|
||||
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ def test_PdfDocument():
|
|||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
|
||||
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker", max_chunk_tokens=2048)
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -37,7 +37,8 @@ def test_TextDocument(input_file, chunk_size):
|
|||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
|
||||
GROUND_TRUTH[input_file],
|
||||
document.read(chunk_size=chunk_size, chunker="text_chunker", max_chunk_tokens=1024),
|
||||
):
|
||||
assert ground_truth["word_count"] == paragraph_data.word_count, (
|
||||
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||
|
|
|
|||
|
|
@ -68,7 +68,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test PPTX
|
||||
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in pptx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
|
||||
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
|
|
@ -76,7 +78,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test DOCX
|
||||
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in docx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
|
||||
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_end" == paragraph_data.cut_type, (
|
||||
|
|
@ -84,7 +88,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# TEST CSV
|
||||
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in csv_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
|
||||
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
|
||||
f"Read text doesn't match expected text: {paragraph_data.text}"
|
||||
|
|
@ -94,7 +100,9 @@ def test_UnstructuredDocument():
|
|||
)
|
||||
|
||||
# Test XLSX
|
||||
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
|
||||
for paragraph_data in xlsx_document.read(
|
||||
chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
|
||||
):
|
||||
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
|
||||
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
|
||||
assert "sentence_cut" == paragraph_data.cut_type, (
|
||||
|
|
|
|||
|
|
@ -8,14 +8,24 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
|
|||
|
||||
paragraph_lengths = [64, 256, 1024]
|
||||
batch_paragraphs_vals = [True, False]
|
||||
max_chunk_tokens_vals = [512, 1024, 4096]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,paragraph_length,batch_paragraphs",
|
||||
list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
|
||||
chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
|
||||
def test_chunk_by_paragraph_isomorphism(
|
||||
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
|
||||
):
|
||||
chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
|
||||
reconstructed_text = "".join([chunk["text"] for chunk in chunks])
|
||||
assert reconstructed_text == input_text, (
|
||||
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
|
||||
|
|
@ -23,13 +33,23 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,paragraph_length,batch_paragraphs",
|
||||
list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
|
||||
def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs):
|
||||
chunks = list(
|
||||
chunk_by_paragraph(
|
||||
data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
|
||||
data=input_text,
|
||||
max_chunk_tokens=max_chunk_tokens,
|
||||
paragraph_length=paragraph_length,
|
||||
batch_paragraphs=batch_paragraphs,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -42,12 +62,24 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text,paragraph_length,batch_paragraphs",
|
||||
list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
|
||||
"input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
|
||||
list(
|
||||
product(
|
||||
list(INPUT_TEXTS.values()),
|
||||
max_chunk_tokens_vals,
|
||||
paragraph_lengths,
|
||||
batch_paragraphs_vals,
|
||||
)
|
||||
),
|
||||
)
|
||||
def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_paragraphs):
|
||||
def test_chunk_by_paragraph_chunk_numbering(
|
||||
input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
|
||||
):
|
||||
chunks = chunk_by_paragraph(
|
||||
data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
|
||||
data=input_text,
|
||||
max_chunk_tokens=max_chunk_tokens,
|
||||
paragraph_length=paragraph_length,
|
||||
batch_paragraphs=batch_paragraphs,
|
||||
)
|
||||
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
|
||||
assert np.all(chunk_indices == np.arange(len(chunk_indices))), (
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ Third paragraph is cut and is missing the dot at the end""",
|
|||
def run_chunking_test(test_text, expected_chunks):
|
||||
chunks = []
|
||||
for chunk_data in chunk_by_paragraph(
|
||||
data=test_text, paragraph_length=12, batch_paragraphs=False
|
||||
data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
|
||||
):
|
||||
chunks.append(chunk_data)
|
||||
|
||||
|
|
|
|||
|
|
@ -11,9 +11,7 @@ from cognee.shared.exceptions import IngestionError
|
|||
from cognee.shared.utils import (
|
||||
get_anonymous_id,
|
||||
send_telemetry,
|
||||
num_tokens_from_string,
|
||||
get_file_content_hash,
|
||||
trim_text_to_max_tokens,
|
||||
prepare_edges,
|
||||
prepare_nodes,
|
||||
create_cognee_style_network_with_logo,
|
||||
|
|
@ -45,15 +43,6 @@ def test_get_anonymous_id(mock_open_file, mock_makedirs, temp_dir):
|
|||
# args, kwargs = mock_post.call_args
|
||||
# assert kwargs["json"]["event_name"] == "test_event"
|
||||
|
||||
#
|
||||
# @patch("tiktoken.encoding_for_model")
|
||||
# def test_num_tokens_from_string(mock_encoding):
|
||||
# mock_encoding.return_value.encode = lambda x: list(x)
|
||||
#
|
||||
# assert num_tokens_from_string("hello", "test_encoding") == 5
|
||||
# assert num_tokens_from_string("world", "test_encoding") == 5
|
||||
#
|
||||
|
||||
|
||||
@patch("builtins.open", new_callable=mock_open, read_data=b"test_data")
|
||||
def test_get_file_content_hash_file(mock_open_file):
|
||||
|
|
@ -73,18 +62,6 @@ def test_get_file_content_hash_stream():
|
|||
assert result == expected_hash
|
||||
|
||||
|
||||
# def test_trim_text_to_max_tokens():
|
||||
# text = "This is a test string with multiple words."
|
||||
# encoding_name = "test_encoding"
|
||||
#
|
||||
# with patch("tiktoken.get_encoding") as mock_get_encoding:
|
||||
# mock_get_encoding.return_value.encode = lambda x: list(x)
|
||||
# mock_get_encoding.return_value.decode = lambda x: "".join(x)
|
||||
#
|
||||
# result = trim_text_to_max_tokens(text, 5, encoding_name)
|
||||
# assert result == text[:5]
|
||||
|
||||
|
||||
def test_prepare_edges():
|
||||
graph = nx.MultiDiGraph()
|
||||
graph.add_edge("A", "B", key="AB", weight=1)
|
||||
|
|
|
|||
|
|
@ -650,6 +650,7 @@
|
|||
"from cognee.modules.pipelines import run_tasks\n",
|
||||
"from cognee.modules.users.models import User\n",
|
||||
"from cognee.tasks.documents import check_permissions_on_documents, classify_documents, extract_chunks_from_documents\n",
|
||||
"from cognee.infrastructure.llm import get_max_chunk_tokens\n",
|
||||
"from cognee.tasks.graph import extract_graph_from_data\n",
|
||||
"from cognee.tasks.storage import add_data_points\n",
|
||||
"from cognee.tasks.summarization import summarize_text\n",
|
||||
|
|
@ -663,7 +664,7 @@
|
|||
" tasks = [\n",
|
||||
" Task(classify_documents),\n",
|
||||
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
|
||||
" Task(extract_chunks_from_documents), # Extract text chunks based on the document type.\n",
|
||||
" Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()), # Extract text chunks based on the document type.\n",
|
||||
" Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks.\n",
|
||||
" Task(\n",
|
||||
" summarize_text,\n",
|
||||
|
|
|
|||
246
poetry.lock
generated
246
poetry.lock
generated
|
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiofiles"
|
||||
|
|
@ -645,6 +645,17 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >
|
|||
[package.extras]
|
||||
crt = ["awscrt (==0.23.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "cachetools"
|
||||
version = "5.5.1"
|
||||
description = "Extensible memoizing collections and decorators"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"},
|
||||
{file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2024.12.14"
|
||||
|
|
@ -1995,6 +2006,135 @@ files = [
|
|||
{file = "giturlparse-0.12.0.tar.gz", hash = "sha256:c0fff7c21acc435491b1779566e038757a205c1ffdcb47e4f81ea52ad8c3859a"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "google-ai-generativelanguage"
|
||||
version = "0.6.15"
|
||||
description = "Google Ai Generativelanguage API client library"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google_ai_generativelanguage-0.6.15-py3-none-any.whl", hash = "sha256:5a03ef86377aa184ffef3662ca28f19eeee158733e45d7947982eb953c6ebb6c"},
|
||||
{file = "google_ai_generativelanguage-0.6.15.tar.gz", hash = "sha256:8f6d9dc4c12b065fe2d0289026171acea5183ebf2d0b11cefe12f3821e159ec3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]}
|
||||
google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev"
|
||||
proto-plus = ">=1.22.3,<2.0.0dev"
|
||||
protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
|
||||
|
||||
[[package]]
|
||||
name = "google-api-core"
|
||||
version = "2.24.0"
|
||||
description = "Google API client core library"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"},
|
||||
{file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
google-auth = ">=2.14.1,<3.0.dev0"
|
||||
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
|
||||
grpcio = [
|
||||
{version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
||||
{version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
||||
]
|
||||
grpcio-status = [
|
||||
{version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
|
||||
{version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
|
||||
]
|
||||
proto-plus = ">=1.22.3,<2.0.0dev"
|
||||
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
|
||||
requests = ">=2.18.0,<3.0.0.dev0"
|
||||
|
||||
[package.extras]
|
||||
async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
|
||||
grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"]
|
||||
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
|
||||
|
||||
[[package]]
|
||||
name = "google-api-python-client"
|
||||
version = "2.159.0"
|
||||
description = "Google API Client Library for Python"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"},
|
||||
{file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0.dev0"
|
||||
google-auth = ">=1.32.0,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0.dev0"
|
||||
google-auth-httplib2 = ">=0.2.0,<1.0.0"
|
||||
httplib2 = ">=0.19.0,<1.dev0"
|
||||
uritemplate = ">=3.0.1,<5"
|
||||
|
||||
[[package]]
|
||||
name = "google-auth"
|
||||
version = "2.38.0"
|
||||
description = "Google Authentication Library"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"},
|
||||
{file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cachetools = ">=2.0.0,<6.0"
|
||||
pyasn1-modules = ">=0.2.1"
|
||||
rsa = ">=3.1.4,<5"
|
||||
|
||||
[package.extras]
|
||||
aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
|
||||
enterprise-cert = ["cryptography", "pyopenssl"]
|
||||
pyjwt = ["cryptography (>=38.0.3)", "pyjwt (>=2.0)"]
|
||||
pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"]
|
||||
reauth = ["pyu2f (>=0.1.5)"]
|
||||
requests = ["requests (>=2.20.0,<3.0.0.dev0)"]
|
||||
|
||||
[[package]]
|
||||
name = "google-auth-httplib2"
|
||||
version = "0.2.0"
|
||||
description = "Google Authentication Library: httplib2 transport"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05"},
|
||||
{file = "google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
google-auth = "*"
|
||||
httplib2 = ">=0.19.0"
|
||||
|
||||
[[package]]
|
||||
name = "google-generativeai"
|
||||
version = "0.8.4"
|
||||
description = "Google Generative AI High level API client library and tools."
|
||||
optional = true
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "google_generativeai-0.8.4-py3-none-any.whl", hash = "sha256:e987b33ea6decde1e69191ddcaec6ef974458864d243de7191db50c21a7c5b82"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
google-ai-generativelanguage = "0.6.15"
|
||||
google-api-core = "*"
|
||||
google-api-python-client = "*"
|
||||
google-auth = ">=2.15.0"
|
||||
protobuf = "*"
|
||||
pydantic = "*"
|
||||
tqdm = "*"
|
||||
typing-extensions = "*"
|
||||
|
||||
[package.extras]
|
||||
dev = ["Pillow", "absl-py", "black", "ipython", "nose2", "pandas", "pytype", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "googleapis-common-protos"
|
||||
version = "1.66.0"
|
||||
|
|
@ -2251,6 +2391,22 @@ files = [
|
|||
grpcio = ">=1.67.1"
|
||||
protobuf = ">=5.26.1,<6.0dev"
|
||||
|
||||
[[package]]
|
||||
name = "grpcio-status"
|
||||
version = "1.67.1"
|
||||
description = "Status proto mapping for gRPC"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd"},
|
||||
{file = "grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
googleapis-common-protos = ">=1.5.5"
|
||||
grpcio = ">=1.67.1"
|
||||
protobuf = ">=5.26.1,<6.0dev"
|
||||
|
||||
[[package]]
|
||||
name = "grpcio-tools"
|
||||
version = "1.67.1"
|
||||
|
|
@ -2445,6 +2601,20 @@ http2 = ["h2 (>=3,<5)"]
|
|||
socks = ["socksio (==1.*)"]
|
||||
trio = ["trio (>=0.22.0,<1.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "httplib2"
|
||||
version = "0.22.0"
|
||||
description = "A comprehensive HTTP client library."
|
||||
optional = true
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"},
|
||||
{file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
|
||||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.27.0"
|
||||
|
|
@ -4998,8 +5168,8 @@ files = [
|
|||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
|
|
@ -5520,6 +5690,23 @@ files = [
|
|||
{file = "propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proto-plus"
|
||||
version = "1.25.0"
|
||||
description = "Beautiful, Pythonic protocol buffers."
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"},
|
||||
{file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
protobuf = ">=3.19.0,<6.0.0dev"
|
||||
|
||||
[package.extras]
|
||||
testing = ["google-api-core (>=1.31.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "5.29.3"
|
||||
|
|
@ -5686,6 +5873,31 @@ files = [
|
|||
[package.extras]
|
||||
test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
|
||||
|
||||
[[package]]
|
||||
name = "pyasn1"
|
||||
version = "0.6.1"
|
||||
description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"},
|
||||
{file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyasn1-modules"
|
||||
version = "0.4.1"
|
||||
description = "A collection of ASN.1-based protocols modules"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd"},
|
||||
{file = "pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyasn1 = ">=0.4.6,<0.7.0"
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "2.22"
|
||||
|
|
@ -5927,8 +6139,8 @@ astroid = ">=3.3.8,<=3.4.0-dev0"
|
|||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||
dill = [
|
||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=0.3.7", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
]
|
||||
isort = ">=4.2.5,<5.13.0 || >5.13.0,<6"
|
||||
mccabe = ">=0.6,<0.8"
|
||||
|
|
@ -6967,6 +7179,20 @@ files = [
|
|||
{file = "rpds_py-0.22.3.tar.gz", hash = "sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rsa"
|
||||
version = "4.9"
|
||||
description = "Pure-Python RSA implementation"
|
||||
optional = true
|
||||
python-versions = ">=3.6,<4"
|
||||
files = [
|
||||
{file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
|
||||
{file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.9.3"
|
||||
|
|
@ -8218,6 +8444,17 @@ files = [
|
|||
[package.extras]
|
||||
dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake8-commas", "flake8-comprehensions", "flake8-continuation", "flake8-datetimez", "flake8-docstrings", "flake8-import-order", "flake8-literal", "flake8-modern-annotations", "flake8-noqa", "flake8-pyproject", "flake8-requirements", "flake8-typechecking-import", "flake8-use-fstring", "mypy", "pep8-naming", "types-PyYAML"]
|
||||
|
||||
[[package]]
|
||||
name = "uritemplate"
|
||||
version = "4.1.1"
|
||||
description = "Implementation of RFC 6570 URI Templates"
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"},
|
||||
{file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.3.0"
|
||||
|
|
@ -8801,6 +9038,7 @@ deepeval = ["deepeval"]
|
|||
docs = ["unstructured"]
|
||||
falkordb = ["falkordb"]
|
||||
filesystem = ["botocore"]
|
||||
gemini = ["google-generativeai"]
|
||||
groq = ["groq"]
|
||||
langchain = ["langchain_text_splitters", "langsmith"]
|
||||
llama-index = ["llama-index-core"]
|
||||
|
|
@ -8815,4 +9053,4 @@ weaviate = ["weaviate-client"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10.0,<3.13"
|
||||
content-hash = "1cc352109264d0e3add524cdc15c9b2e6153e1bab20d968b40e42a4d5138967f"
|
||||
content-hash = "480675c274cd85a76a95bf03af865b1a0b462f25bbc21d7427b0a0b8e21c13db"
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ pre-commit = "^4.0.1"
|
|||
httpx = "0.27.0"
|
||||
bokeh="^3.6.2"
|
||||
nltk = "3.9.1"
|
||||
google-generativeai = {version = "^0.8.4", optional = true}
|
||||
parso = {version = "^0.8.4", optional = true}
|
||||
jedi = {version = "^0.19.2", optional = true}
|
||||
|
||||
|
|
@ -90,6 +91,7 @@ postgres = ["psycopg2", "pgvector", "asyncpg"]
|
|||
notebook = ["notebook", "ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
|
||||
langchain = ["langsmith", "langchain_text_splitters"]
|
||||
llama-index = ["llama-index-core"]
|
||||
gemini = ["google-generativeai"]
|
||||
deepeval = ["deepeval"]
|
||||
posthog = ["posthog"]
|
||||
falkordb = ["falkordb"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue