Merge branch 'dev' into add-column-value-db-migration
This commit is contained in:
commit
6f78462f3c
28 changed files with 3924 additions and 3817 deletions
|
|
@ -97,7 +97,7 @@ git push origin feature/your-feature-name
|
||||||
|
|
||||||
2. Create a Pull Request:
|
2. Create a Pull Request:
|
||||||
- Go to the [**cognee** repository](https://github.com/topoteretes/cognee)
|
- Go to the [**cognee** repository](https://github.com/topoteretes/cognee)
|
||||||
- Click "Compare & Pull Request" and make sure to open PR against dev branch
|
- Click "Compare & Pull Request"
|
||||||
- Fill in the PR template with details about your changes
|
- Fill in the PR template with details about your changes
|
||||||
|
|
||||||
## 5. 📜 Developer Certificate of Origin (DCO)
|
## 5. 📜 Developer Certificate of Origin (DCO)
|
||||||
|
|
|
||||||
83
Dockerfile
83
Dockerfile
|
|
@ -1,60 +1,61 @@
|
||||||
FROM python:3.11-slim
|
# Use a Python image with uv pre-installed
|
||||||
|
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv
|
||||||
|
|
||||||
# Define Poetry extras to install
|
# Install the project into `/app`
|
||||||
ARG POETRY_EXTRAS="\
|
WORKDIR /app
|
||||||
# API \
|
|
||||||
api \
|
# Enable bytecode compilation
|
||||||
# Storage & Databases \
|
# ENV UV_COMPILE_BYTECODE=1
|
||||||
postgres weaviate qdrant neo4j falkordb milvus kuzu chromadb \
|
|
||||||
# Notebooks & Interactive Environments \
|
# Copy from the cache instead of linking since it's a mounted volume
|
||||||
notebook \
|
ENV UV_LINK_MODE=copy
|
||||||
# LLM & AI Frameworks \
|
|
||||||
langchain llama-index gemini huggingface ollama mistral groq anthropic \
|
|
||||||
# Evaluation & Monitoring \
|
|
||||||
deepeval evals posthog \
|
|
||||||
# Graph Processing & Code Analysis \
|
|
||||||
codegraph graphiti \
|
|
||||||
# Document Processing \
|
|
||||||
docs"
|
|
||||||
|
|
||||||
# Set build argument
|
# Set build argument
|
||||||
ARG DEBUG
|
ARG DEBUG
|
||||||
|
|
||||||
# Set environment variable based on the build argument
|
# Set environment variable based on the build argument
|
||||||
ENV DEBUG=${DEBUG}
|
ENV DEBUG=${DEBUG}
|
||||||
ENV PIP_NO_CACHE_DIR=true
|
|
||||||
ENV PATH="${PATH}:/root/.poetry/bin"
|
|
||||||
|
|
||||||
RUN apt-get update
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
gcc \
|
||||||
|
libpq-dev \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
clang \
|
||||||
|
build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN apt-get install -y \
|
# Copy pyproject.toml and lockfile first for better caching
|
||||||
gcc \
|
COPY README.md pyproject.toml uv.lock entrypoint.sh ./
|
||||||
build-essential \
|
|
||||||
libpq-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
# Install the project's dependencies using the lockfile and settings
|
||||||
COPY pyproject.toml poetry.lock /app/
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv sync --extra debug --extra api --extra postgres --extra weaviate --extra qdrant --extra neo4j --extra kuzu --extra llama-index --extra gemini --extra ollama --extra mistral --extra groq --extra anthropic --frozen --no-install-project --no-dev --no-editable
|
||||||
RUN pip install poetry
|
|
||||||
|
|
||||||
# Don't create virtualenv since Docker is already isolated
|
|
||||||
RUN poetry config virtualenvs.create false
|
|
||||||
|
|
||||||
# Install the dependencies using the defined extras
|
|
||||||
RUN poetry install --extras "${POETRY_EXTRAS}" --no-root
|
|
||||||
|
|
||||||
# Set the PYTHONPATH environment variable to include the /app directory
|
|
||||||
ENV PYTHONPATH=/app
|
|
||||||
|
|
||||||
COPY cognee/ /app/cognee
|
|
||||||
|
|
||||||
# Copy Alembic configuration
|
# Copy Alembic configuration
|
||||||
COPY alembic.ini /app/alembic.ini
|
COPY alembic.ini /app/alembic.ini
|
||||||
COPY alembic/ /app/alembic
|
COPY alembic/ /app/alembic
|
||||||
|
|
||||||
COPY entrypoint.sh /app/entrypoint.sh
|
# Then, add the rest of the project source code and install it
|
||||||
|
# Installing separately from its dependencies allows optimal layer caching
|
||||||
|
COPY ./cognee /app/cognee
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv sync --extra debug --extra api --extra postgres --extra weaviate --extra qdrant --extra neo4j --extra kuzu --extra llama-index --extra gemini --extra ollama --extra mistral --extra groq --extra anthropic --frozen --no-dev --no-editable
|
||||||
|
|
||||||
|
FROM python:3.12-slim-bookworm
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY --from=uv /app /app
|
||||||
|
# COPY --from=uv /app/.venv /app/.venv
|
||||||
|
# COPY --from=uv /root/.local /root/.local
|
||||||
|
|
||||||
RUN chmod +x /app/entrypoint.sh
|
RUN chmod +x /app/entrypoint.sh
|
||||||
|
|
||||||
RUN sed -i 's/\r$//' /app/entrypoint.sh
|
# Place executables in the environment at the front of the path
|
||||||
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
|
||||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||||
|
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 2.3 MiB After Width: | Height: | Size: 353 KiB |
|
|
@ -43,7 +43,7 @@ export default function Home() {
|
||||||
const onDataAdd = useCallback((dataset: { id: string }, files: File[]) => {
|
const onDataAdd = useCallback((dataset: { id: string }, files: File[]) => {
|
||||||
return addData(dataset, files)
|
return addData(dataset, files)
|
||||||
.then(() => {
|
.then(() => {
|
||||||
showNotification("Data added successfully.", 5000);
|
showNotification("Data added successfully. Please run \"Cognify\" when ready.", 5000);
|
||||||
openDatasetData(dataset);
|
openDatasetData(dataset);
|
||||||
});
|
});
|
||||||
}, [showNotification])
|
}, [showNotification])
|
||||||
|
|
@ -60,6 +60,14 @@ export default function Home() {
|
||||||
});
|
});
|
||||||
}, [showNotification]);
|
}, [showNotification]);
|
||||||
|
|
||||||
|
const onCognify = useCallback(() => {
|
||||||
|
const dataset = datasets.find((dataset) => dataset.id === selectedDataset);
|
||||||
|
return onDatasetCognify({
|
||||||
|
id: dataset!.id,
|
||||||
|
name: dataset!.name,
|
||||||
|
});
|
||||||
|
}, [datasets, onDatasetCognify, selectedDataset]);
|
||||||
|
|
||||||
const {
|
const {
|
||||||
value: isSettingsModalOpen,
|
value: isSettingsModalOpen,
|
||||||
setTrue: openSettingsModal,
|
setTrue: openSettingsModal,
|
||||||
|
|
@ -95,6 +103,7 @@ export default function Home() {
|
||||||
datasetId={selectedDataset}
|
datasetId={selectedDataset}
|
||||||
onClose={closeDatasetData}
|
onClose={closeDatasetData}
|
||||||
onDataAdd={onDataAdd}
|
onDataAdd={onDataAdd}
|
||||||
|
onCognify={onCognify}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import cognifyDataset from '@/modules/datasets/cognifyDataset';
|
||||||
|
|
||||||
interface ConfigStepProps {
|
interface ConfigStepProps {
|
||||||
onNext: () => void;
|
onNext: () => void;
|
||||||
dataset: { id: string }
|
dataset: { name: string }
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function CognifyStep({ onNext, dataset }: ConfigStepProps) {
|
export default function CognifyStep({ onNext, dataset }: ConfigStepProps) {
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,13 @@ import { Explorer } from '@/ui/Partials';
|
||||||
import { Spacer } from 'ohmy-ui';
|
import { Spacer } from 'ohmy-ui';
|
||||||
|
|
||||||
interface ExploreStepProps {
|
interface ExploreStepProps {
|
||||||
dataset: { id: string };
|
dataset: { name: string };
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function ExploreStep({ dataset }: ExploreStepProps) {
|
export default function ExploreStep({ dataset }: ExploreStepProps) {
|
||||||
return (
|
return (
|
||||||
<Spacer horizontal="3">
|
<Spacer horizontal="3">
|
||||||
<Explorer dataset={dataset!} />
|
<Explorer dataset={dataset} />
|
||||||
</Spacer>
|
</Spacer>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ export default function WizardPage({
|
||||||
setFalse: closeSettingsModal,
|
setFalse: closeSettingsModal,
|
||||||
} = useBoolean(false);
|
} = useBoolean(false);
|
||||||
|
|
||||||
const dataset = { id: 'main' };
|
const dataset = { name: 'main' };
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<main className={styles.main}>
|
<main className={styles.main}>
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,13 @@
|
||||||
import { fetch } from '@/utils';
|
import { fetch } from '@/utils';
|
||||||
|
|
||||||
export default function cognifyDataset(dataset: { id: string, name: string }) {
|
export default function cognifyDataset(dataset: { id?: string, name?: string }) {
|
||||||
return fetch('/v1/cognify', {
|
return fetch('/v1/cognify', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
datasets: [dataset.id],
|
datasets: [dataset.id || dataset.name],
|
||||||
}),
|
}),
|
||||||
}).then((response) => response.json());
|
}).then((response) => response.json());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import { fetch } from '@/utils';
|
import { fetch } from '@/utils';
|
||||||
|
|
||||||
export default function getExplorationGraphUrl(dataset: { id: string }) {
|
export default function getExplorationGraphUrl(dataset: { name: string }) {
|
||||||
return fetch('/v1/visualize')
|
return fetch('/v1/visualize')
|
||||||
.then(async (response) => {
|
.then(async (response) => {
|
||||||
if (response.status !== 200) {
|
if (response.status !== 200) {
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ import {
|
||||||
Text,
|
Text,
|
||||||
UploadInput,
|
UploadInput,
|
||||||
CloseIcon,
|
CloseIcon,
|
||||||
|
CTAButton,
|
||||||
|
useBoolean,
|
||||||
} from "ohmy-ui";
|
} from "ohmy-ui";
|
||||||
import { fetch } from '@/utils';
|
import { fetch } from '@/utils';
|
||||||
import RawDataPreview from './RawDataPreview';
|
import RawDataPreview from './RawDataPreview';
|
||||||
|
|
@ -28,9 +30,10 @@ interface DataViewProps {
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
onClose: () => void;
|
onClose: () => void;
|
||||||
onDataAdd: (dataset: DatasetLike, files: File[]) => void;
|
onDataAdd: (dataset: DatasetLike, files: File[]) => void;
|
||||||
|
onCognify: () => Promise<any>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function DataView({ datasetId, data, onClose, onDataAdd }: DataViewProps) {
|
export default function DataView({ datasetId, data, onClose, onDataAdd, onCognify }: DataViewProps) {
|
||||||
// const handleDataDelete = () => {};
|
// const handleDataDelete = () => {};
|
||||||
const [rawData, setRawData] = useState<ArrayBuffer | null>(null);
|
const [rawData, setRawData] = useState<ArrayBuffer | null>(null);
|
||||||
const [selectedData, setSelectedData] = useState<Data | null>(null);
|
const [selectedData, setSelectedData] = useState<Data | null>(null);
|
||||||
|
|
@ -52,7 +55,19 @@ export default function DataView({ datasetId, data, onClose, onDataAdd }: DataVi
|
||||||
|
|
||||||
const handleDataAdd = (files: File[]) => {
|
const handleDataAdd = (files: File[]) => {
|
||||||
onDataAdd({ id: datasetId }, files);
|
onDataAdd({ id: datasetId }, files);
|
||||||
}
|
};
|
||||||
|
|
||||||
|
const {
|
||||||
|
value: isCognifyButtonDisabled,
|
||||||
|
setTrue: disableCognifyButton,
|
||||||
|
setFalse: enableCognifyButton,
|
||||||
|
} = useBoolean(false);
|
||||||
|
|
||||||
|
const handleCognify = () => {
|
||||||
|
disableCognifyButton();
|
||||||
|
onCognify()
|
||||||
|
.finally(() => enableCognifyButton());
|
||||||
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Stack orientation="vertical" gap="4">
|
<Stack orientation="vertical" gap="4">
|
||||||
|
|
@ -62,6 +77,11 @@ export default function DataView({ datasetId, data, onClose, onDataAdd }: DataVi
|
||||||
<Text>Add data</Text>
|
<Text>Add data</Text>
|
||||||
</UploadInput>
|
</UploadInput>
|
||||||
</div>
|
</div>
|
||||||
|
<div>
|
||||||
|
<CTAButton disabled={isCognifyButtonDisabled} onClick={handleCognify}>
|
||||||
|
<Text>Cognify</Text>
|
||||||
|
</CTAButton>
|
||||||
|
</div>
|
||||||
<GhostButton hugContent onClick={onClose}>
|
<GhostButton hugContent onClick={onClose}>
|
||||||
<CloseIcon />
|
<CloseIcon />
|
||||||
</GhostButton>
|
</GhostButton>
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import { getExplorationGraphUrl } from '@/modules/exploration';
|
||||||
import styles from './Explorer.module.css';
|
import styles from './Explorer.module.css';
|
||||||
|
|
||||||
interface ExplorerProps {
|
interface ExplorerProps {
|
||||||
dataset: { id: string };
|
dataset: { name: string };
|
||||||
className?: string;
|
className?: string;
|
||||||
style?: React.CSSProperties;
|
style?: React.CSSProperties;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,9 +28,6 @@ export default function SearchView() {
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const searchOptions = [{
|
const searchOptions = [{
|
||||||
value: 'INSIGHTS',
|
|
||||||
label: 'Query insights from documents',
|
|
||||||
}, {
|
|
||||||
value: 'GRAPH_COMPLETION',
|
value: 'GRAPH_COMPLETION',
|
||||||
label: 'Completion using Cognee\'s graph based memory',
|
label: 'Completion using Cognee\'s graph based memory',
|
||||||
}, {
|
}, {
|
||||||
|
|
@ -81,6 +78,8 @@ export default function SearchView() {
|
||||||
|
|
||||||
scrollToBottom();
|
scrollToBottom();
|
||||||
|
|
||||||
|
setInputValue('');
|
||||||
|
|
||||||
const searchTypeValue = searchType.value;
|
const searchTypeValue = searchType.value;
|
||||||
|
|
||||||
fetch('/v1/search', {
|
fetch('/v1/search', {
|
||||||
|
|
@ -103,10 +102,12 @@ export default function SearchView() {
|
||||||
text: convertToSearchTypeOutput(systemMessage, searchTypeValue),
|
text: convertToSearchTypeOutput(systemMessage, searchTypeValue),
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
setInputValue('');
|
|
||||||
|
|
||||||
scrollToBottom();
|
scrollToBottom();
|
||||||
})
|
})
|
||||||
|
.catch(() => {
|
||||||
|
setInputValue(inputValue);
|
||||||
|
});
|
||||||
}, [inputValue, scrollToBottom, searchType.value]);
|
}, [inputValue, scrollToBottom, searchType.value]);
|
||||||
|
|
||||||
const {
|
const {
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import handleServerErrors from './handleServerErrors';
|
import handleServerErrors from './handleServerErrors';
|
||||||
|
|
||||||
export default function fetch(url: string, options: RequestInit = {}): Promise<Response> {
|
export default function fetch(url: string, options: RequestInit = {}): Promise<Response> {
|
||||||
return global.fetch('http://127.0.0.1:8000/api' + url, {
|
return global.fetch('http://localhost:8000/api' + url, {
|
||||||
...options,
|
...options,
|
||||||
headers: {
|
headers: {
|
||||||
...options.headers,
|
...options.headers,
|
||||||
|
|
|
||||||
|
|
@ -144,7 +144,9 @@ async def cognify_status():
|
||||||
"""Get status of cognify pipeline"""
|
"""Get status of cognify pipeline"""
|
||||||
with redirect_stdout(sys.stderr):
|
with redirect_stdout(sys.stderr):
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
status = await get_pipeline_status([await get_unique_dataset_id("main_dataset", user)])
|
status = await get_pipeline_status(
|
||||||
|
[await get_unique_dataset_id("main_dataset", user)], "cognify_pipeline"
|
||||||
|
)
|
||||||
return [types.TextContent(type="text", text=str(status))]
|
return [types.TextContent(type="text", text=str(status))]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -153,7 +155,9 @@ async def codify_status():
|
||||||
"""Get status of codify pipeline"""
|
"""Get status of codify pipeline"""
|
||||||
with redirect_stdout(sys.stderr):
|
with redirect_stdout(sys.stderr):
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
status = await get_pipeline_status([await get_unique_dataset_id("codebase", user)])
|
status = await get_pipeline_status(
|
||||||
|
[await get_unique_dataset_id("codebase", user)], "cognify_code_pipeline"
|
||||||
|
)
|
||||||
return [types.TextContent(type="text", text=str(status))]
|
return [types.TextContent(type="text", text=str(status))]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Union, BinaryIO, List, Optional
|
from typing import Union, BinaryIO, List, Optional
|
||||||
from cognee.modules.users.models import User
|
|
||||||
from cognee.modules.pipelines import Task
|
from cognee.modules.pipelines import Task
|
||||||
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
from cognee.modules.users.models import User
|
||||||
from cognee.modules.pipelines import cognee_pipeline
|
from cognee.modules.pipelines import cognee_pipeline
|
||||||
|
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
||||||
|
|
||||||
|
|
||||||
async def add(
|
async def add(
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,9 @@ async def cognify(
|
||||||
):
|
):
|
||||||
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
|
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
|
||||||
|
|
||||||
return await cognee_pipeline(tasks=tasks, datasets=datasets, user=user)
|
return await cognee_pipeline(
|
||||||
|
tasks=tasks, datasets=datasets, user=user, pipeline_name="cognify_pipeline"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
|
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ class datasets:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def get_status(dataset_ids: list[UUID]) -> dict:
|
async def get_status(dataset_ids: list[UUID]) -> dict:
|
||||||
return await get_pipeline_status(dataset_ids)
|
return await get_pipeline_status(dataset_ids, pipeline_name="cognify_pipeline")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def delete_dataset(dataset_id: str):
|
async def delete_dataset(dataset_id: str):
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Generic, List, Optional, TypeVar, Union, get_args, get_origin, get_type_hints
|
|
||||||
import lancedb
|
import lancedb
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from typing import Generic, List, Optional, TypeVar, Union, get_args, get_origin, get_type_hints
|
||||||
|
|
||||||
from cognee.exceptions import InvalidValueError
|
from cognee.exceptions import InvalidValueError
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
|
@ -16,8 +16,6 @@ from ..models.ScoredResult import ScoredResult
|
||||||
from ..utils import normalize_distances
|
from ..utils import normalize_distances
|
||||||
from ..vector_db_interface import VectorDBInterface
|
from ..vector_db_interface import VectorDBInterface
|
||||||
|
|
||||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
||||||
|
|
||||||
|
|
||||||
class IndexSchema(DataPoint):
|
class IndexSchema(DataPoint):
|
||||||
id: str
|
id: str
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from cognee.infrastructure.databases.relational import Base
|
||||||
|
|
||||||
|
|
||||||
class PipelineRunStatus(enum.Enum):
|
class PipelineRunStatus(enum.Enum):
|
||||||
|
DATASET_PROCESSING_INITIATED = "DATASET_PROCESSING_INITIATED"
|
||||||
DATASET_PROCESSING_STARTED = "DATASET_PROCESSING_STARTED"
|
DATASET_PROCESSING_STARTED = "DATASET_PROCESSING_STARTED"
|
||||||
DATASET_PROCESSING_COMPLETED = "DATASET_PROCESSING_COMPLETED"
|
DATASET_PROCESSING_COMPLETED = "DATASET_PROCESSING_COMPLETED"
|
||||||
DATASET_PROCESSING_ERRORED = "DATASET_PROCESSING_ERRORED"
|
DATASET_PROCESSING_ERRORED = "DATASET_PROCESSING_ERRORED"
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
from .log_pipeline_run_initiated import log_pipeline_run_initiated
|
||||||
from .log_pipeline_run_start import log_pipeline_run_start
|
from .log_pipeline_run_start import log_pipeline_run_start
|
||||||
from .log_pipeline_run_complete import log_pipeline_run_complete
|
from .log_pipeline_run_complete import log_pipeline_run_complete
|
||||||
from .log_pipeline_run_error import log_pipeline_run_error
|
from .log_pipeline_run_error import log_pipeline_run_error
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from ..models import PipelineRun
|
||||||
from sqlalchemy.orm import aliased
|
from sqlalchemy.orm import aliased
|
||||||
|
|
||||||
|
|
||||||
async def get_pipeline_status(dataset_ids: list[UUID]):
|
async def get_pipeline_status(dataset_ids: list[UUID], pipeline_name: str):
|
||||||
db_engine = get_relational_engine()
|
db_engine = get_relational_engine()
|
||||||
|
|
||||||
async with db_engine.get_async_session() as session:
|
async with db_engine.get_async_session() as session:
|
||||||
|
|
@ -20,6 +20,7 @@ async def get_pipeline_status(dataset_ids: list[UUID]):
|
||||||
.label("rn"),
|
.label("rn"),
|
||||||
)
|
)
|
||||||
.filter(PipelineRun.dataset_id.in_(dataset_ids))
|
.filter(PipelineRun.dataset_id.in_(dataset_ids))
|
||||||
|
.filter(PipelineRun.pipeline_name == pipeline_name)
|
||||||
.subquery()
|
.subquery()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
from uuid import UUID, uuid4
|
||||||
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
|
from cognee.modules.pipelines.models import PipelineRun, PipelineRunStatus
|
||||||
|
|
||||||
|
|
||||||
|
async def log_pipeline_run_initiated(pipeline_id: str, pipeline_name: str, dataset_id: UUID):
|
||||||
|
pipeline_run = PipelineRun(
|
||||||
|
pipeline_run_id=uuid4(),
|
||||||
|
pipeline_name=pipeline_name,
|
||||||
|
pipeline_id=pipeline_id,
|
||||||
|
status=PipelineRunStatus.DATASET_PROCESSING_INITIATED,
|
||||||
|
dataset_id=dataset_id,
|
||||||
|
run_info={},
|
||||||
|
)
|
||||||
|
|
||||||
|
db_engine = get_relational_engine()
|
||||||
|
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
session.add(pipeline_run)
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
return pipeline_run
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from cognee.shared.logging_utils import get_logger
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from uuid import uuid5, NAMESPACE_OID
|
from uuid import NAMESPACE_OID, uuid5
|
||||||
|
|
||||||
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from cognee.modules.data.methods import get_datasets
|
||||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||||
from cognee.modules.data.methods.get_unique_dataset_id import get_unique_dataset_id
|
from cognee.modules.data.methods.get_unique_dataset_id import get_unique_dataset_id
|
||||||
from cognee.modules.data.models import Data, Dataset
|
from cognee.modules.data.models import Data, Dataset
|
||||||
|
|
@ -13,6 +13,7 @@ from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline
|
||||||
from cognee.modules.pipelines.tasks.task import Task
|
from cognee.modules.pipelines.tasks.task import Task
|
||||||
from cognee.modules.users.methods import get_default_user
|
from cognee.modules.users.methods import get_default_user
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
|
from cognee.modules.pipelines.operations import log_pipeline_run_initiated
|
||||||
|
|
||||||
from cognee.infrastructure.databases.relational import (
|
from cognee.infrastructure.databases.relational import (
|
||||||
create_db_and_tables as create_relational_db_and_tables,
|
create_db_and_tables as create_relational_db_and_tables,
|
||||||
|
|
@ -59,15 +60,36 @@ async def cognee_pipeline(
|
||||||
|
|
||||||
# If no datasets are provided, work with all existing datasets.
|
# If no datasets are provided, work with all existing datasets.
|
||||||
existing_datasets = await get_datasets(user.id)
|
existing_datasets = await get_datasets(user.id)
|
||||||
if datasets is None or len(datasets) == 0:
|
|
||||||
|
if not datasets:
|
||||||
|
# Get datasets from database if none sent.
|
||||||
datasets = existing_datasets
|
datasets = existing_datasets
|
||||||
if isinstance(datasets[0], str):
|
|
||||||
datasets = await get_datasets_by_name(datasets, user.id)
|
|
||||||
else:
|
else:
|
||||||
# Try to get datasets objects from database, if they don't exist use dataset name
|
# If dataset is already in database, use it, otherwise create a new instance.
|
||||||
datasets_names = await get_datasets_by_name(datasets, user.id)
|
dataset_instances = []
|
||||||
if datasets_names:
|
|
||||||
datasets = datasets_names
|
for dataset_name in datasets:
|
||||||
|
is_dataset_found = False
|
||||||
|
|
||||||
|
for existing_dataset in existing_datasets:
|
||||||
|
if (
|
||||||
|
existing_dataset.name == dataset_name
|
||||||
|
or str(existing_dataset.id) == dataset_name
|
||||||
|
):
|
||||||
|
dataset_instances.append(existing_dataset)
|
||||||
|
is_dataset_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not is_dataset_found:
|
||||||
|
dataset_instances.append(
|
||||||
|
Dataset(
|
||||||
|
id=await get_unique_dataset_id(dataset_name=dataset_name, user=user),
|
||||||
|
name=dataset_name,
|
||||||
|
owner_id=user.id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
datasets = dataset_instances
|
||||||
|
|
||||||
awaitables = []
|
awaitables = []
|
||||||
|
|
||||||
|
|
@ -88,31 +110,48 @@ async def run_pipeline(
|
||||||
data=None,
|
data=None,
|
||||||
pipeline_name: str = "custom_pipeline",
|
pipeline_name: str = "custom_pipeline",
|
||||||
):
|
):
|
||||||
if isinstance(dataset, Dataset):
|
check_dataset_name(dataset.name)
|
||||||
check_dataset_name(dataset.name)
|
|
||||||
dataset_id = dataset.id
|
# Ugly hack, but no easier way to do this.
|
||||||
elif isinstance(dataset, str):
|
if pipeline_name == "add_pipeline":
|
||||||
check_dataset_name(dataset)
|
# Refresh the add pipeline status so data is added to a dataset.
|
||||||
# Generate id based on unique dataset_id formula
|
# Without this the app_pipeline status will be DATASET_PROCESSING_COMPLETED and will skip the execution.
|
||||||
dataset_id = await get_unique_dataset_id(dataset_name=dataset, user=user)
|
dataset_id = uuid5(NAMESPACE_OID, f"{dataset.name}{str(user.id)}")
|
||||||
|
|
||||||
|
await log_pipeline_run_initiated(
|
||||||
|
pipeline_id=uuid5(NAMESPACE_OID, "add_pipeline"),
|
||||||
|
pipeline_name="add_pipeline",
|
||||||
|
dataset_id=dataset_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Refresh the cognify pipeline status after we add new files.
|
||||||
|
# Without this the cognify_pipeline status will be DATASET_PROCESSING_COMPLETED and will skip the execution.
|
||||||
|
await log_pipeline_run_initiated(
|
||||||
|
pipeline_id=uuid5(NAMESPACE_OID, "cognify_pipeline"),
|
||||||
|
pipeline_name="cognify_pipeline",
|
||||||
|
dataset_id=dataset_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset_id = dataset.id
|
||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
data: list[Data] = await get_dataset_data(dataset_id=dataset_id)
|
data: list[Data] = await get_dataset_data(dataset_id=dataset_id)
|
||||||
|
|
||||||
# async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
|
# async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
|
||||||
if isinstance(dataset, Dataset):
|
if isinstance(dataset, Dataset):
|
||||||
task_status = await get_pipeline_status([dataset_id])
|
task_status = await get_pipeline_status([dataset_id], pipeline_name)
|
||||||
else:
|
else:
|
||||||
task_status = [
|
task_status = [
|
||||||
PipelineRunStatus.DATASET_PROCESSING_COMPLETED
|
PipelineRunStatus.DATASET_PROCESSING_COMPLETED
|
||||||
] # TODO: this is a random assignment, find permanent solution
|
] # TODO: this is a random assignment, find permanent solution
|
||||||
|
|
||||||
if (
|
if str(dataset_id) in task_status:
|
||||||
str(dataset_id) in task_status
|
if task_status[str(dataset_id)] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
|
||||||
and task_status[str(dataset_id)] == PipelineRunStatus.DATASET_PROCESSING_STARTED
|
logger.info("Dataset %s is already being processed.", dataset_id)
|
||||||
):
|
return
|
||||||
logger.info("Dataset %s is already being processed.", dataset_id)
|
if task_status[str(dataset_id)] == PipelineRunStatus.DATASET_PROCESSING_COMPLETED:
|
||||||
return
|
logger.info("Dataset %s is already processed.", dataset_id)
|
||||||
|
return
|
||||||
|
|
||||||
if not isinstance(tasks, list):
|
if not isinstance(tasks, list):
|
||||||
raise ValueError("Tasks must be a list")
|
raise ValueError("Tasks must be a list")
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from cognee.infrastructure.databases.vector import get_vectordb_config
|
from cognee.infrastructure.databases.vector import get_vectordb_config
|
||||||
from cognee.infrastructure.llm import get_llm_config
|
from cognee.infrastructure.llm import get_llm_config
|
||||||
|
|
@ -20,8 +21,8 @@ class LLMConfig(BaseModel):
|
||||||
api_key: str
|
api_key: str
|
||||||
model: str
|
model: str
|
||||||
provider: str
|
provider: str
|
||||||
endpoint: str
|
endpoint: Optional[str]
|
||||||
api_version: str
|
api_version: Optional[str]
|
||||||
models: dict[str, list[ConfigChoice]]
|
models: dict[str, list[ConfigChoice]]
|
||||||
providers: list[ConfigChoice]
|
providers: list[ConfigChoice]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ echo "Environment: $ENVIRONMENT"
|
||||||
# inconsistencies and should cause the startup to fail. This check allows for
|
# inconsistencies and should cause the startup to fail. This check allows for
|
||||||
# smooth redeployments and container restarts while maintaining data integrity.
|
# smooth redeployments and container restarts while maintaining data integrity.
|
||||||
echo "Running database migrations..."
|
echo "Running database migrations..."
|
||||||
MIGRATION_OUTPUT=$(poetry run alembic upgrade head 2>&1) || {
|
MIGRATION_OUTPUT=$(alembic upgrade head 2>&1) || {
|
||||||
if [[ $MIGRATION_OUTPUT == *"UserAlreadyExists"* ]] || [[ $MIGRATION_OUTPUT == *"User default_user@example.com already exists"* ]]; then
|
if [[ $MIGRATION_OUTPUT == *"UserAlreadyExists"* ]] || [[ $MIGRATION_OUTPUT == *"User default_user@example.com already exists"* ]]; then
|
||||||
echo "Warning: Default user already exists, continuing startup..."
|
echo "Warning: Default user already exists, continuing startup..."
|
||||||
else
|
else
|
||||||
|
|
@ -22,8 +22,9 @@ MIGRATION_OUTPUT=$(poetry run alembic upgrade head 2>&1) || {
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
echo "Database migrations done."
|
||||||
|
|
||||||
echo "Starting Gunicorn"
|
echo "Starting server..."
|
||||||
|
|
||||||
# Add startup delay to ensure DB is ready
|
# Add startup delay to ensure DB is ready
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
@ -32,10 +33,10 @@ sleep 2
|
||||||
if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
|
if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
|
||||||
if [ "$DEBUG" = "true" ]; then
|
if [ "$DEBUG" = "true" ]; then
|
||||||
echo "Waiting for the debugger to attach..."
|
echo "Waiting for the debugger to attach..."
|
||||||
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level debug --reload cognee.api.client:app
|
debugpy --wait-for-client --listen 0.0.0.0:5678 -m gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level debug --reload cognee.api.client:app
|
||||||
else
|
else
|
||||||
exec gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level debug --reload cognee.api.client:app
|
gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level debug --reload cognee.api.client:app
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
exec gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level error cognee.api.client:app
|
gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level error cognee.api.client:app
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
7
poetry.lock
generated
7
poetry.lock
generated
|
|
@ -1538,7 +1538,7 @@ description = "An implementation of the Debug Adapter Protocol for Python"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "extra == \"notebook\" or extra == \"dev\""
|
markers = "extra == \"notebook\" or extra == \"dev\" or extra == \"debug\""
|
||||||
files = [
|
files = [
|
||||||
{file = "debugpy-1.8.9-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:cfe1e6c6ad7178265f74981edf1154ffce97b69005212fbc90ca22ddfe3d017e"},
|
{file = "debugpy-1.8.9-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:cfe1e6c6ad7178265f74981edf1154ffce97b69005212fbc90ca22ddfe3d017e"},
|
||||||
{file = "debugpy-1.8.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada7fb65102a4d2c9ab62e8908e9e9f12aed9d76ef44880367bc9308ebe49a0f"},
|
{file = "debugpy-1.8.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada7fb65102a4d2c9ab62e8908e9e9f12aed9d76ef44880367bc9308ebe49a0f"},
|
||||||
|
|
@ -11961,8 +11961,9 @@ anthropic = ["anthropic"]
|
||||||
api = ["gunicorn", "uvicorn"]
|
api = ["gunicorn", "uvicorn"]
|
||||||
chromadb = ["chromadb", "pypika"]
|
chromadb = ["chromadb", "pypika"]
|
||||||
codegraph = ["fastembed", "transformers", "tree-sitter", "tree-sitter-python"]
|
codegraph = ["fastembed", "transformers", "tree-sitter", "tree-sitter-python"]
|
||||||
|
debug = ["debugpy"]
|
||||||
deepeval = ["deepeval"]
|
deepeval = ["deepeval"]
|
||||||
dev = ["coverage", "debugpy", "deptry", "gitpython", "mkdocs-material", "mkdocs-minify-plugin", "mkdocstrings", "mypy", "notebook", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "ruff", "tweepy"]
|
dev = ["coverage", "deptry", "gitpython", "mkdocs-material", "mkdocs-minify-plugin", "mkdocstrings", "mypy", "notebook", "pylint", "pytest", "pytest-asyncio", "pytest-cov", "ruff", "tweepy"]
|
||||||
docs = ["unstructured"]
|
docs = ["unstructured"]
|
||||||
evals = ["gdown", "plotly"]
|
evals = ["gdown", "plotly"]
|
||||||
falkordb = ["falkordb"]
|
falkordb = ["falkordb"]
|
||||||
|
|
@ -11987,4 +11988,4 @@ weaviate = ["weaviate-client"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = ">=3.10,<=3.13"
|
python-versions = ">=3.10,<=3.13"
|
||||||
content-hash = "9d2a3beda2b6c329e69319e51c28c90d2806d8a257cd971e990600e99c1d96bd"
|
content-hash = "15b319ff8dbe5bd88e41ead93f4e9140b2b7d86d57a707682dd3a308e78ef245"
|
||||||
|
|
|
||||||
|
|
@ -118,7 +118,6 @@ dev = [
|
||||||
"mypy>=1.7.1,<2",
|
"mypy>=1.7.1,<2",
|
||||||
"notebook>=7.1.0,<8",
|
"notebook>=7.1.0,<8",
|
||||||
"deptry>=0.20.0,<0.21",
|
"deptry>=0.20.0,<0.21",
|
||||||
"debugpy==1.8.9",
|
|
||||||
"pylint>=3.0.3,<4",
|
"pylint>=3.0.3,<4",
|
||||||
"ruff>=0.9.2,<1.0.0",
|
"ruff>=0.9.2,<1.0.0",
|
||||||
"tweepy==4.14.0",
|
"tweepy==4.14.0",
|
||||||
|
|
@ -127,6 +126,7 @@ dev = [
|
||||||
"mkdocs-minify-plugin>=0.8.0,<0.9",
|
"mkdocs-minify-plugin>=0.8.0,<0.9",
|
||||||
"mkdocstrings[python]>=0.26.2,<0.27",
|
"mkdocstrings[python]>=0.26.2,<0.27",
|
||||||
]
|
]
|
||||||
|
debug = ["debugpy==1.8.9"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://www.cognee.ai"
|
Homepage = "https://www.cognee.ai"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue