Merge branch 'dev' into feat/add-pdfproloader

Signed-off-by: EricXiao <taoiaox@gmail.com>
This commit is contained in:
EricXiao 2025-09-30 17:08:28 +08:00
commit 4938ad9fe9
90 changed files with 7255 additions and 4084 deletions

View file

@ -176,7 +176,14 @@ ENABLE_BACKEND_ACCESS_CONTROL=False
# Cognee Cloud API settings for syncing data to/from cloud infrastructure
COGNEE_CLOUD_API_URL="http://localhost:8001"
COGNEE_CLOUD_AUTH_TOKEN="your-auth-token"
COGNEE_CLOUD_AUTH_TOKEN="your-api-key"
################################################################################
# UI Settings
################################################################################
# URL where the frontend is served, defaults to http://localhost:3000
UI_APP_URL=http://localhost:3000
################################################################################
# 🛠️ DEV Settings
@ -196,6 +203,16 @@ LITELLM_LOG="ERROR"
# DEFAULT_USER_EMAIL=""
# DEFAULT_USER_PASSWORD=""
################################################################################
# 📂 AWS Settings
################################################################################
#AWS_REGION=""
#AWS_ENDPOINT_URL=""
#AWS_ACCESS_KEY_ID=""
#AWS_SECRET_ACCESS_KEY=""
#AWS_SESSION_TOKEN=""
------------------------------- END OF POSSIBLE SETTINGS -------------------------------

View file

@ -24,7 +24,7 @@ runs:
uses: astral-sh/setup-uv@v4
with:
enable-cache: true
- name: Rebuild uv lockfile
shell: bash
run: |

View file

@ -1,8 +1,8 @@
<!-- .github/pull_request_template.md -->
## Description
<!--
Please provide a clear, human-generated description of the changes in this PR.
<!--
Please provide a clear, human-generated description of the changes in this PR.
DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning.
-->
@ -16,15 +16,6 @@ DO NOT use AI-generated descriptions. We want to understand your thought process
- [ ] Performance improvement
- [ ] Other (please specify):
## Changes Made
<!-- List the specific changes made in this PR -->
-
-
-
## Testing
<!-- Describe how you tested your changes -->
## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->
@ -40,11 +31,5 @@ DO NOT use AI-generated descriptions. We want to understand your thought process
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages
## Related Issues
<!-- Link any related issues using "Fixes #issue_number" or "Relates to #issue_number" -->
## Additional Notes
<!-- Add any additional notes, concerns, or context for reviewers -->
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.

View file

@ -188,6 +188,7 @@ jobs:
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
extra-dependencies: "baml"
- name: Run Simple Examples
run: uv run python ./examples/python/simple_example.py

View file

@ -166,9 +166,6 @@ jobs:
python-version: '3.11.x'
extra-dependencies: "aws"
- name: Dependencies already installed
run: echo "Dependencies already installed in setup"
- name: Run S3 Bucket Test
env:
ENV: 'dev'

View file

@ -1,5 +1,8 @@
name: Reusable Examples Tests
permissions:
contents: read
on:
workflow_call:
@ -131,3 +134,28 @@ jobs:
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/memify_coding_agent_example.py
test-permissions-example:
name: Run Permissions Example
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
- name: Run Memify Tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/permissions_example.py

View file

@ -18,6 +18,7 @@ jobs:
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
extra-dependencies: "aws"
- name: Run S3 File Storage Test
env:

View file

@ -101,3 +101,30 @@ jobs:
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./cognee/tests/test_pgvector.py
run-lancedb-tests:
name: LanceDB Tests
runs-on: ubuntu-22.04
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
- name: Run LanceDB Tests
env:
ENV: 'dev'
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./cognee/tests/test_lancedb.py

130
README.md
View file

@ -5,7 +5,7 @@
<br />
cognee - Memory for AI Agents in 5 lines of code
cognee - Memory for AI Agents in 6 lines of code
<p align="center">
<a href="https://www.youtube.com/watch?v=1bezuvLwJmw&t=2s">Demo</a>
@ -43,12 +43,10 @@
**🚀 We launched Cogwit beta (Fully-hosted AI Memory): Sign up [here](https://platform.cognee.ai/)! 🚀**
Build dynamic memory for Agents and replace RAG using scalable, modular ECL (Extract, Cognify, Load) pipelines.
More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github.com/topoteretes/cognee/tree/main/evals)
<p align="center">
🌐 Available Languages
:
@ -70,53 +68,47 @@ More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github
</div>
## Features
- Interconnect and retrieve your past conversations, documents, images and audio transcriptions
- Replaces RAG systems and reduces developer effort, and cost.
- Load data to graph and vector databases using only Pydantic
- Manipulate your data while ingesting from 30+ data sources
## Get Started
Get started quickly with a Google Colab <a href="https://colab.research.google.com/drive/1jHbWVypDgCLwjE71GSXhRL3YxYhCZzG1?usp=sharing">notebook</a> , <a href="https://deepnote.com/workspace/cognee-382213d0-0444-4c89-8265-13770e333c02/project/cognee-demo-78ffacb9-5832-4611-bb1a-560386068b30/notebook/Notebook-1-75b24cda566d4c24ab348f7150792601?utm_source=share-modal&utm_medium=product-shared-content&utm_campaign=notebook&utm_content=78ffacb9-5832-4611-bb1a-560386068b30">Deepnote notebook</a> or <a href="https://github.com/topoteretes/cognee/tree/main/cognee-starter-kit">starter repo</a>
## About cognee
Self-hosted package:
## Contributing
Your contributions are at the core of making this a true open source project. Any contributions you make are **greatly appreciated**. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for more information.
- Interconnects any kind of documents: past conversations, files, images, and audio transcriptions
- Replaces RAG systems with a memory layer based on graphs and vectors
- Reduces developer effort and cost, while increasing quality and precision
- Provides Pythonic data pipelines that manage data ingestion from 30+ data sources
- Is highly customizable with custom tasks, pipelines, and a set of built-in search endpoints
Hosted platform:
- Includes a managed UI and a [hosted solution](https://www.cognee.ai)
## Self-Hosted (Open Source)
## 📦 Installation
### 📦 Installation
You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager.
Cognee supports Python 3.10 to 3.13
Cognee supports Python 3.10 to 3.12
### With pip
#### With uv
```bash
pip install cognee
uv pip install cognee
```
## Local Cognee installation
Detailed instructions can be found in our [docs](https://docs.cognee.ai/getting-started/installation#environment-configuration)
You can install the local Cognee repo using **uv**, **pip** and **poetry**.
For local pip installation please make sure your pip version is above version 21.3.
### 💻 Basic Usage
### with UV with all optional dependencies
```bash
uv sync --all-extras
```
## 💻 Basic Usage
### Setup
#### Setup
```
import os
@ -125,10 +117,14 @@ os.environ["LLM_API_KEY"] = "YOUR OPENAI_API_KEY"
```
You can also set the variables by creating .env file, using our <a href="https://github.com/topoteretes/cognee/blob/main/.env.template">template.</a>
To use different LLM providers, for more info check out our <a href="https://docs.cognee.ai">documentation</a>
To use different LLM providers, for more info check out our <a href="https://docs.cognee.ai/setup-configuration/llm-providers">documentation</a>
### Simple example
#### Simple example
##### Python
This script will run the default pipeline:
@ -139,13 +135,16 @@ import asyncio
async def main():
# Add text to cognee
await cognee.add("Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval.")
await cognee.add("Cognee turns documents into AI memory.")
# Generate the knowledge graph
await cognee.cognify()
# Add memory algorithms to the graph
await cognee.memify()
# Query the knowledge graph
results = await cognee.search("Tell me about NLP")
results = await cognee.search("What does cognee do?")
# Display the results
for result in results:
@ -158,33 +157,38 @@ if __name__ == '__main__':
```
Example output:
```
Natural Language Processing (NLP) is a cross-disciplinary and interdisciplinary field that involves computer science and information retrieval. It focuses on the interaction between computers and human language, enabling machines to understand and process natural language.
Cognee turns documents into AI memory.
```
##### Via CLI
## Our paper is out! <a href="https://arxiv.org/abs/2505.24478" target="_blank" rel="noopener noreferrer">Read here</a>
Let's get the basics covered
```
cognee-cli add "Cognee turns documents into AI memory."
cognee-cli cognify
cognee-cli search "What does cognee do?"
cognee-cli delete --all
```
or run
```
cognee-cli -ui
```
<div style="text-align: center">
<img src="assets/cognee-paper.png" alt="cognee paper" width="100%" />
</div>
</div>
## Cognee UI
You can also cognify your files and query using cognee UI.
### Hosted Platform
<img src="assets/cognee-new-ui.webp" width="100%" alt="Cognee UI 2"></a>
Get up and running in minutes with automatic updates, analytics, and enterprise security.
### Running the UI
1. Sign up on [cogwit](https://www.cognee.ai)
2. Add your API key to local UI and sync your data to Cogwit
Try cognee UI by setting LLM_API_KEY and running ``` cognee-cli -ui ``` command on your terminal.
## Understand our architecture
<div style="text-align: center">
<img src="assets/cognee_diagram.png" alt="cognee concept diagram" width="100%" />
</div>
@ -203,22 +207,26 @@ Try cognee UI by setting LLM_API_KEY and running ``` cognee-cli -ui ``` command
[cognee with local models](https://github.com/user-attachments/assets/8621d3e8-ecb8-4860-afb2-5594f2ee17db)
## Contributing
Your contributions are at the core of making this a true open source project. Any contributions you make are **greatly appreciated**. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for more information.
## Code of Conduct
We are committed to making open source an enjoyable and respectful experience for our community. See <a href="https://github.com/topoteretes/cognee/blob/main/CODE_OF_CONDUCT.md"><code>CODE_OF_CONDUCT</code></a> for more information.
## 💫 Contributors
## Citation
<a href="https://github.com/topoteretes/cognee/graphs/contributors">
<img alt="contributors" src="https://contrib.rocks/image?repo=topoteretes/cognee"/>
</a>
We now have a paper you can cite:
## Sponsors
Thanks to the following companies for sponsoring the ongoing development of cognee.
- [GitHub's Secure Open Source Fund](https://resources.github.com/github-secure-open-source-fund/)
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=topoteretes/cognee&type=Date)](https://star-history.com/#topoteretes/cognee&Date)
```bibtex
@misc{markovic2025optimizinginterfaceknowledgegraphs,
title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning},
author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic},
year={2025},
eprint={2505.24478},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2505.24478},
}
```

View file

@ -3,10 +3,18 @@
import classNames from "classnames";
import { MutableRefObject, useEffect, useImperativeHandle, useRef, useState, useCallback } from "react";
import { forceCollide, forceManyBody } from "d3-force-3d";
import ForceGraph, { ForceGraphMethods, GraphData, LinkObject, NodeObject } from "react-force-graph-2d";
import dynamic from "next/dynamic";
import { GraphControlsAPI } from "./GraphControls";
import getColorForNodeType from "./getColorForNodeType";
// Dynamically import ForceGraph to prevent SSR issues
const ForceGraph = dynamic(() => import("react-force-graph-2d"), {
ssr: false,
loading: () => <div className="w-full h-full flex items-center justify-center">Loading graph...</div>
});
import type { ForceGraphMethods, GraphData, LinkObject, NodeObject } from "react-force-graph-2d";
interface GraphVisuzaliationProps {
ref: MutableRefObject<GraphVisualizationAPI>;
data?: GraphData<NodeObject, LinkObject>;
@ -200,7 +208,7 @@ export default function GraphVisualization({ ref, data, graphControls, className
const graphRef = useRef<ForceGraphMethods>();
useEffect(() => {
if (typeof window !== "undefined" && data && graphRef.current) {
if (data && graphRef.current) {
// add collision force
graphRef.current.d3Force("collision", forceCollide(nodeSize * 1.5));
graphRef.current.d3Force("charge", forceManyBody().strength(-10).distanceMin(10).distanceMax(50));
@ -216,56 +224,34 @@ export default function GraphVisualization({ ref, data, graphControls, className
return (
<div ref={containerRef} className={classNames("w-full h-full", className)} id="graph-container">
{(data && typeof window !== "undefined") ? (
<ForceGraph
ref={graphRef}
width={dimensions.width}
height={dimensions.height}
dagMode={graphShape as unknown as undefined}
dagLevelDistance={300}
onDagError={handleDagError}
graphData={data}
<ForceGraph
ref={graphRef}
width={dimensions.width}
height={dimensions.height}
dagMode={graphShape as unknown as undefined}
dagLevelDistance={data ? 300 : 100}
onDagError={handleDagError}
graphData={data || {
nodes: [{ id: 1, label: "Add" }, { id: 2, label: "Cognify" }, { id: 3, label: "Search" }],
links: [{ source: 1, target: 2, label: "but don't forget to" }, { source: 2, target: 3, label: "and after that you can" }],
}}
nodeLabel="label"
nodeRelSize={nodeSize}
nodeCanvasObject={renderNode}
nodeCanvasObjectMode={() => "replace"}
nodeLabel="label"
nodeRelSize={data ? nodeSize : 20}
nodeCanvasObject={data ? renderNode : renderInitialNode}
nodeCanvasObjectMode={() => data ? "replace" : "after"}
nodeAutoColorBy={data ? undefined : "type"}
linkLabel="label"
linkCanvasObject={renderLink}
linkCanvasObjectMode={() => "after"}
linkDirectionalArrowLength={3.5}
linkDirectionalArrowRelPos={1}
linkLabel="label"
linkCanvasObject={renderLink}
linkCanvasObjectMode={() => "after"}
linkDirectionalArrowLength={3.5}
linkDirectionalArrowRelPos={1}
onNodeClick={handleNodeClick}
onBackgroundClick={handleBackgroundClick}
d3VelocityDecay={0.3}
/>
) : (
<ForceGraph
ref={graphRef}
width={dimensions.width}
height={dimensions.height}
dagMode={graphShape as unknown as undefined}
dagLevelDistance={100}
graphData={{
nodes: [{ id: 1, label: "Add" }, { id: 2, label: "Cognify" }, { id: 3, label: "Search" }],
links: [{ source: 1, target: 2, label: "but don't forget to" }, { source: 2, target: 3, label: "and after that you can" }],
}}
nodeLabel="label"
nodeRelSize={20}
nodeCanvasObject={renderInitialNode}
nodeCanvasObjectMode={() => "after"}
nodeAutoColorBy="type"
linkLabel="label"
linkCanvasObject={renderLink}
linkCanvasObjectMode={() => "after"}
linkDirectionalArrowLength={3.5}
linkDirectionalArrowRelPos={1}
/>
)}
onNodeClick={handleNodeClick}
onBackgroundClick={handleBackgroundClick}
d3VelocityDecay={data ? 0.3 : undefined}
/>
</div>
);
}

View file

@ -2,10 +2,11 @@
import Link from "next/link";
import Image from "next/image";
import { useBoolean } from "@/utils";
import { useEffect } from "react";
import { useBoolean, fetch } from "@/utils";
import { CloseIcon, CloudIcon, CogneeIcon } from "../Icons";
import { CTAButton, GhostButton, IconButton, Modal } from "../elements";
import { CTAButton, GhostButton, IconButton, Modal, StatusDot } from "../elements";
import syncData from "@/modules/cloud/syncData";
interface HeaderProps {
@ -23,6 +24,12 @@ export default function Header({ user }: HeaderProps) {
setFalse: closeSyncModal,
} = useBoolean(false);
const {
value: isMCPConnected,
setTrue: setMCPConnected,
setFalse: setMCPDisconnected,
} = useBoolean(false);
const handleDataSyncConfirm = () => {
syncData()
.finally(() => {
@ -30,6 +37,19 @@ export default function Header({ user }: HeaderProps) {
});
};
useEffect(() => {
const checkMCPConnection = () => {
fetch.checkMCPHealth()
.then(() => setMCPConnected())
.catch(() => setMCPDisconnected());
};
checkMCPConnection();
const interval = setInterval(checkMCPConnection, 30000);
return () => clearInterval(interval);
}, [setMCPConnected, setMCPDisconnected]);
return (
<>
<header className="relative flex flex-row h-14 min-h-14 px-5 items-center justify-between w-full max-w-[1920px] mx-auto">
@ -39,6 +59,10 @@ export default function Header({ user }: HeaderProps) {
</div>
<div className="flex flex-row items-center gap-2.5">
<Link href="/mcp-status" className="!text-indigo-600 pl-4 pr-4">
<StatusDot className="mr-2" isActive={isMCPConnected} />
{ isMCPConnected ? "MCP connected" : "MCP disconnected" }
</Link>
<GhostButton onClick={openSyncModal} className="text-indigo-600 gap-3 pl-4 pr-4">
<CloudIcon />
<div>Sync</div>

View file

@ -0,0 +1,13 @@
import React from "react";
const StatusDot = ({ isActive, className }: { isActive: boolean, className?: string }) => {
return (
<span
className={`inline-block w-3 h-3 rounded-full ${className} ${
isActive ? "bg-green-500" : "bg-red-500"
}`}
/>
);
};
export default StatusDot;

View file

@ -8,5 +8,6 @@ export { default as IconButton } from "./IconButton";
export { default as GhostButton } from "./GhostButton";
export { default as NeutralButton } from "./NeutralButton";
export { default as StatusIndicator } from "./StatusIndicator";
export { default as StatusDot } from "./StatusDot";
export { default as Accordion } from "./Accordion";
export { default as Notebook } from "./Notebook";

View file

@ -9,6 +9,8 @@ const backendApiUrl = process.env.NEXT_PUBLIC_BACKEND_API_URL || "http://localho
const cloudApiUrl = process.env.NEXT_PUBLIC_CLOUD_API_URL || "http://localhost:8001";
const mcpApiUrl = process.env.NEXT_PUBLIC_MCP_API_URL || "http://localhost:8001";
let apiKey: string | null = process.env.NEXT_PUBLIC_COGWIT_API_KEY || null;
let accessToken: string | null = null;
@ -49,6 +51,13 @@ export default async function fetch(url: string, options: RequestInit = {}, useC
)
.then((response) => handleServerErrors(response, retry, useCloud))
.catch((error) => {
// Handle network errors more gracefully
if (error.name === 'TypeError' && error.message.includes('fetch')) {
return Promise.reject(
new Error("Backend server is not responding. Please check if the server is running.")
);
}
if (error.detail === undefined) {
return Promise.reject(
new Error("No connection to the server.")
@ -62,8 +71,31 @@ export default async function fetch(url: string, options: RequestInit = {}, useC
});
}
fetch.checkHealth = () => {
return global.fetch(`${backendApiUrl.replace("/api", "")}/health`);
fetch.checkHealth = async () => {
const maxRetries = 5;
const retryDelay = 1000; // 1 second
for (let i = 0; i < maxRetries; i++) {
try {
const response = await global.fetch(`${backendApiUrl.replace("/api", "")}/health`);
if (response.ok) {
return response;
}
} catch (error) {
// If this is the last retry, throw the error
if (i === maxRetries - 1) {
throw error;
}
// Wait before retrying
await new Promise(resolve => setTimeout(resolve, retryDelay));
}
}
throw new Error("Backend server is not responding after multiple attempts");
};
fetch.checkMCPHealth = () => {
return global.fetch(`${mcpApiUrl.replace("/api", "")}/health`);
};
fetch.setApiKey = (newApiKey: string) => {

View file

@ -1,153 +0,0 @@
import sys
import asyncio
try:
import cognee
from PySide6.QtWidgets import (
QApplication,
QWidget,
QPushButton,
QLineEdit,
QFileDialog,
QVBoxLayout,
QHBoxLayout,
QLabel,
QMessageBox,
QTextEdit,
QProgressDialog,
)
from PySide6.QtCore import Qt
from qasync import QEventLoop # Import QEventLoop from qasync
except ImportError as e:
print(
"\nPlease install Cognee with optional gui dependencies or manually install missing dependencies.\n"
)
print("\nTo install with poetry use:")
print("\npoetry install -E gui\n")
print("\nOr to install with poetry and all dependencies use:")
print("\npoetry install --all-extras\n")
print("\nTo install with pip use: ")
print('\npip install ".[gui]"\n')
raise e
class FileSearchApp(QWidget):
def __init__(self):
super().__init__()
self.selected_file = None
self.init_ui()
def init_ui(self):
# Horizontal layout for file upload and visualization buttons
button_layout = QHBoxLayout()
# Button to open file dialog
self.file_button = QPushButton("Upload File to Cognee", parent=self)
self.file_button.clicked.connect(self.open_file_dialog)
button_layout.addWidget(self.file_button)
# Button to visualize data
self.visualize_button = QPushButton("Visualize Data", parent=self)
self.visualize_button.clicked.connect(lambda: asyncio.ensure_future(self.visualize_data()))
button_layout.addWidget(self.visualize_button)
# Label to display selected file path
self.file_label = QLabel("No file selected", parent=self)
# Line edit for search input
self.search_input = QLineEdit(parent=self)
self.search_input.setPlaceholderText("Enter text to search...")
# Button to perform search; schedule the async search on click
self.search_button = QPushButton("Cognee Search", parent=self)
self.search_button.clicked.connect(lambda: asyncio.ensure_future(self._cognee_search()))
# Text output area for search results
self.result_output = QTextEdit(parent=self)
self.result_output.setReadOnly(True)
self.result_output.setPlaceholderText("Search results will appear here...")
# Progress dialog
self.progress_dialog = QProgressDialog("Processing..", None, 0, 0, parent=self)
self.progress_dialog.setWindowModality(Qt.WindowModal)
self.progress_dialog.setCancelButton(None) # Remove the cancel button
self.progress_dialog.close()
# Layout setup
layout = QVBoxLayout()
layout.addLayout(button_layout)
layout.addWidget(self.file_label)
layout.addWidget(self.search_input)
layout.addWidget(self.search_button)
layout.addWidget(self.result_output)
self.setLayout(layout)
self.setWindowTitle("Cognee")
self.resize(500, 300)
def open_file_dialog(self):
file_path, _ = QFileDialog.getOpenFileName(
self, "Select a File", "", "All Files (*.*);;Text Files (*.txt)"
)
if file_path:
self.selected_file = file_path
self.file_label.setText(f"Selected: {file_path}")
asyncio.ensure_future(self.process_file_async())
async def process_file_async(self):
"""Asynchronously add and process the selected file."""
# Disable the entire window
self.progress_dialog.show()
self.setEnabled(False)
try:
await cognee.add(self.selected_file)
await cognee.cognify()
except Exception as e:
QMessageBox.critical(self, "Error", f"File processing failed: {str(e)}")
# Once finished, re-enable the window
self.setEnabled(True)
self.progress_dialog.close()
async def _cognee_search(self):
"""Performs an async search and updates the result output."""
# Disable the entire window
self.setEnabled(False)
self.progress_dialog.show()
try:
search_text = self.search_input.text().strip()
result = await cognee.search(query_text=search_text)
print(result)
# Assuming result is a list-like object; adjust if necessary
self.result_output.setText(result[0])
except Exception as e:
QMessageBox.critical(self, "Error", f"Search failed: {str(e)}")
# Once finished, re-enable the window
self.setEnabled(True)
self.progress_dialog.close()
async def visualize_data(self):
"""Async slot for handling visualize data button press."""
import webbrowser
from cognee.api.v1.visualize.visualize import visualize_graph
import os
import pathlib
html_file = os.path.join(pathlib.Path(__file__).parent, ".data", "graph_visualization.html")
await visualize_graph(html_file)
webbrowser.open(f"file://{html_file}")
if __name__ == "__main__":
app = QApplication(sys.argv)
# Create a qasync event loop and set it as the current event loop
loop = QEventLoop(app)
asyncio.set_event_loop(loop)
window = FileSearchApp()
window.show()
with loop:
loop.run_forever()

View file

@ -48,27 +48,27 @@ if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
if [ "$DEBUG" = "true" ]; then
echo "Waiting for the debugger to attach..."
if [ "$TRANSPORT_MODE" = "sse" ]; then
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee-mcp --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
elif [ "$TRANSPORT_MODE" = "http" ]; then
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee-mcp --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
else
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee --transport stdio --no-migration
exec python -m debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m cognee-mcp --transport stdio --no-migration
fi
else
if [ "$TRANSPORT_MODE" = "sse" ]; then
exec cognee --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec cognee-mcp --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
elif [ "$TRANSPORT_MODE" = "http" ]; then
exec cognee --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec cognee-mcp --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
else
exec cognee --transport stdio --no-migration
exec cognee-mcp --transport stdio --no-migration
fi
fi
else
if [ "$TRANSPORT_MODE" = "sse" ]; then
exec cognee --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec cognee-mcp --transport sse --host 0.0.0.0 --port $HTTP_PORT --no-migration
elif [ "$TRANSPORT_MODE" = "http" ]; then
exec cognee --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
exec cognee-mcp --transport http --host 0.0.0.0 --port $HTTP_PORT --no-migration
else
exec cognee --transport stdio --no-migration
exec cognee-mcp --transport stdio --no-migration
fi
fi

View file

@ -8,7 +8,7 @@ requires-python = ">=3.10"
dependencies = [
# For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
#"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee",
"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j]==0.3.2",
"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j]==0.3.4",
"fastmcp>=2.10.0,<3.0.0",
"mcp>=1.12.0,<2.0.0",
"uv>=0.6.3,<1.0.0",
@ -36,4 +36,4 @@ dev = [
allow-direct-references = true
[project.scripts]
cognee = "src:main"
cognee-mcp = "src:main"

View file

@ -19,6 +19,10 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
from cognee.modules.search.types import SearchType
from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.storage.utils import JSONEncoder
from starlette.responses import JSONResponse
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
import uvicorn
try:
@ -38,6 +42,53 @@ mcp = FastMCP("Cognee")
logger = get_logger()
async def run_sse_with_cors():
"""Custom SSE transport with CORS middleware."""
sse_app = mcp.sse_app()
sse_app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:3000"],
allow_credentials=True,
allow_methods=["GET"],
allow_headers=["*"],
)
config = uvicorn.Config(
sse_app,
host=mcp.settings.host,
port=mcp.settings.port,
log_level=mcp.settings.log_level.lower(),
)
server = uvicorn.Server(config)
await server.serve()
async def run_http_with_cors():
"""Custom HTTP transport with CORS middleware."""
http_app = mcp.streamable_http_app()
http_app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:3000"],
allow_credentials=True,
allow_methods=["GET"],
allow_headers=["*"],
)
config = uvicorn.Config(
http_app,
host=mcp.settings.host,
port=mcp.settings.port,
log_level=mcp.settings.log_level.lower(),
)
server = uvicorn.Server(config)
await server.serve()
@mcp.custom_route("/health", methods=["GET"])
async def health_check(request):
return JSONResponse({"status": "ok"})
@mcp.tool()
async def cognee_add_developer_rules(
base_path: str = ".", graph_model_file: str = None, graph_model_name: str = None
@ -975,12 +1026,12 @@ async def main():
await mcp.run_stdio_async()
elif args.transport == "sse":
logger.info(f"Running MCP server with SSE transport on {args.host}:{args.port}")
await mcp.run_sse_async()
await run_sse_with_cors()
elif args.transport == "http":
logger.info(
f"Running MCP server with Streamable HTTP transport on {args.host}:{args.port}{args.path}"
)
await mcp.run_streamable_http_async()
await run_http_with_cors()
if __name__ == "__main__":

3529
cognee-mcp/uv.lock generated

File diff suppressed because it is too large Load diff

View file

@ -3,7 +3,6 @@
import os
import uvicorn
import sentry_sdk
from traceback import format_exc
from contextlib import asynccontextmanager
from fastapi import Request
@ -29,6 +28,7 @@ from cognee.api.v1.add.routers import get_add_router
from cognee.api.v1.delete.routers import get_delete_router
from cognee.api.v1.responses.routers import get_responses_router
from cognee.api.v1.sync.routers import get_sync_router
from cognee.api.v1.update.routers import get_update_router
from cognee.api.v1.users.routers import (
get_auth_router,
get_register_router,
@ -42,11 +42,18 @@ from cognee.modules.users.methods.get_authenticated_user import REQUIRE_AUTHENTI
logger = get_logger()
if os.getenv("ENV", "prod") == "prod":
sentry_sdk.init(
dsn=os.getenv("SENTRY_REPORTING_URL"),
traces_sample_rate=1.0,
profiles_sample_rate=1.0,
)
try:
import sentry_sdk
sentry_sdk.init(
dsn=os.getenv("SENTRY_REPORTING_URL"),
traces_sample_rate=1.0,
profiles_sample_rate=1.0,
)
except ImportError:
logger.info(
"Sentry SDK not available. Install with 'pip install cognee\"[monitoring]\"' to enable error monitoring."
)
app_environment = os.getenv("ENV", "prod")
@ -81,7 +88,7 @@ if CORS_ALLOWED_ORIGINS:
]
else:
allowed_origins = [
"http://localhost:3000",
os.getenv("UI_APP_URL", "http://localhost:3000"),
] # Block all except explicitly set origins
app.add_middleware(
@ -257,6 +264,8 @@ app.include_router(get_visualize_router(), prefix="/api/v1/visualize", tags=["vi
app.include_router(get_delete_router(), prefix="/api/v1/delete", tags=["delete"])
app.include_router(get_update_router(), prefix="/api/v1/update", tags=["update"])
app.include_router(get_responses_router(), prefix="/api/v1/responses", tags=["responses"])
app.include_router(get_sync_router(), prefix="/api/v1/sync", tags=["sync"])

View file

@ -194,7 +194,7 @@ class HealthChecker:
config = get_llm_config()
# Test actual API connection with minimal request
LLMGateway.show_prompt("test", "test")
LLMGateway.show_prompt("test", "test.txt")
response_time = int((time.time() - start_time) * 1000)
return ComponentHealth(

View file

@ -20,4 +20,4 @@ def get_checks_router():
return await check_api_key(api_token)
return router
return router

View file

@ -94,9 +94,11 @@ def get_permissions_router() -> APIRouter:
from cognee.modules.users.roles.methods import create_role as create_role_method
await create_role_method(role_name=role_name, owner_id=user.id)
role_id = await create_role_method(role_name=role_name, owner_id=user.id)
return JSONResponse(status_code=200, content={"message": "Role created for tenant"})
return JSONResponse(
status_code=200, content={"message": "Role created for tenant", "role_id": str(role_id)}
)
@permissions_router.post("/users/{user_id}/roles")
async def add_user_to_role(
@ -212,8 +214,10 @@ def get_permissions_router() -> APIRouter:
from cognee.modules.users.tenants.methods import create_tenant as create_tenant_method
await create_tenant_method(tenant_name=tenant_name, user_id=user.id)
tenant_id = await create_tenant_method(tenant_name=tenant_name, user_id=user.id)
return JSONResponse(status_code=200, content={"message": "Tenant created."})
return JSONResponse(
status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)}
)
return permissions_router

View file

@ -1 +1 @@
from .ui import start_ui, stop_ui, ui
from .ui import start_ui

View file

@ -1,5 +1,7 @@
import os
import platform
import signal
import socket
import subprocess
import threading
import time
@ -7,7 +9,7 @@ import webbrowser
import zipfile
import requests
from pathlib import Path
from typing import Callable, Optional, Tuple
from typing import Callable, Optional, Tuple, List
import tempfile
import shutil
@ -17,6 +19,80 @@ from cognee.version import get_cognee_version
logger = get_logger()
def _stream_process_output(
process: subprocess.Popen, stream_name: str, prefix: str, color_code: str = ""
) -> threading.Thread:
"""
Stream output from a process with a prefix to identify the source.
Args:
process: The subprocess to monitor
stream_name: 'stdout' or 'stderr'
prefix: Text prefix for each line (e.g., '[BACKEND]', '[FRONTEND]')
color_code: ANSI color code for the prefix (optional)
Returns:
Thread that handles the streaming
"""
def stream_reader():
stream = getattr(process, stream_name)
if stream is None:
return
reset_code = "\033[0m" if color_code else ""
try:
for line in iter(stream.readline, b""):
if line:
line_text = line.decode("utf-8").rstrip()
if line_text:
print(f"{color_code}{prefix}{reset_code} {line_text}", flush=True)
except Exception:
pass
finally:
if stream:
stream.close()
thread = threading.Thread(target=stream_reader, daemon=True)
thread.start()
return thread
def _is_port_available(port: int) -> bool:
"""
Check if a port is available on localhost.
Returns True if the port is available, False otherwise.
"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(1) # 1 second timeout
result = sock.connect_ex(("localhost", port))
return result != 0 # Port is available if connection fails
except Exception:
return False
def _check_required_ports(ports_to_check: List[Tuple[int, str]]) -> Tuple[bool, List[str]]:
"""
Check if all required ports are available on localhost.
Args:
ports_to_check: List of (port, service_name) tuples
Returns:
Tuple of (all_available: bool, unavailable_services: List[str])
"""
unavailable = []
for port, service_name in ports_to_check:
if not _is_port_available(port):
unavailable.append(f"{service_name} (port {port})")
logger.error(f"Port {port} is already in use for {service_name}")
return len(unavailable) == 0, unavailable
def normalize_version_for_comparison(version: str) -> str:
"""
Normalize version string for comparison.
@ -214,6 +290,7 @@ def check_node_npm() -> tuple[bool, str]:
Check if Node.js and npm are available.
Returns (is_available, error_message)
"""
try:
# Check Node.js
result = subprocess.run(["node", "--version"], capture_output=True, text=True, timeout=10)
@ -223,8 +300,17 @@ def check_node_npm() -> tuple[bool, str]:
node_version = result.stdout.strip()
logger.debug(f"Found Node.js version: {node_version}")
# Check npm
result = subprocess.run(["npm", "--version"], capture_output=True, text=True, timeout=10)
# Check npm - handle Windows PowerShell scripts
if platform.system() == "Windows":
# On Windows, npm might be a PowerShell script, so we need to use shell=True
result = subprocess.run(
["npm", "--version"], capture_output=True, text=True, timeout=10, shell=True
)
else:
result = subprocess.run(
["npm", "--version"], capture_output=True, text=True, timeout=10
)
if result.returncode != 0:
return False, "npm is not installed or not in PATH"
@ -246,6 +332,7 @@ def install_frontend_dependencies(frontend_path: Path) -> bool:
Install frontend dependencies if node_modules doesn't exist.
This is needed for both development and downloaded frontends since both use npm run dev.
"""
node_modules = frontend_path / "node_modules"
if node_modules.exists():
logger.debug("Frontend dependencies already installed")
@ -254,13 +341,24 @@ def install_frontend_dependencies(frontend_path: Path) -> bool:
logger.info("Installing frontend dependencies (this may take a few minutes)...")
try:
result = subprocess.run(
["npm", "install"],
cwd=frontend_path,
capture_output=True,
text=True,
timeout=300, # 5 minutes timeout
)
# Use shell=True on Windows for npm commands
if platform.system() == "Windows":
result = subprocess.run(
["npm", "install"],
cwd=frontend_path,
capture_output=True,
text=True,
timeout=300, # 5 minutes timeout
shell=True,
)
else:
result = subprocess.run(
["npm", "install"],
cwd=frontend_path,
capture_output=True,
text=True,
timeout=300, # 5 minutes timeout
)
if result.returncode == 0:
logger.info("Frontend dependencies installed successfully")
@ -327,55 +425,111 @@ def prompt_user_for_download() -> bool:
def start_ui(
pid_callback: Callable[[int], None],
host: str = "localhost",
port: int = 3000,
open_browser: bool = True,
auto_download: bool = False,
start_backend: bool = False,
backend_host: str = "localhost",
backend_port: int = 8000,
start_mcp: bool = False,
mcp_port: int = 8001,
) -> Optional[subprocess.Popen]:
"""
Start the cognee frontend UI server, optionally with the backend API server.
Start the cognee frontend UI server, optionally with the backend API server and MCP server.
This function will:
1. Optionally start the cognee backend API server
2. Find the cognee-frontend directory (development) or download it (pip install)
3. Check if Node.js and npm are available (for development mode)
4. Install dependencies if needed (development mode)
5. Start the frontend server
6. Optionally open the browser
2. Optionally start the cognee MCP server
3. Find the cognee-frontend directory (development) or download it (pip install)
4. Check if Node.js and npm are available (for development mode)
5. Install dependencies if needed (development mode)
6. Start the frontend server
7. Optionally open the browser
Args:
pid_callback: Callback to notify with PID of each spawned process
host: Host to bind the frontend server to (default: localhost)
port: Port to run the frontend server on (default: 3000)
open_browser: Whether to open the browser automatically (default: True)
auto_download: If True, download frontend without prompting (default: False)
start_backend: If True, also start the cognee API backend server (default: False)
backend_host: Host to bind the backend server to (default: localhost)
backend_port: Port to run the backend server on (default: 8000)
start_mcp: If True, also start the cognee MCP server (default: False)
mcp_port: Port to run the MCP server on (default: 8001)
Returns:
subprocess.Popen object representing the running frontend server, or None if failed
Note: If backend is started, it runs in a separate process that will be cleaned up
when the frontend process is terminated.
Note: If backend and/or MCP server are started, they run in separate processes
that will be cleaned up when the frontend process is terminated.
Example:
>>> import cognee
>>> def dummy_callback(pid): pass
>>> # Start just the frontend
>>> server = cognee.start_ui()
>>> server = cognee.start_ui(dummy_callback)
>>>
>>> # Start both frontend and backend
>>> server = cognee.start_ui(start_backend=True)
>>> server = cognee.start_ui(dummy_callback, start_backend=True)
>>> # UI will be available at http://localhost:3000
>>> # API will be available at http://localhost:8000
>>> # To stop both servers later:
>>>
>>> # Start frontend with MCP server
>>> server = cognee.start_ui(dummy_callback, start_mcp=True)
>>> # UI will be available at http://localhost:3000
>>> # MCP server will be available at http://127.0.0.1:8001/sse
>>> # To stop all servers later:
>>> server.terminate()
"""
logger.info("Starting cognee UI...")
ports_to_check = [(port, "Frontend UI")]
if start_backend:
ports_to_check.append((backend_port, "Backend API"))
if start_mcp:
ports_to_check.append((mcp_port, "MCP Server"))
logger.info("Checking port availability...")
all_ports_available, unavailable_services = _check_required_ports(ports_to_check)
if not all_ports_available:
error_msg = f"Cannot start cognee UI: The following services have ports already in use: {', '.join(unavailable_services)}"
logger.error(error_msg)
logger.error("Please stop the conflicting services or change the port configuration.")
return None
logger.info("✓ All required ports are available")
backend_process = None
if start_mcp:
logger.info("Starting Cognee MCP server with Docker...")
cwd = os.getcwd()
env_file = os.path.join(cwd, ".env")
try:
mcp_process = subprocess.Popen(
[
"docker",
"run",
"-p",
f"{mcp_port}:8000",
"--rm",
"--env-file",
env_file,
"-e",
"TRANSPORT_MODE=sse",
"cognee/cognee-mcp:daulet-dev",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
)
_stream_process_output(mcp_process, "stdout", "[MCP]", "\033[34m") # Blue
_stream_process_output(mcp_process, "stderr", "[MCP]", "\033[34m") # Blue
pid_callback(mcp_process.pid)
logger.info(f"✓ Cognee MCP server starting on http://127.0.0.1:{mcp_port}/sse")
except Exception as e:
logger.error(f"Failed to start MCP server with Docker: {str(e)}")
# Start backend server if requested
if start_backend:
logger.info("Starting cognee backend API server...")
@ -389,16 +543,19 @@ def start_ui(
"uvicorn",
"cognee.api.client:app",
"--host",
backend_host,
"localhost",
"--port",
str(backend_port),
],
# Inherit stdout/stderr from parent process to show logs
stdout=None,
stderr=None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
)
# Start threads to stream backend output with prefix
_stream_process_output(backend_process, "stdout", "[BACKEND]", "\033[32m") # Green
_stream_process_output(backend_process, "stderr", "[BACKEND]", "\033[32m") # Green
pid_callback(backend_process.pid)
# Give the backend a moment to start
@ -408,7 +565,7 @@ def start_ui(
logger.error("Backend server failed to start - process exited early")
return None
logger.info(f"✓ Backend API started at http://{backend_host}:{backend_port}")
logger.info(f"✓ Backend API started at http://localhost:{backend_port}")
except Exception as e:
logger.error(f"Failed to start backend server: {str(e)}")
@ -453,24 +610,40 @@ def start_ui(
# Prepare environment variables
env = os.environ.copy()
env["HOST"] = host
env["HOST"] = "localhost"
env["PORT"] = str(port)
# Start the development server
logger.info(f"Starting frontend server at http://{host}:{port}")
logger.info(f"Starting frontend server at http://localhost:{port}")
logger.info("This may take a moment to compile and start...")
try:
# Create frontend in its own process group for clean termination
process = subprocess.Popen(
["npm", "run", "dev"],
cwd=frontend_path,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
)
# Use shell=True on Windows for npm commands
if platform.system() == "Windows":
process = subprocess.Popen(
["npm", "run", "dev"],
cwd=frontend_path,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
shell=True,
)
else:
process = subprocess.Popen(
["npm", "run", "dev"],
cwd=frontend_path,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
)
# Start threads to stream frontend output with prefix
_stream_process_output(process, "stdout", "[FRONTEND]", "\033[33m") # Yellow
_stream_process_output(process, "stderr", "[FRONTEND]", "\033[33m") # Yellow
pid_callback(process.pid)
@ -479,10 +652,7 @@ def start_ui(
# Check if process is still running
if process.poll() is not None:
stdout, stderr = process.communicate()
logger.error("Frontend server failed to start:")
logger.error(f"stdout: {stdout}")
logger.error(f"stderr: {stderr}")
logger.error("Frontend server failed to start - check the logs above for details")
return None
# Open browser if requested
@ -491,7 +661,7 @@ def start_ui(
def open_browser_delayed():
time.sleep(5) # Give Next.js time to fully start
try:
webbrowser.open(f"http://{host}:{port}") # TODO: use dashboard url?
webbrowser.open(f"http://localhost:{port}")
except Exception as e:
logger.warning(f"Could not open browser automatically: {e}")
@ -499,13 +669,9 @@ def start_ui(
browser_thread.start()
logger.info("✓ Cognee UI is starting up...")
logger.info(f"✓ Open your browser to: http://{host}:{port}")
logger.info(f"✓ Open your browser to: http://localhost:{port}")
logger.info("✓ The UI will be available once Next.js finishes compiling")
# Store backend process reference in the frontend process for cleanup
if backend_process:
process._cognee_backend_process = backend_process
return process
except Exception as e:
@ -523,102 +689,3 @@ def start_ui(
except (OSError, ProcessLookupError):
pass
return None
def stop_ui(process: subprocess.Popen) -> bool:
"""
Stop a running UI server process and backend process (if started), along with all their children.
Args:
process: The subprocess.Popen object returned by start_ui()
Returns:
bool: True if stopped successfully, False otherwise
"""
if not process:
return False
success = True
try:
# First, stop the backend process if it exists
backend_process = getattr(process, "_cognee_backend_process", None)
if backend_process:
logger.info("Stopping backend server...")
try:
backend_process.terminate()
try:
backend_process.wait(timeout=5)
logger.info("Backend server stopped gracefully")
except subprocess.TimeoutExpired:
logger.warning("Backend didn't terminate gracefully, forcing kill")
backend_process.kill()
backend_process.wait()
logger.info("Backend server stopped")
except Exception as e:
logger.error(f"Error stopping backend server: {str(e)}")
success = False
# Now stop the frontend process
logger.info("Stopping frontend server...")
# Try to terminate the process group (includes child processes like Next.js)
if hasattr(os, "killpg"):
try:
# Kill the entire process group
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
logger.debug("Sent SIGTERM to process group")
except (OSError, ProcessLookupError):
# Fall back to terminating just the main process
process.terminate()
logger.debug("Terminated main process only")
else:
process.terminate()
logger.debug("Terminated main process (Windows)")
try:
process.wait(timeout=10)
logger.info("Frontend server stopped gracefully")
except subprocess.TimeoutExpired:
logger.warning("Frontend didn't terminate gracefully, forcing kill")
# Force kill the process group
if hasattr(os, "killpg"):
try:
os.killpg(os.getpgid(process.pid), signal.SIGKILL)
logger.debug("Sent SIGKILL to process group")
except (OSError, ProcessLookupError):
process.kill()
logger.debug("Force killed main process only")
else:
process.kill()
logger.debug("Force killed main process (Windows)")
process.wait()
if success:
logger.info("UI servers stopped successfully")
return success
except Exception as e:
logger.error(f"Error stopping UI servers: {str(e)}")
return False
# Convenience function similar to DuckDB's approach
def ui() -> Optional[subprocess.Popen]:
"""
Convenient alias for start_ui() with default parameters.
Similar to how DuckDB provides simple ui() function.
"""
return start_ui()
if __name__ == "__main__":
# Test the UI startup
server = start_ui()
if server:
try:
input("Press Enter to stop the server...")
finally:
stop_ui(server)

View file

@ -0,0 +1 @@
from .update import update

View file

@ -0,0 +1 @@
from .get_update_router import get_update_router

View file

@ -0,0 +1,90 @@
from fastapi.responses import JSONResponse
from fastapi import File, UploadFile, Depends, Form
from typing import Optional
from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder
from typing import List
from uuid import UUID
from cognee.shared.logging_utils import get_logger
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
from cognee.shared.utils import send_telemetry
from cognee.modules.pipelines.models.PipelineRunInfo import (
PipelineRunErrored,
)
logger = get_logger()
def get_update_router() -> APIRouter:
router = APIRouter()
@router.patch("", response_model=None)
async def update(
data_id: UUID,
dataset_id: UUID,
data: List[UploadFile] = File(default=None),
node_set: Optional[List[str]] = Form(default=[""], example=[""]),
user: User = Depends(get_authenticated_user),
):
"""
Update data in a dataset.
This endpoint updates existing documents in a specified dataset by providing the data_id of the existing document
to update and the new document with the changes as the data.
The document is updated, analyzed, and the changes are integrated into the knowledge graph.
## Request Parameters
- **data_id** (UUID): UUID of the document to update in Cognee memory
- **data** (List[UploadFile]): List of files to upload.
- **datasetId** (Optional[UUID]): UUID of an already existing dataset
- **node_set** Optional[list[str]]: List of node identifiers for graph organization and access control.
Used for grouping related data points in the knowledge graph.
## Response
Returns information about the add operation containing:
- Status of the operation
- Details about the processed data
- Any relevant metadata from the ingestion process
## Error Codes
- **400 Bad Request**: Neither datasetId nor datasetName provided
- **409 Conflict**: Error during add operation
- **403 Forbidden**: User doesn't have permission to add to dataset
## Notes
- To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True)
- datasetId value can only be the UUID of an already existing dataset
"""
send_telemetry(
"Update API Endpoint Invoked",
user.id,
additional_properties={
"endpoint": "PATCH /v1/update",
"dataset_id": str(dataset_id),
"data_id": str(data_id),
"node_set": str(node_set),
},
)
from cognee.api.v1.update import update as cognee_update
try:
update_run = await cognee_update(
data_id=data_id,
data=data,
dataset_id=dataset_id,
user=user,
node_set=node_set,
)
# If any cognify run errored return JSONResponse with proper error status code
if any(isinstance(v, PipelineRunErrored) for v in update_run.values()):
return JSONResponse(status_code=420, content=jsonable_encoder(update_run))
return update_run
except Exception as error:
logger.error(f"Error during deletion by data_id: {str(error)}")
return JSONResponse(status_code=409, content={"error": str(error)})
return router

View file

@ -0,0 +1,100 @@
from uuid import UUID
from typing import Union, BinaryIO, List, Optional
from cognee.modules.users.models import User
from cognee.api.v1.delete import delete
from cognee.api.v1.add import add
from cognee.api.v1.cognify import cognify
async def update(
data_id: UUID,
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
user: User = None,
node_set: Optional[List[str]] = None,
dataset_id: Optional[UUID] = None,
vector_db_config: dict = None,
graph_db_config: dict = None,
preferred_loaders: List[str] = None,
incremental_loading: bool = True,
):
"""
Update existing data in Cognee.
Supported Input Types:
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
- **File paths**: Local file paths as strings in these formats:
* Absolute paths: "/path/to/document.pdf"
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
* S3 paths: "s3://bucket-name/path/to/file.pdf"
- **Binary file objects**: File handles/streams (BinaryIO)
- **Lists**: Multiple files or text strings in a single call
Supported File Formats:
- Text files (.txt, .md, .csv)
- PDFs (.pdf)
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
- Audio files (.mp3, .wav) - transcribed to text
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
- Office documents (.docx, .pptx)
Workflow:
1. **Data Resolution**: Resolves file paths and validates accessibility
2. **Content Extraction**: Extracts text content from various file formats
3. **Dataset Storage**: Stores processed content in the specified dataset
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
Args:
data_id: UUID of existing data to update
data: The latest version of the data. Can be:
- Single text string: "Your text content here"
- Absolute file path: "/path/to/document.pdf"
- File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
- S3 path: "s3://my-bucket/documents/file.pdf"
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
- Binary file object: open("file.txt", "rb")
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
Create separate datasets to organize different knowledge domains.
user: User object for authentication and permissions. Uses default user if None.
Default user: "default_user@example.com" (created automatically on first use).
Users can only access datasets they have permissions for.
node_set: Optional list of node identifiers for graph organization and access control.
Used for grouping related data points in the knowledge graph.
vector_db_config: Optional configuration for vector database (for custom setups).
graph_db_config: Optional configuration for graph database (for custom setups).
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
Returns:
PipelineRunInfo: Information about the ingestion pipeline execution including:
- Pipeline run ID for tracking
- Dataset ID where data was stored
- Processing status and any errors
- Execution timestamps and metadata
"""
await delete(
data_id=data_id,
dataset_id=dataset_id,
user=user,
)
await add(
data=data,
dataset_id=dataset_id,
user=user,
node_set=node_set,
vector_db_config=vector_db_config,
graph_db_config=graph_db_config,
preferred_loaders=preferred_loaders,
incremental_loading=incremental_loading,
)
cognify_run = await cognify(
datasets=[dataset_id],
user=user,
vector_db_config=vector_db_config,
graph_db_config=graph_db_config,
incremental_loading=incremental_loading,
)
return cognify_run

View file

@ -183,10 +183,20 @@ def main() -> int:
for pid in spawned_pids:
try:
pgid = os.getpgid(pid)
os.killpg(pgid, signal.SIGTERM)
fmt.success(f"✓ Process group {pgid} (PID {pid}) terminated.")
except (OSError, ProcessLookupError) as e:
if hasattr(os, "killpg"):
# Unix-like systems: Use process groups
pgid = os.getpgid(pid)
os.killpg(pgid, signal.SIGTERM)
fmt.success(f"✓ Process group {pgid} (PID {pid}) terminated.")
else:
# Windows: Use taskkill to terminate process and its children
subprocess.run(
["taskkill", "/F", "/T", "/PID", str(pid)],
capture_output=True,
check=False,
)
fmt.success(f"✓ Process {pid} and its children terminated.")
except (OSError, ProcessLookupError, subprocess.SubprocessError) as e:
fmt.warning(f"Could not terminate process {pid}: {e}")
sys.exit(0)
@ -204,19 +214,27 @@ def main() -> int:
nonlocal spawned_pids
spawned_pids.append(pid)
frontend_port = 3000
start_backend, backend_port = True, 8000
start_mcp, mcp_port = True, 8001
server_process = start_ui(
host="localhost",
port=3000,
open_browser=True,
start_backend=True,
auto_download=True,
pid_callback=pid_callback,
port=frontend_port,
open_browser=True,
auto_download=True,
start_backend=start_backend,
backend_port=backend_port,
start_mcp=start_mcp,
mcp_port=mcp_port,
)
if server_process:
fmt.success("UI server started successfully!")
fmt.echo("The interface is available at: http://localhost:3000")
fmt.echo("The API backend is available at: http://localhost:8000")
fmt.echo(f"The interface is available at: http://localhost:{frontend_port}")
if start_backend:
fmt.echo(f"The API backend is available at: http://localhost:{backend_port}")
if start_mcp:
fmt.echo(f"The MCP server is available at: http://localhost:{mcp_port}")
fmt.note("Press Ctrl+C to stop the server...")
try:

View file

@ -1,6 +1,6 @@
import os
import json
import pandas as pd
import subprocess
import modal
import streamlit as st
@ -78,6 +78,14 @@ def main():
}
)
try:
import pandas as pd
except ImportError:
st.error(
"Pandas is required for the evaluation dashboard. Install with 'pip install cognee\"[evals]\"' to use this feature."
)
return
df = pd.DataFrame(records)
if df.empty:
st.warning("No JSON files found in the volume.")

View file

@ -1,48 +0,0 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
from cognee.shared.utils import extract_pos_tags
def extract_keywords(text: str) -> list[str]:
"""
Extract keywords from the provided text string.
This function raises an KeyWordExtractionError if the input text is empty. It processes the
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
relevant keywords based on their frequency. The function returns a list of up to 15
keywords, each having more than 3 characters.
Parameters:
-----------
- text (str): The input text from which to extract keywords.
Returns:
--------
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns
with more than 3 characters.
"""
if len(text) == 0:
raise KeywordExtractionError()
tags = extract_pos_tags(text)
nouns = [word for (word, tag) in tags if tag == "NN"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(nouns)
top_nouns = sorted(
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
)
keywords = []
for word in top_nouns:
if len(word) > 3:
keywords.append(word)
if len(keywords) >= 15:
break
return keywords

View file

@ -234,7 +234,7 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
collection_name: str,
query_text: Optional[str] = None,
query_vector: Optional[List[float]] = None,
limit: int = None,
limit: Optional[int] = None,
with_vector: bool = False,
):
"""
@ -265,10 +265,10 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
"Use this option only when vector data is required."
)
# In the case of excessive limit, or zero / negative value, limit will be set to 10.
# In the case of excessive limit, or None / zero / negative value, limit will be set to 10.
if not limit or limit <= self._TOPK_LOWER_BOUND or limit > self._TOPK_UPPER_BOUND:
logger.warning(
"Provided limit (%s) is invalid (zero, negative, or exceeds maximum). "
"Provided limit (%s) is invalid (None, zero, negative, or exceeds maximum). "
"Defaulting to limit=10.",
limit,
)

View file

@ -39,8 +39,16 @@ def create_relational_engine(
connection_string = f"sqlite+aiosqlite:///{db_path}/{db_name}"
if db_provider == "postgres":
connection_string = (
f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
)
try:
# Test if asyncpg is available
import asyncpg
connection_string = (
f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
)
except ImportError:
raise ImportError(
"PostgreSQL dependencies are not installed. Please install with 'pip install cognee\"[postgres]\"' or 'pip install cognee\"[postgres-binary]\"' to use PostgreSQL functionality."
)
return SQLAlchemyAdapter(connection_string)

View file

@ -352,7 +352,7 @@ class ChromaDBAdapter(VectorDBInterface):
collection_name: str,
query_text: str = None,
query_vector: List[float] = None,
limit: int = 15,
limit: Optional[int] = 15,
with_vector: bool = False,
normalized: bool = True,
):
@ -386,9 +386,13 @@ class ChromaDBAdapter(VectorDBInterface):
try:
collection = await self.get_collection(collection_name)
if limit == 0:
if limit is None:
limit = await collection.count()
# If limit is still 0, no need to do the search, just return empty results
if limit <= 0:
return []
results = await collection.query(
query_embeddings=[query_vector],
include=["metadatas", "distances", "embeddings"]
@ -428,7 +432,7 @@ class ChromaDBAdapter(VectorDBInterface):
for row in vector_list
]
except Exception as e:
logger.error(f"Error in search: {str(e)}")
logger.warning(f"Error in search: {str(e)}")
return []
async def batch_search(

View file

@ -66,7 +66,12 @@ def create_vector_engine(
f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
)
from .pgvector.PGVectorAdapter import PGVectorAdapter
try:
from .pgvector.PGVectorAdapter import PGVectorAdapter
except ImportError:
raise ImportError(
"PostgreSQL dependencies are not installed. Please install with 'pip install cognee\"[postgres]\"' or 'pip install cognee\"[postgres-binary]\"' to use PGVector functionality."
)
return PGVectorAdapter(
connection_string,

View file

@ -205,9 +205,12 @@ class LanceDBAdapter(VectorDBInterface):
collection = await self.get_collection(collection_name)
if len(data_point_ids) == 1:
results = await collection.query().where(f"id = '{data_point_ids[0]}'").to_pandas()
results = await collection.query().where(f"id = '{data_point_ids[0]}'")
else:
results = await collection.query().where(f"id IN {tuple(data_point_ids)}").to_pandas()
results = await collection.query().where(f"id IN {tuple(data_point_ids)}")
# Convert query results to list format
results_list = results.to_list() if hasattr(results, "to_list") else list(results)
return [
ScoredResult(
@ -215,7 +218,7 @@ class LanceDBAdapter(VectorDBInterface):
payload=result["payload"],
score=0,
)
for result in results.to_dict("index").values()
for result in results_list
]
async def search(
@ -223,7 +226,7 @@ class LanceDBAdapter(VectorDBInterface):
collection_name: str,
query_text: str = None,
query_vector: List[float] = None,
limit: int = 15,
limit: Optional[int] = 15,
with_vector: bool = False,
normalized: bool = True,
):
@ -235,16 +238,14 @@ class LanceDBAdapter(VectorDBInterface):
collection = await self.get_collection(collection_name)
if limit == 0:
if limit is None:
limit = await collection.count_rows()
# LanceDB search will break if limit is 0 so we must return
if limit == 0:
if limit <= 0:
return []
results = await collection.vector_search(query_vector).limit(limit).to_pandas()
result_values = list(results.to_dict("index").values())
result_values = await collection.vector_search(query_vector).limit(limit).to_list()
if not result_values:
return []
@ -264,7 +265,7 @@ class LanceDBAdapter(VectorDBInterface):
self,
collection_name: str,
query_texts: List[str],
limit: int = None,
limit: Optional[int] = None,
with_vectors: bool = False,
):
query_vectors = await self.embedding_engine.embed_text(query_texts)

View file

@ -3,13 +3,12 @@ from typing import List, Optional, get_type_hints
from sqlalchemy.inspection import inspect
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import JSON, Column, Table, select, delete, MetaData
from sqlalchemy import JSON, Column, Table, select, delete, MetaData, func
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
from sqlalchemy.exc import ProgrammingError
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
from asyncpg import DeadlockDetectedError, DuplicateTableError, UniqueViolationError
from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
@ -299,7 +298,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
collection_name: str,
query_text: Optional[str] = None,
query_vector: Optional[List[float]] = None,
limit: int = 15,
limit: Optional[int] = 15,
with_vector: bool = False,
) -> List[ScoredResult]:
if query_text is None and query_vector is None:
@ -311,6 +310,16 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
# Get PGVectorDataPoint Table from database
PGVectorDataPoint = await self.get_table(collection_name)
if limit is None:
async with self.get_async_session() as session:
query = select(func.count()).select_from(PGVectorDataPoint)
result = await session.execute(query)
limit = result.scalar_one()
# If limit is still 0, no need to do the search, just return empty results
if limit <= 0:
return []
# NOTE: This needs to be initialized in case search doesn't return a value
closest_items = []

View file

@ -83,7 +83,7 @@ class VectorDBInterface(Protocol):
collection_name: str,
query_text: Optional[str],
query_vector: Optional[List[float]],
limit: int,
limit: Optional[int],
with_vector: bool = False,
):
"""
@ -98,7 +98,7 @@ class VectorDBInterface(Protocol):
collection.
- query_vector (Optional[List[float]]): An optional vector representation for
searching the collection.
- limit (int): The maximum number of results to return from the search.
- limit (Optional[int]): The maximum number of results to return from the search.
- with_vector (bool): Whether to return the vector representations with search
results. (default False)
"""
@ -106,7 +106,11 @@ class VectorDBInterface(Protocol):
@abstractmethod
async def batch_search(
self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False
self,
collection_name: str,
query_texts: List[str],
limit: Optional[int],
with_vectors: bool = False,
):
"""
Perform a batch search using multiple text queries against a collection.
@ -116,7 +120,7 @@ class VectorDBInterface(Protocol):
- collection_name (str): The name of the collection to conduct the batch search in.
- query_texts (List[str]): A list of text queries to use for the search.
- limit (int): The maximum number of results to return for each query.
- limit (Optional[int]): The maximum number of results to return for each query.
- with_vectors (bool): Whether to include vector representations with search
results. (default False)
"""

View file

@ -1,6 +1,5 @@
import os
import s3fs
from typing import BinaryIO, Union
from typing import BinaryIO, Union, TYPE_CHECKING
from contextlib import asynccontextmanager
from cognee.infrastructure.files.storage.s3_config import get_s3_config
@ -8,23 +7,34 @@ from cognee.infrastructure.utils.run_async import run_async
from cognee.infrastructure.files.storage.FileBufferedReader import FileBufferedReader
from .storage import Storage
if TYPE_CHECKING:
import s3fs
class S3FileStorage(Storage):
"""
Manage local file storage operations such as storing, retrieving, and managing files on
the filesystem.
Manage S3 file storage operations such as storing, retrieving, and managing files on
S3-compatible storage.
"""
storage_path: str
s3: s3fs.S3FileSystem
s3: "s3fs.S3FileSystem"
def __init__(self, storage_path: str):
try:
import s3fs
except ImportError:
raise ImportError(
's3fs is required for S3FileStorage. Install it with: pip install cognee"[aws]"'
)
self.storage_path = storage_path
s3_config = get_s3_config()
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
self.s3 = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id,
secret=s3_config.aws_secret_access_key,
token=s3_config.aws_session_token,
anon=False,
endpoint_url=s3_config.aws_endpoint_url,
client_kwargs={"region_name": s3_config.aws_region},

View file

@ -8,6 +8,7 @@ class S3Config(BaseSettings):
aws_endpoint_url: Optional[str] = None
aws_access_key_id: Optional[str] = None
aws_secret_access_key: Optional[str] = None
aws_session_token: Optional[str] = None
model_config = SettingsConfigDict(env_file=".env", extra="allow")

View file

@ -4,7 +4,6 @@ from urllib.parse import urlparse
from contextlib import asynccontextmanager
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
from cognee.infrastructure.files.storage.S3FileStorage import S3FileStorage
from cognee.infrastructure.files.storage.LocalFileStorage import LocalFileStorage
@ -23,23 +22,17 @@ async def open_data_file(file_path: str, mode: str = "rb", encoding: str = None,
yield file
elif file_path.startswith("s3://"):
try:
from cognee.infrastructure.files.storage.S3FileStorage import S3FileStorage
except ImportError:
raise ImportError(
"S3 dependencies are not installed. Please install with 'pip install cognee\"[aws]\"' to use S3 functionality."
)
normalized_url = get_data_file_path(file_path)
s3_dir_path = os.path.dirname(normalized_url)
s3_filename = os.path.basename(normalized_url)
# if "/" in s3_path:
# s3_dir = "/".join(s3_path.split("/")[:-1])
# s3_filename = s3_path.split("/")[-1]
# else:
# s3_dir = ""
# s3_filename = s3_path
# Extract filesystem path from S3 URL structure
# file_dir_path = (
# f"s3://{parsed_url.netloc}/{s3_dir}" if s3_dir else f"s3://{parsed_url.netloc}"
# )
# file_name = s3_filename
file_storage = S3FileStorage(s3_dir_path)
async with file_storage.open(s3_filename, mode=mode, **kwargs) as file:

View file

@ -1,9 +1,13 @@
import os
from typing import Optional, ClassVar
from typing import Optional, ClassVar, Any
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import model_validator
from baml_py import ClientRegistry
try:
from baml_py import ClientRegistry
except ImportError:
ClientRegistry = None
class LLMConfig(BaseSettings):
@ -65,27 +69,36 @@ class LLMConfig(BaseSettings):
fallback_endpoint: str = ""
fallback_model: str = ""
baml_registry: ClassVar[ClientRegistry] = ClientRegistry()
baml_registry: Optional[Any] = None
model_config = SettingsConfigDict(env_file=".env", extra="allow")
def model_post_init(self, __context) -> None:
"""Initialize the BAML registry after the model is created."""
raw_options = {
"model": self.baml_llm_model,
"temperature": self.baml_llm_temperature,
"api_key": self.baml_llm_api_key,
"base_url": self.baml_llm_endpoint,
"api_version": self.baml_llm_api_version,
}
# Check if BAML is selected as structured output framework but not available
if self.structured_output_framework.lower() == "baml" and ClientRegistry is None:
raise ImportError(
"BAML is selected as structured output framework but not available. "
"Please install with 'pip install cognee\"[baml]\"' to use BAML extraction features."
)
elif self.structured_output_framework.lower() == "baml" and ClientRegistry is not None:
self.baml_registry = ClientRegistry()
# Note: keep the item only when the value is not None or an empty string (they would override baml default values)
options = {k: v for k, v in raw_options.items() if v not in (None, "")}
self.baml_registry.add_llm_client(
name=self.baml_llm_provider, provider=self.baml_llm_provider, options=options
)
# Sets the primary client
self.baml_registry.set_primary(self.baml_llm_provider)
raw_options = {
"model": self.baml_llm_model,
"temperature": self.baml_llm_temperature,
"api_key": self.baml_llm_api_key,
"base_url": self.baml_llm_endpoint,
"api_version": self.baml_llm_api_version,
}
# Note: keep the item only when the value is not None or an empty string (they would override baml default values)
options = {k: v for k, v in raw_options.items() if v not in (None, "")}
self.baml_registry.add_llm_client(
name=self.baml_llm_provider, provider=self.baml_llm_provider, options=options
)
# Sets the primary client
self.baml_registry.set_primary(self.baml_llm_provider)
@model_validator(mode="after")
def ensure_env_vars_for_ollama(self) -> "LLMConfig":

View file

@ -26,6 +26,7 @@ def read_query_prompt(prompt_file_name: str, base_directory: str = None):
read due to an error.
"""
logger = get_logger(level=ERROR)
try:
if base_directory is None:
base_directory = get_absolute_path("./infrastructure/llm/prompts")
@ -35,8 +36,8 @@ def read_query_prompt(prompt_file_name: str, base_directory: str = None):
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
except FileNotFoundError:
logger.error(f"Error: Prompt file not found. Attempted to read: %s {file_path}")
logger.error(f"Error: Prompt file not found. Attempted to read: {file_path}")
return None
except Exception as e:
logger.error(f"An error occurred: %s {e}")
logger.error(f"An error occurred: {e}")
return None

View file

@ -0,0 +1 @@
Respond with: test

View file

@ -53,7 +53,8 @@ async def acreate_structured_output(
# Transform BAML response to proper pydantic reponse model
if response_model is str:
return str(result)
# Note: when a response model is set to string in python, result is stored in text property in the BAML response model
return str(result.text)
return response_model.model_validate(result.dict())

View file

@ -29,9 +29,6 @@ observe = get_observe()
logger = get_logger()
# litellm to drop unsupported params, e.g., reasoning_effort when not supported by the model.
litellm.drop_params = True
class OpenAIAdapter(LLMInterface):
"""
@ -76,8 +73,10 @@ class OpenAIAdapter(LLMInterface):
fallback_api_key: str = None,
fallback_endpoint: str = None,
):
self.aclient = instructor.from_litellm(litellm.acompletion)
self.client = instructor.from_litellm(litellm.completion)
self.aclient = instructor.from_litellm(
litellm.acompletion, mode=instructor.Mode.JSON_SCHEMA
)
self.client = instructor.from_litellm(litellm.completion, mode=instructor.Mode.JSON_SCHEMA)
self.transcription_model = transcription_model
self.model = model
self.api_key = api_key
@ -135,7 +134,6 @@ class OpenAIAdapter(LLMInterface):
api_version=self.api_version,
response_model=response_model,
max_retries=self.MAX_RETRIES,
reasoning_effort="minimal",
)
except (
ContentFilterFinishReasonError,
@ -223,7 +221,6 @@ class OpenAIAdapter(LLMInterface):
api_base=self.endpoint,
api_version=self.api_version,
response_model=response_model,
reasoning_effort="minimal",
max_retries=self.MAX_RETRIES,
)

View file

@ -63,7 +63,7 @@ def get_model_max_completion_tokens(model_name: str):
max_completion_tokens = litellm.model_cost[model_name]["max_tokens"]
logger.debug(f"Max input tokens for {model_name}: {max_completion_tokens}")
else:
logger.info("Model not found in LiteLLM's model_cost.")
logger.debug("Model not found in LiteLLM's model_cost.")
return max_completion_tokens

View file

@ -161,7 +161,7 @@ class CogneeGraph(CogneeAbstractGraph):
edge_distances = await vector_engine.search(
collection_name="EdgeType_relationship_name",
query_vector=query_vector,
limit=0,
limit=None,
)
projection_time = time.time() - start_time
logger.info(

View file

@ -1,7 +1,6 @@
from typing import BinaryIO
from contextlib import asynccontextmanager
import hashlib
from cognee.infrastructure.data.utils.extract_keywords import extract_keywords
from .IngestionData import IngestionData

View file

@ -42,14 +42,12 @@ class TripletSearchContextProvider(BaseContextProvider):
self,
entities: List[DataPoint],
query: str,
user: User,
memory_fragment: CogneeGraph,
) -> List:
"""Creates search tasks for valid entities."""
tasks = [
brute_force_triplet_search(
query=f"{entity_text} {query}",
user=user,
top_k=self.top_k,
collections=self.collections,
properties_to_project=self.properties_to_project,
@ -84,9 +82,8 @@ class TripletSearchContextProvider(BaseContextProvider):
if not entities:
return "No entities provided for context search."
user = await get_default_user()
memory_fragment = await get_memory_fragment(self.properties_to_project)
search_tasks = self._get_search_tasks(entities, query, user, memory_fragment)
search_tasks = self._get_search_tasks(entities, query, memory_fragment)
if not search_tasks:
return "No valid entities found for context search."

View file

@ -93,11 +93,8 @@ class GraphCompletionRetriever(BaseGraphRetriever):
for field_name in index_fields:
vector_index_collections.append(f"{subclass.__name__}_{field_name}")
user = await get_default_user()
found_triplets = await brute_force_triplet_search(
query,
user=user,
top_k=self.top_k,
collections=vector_index_collections or None,
node_type=self.node_type,

View file

@ -25,7 +25,7 @@ class InsightsRetriever(BaseGraphRetriever):
- top_k
"""
def __init__(self, exploration_levels: int = 1, top_k: int = 5):
def __init__(self, exploration_levels: int = 1, top_k: Optional[int] = 5):
"""Initialize retriever with exploration levels and search parameters."""
self.exploration_levels = exploration_levels
self.top_k = top_k

View file

@ -130,7 +130,7 @@ class TemporalRetriever(GraphCompletionRetriever):
query_vector = (await vector_engine.embedding_engine.embed_text([query]))[0]
vector_search_results = await vector_engine.search(
collection_name="Event_name", query_vector=query_vector, limit=0
collection_name="Event_name", query_vector=query_vector, limit=None
)
top_k_events = await self.filter_top_k_events(relevant_events, vector_search_results)

View file

@ -89,7 +89,6 @@ async def get_memory_fragment(
async def brute_force_triplet_search(
query: str,
user: User,
top_k: int = 5,
collections: Optional[List[str]] = None,
properties_to_project: Optional[List[str]] = None,
@ -102,7 +101,6 @@ async def brute_force_triplet_search(
Args:
query (str): The search query.
user (User): The user performing the search.
top_k (int): The number of top results to retrieve.
collections (Optional[List[str]]): List of collections to query.
properties_to_project (Optional[List[str]]): List of properties to project.
@ -139,12 +137,10 @@ async def brute_force_triplet_search(
query_vector = (await vector_engine.embedding_engine.embed_text([query]))[0]
send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id)
async def search_in_collection(collection_name: str):
try:
return await vector_engine.search(
collection_name=collection_name, query_vector=query_vector, limit=0
collection_name=collection_name, query_vector=query_vector, limit=None
)
except CollectionNotFoundError:
return []
@ -176,20 +172,14 @@ async def brute_force_triplet_search(
results = await memory_fragment.calculate_top_triplet_importances(k=top_k)
send_telemetry("cognee.brute_force_triplet_search EXECUTION COMPLETED", user.id)
return results
except CollectionNotFoundError:
return []
except Exception as error:
logger.error(
"Error during brute force search for user: %s, query: %s. Error: %s",
user.id,
"Error during brute force search for query: %s. Error: %s",
query,
error,
)
send_telemetry(
"cognee.brute_force_triplet_search EXECUTION FAILED", user.id, {"error": str(error)}
)
raise error

View file

@ -35,7 +35,7 @@ async def no_access_control_search(
[get_completion, get_context] = search_tools
if only_context:
return await get_context(query_text)
return None, await get_context(query_text), []
context = await get_context(query_text)
result = await get_completion(query_text, context)

View file

@ -143,20 +143,35 @@ async def search(
context = prepared_search_results["context"]
datasets = prepared_search_results["datasets"]
return_value.append(
{
"search_result": [result] if result else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"graphs": graphs,
}
)
if only_context:
return_value.append(
{
"search_result": [context] if context else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"graphs": graphs,
}
)
else:
return_value.append(
{
"search_result": [result] if result else None,
"dataset_id": datasets[0].id,
"dataset_name": datasets[0].name,
"graphs": graphs,
}
)
return return_value
else:
return_value = []
for search_result in search_results:
result, context, datasets = search_result
return_value.append(result)
if only_context:
for search_result in search_results:
prepared_search_results = await prepare_search_result(search_result)
return_value.append(prepared_search_results["context"])
else:
for search_result in search_results:
result, context, datasets = search_result
return_value.append(result)
# For maintaining backwards compatibility
if len(return_value) == 1 and isinstance(return_value[0], list):
return return_value[0]

View file

@ -15,7 +15,7 @@ from cognee.modules.users.models import (
async def create_role(
role_name: str,
owner_id: UUID,
):
) -> UUID:
"""
Create a new role with the given name, if the request owner with the given id
has the necessary permission.
@ -45,3 +45,4 @@ async def create_role(
await session.commit()
await session.refresh(role)
return role.id

View file

@ -7,7 +7,7 @@ from cognee.modules.users.models import Tenant
from cognee.modules.users.methods import get_user
async def create_tenant(tenant_name: str, user_id: UUID):
async def create_tenant(tenant_name: str, user_id: UUID) -> UUID:
"""
Create a new tenant with the given name, for the user with the given id.
This user is the owner of the tenant.
@ -34,5 +34,6 @@ async def create_tenant(tenant_name: str, user_id: UUID):
user.tenant_id = tenant.id
await session.merge(user)
await session.commit()
return tenant.id
except IntegrityError:
raise EntityAlreadyExistsError(message="Tenant already exists.")

View file

@ -1,6 +1,5 @@
import os
import json
import networkx
from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.files.storage.LocalFileStorage import LocalFileStorage
@ -9,6 +8,8 @@ logger = get_logger()
async def cognee_network_visualization(graph_data, destination_file_path: str = None):
import networkx
nodes_data, edges_data = graph_data
G = networkx.DiGraph()
@ -22,6 +23,9 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
"TableRow": "#f47710",
"TableType": "#6510f4",
"ColumnValue": "#13613a",
"SchemaTable": "#f47710",
"DatabaseSchema": "#6510f4",
"SchemaRelationship": "#13613a",
"default": "#D3D3D3",
}
@ -104,7 +108,7 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
.nodes circle { stroke: white; stroke-width: 0.5px; filter: drop-shadow(0 0 5px rgba(255,255,255,0.3)); }
.node-label { font-size: 5px; font-weight: bold; fill: white; text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; }
.edge-label { font-size: 3px; fill: rgba(255, 255, 255, 0.7); text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; }
.tooltip {
position: absolute;
text-align: left;
@ -166,7 +170,7 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
// Create tooltip content for edge
var content = "<strong>Edge Information</strong><br/>";
content += "Relationship: " + d.relation + "<br/>";
// Show all weights
if (d.all_weights && Object.keys(d.all_weights).length > 0) {
content += "<strong>Weights:</strong><br/>";
@ -176,23 +180,23 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
} else if (d.weight !== null && d.weight !== undefined) {
content += "Weight: " + d.weight + "<br/>";
}
if (d.relationship_type) {
content += "Type: " + d.relationship_type + "<br/>";
}
// Add other edge properties
if (d.edge_info) {
Object.keys(d.edge_info).forEach(function(key) {
if (key !== 'weight' && key !== 'weights' && key !== 'relationship_type' &&
key !== 'source_node_id' && key !== 'target_node_id' &&
key !== 'relationship_name' && key !== 'updated_at' &&
if (key !== 'weight' && key !== 'weights' && key !== 'relationship_type' &&
key !== 'source_node_id' && key !== 'target_node_id' &&
key !== 'relationship_name' && key !== 'updated_at' &&
!key.startsWith('weight_')) {
content += key + ": " + d.edge_info[key] + "<br/>";
}
});
}
tooltip.html(content)
.style("left", (d3.event.pageX + 10) + "px")
.style("top", (d3.event.pageY - 10) + "px")

View file

@ -4,7 +4,6 @@ import os
import ssl
import requests
from datetime import datetime, timezone
import matplotlib.pyplot as plt
import http.server
import socketserver
from threading import Thread
@ -30,37 +29,6 @@ def create_secure_ssl_context() -> ssl.SSLContext:
return ssl.create_default_context()
def get_entities(tagged_tokens):
import nltk
nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk
return ne_chunk(tagged_tokens)
def extract_pos_tags(sentence):
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
import nltk
# Ensure that the necessary NLTK resources are downloaded
nltk.download("words", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# Tokenize the sentence into words
tokens = word_tokenize(sentence)
# Tag each word with its corresponding POS tag
pos_tags = pos_tag(tokens)
return pos_tags
def get_anonymous_id():
"""Creates or reads a anonymous user id"""
tracking_id = os.getenv("TRACKING_ID", None)

View file

@ -1,309 +0,0 @@
# PROPOSED TO BE DEPRECATED
"""This module contains the OntologyEngine class which is responsible for adding graph ontology from a JSON or CSV file."""
import csv
import json
from cognee.shared.logging_utils import get_logger
from datetime import datetime, timezone
from fastapi import status
from typing import Any, Dict, List, Optional, Union, Type
import aiofiles
import pandas as pd
from pydantic import BaseModel
from cognee.modules.graph.exceptions import EntityNotFoundError
from cognee.modules.ingestion.exceptions import IngestionError
from cognee.infrastructure.data.chunking.config import get_chunk_config
from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_engine
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
from cognee.modules.data.methods.add_model_class_to_graph import (
add_model_class_to_graph,
)
from cognee.tasks.graph.models import NodeModel, GraphOntology
from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.engine.utils import generate_node_id, generate_node_name
from cognee.infrastructure.llm.LLMGateway import LLMGateway
logger = get_logger("task:infer_data_ontology")
async def extract_ontology(content: str, response_model: Type[BaseModel]):
"""
Extracts structured ontology from the provided content using a pre-defined LLM client.
This asynchronous function retrieves a system prompt from a file and utilizes an LLM
client to create a structured output based on the input content and specified response
model.
Parameters:
-----------
- content (str): The content from which to extract the ontology.
- response_model (Type[BaseModel]): The model that defines the structure of the
output ontology.
Returns:
--------
The structured ontology extracted from the content.
"""
system_prompt = LLMGateway.read_query_prompt("extract_ontology.txt")
ontology = await LLMGateway.acreate_structured_output(content, system_prompt, response_model)
return ontology
class OntologyEngine:
"""
Manage ontology data and operations for graph structures, providing methods for data
loading, flattening models, and adding ontological relationships to a graph database.
Public methods:
- flatten_model
- recursive_flatten
- load_data
- add_graph_ontology
"""
async def flatten_model(
self, model: NodeModel, parent_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Flatten the model to a dictionary including optional parent ID and relationship details
if available.
Parameters:
-----------
- model (NodeModel): The NodeModel instance to flatten.
- parent_id (Optional[str]): An optional ID of the parent node for hierarchical
purposes. (default None)
Returns:
--------
- Dict[str, Any]: A dictionary representation of the model with flattened
attributes.
"""
result = model.dict()
result["parent_id"] = parent_id
if model.default_relationship:
result.update(
{
"relationship_type": model.default_relationship.type,
"relationship_source": model.default_relationship.source,
"relationship_target": model.default_relationship.target,
}
)
return result
async def recursive_flatten(
self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
Parameters:
-----------
- items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
containing models to flatten.
- parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
during flattening. (default None)
Returns:
--------
- List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
model structure.
"""
flat_list = []
if isinstance(items, list):
for item in items:
flat_list.extend(await self.recursive_flatten(item, parent_id))
elif isinstance(items, dict):
model = NodeModel.model_validate(items)
flat_list.append(await self.flatten_model(model, parent_id))
for child in model.children:
flat_list.extend(await self.recursive_flatten(child, model.node_id))
return flat_list
async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
"""
Load data from a specified JSON or CSV file and return it in a structured format.
Parameters:
-----------
- file_path (str): The path to the file to load data from.
Returns:
--------
- Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
list of dictionaries or a single dictionary depending on content type.
"""
try:
if file_path.endswith(".json"):
async with aiofiles.open(file_path, mode="r") as f:
data = await f.read()
return json.loads(data)
elif file_path.endswith(".csv"):
async with aiofiles.open(file_path, mode="r") as f:
content = await f.read()
reader = csv.DictReader(content.splitlines())
return list(reader)
else:
raise IngestionError(message="Unsupported file format")
except Exception as e:
raise IngestionError(
message=f"Failed to load data from {file_path}: {e}",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
)
async def add_graph_ontology(self, file_path: str = None, documents: list = None):
"""
Add graph ontology from a JSON or CSV file, or infer relationships from provided
document content. Raise exceptions for invalid file types or missing entities.
Parameters:
-----------
- file_path (str): Optional path to a file containing data to be loaded. (default
None)
- documents (list): Optional list of document objects for content extraction if no
file path is provided. (default None)
"""
if file_path is None:
initial_chunks_and_ids = []
chunk_config = get_chunk_config()
chunk_engine = get_chunk_engine()
chunk_strategy = chunk_config.chunk_strategy
for base_file in documents:
with open(base_file.raw_data_location, "rb") as file:
try:
file_type = guess_file_type(file)
text = extract_text_from_file(file, file_type)
subchunks, chunks_with_ids = chunk_engine.chunk_data(
chunk_strategy,
text,
chunk_config.chunk_size,
chunk_config.chunk_overlap,
)
if chunks_with_ids[0][0] == 1:
initial_chunks_and_ids.append({base_file.id: chunks_with_ids})
except FileTypeException:
logger.warning(
"File (%s) has an unknown file type. We are skipping it.", file["id"]
)
ontology = await extract_ontology(str(initial_chunks_and_ids), GraphOntology)
graph_client = await get_graph_engine()
await graph_client.add_nodes(
[
(
node.id,
dict(
uuid=generate_node_id(node.id),
name=generate_node_name(node.name),
type=generate_node_id(node.id),
description=node.description,
updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
),
)
for node in ontology.nodes
]
)
await graph_client.add_edges(
(
generate_node_id(edge.source_id),
generate_node_id(edge.target_id),
edge.relationship_type,
dict(
source_node_id=generate_node_id(edge.source_id),
target_node_id=generate_node_id(edge.target_id),
relationship_name=edge.relationship_type,
updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
),
)
for edge in ontology.edges
)
else:
dataset_level_information = documents[0][1]
# Extract the list of valid IDs from the explanations
valid_ids = {item["id"] for item in dataset_level_information}
try:
data = await self.load_data(file_path)
flt_ontology = await self.recursive_flatten(data)
df = pd.DataFrame(flt_ontology)
graph_client = await get_graph_engine()
for _, row in df.iterrows():
node_data = row.to_dict()
node_id = node_data.pop("node_id", None)
if node_id in valid_ids:
await graph_client.add_node(node_id, node_data)
if node_id not in valid_ids:
raise EntityNotFoundError(
message=f"Node ID {node_id} not found in the dataset"
)
if pd.notna(row.get("relationship_source")) and pd.notna(
row.get("relationship_target")
):
await graph_client.add_edge(
row["relationship_source"],
row["relationship_target"],
relationship_name=row["relationship_type"],
edge_properties={
"source_node_id": row["relationship_source"],
"target_node_id": row["relationship_target"],
"relationship_name": row["relationship_type"],
"updated_at": datetime.now(timezone.utc).strftime(
"%Y-%m-%d %H:%M:%S"
),
},
)
return
except Exception as e:
raise RuntimeError(f"Failed to add graph ontology from {file_path}: {e}") from e
async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
"""
Infer data ontology from provided documents and optionally add it to a graph.
Parameters:
-----------
- documents: The documents from which to infer the ontology.
- ontology_model: The ontology model to use for the inference, defaults to
KnowledgeGraph. (default KnowledgeGraph)
- root_node_id: An optional root node identifier for the ontology. (default None)
"""
if ontology_model == KnowledgeGraph:
ontology_engine = OntologyEngine()
root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
else:
graph_engine = await get_graph_engine()
await add_model_class_to_graph(ontology_model, graph_engine)
yield (documents, root_node_id)

View file

@ -4,16 +4,20 @@ from sqlalchemy import text
from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
get_migration_relational_engine,
)
from cognee.infrastructure.databases.relational.config import get_migration_config
from cognee.tasks.storage.index_data_points import index_data_points
from cognee.tasks.storage.index_graph_edges import index_graph_edges
from cognee.tasks.schema.ingest_database_schema import ingest_database_schema
from cognee.modules.engine.models import TableRow, TableType, ColumnValue
logger = logging.getLogger(__name__)
async def migrate_relational_database(graph_db, schema, migrate_column_data=True):
async def migrate_relational_database(
graph_db, schema, migrate_column_data=True, schema_only=False
):
"""
Migrates data from a relational database into a graph database.
@ -26,11 +30,133 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
Both TableType and TableRow inherit from DataPoint to maintain consistency with Cognee data model.
"""
# Create a mapping of node_id to node objects for referencing in edge creation
if schema_only:
node_mapping, edge_mapping = await schema_only_ingestion(schema)
else:
node_mapping, edge_mapping = await complete_database_ingestion(schema, migrate_column_data)
def _remove_duplicate_edges(edge_mapping):
seen = set()
unique_original_shape = []
for tup in edge_mapping:
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
# To eliminate duplicate edges.
source_id, target_id, rel_name, rel_dict = tup
# We need to convert the dictionary to a frozenset to be able to compare values for it
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
# We use the seen set to keep track of unique edges
if hashable_tup not in seen:
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
seen.add(hashable_tup)
# append the original tuple shape (with the dictionary) if it's the first time we see it
unique_original_shape.append(tup)
return unique_original_shape
# Add all nodes and edges to the graph
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
# If we'd create nodes and add them to graph in real time the process would take too long.
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
await graph_db.add_nodes(list(node_mapping.values()))
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
# Cognee uses this information to perform searches on the knowledge graph.
await index_data_points(list(node_mapping.values()))
await index_graph_edges()
logger.info("Data successfully migrated from relational database to desired graph database.")
return await graph_db.get_graph_data()
async def schema_only_ingestion(schema):
node_mapping = {}
edge_mapping = []
# Calling the ingest_database_schema function to return DataPoint subclasses
result = await ingest_database_schema(
schema=schema,
max_sample_rows=5,
)
database_schema = result["database_schema"]
schema_tables = result["schema_tables"]
schema_relationships = result["relationships"]
database_node_id = database_schema.id
node_mapping[database_node_id] = database_schema
for table in schema_tables:
table_node_id = table.id
# Add TableSchema Datapoint as a node.
node_mapping[table_node_id] = table
edge_mapping.append(
(
table_node_id,
database_node_id,
"is_part_of",
dict(
source_node_id=table_node_id,
target_node_id=database_node_id,
relationship_name="is_part_of",
),
)
)
table_name_to_id = {t.name: t.id for t in schema_tables}
for rel in schema_relationships:
source_table_id = table_name_to_id.get(rel.source_table)
target_table_id = table_name_to_id.get(rel.target_table)
relationship_id = rel.id
# Add RelationshipTable DataPoint as a node.
node_mapping[relationship_id] = rel
edge_mapping.append(
(
source_table_id,
relationship_id,
"has_relationship",
dict(
source_node_id=source_table_id,
target_node_id=relationship_id,
relationship_name=rel.relationship_type,
),
)
)
edge_mapping.append(
(
relationship_id,
target_table_id,
"has_relationship",
dict(
source_node_id=relationship_id,
target_node_id=target_table_id,
relationship_name=rel.relationship_type,
),
)
)
edge_mapping.append(
(
source_table_id,
target_table_id,
rel.relationship_type,
dict(
source_node_id=source_table_id,
target_node_id=target_table_id,
relationship_name=rel.relationship_type,
),
)
)
return node_mapping, edge_mapping
async def complete_database_ingestion(schema, migrate_column_data):
engine = get_migration_relational_engine()
# Create a mapping of node_id to node objects for referencing in edge creation
node_mapping = {}
edge_mapping = []
async with engine.engine.begin() as cursor:
# First, create table type nodes for all tables
for table_name, details in schema.items():
@ -38,7 +164,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
table_node = TableType(
id=uuid5(NAMESPACE_OID, name=table_name),
name=table_name,
description=f"Table: {table_name}",
description=f'Relational database table with the following name: "{table_name}".',
)
# Add TableType node to mapping ( node will be added to the graph later based on this mapping )
@ -75,7 +201,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
name=node_id,
is_a=table_node,
properties=str(row_properties),
description=f"Row in {table_name} with {primary_key_col}={primary_key_value}",
description=f'Row in relational database table from the table with the name: "{table_name}" with the following row data {str(row_properties)} where the dictionary key value is the column name and the value is the column value. This row has the id of: {node_id}',
)
# Store the node object in our mapping
@ -113,7 +239,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
id=uuid5(NAMESPACE_OID, name=column_node_id),
name=column_node_id,
properties=f"{key} {value} {table_name}",
description=f"Column name={key} and value={value} from column from table={table_name}",
description=f"column from relational database table={table_name}. Column name={key} and value={value}. The value of the column is related to the following row with this id: {row_node.id}. This column has the following ID: {column_node_id}",
)
node_mapping[column_node_id] = column_node
@ -180,39 +306,4 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
),
)
)
def _remove_duplicate_edges(edge_mapping):
seen = set()
unique_original_shape = []
for tup in edge_mapping:
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
# To eliminate duplicate edges.
source_id, target_id, rel_name, rel_dict = tup
# We need to convert the dictionary to a frozenset to be able to compare values for it
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
# We use the seen set to keep track of unique edges
if hashable_tup not in seen:
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
seen.add(hashable_tup)
# append the original tuple shape (with the dictionary) if it's the first time we see it
unique_original_shape.append(tup)
return unique_original_shape
# Add all nodes and edges to the graph
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
# If we'd create nodes and add them to graph in real time the process would take too long.
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
await graph_db.add_nodes(list(node_mapping.values()))
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
# Cognee uses this information to perform searches on the knowledge graph.
await index_data_points(list(node_mapping.values()))
await index_graph_edges()
logger.info("Data successfully migrated from relational database to desired graph database.")
return await graph_db.get_graph_data()
return node_mapping, edge_mapping

View file

@ -32,7 +32,10 @@ async def resolve_data_directories(
import s3fs
fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
key=s3_config.aws_access_key_id,
secret=s3_config.aws_secret_access_key,
token=s3_config.aws_session_token,
anon=False,
)
for item in data:

View file

@ -0,0 +1,134 @@
import json
from typing import List, Dict
from uuid import uuid5, NAMESPACE_OID
from cognee.infrastructure.engine.models.DataPoint import DataPoint
from sqlalchemy import text
from cognee.tasks.schema.models import DatabaseSchema, SchemaTable, SchemaRelationship
from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
get_migration_relational_engine,
)
from cognee.infrastructure.databases.relational.config import get_migration_config
from datetime import datetime, timezone
async def ingest_database_schema(
schema,
max_sample_rows: int = 0,
) -> Dict[str, List[DataPoint] | DataPoint]:
"""
Extract database schema metadata (optionally with sample data) and return DataPoint models for graph construction.
Args:
schema: Database schema
max_sample_rows: Maximum sample rows per table (0 means no sampling)
Returns:
Dict with keys:
"database_schema": DatabaseSchema
"schema_tables": List[SchemaTable]
"relationships": List[SchemaRelationship]
"""
tables = {}
sample_data = {}
schema_tables = []
schema_relationships = []
migration_config = get_migration_config()
engine = get_migration_relational_engine()
qi = engine.engine.dialect.identifier_preparer.quote
try:
max_sample_rows = max(0, int(max_sample_rows))
except (TypeError, ValueError):
max_sample_rows = 0
def qname(name: str):
split_name = name.split(".")
return ".".join(qi(p) for p in split_name)
async with engine.engine.begin() as cursor:
for table_name, details in schema.items():
tn = qname(table_name)
if max_sample_rows > 0:
rows_result = await cursor.execute(
text(f"SELECT * FROM {tn} LIMIT :limit;"), # noqa: S608 - tn is fully quoted
{"limit": max_sample_rows},
)
rows = [dict(r) for r in rows_result.mappings().all()]
else:
rows = []
if engine.engine.dialect.name == "postgresql":
if "." in table_name:
schema_part, table_part = table_name.split(".", 1)
else:
schema_part, table_part = "public", table_name
estimate = await cursor.execute(
text(
"SELECT reltuples::bigint AS estimate "
"FROM pg_class c "
"JOIN pg_namespace n ON n.oid = c.relnamespace "
"WHERE n.nspname = :schema AND c.relname = :table"
),
{"schema": schema_part, "table": table_part},
)
row_count_estimate = estimate.scalar() or 0
else:
count_result = await cursor.execute(text(f"SELECT COUNT(*) FROM {tn};")) # noqa: S608 - tn is fully quoted
row_count_estimate = count_result.scalar()
schema_table = SchemaTable(
id=uuid5(NAMESPACE_OID, name=f"{table_name}"),
name=table_name,
columns=json.dumps(details["columns"], default=str),
primary_key=details.get("primary_key"),
foreign_keys=json.dumps(details.get("foreign_keys", []), default=str),
sample_rows=json.dumps(rows, default=str),
row_count_estimate=row_count_estimate,
description=f"Relational database table with '{table_name}' with {len(details['columns'])} columns and approx. {row_count_estimate} rows."
f"Here are the columns this table contains: {details['columns']}"
f"Here are a few sample_rows to show the contents of the table: {rows}"
f"Table is part of the database: {migration_config.migration_db_name}",
)
schema_tables.append(schema_table)
tables[table_name] = details
sample_data[table_name] = rows
for fk in details.get("foreign_keys", []):
ref_table_fq = fk["ref_table"]
if "." not in ref_table_fq and "." in table_name:
ref_table_fq = f"{table_name.split('.', 1)[0]}.{ref_table_fq}"
relationship_name = (
f"{table_name}:{fk['column']}->{ref_table_fq}:{fk['ref_column']}"
)
relationship = SchemaRelationship(
id=uuid5(NAMESPACE_OID, name=relationship_name),
name=relationship_name,
source_table=table_name,
target_table=ref_table_fq,
relationship_type="foreign_key",
source_column=fk["column"],
target_column=fk["ref_column"],
description=f"Relational database table foreign key relationship between: {table_name}.{fk['column']}{ref_table_fq}.{fk['ref_column']}"
f"This foreing key relationship between table columns is a part of the following database: {migration_config.migration_db_name}",
)
schema_relationships.append(relationship)
id_str = f"{migration_config.migration_db_provider}:{migration_config.migration_db_name}"
database_schema = DatabaseSchema(
id=uuid5(NAMESPACE_OID, name=id_str),
name=migration_config.migration_db_name,
database_type=migration_config.migration_db_provider,
tables=json.dumps(tables, default=str),
sample_data=json.dumps(sample_data, default=str),
description=f"Database schema containing {len(schema_tables)} tables and {len(schema_relationships)} relationships. "
f"The database type is {migration_config.migration_db_provider}."
f"The database contains the following tables: {tables}",
)
return {
"database_schema": database_schema,
"schema_tables": schema_tables,
"relationships": schema_relationships,
}

View file

@ -0,0 +1,40 @@
from cognee.infrastructure.engine.models.DataPoint import DataPoint
from typing import List, Dict, Optional
from datetime import datetime
class DatabaseSchema(DataPoint):
"""Represents a complete database schema with sample data"""
name: str
database_type: str # sqlite, postgres, etc.
tables: str # Reuse existing schema format from SqlAlchemyAdapter
sample_data: str # Limited examples per table
description: str
metadata: dict = {"index_fields": ["description", "name"]}
class SchemaTable(DataPoint):
"""Represents an individual table schema with relationships"""
name: str
columns: str # Column definitions with types
primary_key: Optional[str]
foreign_keys: str # Foreign key relationships
sample_rows: str # Max 3-5 example rows
row_count_estimate: Optional[int] # Actual table size
description: str
metadata: dict = {"index_fields": ["description", "name"]}
class SchemaRelationship(DataPoint):
"""Represents relationships between tables"""
name: str
source_table: str
target_table: str
relationship_type: str # "foreign_key", "one_to_many", etc.
source_column: str
target_column: str
description: str
metadata: dict = {"index_fields": ["description", "name"]}

View file

@ -67,6 +67,44 @@ async def test_getting_of_documents(dataset_name_1):
)
async def test_vector_engine_search_none_limit():
file_path_quantum = os.path.join(
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
)
file_path_nlp = os.path.join(
pathlib.Path(__file__).parent,
"test_data/Natural_language_processing.txt",
)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add(file_path_quantum)
await cognee.add(file_path_nlp)
await cognee.cognify()
query_text = "Tell me about Quantum computers"
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
collection_name = "Entity_name"
query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]
result = await vector_engine.search(
collection_name=collection_name, query_vector=query_vector, limit=None
)
# Check that we did not accidentally use any default value for limit
# in vector search along the way (like 5, 10, or 15)
assert len(result) > 15
async def main():
cognee.config.set_vector_db_config(
{
@ -165,6 +203,8 @@ async def main():
tables_in_database = await vector_engine.get_collection_names()
assert len(tables_in_database) == 0, "ChromaDB database is not empty"
await test_vector_engine_search_none_limit()
if __name__ == "__main__":
import asyncio

View file

@ -41,7 +41,12 @@ class TestCogneeServerStart(unittest.TestCase):
def tearDownClass(cls):
# Terminate the server process
if hasattr(cls, "server_process") and cls.server_process:
os.killpg(os.getpgid(cls.server_process.pid), signal.SIGTERM)
if hasattr(os, "killpg"):
# Unix-like systems: Use process groups
os.killpg(os.getpgid(cls.server_process.pid), signal.SIGTERM)
else:
# Windows: Just terminate the main process
cls.server_process.terminate()
cls.server_process.wait()
def test_server_is_running(self):

View file

@ -0,0 +1,9 @@
A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
Emerging error-correcting codes aim to mitigate decoherence effects and are expected to pave the way for fault-tolerant quantum processors. Laboratories across the globe are investigating diverse qubit implementations, such as superconducting circuits, trapped ions, neutral atoms, and photonic systems. Significant government funding and private investment have created an ecosystem of startups and consortia focused on accelerating quantum hardware and software development. Universities are meanwhile launching interdisciplinary programs that teach physics, computer science, and engineering concepts necessary for tomorrow's quantum workforce. Establishing reliable benchmarking standards will be essential for objectively comparing devices and charting realistic milestones toward practical quantum advantage.
Industry roadmaps anticipate that achieving error rates below the threshold for surface codes will require millions of physical qubits per logical qubit, highlighting daunting scale challenges. Researchers are therefore exploring hardwaresoftware co-design strategies, where algorithmic breakthroughs and device engineering progress hand in hand to minimize overhead. Hybrid quantum-classical workflows, exemplified by variational algorithms running on near-term devices, offer a pragmatic path to extracting value before full fault tolerance arrives. Meanwhile, cryptographers are advancing post-quantum encryption schemes to safeguard information in a future where Shors algorithm becomes practical. The interplay between theoretical advances, experimental ingenuity, and policy considerations will ultimately determine how transformative quantum computing becomes for science, industry, and society.
Collaborative open-source toolkits are lowering the barrier to entry for developers eager to prototype quantum algorithms and simulate small devices on classical hardware. As these software frameworks mature, they will foster standardization of gate libraries, circuit optimization passes, and error-mitigation techniques. At the same time, advances in cryogenic engineering, vacuum systems, and photonics are steadily improving the stability and manufacturability of next-generation qubit platforms. Policymakers are beginning to craft export controls and ethical guidelines aimed at preventing misuse while encouraging international collaboration in fundamental research. Ultimately, the success of quantum technology will hinge on integrating robust hardware, intelligent software, and a skilled workforce within an environment of responsible governance.

View file

@ -0,0 +1,211 @@
import os
import pathlib
import cognee
from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.files.storage import get_storage_config
from cognee.modules.data.models import Data
from cognee.modules.users.methods import get_default_user
from cognee.modules.search.types import SearchType
from cognee.modules.search.operations import get_history
logger = get_logger()
async def test_local_file_deletion(data_text, file_location):
from sqlalchemy import select
import hashlib
from cognee.infrastructure.databases.relational import get_relational_engine
engine = get_relational_engine()
async with engine.get_async_session() as session:
# Get hash of data contents
encoded_text = data_text.encode("utf-8")
data_hash = hashlib.md5(encoded_text).hexdigest()
# Get data entry from database based on hash contents
data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one()
assert os.path.isfile(data.raw_data_location.replace("file://", "")), (
f"Data location doesn't exist: {data.raw_data_location}"
)
# Test deletion of data along with local files created by cognee
await engine.delete_data_entity(data.id)
assert not os.path.exists(data.raw_data_location.replace("file://", "")), (
f"Data location still exists after deletion: {data.raw_data_location}"
)
async with engine.get_async_session() as session:
# Get data entry from database based on file path
data = (
await session.scalars(select(Data).where(Data.raw_data_location == file_location))
).one()
assert os.path.isfile(data.raw_data_location.replace("file://", "")), (
f"Data location doesn't exist: {data.raw_data_location}"
)
# Test local files not created by cognee won't get deleted
await engine.delete_data_entity(data.id)
assert os.path.exists(data.raw_data_location.replace("file://", "")), (
f"Data location doesn't exists: {data.raw_data_location}"
)
async def test_getting_of_documents(dataset_name_1):
# Test getting of documents for search per dataset
from cognee.modules.users.permissions.methods import get_document_ids_for_user
user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
assert len(document_ids) == 1, (
f"Number of expected documents doesn't match {len(document_ids)} != 1"
)
# Test getting of documents for search when no dataset is provided
user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id)
assert len(document_ids) == 2, (
f"Number of expected documents doesn't match {len(document_ids)} != 2"
)
async def test_vector_engine_search_none_limit():
file_path_quantum = os.path.join(
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
)
file_path_nlp = os.path.join(
pathlib.Path(__file__).parent,
"test_data/Natural_language_processing.txt",
)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add(file_path_quantum)
await cognee.add(file_path_nlp)
await cognee.cognify()
query_text = "Tell me about Quantum computers"
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
collection_name = "Entity_name"
query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]
result = await vector_engine.search(
collection_name=collection_name, query_vector=query_vector, limit=None
)
# Check that we did not accidentally use any default value for limit
# in vector search along the way (like 5, 10, or 15)
assert len(result) > 15
async def main():
cognee.config.set_vector_db_config(
{
"vector_db_provider": "lancedb",
}
)
data_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_lancedb")
).resolve()
)
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_lancedb")
).resolve()
)
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
dataset_name_1 = "natural_language"
dataset_name_2 = "quantum"
explanation_file_path = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
)
await cognee.add([explanation_file_path], dataset_name_1)
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
"""
await cognee.add([text], dataset_name_2)
await cognee.cognify([dataset_name_2, dataset_name_1])
from cognee.infrastructure.databases.vector import get_vector_engine
await test_getting_of_documents(dataset_name_1)
vector_engine = get_vector_engine()
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(
query_type=SearchType.INSIGHTS, query_text=random_node_name
)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(
query_type=SearchType.CHUNKS, query_text=random_node_name, datasets=[dataset_name_2]
)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
graph_completion = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=random_node_name,
datasets=[dataset_name_2],
)
assert len(graph_completion) != 0, "Completion result is empty."
print("Completion result is:")
print(graph_completion)
search_results = await cognee.search(
query_type=SearchType.SUMMARIES, query_text=random_node_name
)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
user = await get_default_user()
history = await get_history(user.id)
assert len(history) == 8, "Search history is not correct."
await cognee.prune.prune_data()
data_root_directory = get_storage_config()["data_root_directory"]
assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True)
connection = await vector_engine.get_connection()
tables_in_database = await connection.table_names()
assert len(tables_in_database) == 0, "LanceDB database is not empty"
await test_vector_engine_search_none_limit()
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View file

@ -68,6 +68,44 @@ async def test_getting_of_documents(dataset_name_1):
)
async def test_vector_engine_search_none_limit():
file_path_quantum = os.path.join(
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
)
file_path_nlp = os.path.join(
pathlib.Path(__file__).parent,
"test_data/Natural_language_processing.txt",
)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add(file_path_quantum)
await cognee.add(file_path_nlp)
await cognee.cognify()
query_text = "Tell me about Quantum computers"
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
collection_name = "Entity_name"
query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]
result = await vector_engine.search(
collection_name=collection_name, query_vector=query_vector, limit=None
)
# Check that we did not accidentally use any default value for limit
# in vector search along the way (like 5, 10, or 15)
assert len(result) > 15
async def main():
cognee.config.set_vector_db_config(
{"vector_db_url": "", "vector_db_key": "", "vector_db_provider": "pgvector"}
@ -174,6 +212,8 @@ async def main():
tables_in_database = await vector_engine.get_table_names()
assert len(tables_in_database) == 0, "PostgreSQL database is not empty"
await test_vector_engine_search_none_limit()
if __name__ == "__main__":
import asyncio

View file

@ -197,6 +197,80 @@ async def relational_db_migration():
print(f"All checks passed for {graph_db_provider} provider with '{relationship_label}' edges!")
async def test_schema_only_migration():
# 1. Setup test DB and extract schema
migration_engine = await setup_test_db()
schema = await migration_engine.extract_schema()
# 2. Setup graph engine
graph_engine = await get_graph_engine()
# 4. Migrate schema only
await migrate_relational_database(graph_engine, schema=schema, schema_only=True)
# 5. Verify number of tables through search
search_results = await cognee.search(
query_text="How many tables are there in this database",
query_type=cognee.SearchType.GRAPH_COMPLETION,
top_k=30,
)
assert any("11" in r for r in search_results), (
"Number of tables in the database reported in search_results is either None or not equal to 11"
)
graph_db_provider = os.getenv("GRAPH_DATABASE_PROVIDER", "networkx").lower()
edge_counts = {
"is_part_of": 0,
"has_relationship": 0,
"foreign_key": 0,
}
if graph_db_provider == "neo4j":
for rel_type in edge_counts.keys():
query_str = f"""
MATCH ()-[r:{rel_type}]->()
RETURN count(r) as c
"""
rows = await graph_engine.query(query_str)
edge_counts[rel_type] = rows[0]["c"]
elif graph_db_provider == "kuzu":
for rel_type in edge_counts.keys():
query_str = f"""
MATCH ()-[r:EDGE]->()
WHERE r.relationship_name = '{rel_type}'
RETURN count(r) as c
"""
rows = await graph_engine.query(query_str)
edge_counts[rel_type] = rows[0][0]
elif graph_db_provider == "networkx":
nodes, edges = await graph_engine.get_graph_data()
for _, _, key, _ in edges:
if key in edge_counts:
edge_counts[key] += 1
else:
raise ValueError(f"Unsupported graph database provider: {graph_db_provider}")
# 7. Assert counts match expected values
expected_counts = {
"is_part_of": 11,
"has_relationship": 22,
"foreign_key": 11,
}
for rel_type, expected in expected_counts.items():
actual = edge_counts[rel_type]
assert actual == expected, (
f"Expected {expected} edges for relationship '{rel_type}', but found {actual}"
)
print("Schema-only migration edge counts validated successfully!")
print(f"Edge counts: {edge_counts}")
async def test_migration_sqlite():
database_to_migrate_path = os.path.join(pathlib.Path(__file__).parent, "test_data/")
@ -209,6 +283,7 @@ async def test_migration_sqlite():
)
await relational_db_migration()
await test_schema_only_migration()
async def test_migration_postgres():
@ -224,6 +299,7 @@ async def test_migration_postgres():
}
)
await relational_db_migration()
await test_schema_only_migration()
async def main():

View file

@ -7,7 +7,6 @@ import cognee
from cognee.low_level import setup, DataPoint
from cognee.tasks.storage import add_data_points
from cognee.modules.graph.utils import resolve_edges_to_text
from cognee.infrastructure.databases.exceptions import DatabaseNotCreatedError
from cognee.modules.retrieval.graph_completion_context_extension_retriever import (
GraphCompletionContextExtensionRetriever,
)
@ -165,9 +164,6 @@ class TestGraphCompletionWithContextExtensionRetriever:
retriever = GraphCompletionContextExtensionRetriever()
with pytest.raises(DatabaseNotCreatedError):
await retriever.get_context("Who works at Figma?")
await setup()
context = await retriever.get_context("Who works at Figma?")

View file

@ -7,7 +7,6 @@ import cognee
from cognee.low_level import setup, DataPoint
from cognee.modules.graph.utils import resolve_edges_to_text
from cognee.tasks.storage import add_data_points
from cognee.infrastructure.databases.exceptions import DatabaseNotCreatedError
from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever
@ -158,9 +157,6 @@ class TestGraphCompletionCoTRetriever:
retriever = GraphCompletionCotRetriever()
with pytest.raises(DatabaseNotCreatedError):
await retriever.get_context("Who works at Figma?")
await setup()
context = await retriever.get_context("Who works at Figma?")

View file

@ -7,7 +7,6 @@ import cognee
from cognee.low_level import setup, DataPoint
from cognee.modules.graph.utils import resolve_edges_to_text
from cognee.tasks.storage import add_data_points
from cognee.infrastructure.databases.exceptions import DatabaseNotCreatedError
from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
@ -218,9 +217,6 @@ class TestGraphCompletionRetriever:
retriever = GraphCompletionRetriever()
with pytest.raises(DatabaseNotCreatedError):
await retriever.get_context("Who works at Figma?")
await setup()
context = await retriever.get_context("Who works at Figma?")

View file

@ -15,6 +15,9 @@ async def cognee_demo():
current_directory = Path(__file__).resolve().parent.parent
file_path = os.path.join(current_directory, "data", "alice_in_wonderland.txt")
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# Call Cognee to process document
await cognee.add(file_path)
await cognee.cognify()

View file

@ -48,7 +48,6 @@ async def main():
query = "When was Kamala Harris in office?"
triplets = await brute_force_triplet_search(
query=query,
user=user,
top_k=3,
collections=["graphitinode_content", "graphitinode_name", "graphitinode_summary"],
)

View file

@ -0,0 +1,188 @@
import os
import cognee
import pathlib
from cognee.modules.users.exceptions import PermissionDeniedError
from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType
from cognee.modules.users.methods import create_user
from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets
from cognee.modules.users.roles.methods import add_user_to_role
from cognee.modules.users.roles.methods import create_role
from cognee.modules.users.tenants.methods import create_tenant
from cognee.modules.users.tenants.methods import add_user_to_tenant
from cognee.modules.engine.operations.setup import setup
from cognee.shared.logging_utils import setup_logging, CRITICAL
logger = get_logger()
async def main():
# ENABLE PERMISSIONS FEATURE
# Note: When ENABLE_BACKEND_ACCESS_CONTROL is enabled vector provider is automatically set to use LanceDB
# and graph provider is set to use Kuzu.
os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "True"
# Set the rest of your environment variables as needed. By default OpenAI is used as the LLM provider
# Reference the .env.tempalte file for available option and how to change LLM provider: https://github.com/topoteretes/cognee/blob/main/.env.template
# For example to set your OpenAI LLM API key use:
# os.environ["LLM_API_KEY""] = "your-api-key"
# Create a clean slate for cognee -- reset data and system state
print("Resetting cognee data...")
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
print("Data reset complete.\n")
# Set up the necessary databases and tables for user management.
await setup()
# NOTE: When a document is added in Cognee with permissions enabled only the owner of the document has permissions
# to work with the document initially.
# Add document for user_1, add it under dataset name AI
explanation_file_path = os.path.join(
pathlib.Path(__file__).parent, "../data/artificial_intelligence.pdf"
)
print("Creating user_1: user_1@example.com")
user_1 = await create_user("user_1@example.com", "example")
await cognee.add([explanation_file_path], dataset_name="AI", user=user_1)
# Add document for user_2, add it under dataset name QUANTUM
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages
this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the
preparation and manipulation of quantum states.
"""
print("\nCreating user_2: user_2@example.com")
user_2 = await create_user("user_2@example.com", "example")
await cognee.add([text], dataset_name="QUANTUM", user=user_2)
# Run cognify for both datasets as the appropriate user/owner
print("\nCreating different datasets for user_1 (AI dataset) and user_2 (QUANTUM dataset)")
ai_cognify_result = await cognee.cognify(["AI"], user=user_1)
quantum_cognify_result = await cognee.cognify(["QUANTUM"], user=user_2)
# Extract dataset_ids from cognify results
def extract_dataset_id_from_cognify(cognify_result):
"""Extract dataset_id from cognify output dictionary"""
for dataset_id, pipeline_result in cognify_result.items():
return dataset_id # Return the first dataset_id
return None
# Get dataset IDs from cognify results
# Note: When we want to work with datasets from other users (search, add, cognify and etc.) we must supply dataset
# information through dataset_id using dataset name only looks for datasets owned by current user
ai_dataset_id = extract_dataset_id_from_cognify(ai_cognify_result)
quantum_dataset_id = extract_dataset_id_from_cognify(quantum_cognify_result)
# We can see here that user_1 can read his own dataset (AI dataset)
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=user_1,
datasets=[ai_dataset_id],
)
print("\nSearch results as user_1 on dataset owned by user_1:")
for result in search_results:
print(f"{result}\n")
# But user_1 cant read the dataset owned by user_2 (QUANTUM dataset)
print("\nSearch result as user_1 on the dataset owned by user_2:")
try:
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=user_1,
datasets=[quantum_dataset_id],
)
except PermissionDeniedError:
print(f"User: {user_1} does not have permission to read from dataset: QUANTUM")
# user_1 currently also can't add a document to user_2's dataset (QUANTUM dataset)
print("\nAttempting to add new data as user_1 to dataset owned by user_2:")
try:
await cognee.add(
[explanation_file_path],
dataset_id=quantum_dataset_id,
user=user_1,
)
except PermissionDeniedError:
print(f"User: {user_1} does not have permission to write to dataset: QUANTUM")
# We've shown that user_1 can't interact with the dataset from user_2
# Now have user_2 give proper permission to user_1 to read QUANTUM dataset
# Note: supported permission types are "read", "write", "delete" and "share"
print(
"\nOperation started as user_2 to give read permission to user_1 for the dataset owned by user_2"
)
await authorized_give_permission_on_datasets(
user_1.id,
[quantum_dataset_id],
"read",
user_2.id,
)
# Now user_1 can read from quantum dataset after proper permissions have been assigned by the QUANTUM dataset owner.
print("\nSearch result as user_1 on the dataset owned by user_2:")
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=user_1,
dataset_ids=[quantum_dataset_id],
)
for result in search_results:
print(f"{result}\n")
# If we'd like for user_1 to add new documents to the QUANTUM dataset owned by user_2, user_1 would have to get
# "write" access permission, which user_1 currently does not have
# Users can also be added to Roles and Tenants and then permission can be assigned on a Role/Tenant level as well
# To create a Role a user first must be an owner of a Tenant
print("User 2 is creating CogneeLab tenant/organization")
tenant_id = await create_tenant("CogneeLab", user_2.id)
print("\nUser 2 is creating Researcher role")
role_id = await create_role(role_name="Researcher", owner_id=user_2.id)
print("\nCreating user_3: user_3@example.com")
user_3 = await create_user("user_3@example.com", "example")
# To add a user to a role he must be part of the same tenant/organization
print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization")
await add_user_to_tenant(user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id)
print(
"\nOperation started by user_2, as tenant owner, to add user_3 to Researcher role inside the tenant/organization"
)
await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id)
print(
"\nOperation started as user_2 to give read permission to Researcher role for the dataset owned by user_2"
)
await authorized_give_permission_on_datasets(
role_id,
[quantum_dataset_id],
"read",
user_2.id,
)
# Now user_3 can read from QUANTUM dataset as part of the Researcher role after proper permissions have been assigned by the QUANTUM dataset owner, user_2.
print("\nSearch result as user_3 on the dataset owned by user_2:")
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=user_1,
dataset_ids=[quantum_dataset_id],
)
for result in search_results:
print(f"{result}\n")
# Note: All of these function calls and permission system is available through our backend endpoints as well
if __name__ == "__main__":
import asyncio
logger = setup_logging(log_level=CRITICAL)
asyncio.run(main())

View file

@ -1,16 +1,15 @@
from pathlib import Path
import asyncio
import cognee
import os
import cognee
from cognee.infrastructure.databases.relational.config import get_migration_config
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.api.v1.visualize.visualize import visualize_graph
from cognee.infrastructure.databases.relational import (
get_migration_relational_engine,
)
from cognee.modules.search.types import SearchType
from cognee.infrastructure.databases.relational import (
create_db_and_tables as create_relational_db_and_tables,
)
@ -32,16 +31,29 @@ from cognee.infrastructure.databases.vector.pgvector import (
async def main():
engine = get_migration_relational_engine()
# Clean all data stored in Cognee
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# Needed to create appropriate tables only on the Cognee side
# Needed to create appropriate database tables only on the Cognee side
await create_relational_db_and_tables()
await create_vector_db_and_tables()
# In case environment variables are not set use the example database from the Cognee repo
migration_db_provider = os.environ.get("MIGRATION_DB_PROVIDER", "sqlite")
migration_db_path = os.environ.get(
"MIGRATION_DB_PATH",
os.path.join(Path(__file__).resolve().parent.parent.parent, "cognee/tests/test_data"),
)
migration_db_name = os.environ.get("MIGRATION_DB_NAME", "migration_database.sqlite")
migration_config = get_migration_config()
migration_config.migration_db_provider = migration_db_provider
migration_config.migration_db_path = migration_db_path
migration_config.migration_db_name = migration_db_name
engine = get_migration_relational_engine()
print("\nExtracting schema of database to migrate.")
schema = await engine.extract_schema()
print(f"Migrated database schema:\n{schema}")
@ -53,10 +65,6 @@ async def main():
await migrate_relational_database(graph, schema=schema)
print("Relational database migration complete.")
# Define location where to store html visualization of graph of the migrated database
home_dir = os.path.expanduser("~")
destination_file_path = os.path.join(home_dir, "graph_visualization.html")
# Make sure to set top_k at a high value for a broader search, the default value is only 10!
# top_k represent the number of graph tripplets to supply to the LLM to answer your question
search_results = await cognee.search(
@ -69,13 +77,25 @@ async def main():
# Having a top_k value set to too high might overwhelm the LLM context when specific questions need to be answered.
# For this kind of question we've set the top_k to 30
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION_COT,
query_type=SearchType.GRAPH_COMPLETION,
query_text="What invoices are related to Leonie Köhler?",
top_k=30,
)
print(f"Search results: {search_results}")
# test.html is a file with visualized data migration
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What invoices are related to Luís Gonçalves?",
top_k=30,
)
print(f"Search results: {search_results}")
# If you check the relational database for this example you can see that the search results successfully found all
# the invoices related to the two customers, without any hallucinations or additional information
# Define location where to store html visualization of graph of the migrated database
home_dir = os.path.expanduser("~")
destination_file_path = os.path.join(home_dir, "graph_visualization.html")
print("Adding html visualization of graph database after migration.")
await visualize_graph(destination_file_path)
print(f"Visualization can be found at: {destination_file_path}")

View file

@ -29,8 +29,11 @@ async def main():
print("=" * 60)
# Start the UI server
def dummy_callback(pid):
pass
server = cognee.start_ui(
host="localhost",
pid_callback=dummy_callback,
port=3000,
open_browser=True, # This will automatically open your browser
)

View file

@ -1,78 +0,0 @@
# cognee Graduates from GitHub Secure Open Source Program
*Building Trust and Security in AI Memory Systems*
We're excited to announce that **cognee** has successfully graduated from the GitHub Secure Open Source Program! This milestone reflects our commitment to maintaining the highest standards of security and reliability in open source AI infrastructure.
## What is cognee?
cognee is an open source library that provides **memory for AI agents in just 5 lines of code**. It transforms raw data into structured knowledge graphs through our innovative ECL (Extract, Cognify, Load) pipeline, enabling AI systems to build dynamic memory that goes far beyond traditional RAG systems.
### Key Features:
- **Interconnected Knowledge**: Links conversations, documents, images, and audio transcriptions
- **Scalable Architecture**: Loads data to graph and vector databases using only Pydantic
- **30+ Data Sources**: Manipulates data while ingesting from diverse sources
- **Developer-Friendly**: Reduces complexity and cost compared to traditional RAG implementations
## GitHub Secure Open Source Program Achievement
The GitHub Secure Open Source Program helps maintainers adopt security best practices and ensures that critical open source projects meet enterprise-grade security standards. Our graduation demonstrates that cognee has successfully implemented:
- **Security-first development practices**
- **Comprehensive vulnerability management**
- **Secure dependency management**
- **Code quality and review processes**
- **Community safety guidelines**
## Why This Matters for AI Development
As AI systems become more prevalent in production environments, security becomes paramount. cognee's graduation from this program means developers can confidently build AI memory systems knowing they're using infrastructure that meets rigorous security standards.
### Benefits for Our Community:
- **Enterprise Adoption**: Companies can deploy cognee with confidence in security-sensitive environments
- **Vulnerability Response**: Our security practices ensure rapid identification and resolution of potential issues
- **Supply Chain Security**: Dependencies are carefully managed and regularly audited
- **Trust & Transparency**: Open source development with security-first principles
## What's Next?
With over **5,000 GitHub stars** and a growing community of developers, cognee continues to evolve. We recently launched **Cogwit beta** - our fully-hosted AI Memory platform, and our [research paper](https://arxiv.org/abs/2505.24478) demonstrates the effectiveness of our approach.
Our commitment to security doesn't end with graduation. We'll continue following best practices and contributing to the broader conversation about secure AI infrastructure.
## Get Started Today
Ready to add intelligent memory to your AI applications? Get started with cognee:
```python
import cognee
import asyncio
async def main():
# Add your data
await cognee.add("Your document content here")
# Transform into knowledge graph
await cognee.cognify()
# Query intelligently
results = await cognee.search("What insights can you find?")
for result in results:
print(result)
asyncio.run(main())
```
## Join Our Community
- 🌟 [Star us on GitHub](https://github.com/topoteretes/cognee)
- 💬 [Join our Discord](https://discord.gg/NQPKmU5CCg)
- 📖 [Read our documentation](https://docs.cognee.ai/)
- 🚀 [Try Cogwit beta](https://platform.cognee.ai/)
The future of AI memory is secure, scalable, and open source. We're grateful for the GitHub team's support and excited to continue building the infrastructure that powers the next generation of intelligent applications.
---
*About cognee: We're building the memory layer for AI agents, enabling them to learn, remember, and reason across conversations and data sources. Our open source approach ensures that advanced AI memory capabilities are accessible to developers worldwide.*

View file

@ -0,0 +1,107 @@
{
"conversations": [
{
"id": "conv_001",
"timestamp": "2024-01-15T10:30:00Z",
"topic": "async/await patterns",
"user_query": "I'm building a web scraper that needs to handle thousands of URLs concurrently. What's the best way to structure this with asyncio?",
"assistant_response": "For high-concurrency web scraping, you'll want to use asyncio with aiohttp and implement proper semaphore-based rate limiting. Here's a robust pattern:\n\n```python\nimport asyncio\nimport aiohttp\nfrom typing import List, Dict, Any\n\nclass AsyncWebScraper:\n def __init__(self, max_concurrent: int = 100):\n self.semaphore = asyncio.Semaphore(max_concurrent)\n self.session = None\n \n async def __aenter__(self):\n self.session = aiohttp.ClientSession(\n timeout=aiohttp.ClientTimeout(total=30),\n connector=aiohttp.TCPConnector(limit=100)\n )\n return self\n \n async def __aexit__(self, exc_type, exc_val, exc_tb):\n if self.session:\n await self.session.close()\n \n async def fetch_url(self, url: str) -> Dict[str, Any]:\n async with self.semaphore:\n try:\n async with self.session.get(url) as response:\n content = await response.text()\n return {\n 'url': url,\n 'status': response.status,\n 'content': content,\n 'headers': dict(response.headers)\n }\n except Exception as e:\n return {'url': url, 'error': str(e)}\n \n async def scrape_urls(self, urls: List[str]) -> List[Dict[str, Any]]:\n tasks = [self.fetch_url(url) for url in urls]\n return await asyncio.gather(*tasks, return_exceptions=True)\n\n# Usage\nasync def main():\n urls = ['http://example.com'] * 1000 # Your URLs here\n \n async with AsyncWebScraper(max_concurrent=50) as scraper:\n results = await scraper.scrape_urls(urls)\n \n successful = [r for r in results if 'error' not in r]\n print(f'Successfully scraped {len(successful)} URLs')\n\nif __name__ == '__main__':\n asyncio.run(main())\n```\n\nKey points:\n- Semaphore controls concurrency to avoid overwhelming servers\n- Context manager ensures proper cleanup\n- TCPConnector with connection pooling\n- Proper exception handling for failed requests",
"code_context": {
"file": "scraper.py",
"language": "python",
"patterns_discussed": ["async/await", "context_managers", "semaphores", "aiohttp", "error_handling"]
},
"follow_up_questions": [
"How do I add retry logic for failed requests?",
"What's the best way to parse the scraped HTML content?"
]
},
{
"id": "conv_002",
"timestamp": "2024-01-16T14:20:00Z",
"topic": "dataclass vs pydantic",
"user_query": "When should I use dataclasses vs Pydantic models? I'm building an API and need to handle user input validation.",
"assistant_response": "Great question! For API development with input validation, here's when to use each:\n\n**Use Pydantic when:**\n- Building APIs (FastAPI, Django REST)\n- Need runtime validation of external data\n- Want automatic JSON serialization/deserialization\n- Need complex validation rules\n- Working with configuration files\n\n**Use dataclasses when:**\n- Internal data structures\n- Simple data containers\n- Want to stay in stdlib\n- Performance is critical (dataclasses are faster)\n\nFor your API, I'd recommend Pydantic:\n\n```python\nfrom pydantic import BaseModel, validator, Field\nfrom typing import Optional, List\nfrom datetime import datetime\nfrom enum import Enum\n\nclass UserRole(str, Enum):\n ADMIN = 'admin'\n USER = 'user'\n GUEST = 'guest'\n\nclass CreateUserRequest(BaseModel):\n username: str = Field(..., min_length=3, max_length=50, regex=r'^[a-zA-Z0-9_]+$')\n email: str = Field(..., regex=r'^[\\w\\.-]+@[\\w\\.-]+\\.\\w+$')\n password: str = Field(..., min_length=8)\n role: UserRole = UserRole.USER\n tags: Optional[List[str]] = None\n \n @validator('password')\n def validate_password(cls, v):\n if not any(c.isupper() for c in v):\n raise ValueError('Password must contain uppercase letter')\n if not any(c.isdigit() for c in v):\n raise ValueError('Password must contain digit')\n return v\n \n @validator('tags')\n def validate_tags(cls, v):\n if v and len(v) > 10:\n raise ValueError('Too many tags')\n return v\n\nclass User(BaseModel):\n id: int\n username: str\n email: str\n role: UserRole\n created_at: datetime\n is_active: bool = True\n \n class Config:\n orm_mode = True\n json_encoders = { datetime: lambda v: v.isoformat() }\n\n# FastAPI usage\nfrom fastapi import FastAPI, HTTPException\n\napp = FastAPI()\n\n@app.post('/users/', response_model=User)\nasync def create_user(user_data: CreateUserRequest):\n try:\n new_user = await create_user_in_db(user_data)\n return User.from_orm(new_user)\n except Exception as e:\n raise HTTPException(status_code=400, detail=str(e))\n```\n\nPydantic gives you:\n- Automatic request validation\n- Clear error messages\n- Type coercion\n- JSON Schema generation\n- IDE support with type hints",
"code_context": {
"file": "models.py",
"language": "python",
"patterns_discussed": ["pydantic", "dataclasses", "validation", "fastapi", "type_hints", "enums"]
},
"follow_up_questions": [
"How do I handle nested validation with Pydantic?",
"Can I use Pydantic with SQLAlchemy models?"
]
},
{
"id": "conv_003",
"timestamp": "2024-01-17T09:15:00Z",
"topic": "testing patterns",
"user_query": "I'm struggling with testing async code and database interactions. What's the best approach for pytest with async functions?",
"assistant_response": "Testing async code can be tricky! Here's a comprehensive approach using pytest-asyncio and proper test isolation...",
"code_context": {
"file": "test_users.py",
"language": "python",
"patterns_discussed": ["pytest", "async_testing", "fixtures", "mocking", "database_testing", "fastapi_testing"]
},
"follow_up_questions": [
"How do I test WebSocket connections?",
"What's the best way to test database migrations?"
]
},
{
"id": "conv_004",
"timestamp": "2024-01-18T16:45:00Z",
"topic": "performance optimization",
"user_query": "My FastAPI app is getting slow with large datasets. How can I optimize database queries and response times?",
"assistant_response": "Performance optimization in FastAPI apps typically involves several layers...",
"code_context": {
"file": "optimizations.py",
"language": "python",
"patterns_discussed": ["performance_optimization", "caching", "database_optimization", "async_patterns", "monitoring"]
},
"follow_up_questions": [
"How do I implement database connection pooling properly?",
"What's the best way to handle memory usage with large datasets?"
]
},
{
"id": "conv_005",
"timestamp": "2024-01-19T11:30:00Z",
"topic": "error handling and logging",
"user_query": "I need to implement proper error handling and logging across my Python application. What's the best approach for production-ready error management?",
"assistant_response": "Excellent question! Proper error handling and logging are crucial for production applications...",
"code_context": {
"file": "error_handling.py",
"language": "python",
"patterns_discussed": ["error_handling", "logging", "exceptions", "middleware", "decorators", "fastapi"]
},
"follow_up_questions": [
"How do I integrate this with external monitoring tools like Sentry?",
"What's the best way to handle errors in background tasks?"
]
}
],
"metadata": {
"total_conversations": 5,
"date_range": "2024-01-15 to 2024-01-19",
"topics_covered": [
"async/await patterns",
"dataclass vs pydantic",
"testing patterns",
"performance optimization",
"error handling and logging"
],
"code_patterns_discussed": [
"asyncio", "aiohttp", "semaphores", "context_managers",
"pydantic", "fastapi", "type_hints", "validation",
"pytest", "async_testing", "fixtures", "mocking",
"performance_optimization", "caching", "database_optimization",
"error_handling", "logging", "exceptions", "middleware"
],
"difficulty_levels": {
"beginner": 1,
"intermediate": 2,
"advanced": 2
}
}
}

976
notebooks/data/guido_contributions.json vendored Normal file
View file

@ -0,0 +1,976 @@
[
{
"type": "pr",
"repository": "mypy",
"title": "Re-work indirect dependencies",
"description": "Wow, this was quite a ride. Indirect dependencies were always supported kind of on best effort. This PR puts them on some principled foundation. It fixes three crashes and three stale types reported. All tests are quite weird/obscure, they are designed to expose the flaws in current logic (plus one test that passes on master, but it covers important corner case, so I add it just in case ). A short summary of various fixes (in arbitrary order):\r\n* Update many outdated comments and docstrings\r\n* Missing transitive dependency is now considered stale\r\n* Handle transitive generic bases in indirection visitor\r\n* Handle chained alias targets in indirection visitor\r\n* Always record original aliases during semantic analysis\r\n* Delete `qualified_tvars` as a concept, they are not needed since long ago\r\n* Remove ad-hoc handling for `TypeInfo`s from `build.py`\r\n* Support symbols with setter type different from getter type\r\n\r\nIn general the logic should be more simple/straightforward now:\r\n* Get all types in a file (need both symbol types _and_ expression types since some types may be only local)\r\n* For each type _transitively_ find all named types in them (thus aggregating all interfaces the type depends on)\r\n* In case any type was forced using `get_proper_type()`, record the orginal type alias during semantic analysis\r\n\r\nNote since this makes the algorithm correct, it may also make it slower (most notably because we must visit generic bases). I tried to offset this by couple optimizations, hopefully performance impact will be minimal.",
"url": "https://github.com/python/mypy/pull/19798",
"date": "2025-09-05T13:54:52Z",
"sha_or_number": "19798",
"files_changed": [
"mypy/build.py",
"mypy/fixup.py",
"mypy/indirection.py",
"mypy/nodes.py",
"mypy/semanal.py",
"mypy/server/deps.py",
"mypy/test/typefixture.py",
"mypy/typeanal.py",
"test-data/unit/check-incremental.test"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [],
"code_samples": [
{
"file_path": "mypy/build.py",
"language": "python",
"before_code": "from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort\nfrom mypy.indirection import TypeIndirectionVisitor\nfrom mypy.messages import MessageBuilder\nfrom mypy.nodes import Import, ImportAll, ImportBase, ImportFrom, MypyFile, SymbolTable, TypeInfo\nfrom mypy.partially_defined import PossiblyUndefinedVariableVisitor\nfrom mypy.semanal import SemanticAnalyzer\nfrom mypy.semanal_pass1 import SemanticAnalyzerPreAnalysis",
"after_code": "from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort\nfrom mypy.indirection import TypeIndirectionVisitor\nfrom mypy.messages import MessageBuilder\nfrom mypy.nodes import (\n Decorator,\n Import,\n ImportAll,\n ImportBase,\n ImportFrom,\n MypyFile,\n OverloadedFuncDef,\n SymbolTable,\n)\nfrom mypy.partially_defined import PossiblyUndefinedVariableVisitor\nfrom mypy.semanal import SemanticAnalyzer\nfrom mypy.semanal_pass1 import SemanticAnalyzerPreAnalysis",
"diff_context": "from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort\nfrom mypy.indirection import TypeIndirectionVisitor\nfrom mypy.messages import MessageBuilder\nfrom mypy.nodes import Import, ImportAll, ImportBase, ImportFrom, MypyFile, SymbolTable, TypeInfo\nfrom mypy.nodes import (\n Decorator,\n Import,\n ImportAll,\n ImportBase,\n ImportFrom,\n MypyFile,\n OverloadedFuncDef,\n SymbolTable,\n)\nfrom mypy.partially_defined import PossiblyUndefinedVariableVisitor\nfrom mypy.semanal import SemanticAnalyzer\nfrom mypy.semanal_pass1 import SemanticAnalyzerPreAnalysis",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/build.py",
"language": "python",
"before_code": "\nFor single nodes, processing is simple. If the node was cached, we\ndeserialize the cache data and fix up cross-references. Otherwise, we\ndo semantic analysis followed by type checking. We also handle (c)\nabove; if a module has valid cache data *but* any of its\ndependencies was processed from source, then the module should be\nprocessed from source.\n\nA relatively simple optimization (outside SCCs) we might do in the\nfuture is as follows: if a node's cache data is valid, but one or more\nof its dependencies are out of date so we have to re-parse the node\nfrom source, once we have fully type-checked the node, we can decide\nwhether its symbol table actually changed compared to the cache data\n(by reading the cache data and comparing it to the data we would be\nwriting). If there is no change we can declare the node up to date,\nand any node that depends (and for which we have cached data, and\nwhose other dependencies are up to date) on it won't need to be\nre-parsed from source.\n\nImport cycles\n-------------\n\nFinally we have to decide how to handle (c), import cycles. Here\nwe'll need a modified version of the original state machine\n(build.py), but we only need to do this per SCC, and we won't have to\ndeal with changes to the list of nodes while we're processing it.",
"after_code": "\nFor single nodes, processing is simple. If the node was cached, we\ndeserialize the cache data and fix up cross-references. Otherwise, we\ndo semantic analysis followed by type checking. Once we (re-)processed\nan SCC we check whether its interface (symbol table) is still fresh\n(matches previous cached value). If it is not, we consider dependent SCCs\nstale so that they need to be re-parsed as well.\n\nNote on indirect dependencies: normally dependencies are determined from\nimports, but since our type interfaces are \"opaque\" (i.e. symbol tables can\ncontain types identified by name), these are not enough. We *must* also\nadd \"indirect\" dependencies from types to their definitions. For this\npurpose, after we finished processing a module, we travers its type map and\nsymbol tables, and for each type we find (transitively) on which opaque/named\ntypes it depends.\n\nImport cycles\n-------------\n\nFinally we have to decide how to handle (b), import cycles. Here\nwe'll need a modified version of the original state machine\n(build.py), but we only need to do this per SCC, and we won't have to\ndeal with changes to the list of nodes while we're processing it.",
"diff_context": "\nFor single nodes, processing is simple. If the node was cached, we\ndeserialize the cache data and fix up cross-references. Otherwise, we\ndo semantic analysis followed by type checking. We also handle (c)\nabove; if a module has valid cache data *but* any of its\ndependencies was processed from source, then the module should be\nprocessed from source.\n\nA relatively simple optimization (outside SCCs) we might do in the\nfuture is as follows: if a node's cache data is valid, but one or more\nof its dependencies are out of date so we have to re-parse the node\nfrom source, once we have fully type-checked the node, we can decide\nwhether its symbol table actually changed compared to the cache data\n(by reading the cache data and comparing it to the data we would be\nwriting). If there is no change we can declare the node up to date,\nand any node that depends (and for which we have cached data, and\nwhose other dependencies are up to date) on it won't need to be\nre-parsed from source.\ndo semantic analysis followed by type checking. Once we (re-)processed\nan SCC we check whether its interface (symbol table) is still fresh\n(matches previous cached value). If it is not, we consider dependent SCCs\nstale so that they need to be re-parsed as well.\n\nNote on indirect dependencies: normally dependencies are determined from\nimports, but since our type interfaces are \"opaque\" (i.e. symbol tables can\ncontain types identified by name), these are not enough. We *must* also\nadd \"indirect\" dependencies from types to their definitions. For this\npurpose, after we finished processing a module, we travers its type map and\nsymbol tables, and for each type we find (transitively) on which opaque/named\ntypes it depends.\n\nImport cycles\n-------------\n\nFinally we have to decide how to handle (c), import cycles. Here\nFinally we have to decide how to handle (b), import cycles. Here\nwe'll need a modified version of the original state machine\n(build.py), but we only need to do this per SCC, and we won't have to\ndeal with changes to the list of nodes while we're processing it.",
"change_type": "modification",
"lines_of_context": 10,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression"
]
},
{
"file_path": "mypy/build.py",
"language": "python",
"before_code": "\n # We should always patch indirect dependencies, even in full (non-incremental) builds,\n # because the cache still may be written, and it must be correct.\n # TODO: find a more robust way to traverse *all* relevant types?\n all_types = list(self.type_map().values())\n for _, sym, _ in self.tree.local_definitions():\n if sym.type is not None:\n all_types.append(sym.type)\n if isinstance(sym.node, TypeInfo):\n # TypeInfo symbols have some extra relevant types.\n all_types.extend(sym.node.bases)\n if sym.node.metaclass_type:\n all_types.append(sym.node.metaclass_type)\n if sym.node.typeddict_type:\n all_types.append(sym.node.typeddict_type)\n if sym.node.tuple_type:\n all_types.append(sym.node.tuple_type)\n self._patch_indirect_dependencies(self.type_checker().module_refs, all_types)\n\n if self.options.dump_inference_stats:\n dump_type_stats(",
"after_code": "\n # We should always patch indirect dependencies, even in full (non-incremental) builds,\n # because the cache still may be written, and it must be correct.\n all_types = set(self.type_map().values())\n for _, sym, _ in self.tree.local_definitions():\n if sym.type is not None:\n all_types.add(sym.type)\n # Special case: settable properties may have two types.\n if isinstance(sym.node, OverloadedFuncDef) and sym.node.is_property:\n assert isinstance(first_node := sym.node.items[0], Decorator)\n if first_node.var.setter_type:\n all_types.add(first_node.var.setter_type)\n # Using mod_alias_deps is unfortunate but needed, since it is highly impractical\n # (and practically impossible) to avoid all get_proper_type() calls. For example,\n # TypeInfo.bases and metaclass, *args and **kwargs, Overloaded.items, and trivial\n # aliases like Text = str, etc. all currently forced to proper types. Thus, we need\n # to record the original definitions as they are first seen in semanal.py.\n self._patch_indirect_dependencies(\n self.type_checker().module_refs | self.tree.mod_alias_deps, all_types\n )\n\n if self.options.dump_inference_stats:\n dump_type_stats(",
"diff_context": "\n # We should always patch indirect dependencies, even in full (non-incremental) builds,\n # because the cache still may be written, and it must be correct.\n # TODO: find a more robust way to traverse *all* relevant types?\n all_types = list(self.type_map().values())\n all_types = set(self.type_map().values())\n for _, sym, _ in self.tree.local_definitions():\n if sym.type is not None:\n all_types.append(sym.type)\n if isinstance(sym.node, TypeInfo):\n # TypeInfo symbols have some extra relevant types.\n all_types.extend(sym.node.bases)\n if sym.node.metaclass_type:\n all_types.append(sym.node.metaclass_type)\n if sym.node.typeddict_type:\n all_types.append(sym.node.typeddict_type)\n if sym.node.tuple_type:\n all_types.append(sym.node.tuple_type)\n self._patch_indirect_dependencies(self.type_checker().module_refs, all_types)\n all_types.add(sym.type)\n # Special case: settable properties may have two types.\n if isinstance(sym.node, OverloadedFuncDef) and sym.node.is_property:\n assert isinstance(first_node := sym.node.items[0], Decorator)\n if first_node.var.setter_type:\n all_types.add(first_node.var.setter_type)\n # Using mod_alias_deps is unfortunate but needed, since it is highly impractical\n # (and practically impossible) to avoid all get_proper_type() calls. For example,\n # TypeInfo.bases and metaclass, *args and **kwargs, Overloaded.items, and trivial\n # aliases like Text = str, etc. all currently forced to proper types. Thus, we need\n # to record the original definitions as they are first seen in semanal.py.\n self._patch_indirect_dependencies(\n self.type_checker().module_refs | self.tree.mod_alias_deps, all_types\n )\n\n if self.options.dump_inference_stats:\n dump_type_stats(",
"change_type": "modification",
"lines_of_context": 8,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression"
]
},
{
"file_path": "mypy/build.py",
"language": "python",
"before_code": " self._type_checker.reset()\n self._type_checker = None\n\n def _patch_indirect_dependencies(self, module_refs: set[str], types: list[Type]) -> None:\n assert None not in types\n valid = self.valid_references()\n",
"after_code": " self._type_checker.reset()\n self._type_checker = None\n\n def _patch_indirect_dependencies(self, module_refs: set[str], types: set[Type]) -> None:\n assert None not in types\n valid = self.valid_references()\n",
"diff_context": " self._type_checker.reset()\n self._type_checker = None\n\n def _patch_indirect_dependencies(self, module_refs: set[str], types: list[Type]) -> None:\n def _patch_indirect_dependencies(self, module_refs: set[str], types: set[Type]) -> None:\n assert None not in types\n valid = self.valid_references()\n",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "_patch_indirect_dependencies",
"class_name": null,
"docstring": null,
"coding_patterns": [
"function_definition"
]
},
{
"file_path": "mypy/build.py",
"language": "python",
"before_code": " for id in scc:\n deps.update(graph[id].dependencies)\n deps -= ascc\n stale_deps = {id for id in deps if id in graph and not graph[id].is_interface_fresh()}\n fresh = fresh and not stale_deps\n undeps = set()\n if fresh:",
"after_code": " for id in scc:\n deps.update(graph[id].dependencies)\n deps -= ascc\n # Note: if a dependency is not in graph anymore, it should be considered interface-stale.\n # This is important to trigger any relevant updates from indirect dependencies that were\n # removed in load_graph().\n stale_deps = {id for id in deps if id not in graph or not graph[id].is_interface_fresh()}\n fresh = fresh and not stale_deps\n undeps = set()\n if fresh:",
"diff_context": " for id in scc:\n deps.update(graph[id].dependencies)\n deps -= ascc\n stale_deps = {id for id in deps if id in graph and not graph[id].is_interface_fresh()}\n # Note: if a dependency is not in graph anymore, it should be considered interface-stale.\n # This is important to trigger any relevant updates from indirect dependencies that were\n # removed in load_graph().\n stale_deps = {id for id in deps if id not in graph or not graph[id].is_interface_fresh()}\n fresh = fresh and not stale_deps\n undeps = set()\n if fresh:",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression"
]
},
{
"file_path": "mypy/indirection.py",
"language": "python",
"before_code": " def __init__(self) -> None:\n # Module references are collected here\n self.modules: set[str] = set()\n # User to avoid infinite recursion with recursive type aliases\n self.seen_aliases: set[types.TypeAliasType] = set()\n # Used to avoid redundant work\n self.seen_fullnames: set[str] = set()\n\n def find_modules(self, typs: Iterable[types.Type]) -> set[str]:\n self.modules = set()\n self.seen_fullnames = set()\n self.seen_aliases = set()\n for typ in typs:\n self._visit(typ)\n return self.modules\n\n def _visit(self, typ: types.Type) -> None:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ not in self.seen_aliases:\n self.seen_aliases.add(typ)\n typ.accept(self)\n\n def _visit_type_tuple(self, typs: tuple[types.Type, ...]) -> None:\n # Micro-optimization: Specialized version of _visit for lists\n for typ in typs:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ in self.seen_aliases:\n continue\n self.seen_aliases.add(typ)\n typ.accept(self)\n\n def _visit_type_list(self, typs: list[types.Type]) -> None:\n # Micro-optimization: Specialized version of _visit for tuples\n for typ in typs:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ in self.seen_aliases:\n continue\n self.seen_aliases.add(typ)\n typ.accept(self)\n\n def _visit_module_name(self, module_name: str) -> None:",
"after_code": " def __init__(self) -> None:\n # Module references are collected here\n self.modules: set[str] = set()\n # User to avoid infinite recursion with recursive types\n self.seen_types: set[types.TypeAliasType | types.Instance] = set()\n # Used to avoid redundant work\n self.seen_fullnames: set[str] = set()\n\n def find_modules(self, typs: Iterable[types.Type]) -> set[str]:\n self.modules = set()\n self.seen_fullnames = set()\n self.seen_types = set()\n for typ in typs:\n self._visit(typ)\n return self.modules\n\n def _visit(self, typ: types.Type) -> None:\n # Note: instances are needed for `class str(Sequence[str]): ...`\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n return\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_type_tuple(self, typs: tuple[types.Type, ...]) -> None:\n # Micro-optimization: Specialized version of _visit for lists\n for typ in typs:\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n continue\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_type_list(self, typs: list[types.Type]) -> None:\n # Micro-optimization: Specialized version of _visit for tuples\n for typ in typs:\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n continue\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_module_name(self, module_name: str) -> None:",
"diff_context": " def __init__(self) -> None:\n # Module references are collected here\n self.modules: set[str] = set()\n # User to avoid infinite recursion with recursive type aliases\n self.seen_aliases: set[types.TypeAliasType] = set()\n # User to avoid infinite recursion with recursive types\n self.seen_types: set[types.TypeAliasType | types.Instance] = set()\n # Used to avoid redundant work\n self.seen_fullnames: set[str] = set()\n\n def find_modules(self, typs: Iterable[types.Type]) -> set[str]:\n self.modules = set()\n self.seen_fullnames = set()\n self.seen_aliases = set()\n self.seen_types = set()\n for typ in typs:\n self._visit(typ)\n return self.modules\n\n def _visit(self, typ: types.Type) -> None:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ not in self.seen_aliases:\n self.seen_aliases.add(typ)\n # Note: instances are needed for `class str(Sequence[str]): ...`\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n return\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_type_tuple(self, typs: tuple[types.Type, ...]) -> None:\n # Micro-optimization: Specialized version of _visit for lists\n for typ in typs:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ in self.seen_aliases:\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n continue\n self.seen_aliases.add(typ)\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_type_list(self, typs: list[types.Type]) -> None:\n # Micro-optimization: Specialized version of _visit for tuples\n for typ in typs:\n if isinstance(typ, types.TypeAliasType):\n # Avoid infinite recursion for recursive type aliases.\n if typ in self.seen_aliases:\n if (\n isinstance(typ, types.TypeAliasType)\n or isinstance(typ, types.ProperType)\n and isinstance(typ, types.Instance)\n ):\n # Avoid infinite recursion for recursive types.\n if typ in self.seen_types:\n continue\n self.seen_aliases.add(typ)\n self.seen_types.add(typ)\n typ.accept(self)\n\n def _visit_module_name(self, module_name: str) -> None:",
"change_type": "modification",
"lines_of_context": 29,
"function_name": "_visit_module_name",
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression",
"context_manager",
"class_definition",
"type_hint"
]
},
{
"file_path": "mypy/indirection.py",
"language": "python",
"before_code": " self._visit_type_list(t.arg_types)\n\n def visit_instance(self, t: types.Instance) -> None:\n self._visit_type_tuple(t.args)\n if t.type:\n # Uses of a class depend on everything in the MRO,\n # as changes to classes in the MRO can add types to methods,\n # change property types, change the MRO itself, etc.\n for s in t.type.mro:\n self._visit_module_name(s.module_name)\n if t.type.metaclass_type is not None:\n self._visit_module_name(t.type.metaclass_type.type.module_name)\n\n def visit_callable_type(self, t: types.CallableType) -> None:\n self._visit_type_list(t.arg_types)",
"after_code": " self._visit_type_list(t.arg_types)\n\n def visit_instance(self, t: types.Instance) -> None:\n # Instance is named, record its definition and continue digging into\n # components that constitute semantic meaning of this type: bases, metaclass,\n # tuple type, and typeddict type.\n # Note: we cannot simply record the MRO, in case an intermediate base contains\n # a reference to type alias, this affects meaning of map_instance_to_supertype(),\n # see e.g. testDoubleReexportGenericUpdated.\n self._visit_type_tuple(t.args)\n if t.type:\n # Important optimization: instead of simply recording the definition and\n # recursing into bases, record the MRO and only traverse generic bases.\n for s in t.type.mro:\n self._visit_module_name(s.module_name)\n for base in s.bases:\n if base.args:\n self._visit_type_tuple(base.args)\n if t.type.metaclass_type:\n self._visit(t.type.metaclass_type)\n if t.type.typeddict_type:\n self._visit(t.type.typeddict_type)\n if t.type.tuple_type:\n self._visit(t.type.tuple_type)\n\n def visit_callable_type(self, t: types.CallableType) -> None:\n self._visit_type_list(t.arg_types)",
"diff_context": " self._visit_type_list(t.arg_types)\n\n def visit_instance(self, t: types.Instance) -> None:\n # Instance is named, record its definition and continue digging into\n # components that constitute semantic meaning of this type: bases, metaclass,\n # tuple type, and typeddict type.\n # Note: we cannot simply record the MRO, in case an intermediate base contains\n # a reference to type alias, this affects meaning of map_instance_to_supertype(),\n # see e.g. testDoubleReexportGenericUpdated.\n self._visit_type_tuple(t.args)\n if t.type:\n # Uses of a class depend on everything in the MRO,\n # as changes to classes in the MRO can add types to methods,\n # change property types, change the MRO itself, etc.\n # Important optimization: instead of simply recording the definition and\n # recursing into bases, record the MRO and only traverse generic bases.\n for s in t.type.mro:\n self._visit_module_name(s.module_name)\n if t.type.metaclass_type is not None:\n self._visit_module_name(t.type.metaclass_type.type.module_name)\n for base in s.bases:\n if base.args:\n self._visit_type_tuple(base.args)\n if t.type.metaclass_type:\n self._visit(t.type.metaclass_type)\n if t.type.typeddict_type:\n self._visit(t.type.typeddict_type)\n if t.type.tuple_type:\n self._visit(t.type.tuple_type)\n\n def visit_callable_type(self, t: types.CallableType) -> None:\n self._visit_type_list(t.arg_types)",
"change_type": "modification",
"lines_of_context": 10,
"function_name": "visit_callable_type",
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression"
]
},
{
"file_path": "mypy/indirection.py",
"language": "python",
"before_code": " self.seen_fullnames.add(fullname)\n\n def visit_overloaded(self, t: types.Overloaded) -> None:\n self._visit_type_list(list(t.items))\n self._visit(t.fallback)\n\n def visit_tuple_type(self, t: types.TupleType) -> None:",
"after_code": " self.seen_fullnames.add(fullname)\n\n def visit_overloaded(self, t: types.Overloaded) -> None:\n for item in t.items:\n self._visit(item)\n self._visit(t.fallback)\n\n def visit_tuple_type(self, t: types.TupleType) -> None:",
"diff_context": " self.seen_fullnames.add(fullname)\n\n def visit_overloaded(self, t: types.Overloaded) -> None:\n self._visit_type_list(list(t.items))\n for item in t.items:\n self._visit(item)\n self._visit(t.fallback)\n\n def visit_tuple_type(self, t: types.TupleType) -> None:",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "visit_tuple_type",
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/indirection.py",
"language": "python",
"before_code": " self._visit(t.item)\n\n def visit_type_alias_type(self, t: types.TypeAliasType) -> None:\n self._visit(types.get_proper_type(t))",
"after_code": " self._visit(t.item)\n\n def visit_type_alias_type(self, t: types.TypeAliasType) -> None:\n # Type alias is named, record its definition and continue digging into\n # components that constitute semantic meaning of this type: target and args.\n if t.alias:\n self._visit_module_name(t.alias.module)\n self._visit(t.alias.target)\n self._visit_type_list(t.args)",
"diff_context": " self._visit(t.item)\n\n def visit_type_alias_type(self, t: types.TypeAliasType) -> None:\n self._visit(types.get_proper_type(t))\n # Type alias is named, record its definition and continue digging into\n # components that constitute semantic meaning of this type: target and args.\n if t.alias:\n self._visit_module_name(t.alias.module)\n self._visit(t.alias.target)\n self._visit_type_list(t.args)",
"change_type": "modification",
"lines_of_context": 3,
"function_name": "visit_type_alias_type",
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/nodes.py",
"language": "python",
"before_code": " defs: list[Statement]\n # Type alias dependencies as mapping from target to set of alias full names\n alias_deps: defaultdict[str, set[str]]\n # Is there a UTF-8 BOM at the start?\n is_bom: bool\n names: SymbolTable",
"after_code": " defs: list[Statement]\n # Type alias dependencies as mapping from target to set of alias full names\n alias_deps: defaultdict[str, set[str]]\n # Same as above but for coarse-grained dependencies (i.e. modules instead of full names)\n mod_alias_deps: set[str]\n # Is there a UTF-8 BOM at the start?\n is_bom: bool\n names: SymbolTable",
"diff_context": " defs: list[Statement]\n # Type alias dependencies as mapping from target to set of alias full names\n alias_deps: defaultdict[str, set[str]]\n # Same as above but for coarse-grained dependencies (i.e. modules instead of full names)\n mod_alias_deps: set[str]\n # Is there a UTF-8 BOM at the start?\n is_bom: bool\n names: SymbolTable",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"type_hint"
]
},
{
"file_path": "mypy/nodes.py",
"language": "python",
"before_code": " target: The target type. For generic aliases contains bound type variables\n as nested types (currently TypeVar and ParamSpec are supported).\n _fullname: Qualified name of this type alias. This is used in particular\n to track fine grained dependencies from aliases.\n alias_tvars: Type variables used to define this alias.\n normalized: Used to distinguish between `A = List`, and `A = list`. Both\n are internally stored using `builtins.list` (because `typing.List` is",
"after_code": " target: The target type. For generic aliases contains bound type variables\n as nested types (currently TypeVar and ParamSpec are supported).\n _fullname: Qualified name of this type alias. This is used in particular\n to track fine-grained dependencies from aliases.\n module: Module where the alias was defined.\n alias_tvars: Type variables used to define this alias.\n normalized: Used to distinguish between `A = List`, and `A = list`. Both\n are internally stored using `builtins.list` (because `typing.List` is",
"diff_context": " target: The target type. For generic aliases contains bound type variables\n as nested types (currently TypeVar and ParamSpec are supported).\n _fullname: Qualified name of this type alias. This is used in particular\n to track fine grained dependencies from aliases.\n to track fine-grained dependencies from aliases.\n module: Module where the alias was defined.\n alias_tvars: Type variables used to define this alias.\n normalized: Used to distinguish between `A = List`, and `A = list`. Both\n are internally stored using `builtins.list` (because `typing.List` is",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/semanal.py",
"language": "python",
"before_code": " declared_type_vars: TypeVarLikeList | None = None,\n all_declared_type_params_names: list[str] | None = None,\n python_3_12_type_alias: bool = False,\n ) -> tuple[Type | None, list[TypeVarLikeType], set[str], list[str], bool]:\n \"\"\"Check if 'rvalue' is a valid type allowed for aliasing (e.g. not a type variable).\n\n If yes, return the corresponding type, a list of\n qualified type variable names for generic aliases, a set of names the alias depends on,\n and a list of type variables if the alias is generic.\n A schematic example for the dependencies:\n A = int\n B = str\n analyze_alias(Dict[A, B])[2] == {'__main__.A', '__main__.B'}\n \"\"\"\n dynamic = bool(self.function_stack and self.function_stack[-1].is_dynamic())\n global_scope = not self.type and not self.function_stack",
"after_code": " declared_type_vars: TypeVarLikeList | None = None,\n all_declared_type_params_names: list[str] | None = None,\n python_3_12_type_alias: bool = False,\n ) -> tuple[Type | None, list[TypeVarLikeType], set[tuple[str, str]], bool]:\n \"\"\"Check if 'rvalue' is a valid type allowed for aliasing (e.g. not a type variable).\n\n If yes, return the corresponding type, a list of type variables for generic aliases,\n a set of names the alias depends on, and True if the original type has empty tuple index.\n An example for the dependencies:\n A = int\n B = str\n analyze_alias(dict[A, B])[2] == {('mod', 'mod.A'), ('mod', 'mod.B')}\n \"\"\"\n dynamic = bool(self.function_stack and self.function_stack[-1].is_dynamic())\n global_scope = not self.type and not self.function_stack",
"diff_context": " declared_type_vars: TypeVarLikeList | None = None,\n all_declared_type_params_names: list[str] | None = None,\n python_3_12_type_alias: bool = False,\n ) -> tuple[Type | None, list[TypeVarLikeType], set[str], list[str], bool]:\n ) -> tuple[Type | None, list[TypeVarLikeType], set[tuple[str, str]], bool]:\n \"\"\"Check if 'rvalue' is a valid type allowed for aliasing (e.g. not a type variable).\n\n If yes, return the corresponding type, a list of\n qualified type variable names for generic aliases, a set of names the alias depends on,\n and a list of type variables if the alias is generic.\n A schematic example for the dependencies:\n If yes, return the corresponding type, a list of type variables for generic aliases,\n a set of names the alias depends on, and True if the original type has empty tuple index.\n An example for the dependencies:\n A = int\n B = str\n analyze_alias(Dict[A, B])[2] == {'__main__.A', '__main__.B'}\n analyze_alias(dict[A, B])[2] == {('mod', 'mod.A'), ('mod', 'mod.B')}\n \"\"\"\n dynamic = bool(self.function_stack and self.function_stack[-1].is_dynamic())\n global_scope = not self.type and not self.function_stack",
"change_type": "modification",
"lines_of_context": 10,
"function_name": null,
"class_name": null,
"docstring": "\"\"\"Check if 'rvalue' is a valid type allowed for aliasing (e.g. not a type variable).",
"coding_patterns": [
"list_comprehension"
]
},
{
"file_path": "mypy/semanal.py",
"language": "python",
"before_code": " self.cur_mod_node.plugin_deps.setdefault(trigger, set()).add(target)\n\n def add_type_alias_deps(\n self, aliases_used: Collection[str], target: str | None = None\n ) -> None:\n \"\"\"Add full names of type aliases on which the current node depends.\n\n This is used by fine-grained incremental mode to re-check the corresponding nodes.\n If `target` is None, then the target node used will be the current scope.\n \"\"\"\n if not aliases_used:\n # A basic optimization to avoid adding targets with no dependencies to\n # the `alias_deps` dict.\n return\n if target is None:\n target = self.scope.current_target()\n self.cur_mod_node.alias_deps[target].update(aliases_used)\n\n def is_mangled_global(self, name: str) -> bool:\n # A global is mangled if there exists at least one renamed variant.",
"after_code": " self.cur_mod_node.plugin_deps.setdefault(trigger, set()).add(target)\n\n def add_type_alias_deps(\n self, aliases_used: Collection[tuple[str, str]], target: str | None = None\n ) -> None:\n \"\"\"Add full names of type aliases on which the current node depends.\n\n This is used by fine-grained incremental mode to re-check the corresponding nodes.\n If `target` is None, then the target node used will be the current scope. For\n coarse-grained mode, add just the module names where aliases are defined.\n \"\"\"\n if not aliases_used:\n return\n if target is None:\n target = self.scope.current_target()\n for mod, fn in aliases_used:\n self.cur_mod_node.alias_deps[target].add(fn)\n self.cur_mod_node.mod_alias_deps.add(mod)\n\n def is_mangled_global(self, name: str) -> bool:\n # A global is mangled if there exists at least one renamed variant.",
"diff_context": " self.cur_mod_node.plugin_deps.setdefault(trigger, set()).add(target)\n\n def add_type_alias_deps(\n self, aliases_used: Collection[str], target: str | None = None\n self, aliases_used: Collection[tuple[str, str]], target: str | None = None\n ) -> None:\n \"\"\"Add full names of type aliases on which the current node depends.\n\n This is used by fine-grained incremental mode to re-check the corresponding nodes.\n If `target` is None, then the target node used will be the current scope.\n If `target` is None, then the target node used will be the current scope. For\n coarse-grained mode, add just the module names where aliases are defined.\n \"\"\"\n if not aliases_used:\n # A basic optimization to avoid adding targets with no dependencies to\n # the `alias_deps` dict.\n return\n if target is None:\n target = self.scope.current_target()\n self.cur_mod_node.alias_deps[target].update(aliases_used)\n for mod, fn in aliases_used:\n self.cur_mod_node.alias_deps[target].add(fn)\n self.cur_mod_node.mod_alias_deps.add(mod)\n\n def is_mangled_global(self, name: str) -> bool:\n # A global is mangled if there exists at least one renamed variant.",
"change_type": "modification",
"lines_of_context": 15,
"function_name": "is_mangled_global",
"class_name": null,
"docstring": "\"\"\"Add full names of type aliases on which the current node depends.",
"coding_patterns": [
"list_comprehension"
]
}
],
"commit_message_style": "concise_subject",
"python_version": null,
"pep_status": null
},
{
"type": "pr",
"repository": "mypy",
"title": "chore: add cline_docs/ to .gitignore",
"description": "Cline is a commonly used LLM tool which, under certain conditions, creates a cline_docs/ folder with task status and todo items etc\r\n\r\nThis folder is only helpful locally (unless we decide we want to add actual guidelines for Cline here, but thats outside the scope of this PR) so this PR adds it to .gitignore\r\n\r\n<!-- If this pull request fixes an issue, add \"Fixes #NNN\" with the issue number. -->\r\n\r\n<!--\r\nChecklist:\r\n- Read the [Contributing Guidelines](https://github.com/python/mypy/blob/master/CONTRIBUTING.md)\r\n- Add tests for all changed behaviour.\r\n- If you can't add a test, please explain why and how you verified your changes work.\r\n- Make sure CI passes.\r\n- Please do not force push to the PR once it has been reviewed.\r\n-->\r\n",
"url": "https://github.com/python/mypy/pull/19797",
"date": "2025-09-05T02:35:14Z",
"sha_or_number": "19797",
"files_changed": [
".gitignore"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [],
"code_samples": [],
"commit_message_style": "concise_subject",
"python_version": null,
"pep_status": null
},
{
"type": "pr",
"repository": "mypy",
"title": "[mypyc] Add type annotations to tests",
"description": "Missing type annotations can compromise test coverage. My eventual goal is to require annotations by default in all run tests.\r\n",
"url": "https://github.com/python/mypy/pull/19794",
"date": "2025-09-04T15:56:30Z",
"sha_or_number": "19794",
"files_changed": [
"mypyc/test-data/fixtures/ir.py",
"mypyc/test-data/fixtures/typing-full.pyi",
"mypyc/test-data/run-dunders.test",
"mypyc/test-data/run-singledispatch.test"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [],
"code_samples": [
{
"file_path": "mypyc/test-data/fixtures/ir.py",
"language": "python",
"before_code": " def __iadd__(self, value: Iterable[_T], /) -> List[_T]: ... # type: ignore[misc]\n def append(self, x: _T) -> None: pass\n def pop(self, i: int = -1) -> _T: pass\n def count(self, _T) -> int: pass\n def extend(self, l: Iterable[_T]) -> None: pass\n def insert(self, i: int, x: _T) -> None: pass\n def sort(self) -> None: pass",
"after_code": " def __iadd__(self, value: Iterable[_T], /) -> List[_T]: ... # type: ignore[misc]\n def append(self, x: _T) -> None: pass\n def pop(self, i: int = -1) -> _T: pass\n def count(self, x: _T) -> int: pass\n def extend(self, l: Iterable[_T]) -> None: pass\n def insert(self, i: int, x: _T) -> None: pass\n def sort(self) -> None: pass",
"diff_context": " def __iadd__(self, value: Iterable[_T], /) -> List[_T]: ... # type: ignore[misc]\n def append(self, x: _T) -> None: pass\n def pop(self, i: int = -1) -> _T: pass\n def count(self, _T) -> int: pass\n def count(self, x: _T) -> int: pass\n def extend(self, l: Iterable[_T]) -> None: pass\n def insert(self, i: int, x: _T) -> None: pass\n def sort(self) -> None: pass",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "sort",
"class_name": null,
"docstring": null,
"coding_patterns": [
"function_definition",
"type_hint"
]
},
{
"file_path": "mypyc/test-data/fixtures/ir.py",
"language": "python",
"before_code": "def id(o: object) -> int: pass\n# This type is obviously wrong but the test stubs don't have Sized anymore\ndef len(o: object) -> int: pass\ndef print(*object) -> None: pass\ndef isinstance(x: object, t: object) -> bool: pass\ndef iter(i: Iterable[_T]) -> Iterator[_T]: pass\n@overload",
"after_code": "def id(o: object) -> int: pass\n# This type is obviously wrong but the test stubs don't have Sized anymore\ndef len(o: object) -> int: pass\ndef print(*args: object) -> None: pass\ndef isinstance(x: object, t: object) -> bool: pass\ndef iter(i: Iterable[_T]) -> Iterator[_T]: pass\n@overload",
"diff_context": "def id(o: object) -> int: pass\n# This type is obviously wrong but the test stubs don't have Sized anymore\ndef len(o: object) -> int: pass\ndef print(*object) -> None: pass\ndef print(*args: object) -> None: pass\ndef isinstance(x: object, t: object) -> bool: pass\ndef iter(i: Iterable[_T]) -> Iterator[_T]: pass\n@overload",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "iter",
"class_name": null,
"docstring": null,
"coding_patterns": [
"function_definition",
"type_hint"
]
},
{
"file_path": "mypyc/test-data/fixtures/typing-full.pyi",
"language": "python",
"before_code": "class GenericMeta(type): pass\n\nclass _SpecialForm:\n def __getitem__(self, index): ...\nclass TypeVar:\n def __init__(self, name, *args, bound=None): ...\n def __or__(self, other): ...\n\ncast = 0\noverload = 0",
"after_code": "class GenericMeta(type): pass\n\nclass _SpecialForm:\n def __getitem__(self, index: Any) -> Any: ...\nclass TypeVar:\n def __init__(self, name: str, *args: Any, bound: Any = None): ...\n def __or__(self, other: Any) -> Any: ...\n\ncast = 0\noverload = 0",
"diff_context": "class GenericMeta(type): pass\n\nclass _SpecialForm:\n def __getitem__(self, index): ...\n def __getitem__(self, index: Any) -> Any: ...\nclass TypeVar:\n def __init__(self, name, *args, bound=None): ...\n def __or__(self, other): ...\n def __init__(self, name: str, *args: Any, bound: Any = None): ...\n def __or__(self, other: Any) -> Any: ...\n\ncast = 0\noverload = 0",
"change_type": "modification",
"lines_of_context": 7,
"function_name": "__or__",
"class_name": "TypeVar",
"docstring": null,
"coding_patterns": [
"function_definition",
"type_hint"
]
}
],
"commit_message_style": "concise_subject",
"python_version": null,
"pep_status": null
},
{
"type": "pr",
"repository": "mypy",
"title": "Check functions without annotations in mypyc tests",
"description": "c.f. https://github.com/python/mypy/pull/19217#discussion_r2314303410\r\n\r\nDisallowing functions without annotations (where not relevant to the tests) is probably a good idea, but this creates a large number of failures which would take some time to go through (many due to common issues, like untyped functions in the fixtures).\r\n\r\nAs a smaller step in the right direction, this sets `check_untyped_defs = True` for the `run-*` tests so that we at least check functions without annotations. ",
"url": "https://github.com/python/mypy/pull/19792",
"date": "2025-09-04T14:42:17Z",
"sha_or_number": "19792",
"files_changed": [
"mypyc/test-data/fixtures/ir.py",
"mypyc/test-data/run-classes.test",
"mypyc/test/test_run.py"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [],
"code_samples": [
{
"file_path": "mypyc/test-data/fixtures/ir.py",
"language": "python",
"before_code": "class type:\n def __init__(self, o: object) -> None: ...\n def __or__(self, o: object) -> Any: ...\n __name__ : str\n __annotations__: Dict[str, Any]\n",
"after_code": "class type:\n def __init__(self, o: object) -> None: ...\n def __or__(self, o: object) -> Any: ...\n def __new__(cls, *args: object) -> Any: ...\n __name__ : str\n __annotations__: Dict[str, Any]\n",
"diff_context": "class type:\n def __init__(self, o: object) -> None: ...\n def __or__(self, o: object) -> Any: ...\n def __new__(cls, *args: object) -> Any: ...\n __name__ : str\n __annotations__: Dict[str, Any]\n",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "__new__",
"class_name": "type",
"docstring": null,
"coding_patterns": [
"function_definition"
]
}
],
"commit_message_style": "concise_subject",
"python_version": null,
"pep_status": null
},
{
"type": "pr",
"repository": "mypy",
"title": "fix: Allow instantiation of type[None] in analyze_type_type_callee",
"description": "<!-- If this pull request fixes an issue, add \"Fixes #NNN\" with the issue number. -->\r\n\r\n(Explain how this PR changes mypy.)\r\n\r\n<!--\r\nChecklist:\r\n- Read the [Contributing Guidelines](https://github.com/python/mypy/blob/master/CONTRIBUTING.md)\r\n- Add tests for all changed behaviour.\r\n- If you can't add a test, please explain why and how you verified your changes work.\r\n- Make sure CI passes.\r\n- Please do not force push to the PR once it has been reviewed.\r\n-->\r\n\r\nFixes #19660\r\n\r\nAllow instantiation of NoneType in type checker\r\n\r\nThis change fixes the error \"Cannot instantiate type 'Type[None]'\"\r\nwhen calling NoneType() or type(None)().\r\n\r\nBy treating NoneType as a callable that returns None, mypy can now correctly\r\nhandle such calls without raising spurious errors.\r\n\r\nAlso, I added test case testTypeUsingTypeCNoneType covering:\r\n- direct calls to type(None)() and NoneType()\r\n- functions accepting type[None] and type[NoneType] parameters and invoking them\r\n\r\nThis ensures proper handling of NoneType instantiation and prevents spurious errors.",
"url": "https://github.com/python/mypy/pull/19782",
"date": "2025-09-02T06:13:12Z",
"sha_or_number": "19782",
"files_changed": [
"mypy/checkexpr.py",
"test-data/unit/check-classes.test"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [
"19660"
],
"code_samples": [
{
"file_path": "mypy/checkexpr.py",
"language": "python",
"before_code": " return self.analyze_type_type_callee(tuple_fallback(item), context)\n if isinstance(item, TypedDictType):\n return self.typeddict_callable_from_context(item)\n\n self.msg.unsupported_type_type(item, context)\n return AnyType(TypeOfAny.from_error)",
"after_code": " return self.analyze_type_type_callee(tuple_fallback(item), context)\n if isinstance(item, TypedDictType):\n return self.typeddict_callable_from_context(item)\n if isinstance(item, NoneType):\n # NoneType() returns None, so treat it as a callable that returns None\n return CallableType(\n arg_types=[],\n arg_kinds=[],\n arg_names=[],\n ret_type=NoneType(),\n fallback=self.named_type(\"builtins.function\"),\n name=None,\n from_type_type=True,\n )\n\n self.msg.unsupported_type_type(item, context)\n return AnyType(TypeOfAny.from_error)",
"diff_context": " return self.analyze_type_type_callee(tuple_fallback(item), context)\n if isinstance(item, TypedDictType):\n return self.typeddict_callable_from_context(item)\n if isinstance(item, NoneType):\n # NoneType() returns None, so treat it as a callable that returns None\n return CallableType(\n arg_types=[],\n arg_kinds=[],\n arg_names=[],\n ret_type=NoneType(),\n fallback=self.named_type(\"builtins.function\"),\n name=None,\n from_type_type=True,\n )\n\n self.msg.unsupported_type_type(item, context)\n return AnyType(TypeOfAny.from_error)",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "standard",
"python_version": null,
"pep_status": null
},
{
"type": "pr",
"repository": "mypy",
"title": "feat: new mypyc primitives for weakref.proxy",
"description": "This PR adds 2 new weakref primitives for weakref.proxy (1 and 2 arg)\r\n\r\nThe C code generates correctly, but I'm not entirely sure why this test is failing. The weakly-proxied object is being destroyed too early, while there should still be a strong reference to it. It also fails if we use the builtin weakref.proxy, so I believe this might be exposing a reference counting bug unrelated to this PR.\r\n\r\n<!--\r\nChecklist:\r\n- Read the [Contributing Guidelines](https://github.com/python/mypy/blob/master/CONTRIBUTING.md)\r\n- Add tests for all changed behaviour.\r\n- If you can't add a test, please explain why and how you verified your changes work.\r\n- Make sure CI passes.\r\n- Please do not force push to the PR once it has been reviewed.\r\n-->\r\n",
"url": "https://github.com/python/mypy/pull/19217",
"date": "2025-06-03T17:02:26Z",
"sha_or_number": "19217",
"files_changed": [
"mypyc/primitives/weakref_ops.py",
"mypyc/test-data/fixtures/ir.py",
"mypyc/test-data/irbuild-weakref.test",
"mypyc/test-data/run-weakref.test",
"test-data/unit/lib-stub/_weakref.pyi",
"test-data/unit/lib-stub/weakref.pyi"
],
"additions": 0,
"deletions": 0,
"labels": [],
"related_issues": [],
"code_samples": [
{
"file_path": "mypyc/test-data/fixtures/ir.py",
"language": "python",
"before_code": "class UnicodeEncodeError(RuntimeError): pass\nclass UnicodeDecodeError(RuntimeError): pass\nclass NotImplementedError(RuntimeError): pass\n\nclass StopIteration(Exception):\n value: Any",
"after_code": "class UnicodeEncodeError(RuntimeError): pass\nclass UnicodeDecodeError(RuntimeError): pass\nclass NotImplementedError(RuntimeError): pass\nclass ReferenceError(Exception): pass\n\nclass StopIteration(Exception):\n value: Any",
"diff_context": "class UnicodeEncodeError(RuntimeError): pass\nclass UnicodeDecodeError(RuntimeError): pass\nclass NotImplementedError(RuntimeError): pass\nclass ReferenceError(Exception): pass\n\nclass StopIteration(Exception):\n value: Any",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": "StopIteration",
"docstring": null,
"coding_patterns": [
"class_definition",
"type_hint"
]
},
{
"file_path": "test-data/unit/lib-stub/_weakref.pyi",
"language": "python",
"before_code": "",
"after_code": "from typing import Any, Callable, TypeVar, overload\nfrom weakref import CallableProxyType, ProxyType\n\n_C = TypeVar(\"_C\", bound=Callable[..., Any])\n_T = TypeVar(\"_T\")\n\n# Return CallableProxyType if object is callable, ProxyType otherwise\n@overload\ndef proxy(object: _C, callback: Callable[[CallableProxyType[_C]], Any] | None = None, /) -> CallableProxyType[_C]: ...\n@overload\ndef proxy(object: _T, callback: Callable[[ProxyType[_T]], Any] | None = None, /) -> ProxyType[_T]: ...",
"diff_context": "from typing import Any, Callable, TypeVar, overload\nfrom weakref import CallableProxyType, ProxyType\n\n_C = TypeVar(\"_C\", bound=Callable[..., Any])\n_T = TypeVar(\"_T\")\n\n# Return CallableProxyType if object is callable, ProxyType otherwise\n@overload\ndef proxy(object: _C, callback: Callable[[CallableProxyType[_C]], Any] | None = None, /) -> CallableProxyType[_C]: ...\n@overload\ndef proxy(object: _T, callback: Callable[[ProxyType[_T]], Any] | None = None, /) -> ProxyType[_T]: ...",
"change_type": "addition",
"lines_of_context": 0,
"function_name": "proxy",
"class_name": null,
"docstring": null,
"coding_patterns": [
"decorator",
"function_definition"
]
}
],
"commit_message_style": "concise_subject",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "gh-128307: Update what's new in 3.13 and 3.14 with create_task changes of asyncio (#134304)",
"description": "gh-128307: Update what's new in 3.13 and 3.14 with create_task changes of asyncio (#134304)\n\nCo-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>",
"url": "https://github.com/python/cpython/commit/28625d4f956f8d30671aba1daaac9735932983db",
"date": "2025-05-20T08:41:22Z",
"sha_or_number": "28625d4f956f8d30671aba1daaac9735932983db",
"files_changed": [
"Doc/whatsnew/3.13.rst",
"Doc/whatsnew/3.14.rst"
],
"additions": 34,
"deletions": 0,
"labels": [],
"related_issues": [
"134304"
],
"code_samples": [
{
"file_path": "Doc/whatsnew/3.13.rst",
"language": "restructuredtext",
"before_code": " never awaited).\n (Contributed by Arthur Tacca and Jason Zhang in :gh:`115957`.)\n\n\nbase64\n------",
"after_code": " never awaited).\n (Contributed by Arthur Tacca and Jason Zhang in :gh:`115957`.)\n\n* The function and methods named ``create_task`` have received a new\n ``**kwargs`` argument that is passed through to the task constructor.\n This change was accidentally added in 3.13.3,\n and broke the API contract for custom task factories.\n Several third-party task factories implemented workarounds for this.\n In 3.13.4 and later releases the old factory contract is honored\n once again (until 3.14).\n To keep the workarounds working, the extra ``**kwargs`` argument still\n allows passing additional keyword arguments to :class:`~asyncio.Task`\n and to custom task factories.\n\n This affects the following function and methods:\n :meth:`asyncio.create_task`,\n :meth:`asyncio.loop.create_task`,\n :meth:`asyncio.TaskGroup.create_task`.\n (Contributed by Thomas Grainger in :gh:`128307`.)\n\nbase64\n------",
"diff_context": " never awaited).\n (Contributed by Arthur Tacca and Jason Zhang in :gh:`115957`.)\n\n* The function and methods named ``create_task`` have received a new\n ``**kwargs`` argument that is passed through to the task constructor.\n This change was accidentally added in 3.13.3,\n and broke the API contract for custom task factories.\n Several third-party task factories implemented workarounds for this.\n In 3.13.4 and later releases the old factory contract is honored\n once again (until 3.14).\n To keep the workarounds working, the extra ``**kwargs`` argument still\n allows passing additional keyword arguments to :class:`~asyncio.Task`\n and to custom task factories.\n\n This affects the following function and methods:\n :meth:`asyncio.create_task`,\n :meth:`asyncio.loop.create_task`,\n :meth:`asyncio.TaskGroup.create_task`.\n (Contributed by Thomas Grainger in :gh:`128307`.)\n\nbase64\n------",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "Doc/whatsnew/3.14.rst",
"language": "restructuredtext",
"before_code": " (Contributed by Semyon Moroz in :gh:`133367`.)\n\n\nbdb\n---\n",
"after_code": " (Contributed by Semyon Moroz in :gh:`133367`.)\n\n\nasyncio\n-------\n\n* The function and methods named :func:`!create_task` now take an arbitrary\n list of keyword arguments. All keyword arguments are passed to the\n :class:`~asyncio.Task` constructor or the custom task factory.\n (See :meth:`~asyncio.loop.set_task_factory` for details.)\n The ``name`` and ``context`` keyword arguments are no longer special;\n the name should now be set using the ``name`` keyword argument of the factory,\n and ``context`` may be ``None``.\n\n This affects the following function and methods:\n :meth:`asyncio.create_task`,\n :meth:`asyncio.loop.create_task`,\n :meth:`asyncio.TaskGroup.create_task`.\n (Contributed by Thomas Grainger in :gh:`128307`.)\n\n\nbdb\n---\n",
"diff_context": " (Contributed by Semyon Moroz in :gh:`133367`.)\n\n\nasyncio\n-------\n\n* The function and methods named :func:`!create_task` now take an arbitrary\n list of keyword arguments. All keyword arguments are passed to the\n :class:`~asyncio.Task` constructor or the custom task factory.\n (See :meth:`~asyncio.loop.set_task_factory` for details.)\n The ``name`` and ``context`` keyword arguments are no longer special;\n the name should now be set using the ``name`` keyword argument of the factory,\n and ``context`` may be ``None``.\n\n This affects the following function and methods:\n :meth:`asyncio.create_task`,\n :meth:`asyncio.loop.create_task`,\n :meth:`asyncio.TaskGroup.create_task`.\n (Contributed by Thomas Grainger in :gh:`128307`.)\n\n\nbdb\n---\n",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "Update CODEOWNERS (#126005)",
"description": "Update CODEOWNERS (#126005)",
"url": "https://github.com/python/cpython/commit/905eddceb2d61da9087f0d303aa7e4a405d2261a",
"date": "2024-10-26T15:24:51Z",
"sha_or_number": "905eddceb2d61da9087f0d303aa7e4a405d2261a",
"files_changed": [
".github/CODEOWNERS"
],
"additions": 2,
"deletions": 2,
"labels": [],
"related_issues": [
"126005"
],
"code_samples": [],
"commit_message_style": "concise_subject; imperative_mood; references_issue",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "Withdraw most of my ownership in favor of Mark (#119611)",
"description": "Withdraw most of my ownership in favor of Mark (#119611)",
"url": "https://github.com/python/cpython/commit/3ff06ebec4e8b466f76078aa9c97cea2093d52ab",
"date": "2024-05-27T18:07:16Z",
"sha_or_number": "3ff06ebec4e8b466f76078aa9c97cea2093d52ab",
"files_changed": [
".github/CODEOWNERS"
],
"additions": 6,
"deletions": 6,
"labels": [],
"related_issues": [
"119611"
],
"code_samples": [],
"commit_message_style": "references_issue",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "gh-117549: Don't use designated initializers in headers (#118580)",
"description": "gh-117549: Don't use designated initializers in headers (#118580)\n\nThe designated initializer syntax in static inline functions in pycore_backoff.h\r\ncauses problems for C++ or MSVC users who aren't yet using C++20.\r\nWhile internal, pycore_backoff.h is included (indirectly, via pycore_code.h)\r\nby some key 3rd party software that does so for speed.",
"url": "https://github.com/python/cpython/commit/40cc809902304f60c6e1c933191dd4d64e570e28",
"date": "2024-05-05T19:28:55Z",
"sha_or_number": "40cc809902304f60c6e1c933191dd4d64e570e28",
"files_changed": [
"Include/internal/pycore_backoff.h",
"Misc/NEWS.d/next/Core and Builtins/2024-05-05-12-04-02.gh-issue-117549.kITawD.rst"
],
"additions": 12,
"deletions": 2,
"labels": [],
"related_issues": [
"118580"
],
"code_samples": [
{
"file_path": "Misc/NEWS.d/next/Core and Builtins/2024-05-05-12-04-02.gh-issue-117549.kITawD.rst",
"language": "restructuredtext",
"before_code": "",
"after_code": "Don't use designated initializer syntax in inline functions in internal\nheaders. They cause problems for C++ or MSVC users who aren't yet using the\nlatest C++ standard (C++20). While internal, pycore_backoff.h, is included\n(indirectly, via pycore_code.h) by some key 3rd party software that does so\nfor speed.",
"diff_context": "Don't use designated initializer syntax in inline functions in internal\nheaders. They cause problems for C++ or MSVC users who aren't yet using the\nlatest C++ standard (C++20). While internal, pycore_backoff.h, is included\n(indirectly, via pycore_code.h) by some key 3rd party software that does so\nfor speed.",
"change_type": "addition",
"lines_of_context": 0,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "gh-74929: Rudimentary docs for PEP 667 (#118581)",
"description": "gh-74929: Rudimentary docs for PEP 667 (#118581)\n\nThis is *not* sufficient for the final 3.13 release, but it will do for beta 1:\r\n\r\n- What's new entry\r\n- Updated changelog entry (news blurb)\r\n- Mention the proxy for f_globals in the datamodel and Python frame object docs\r\n\r\nThis doesn't have any C API details (what's new refers to the PEP).",
"url": "https://github.com/python/cpython/commit/9c13d9e37a194f574b8591da634bf98419786448",
"date": "2024-05-05T15:31:26Z",
"sha_or_number": "9c13d9e37a194f574b8591da634bf98419786448",
"files_changed": [
"Doc/c-api/frame.rst",
"Doc/reference/datamodel.rst",
"Doc/whatsnew/3.13.rst",
"Misc/NEWS.d/next/Core and Builtins/2024-04-27-21-44-40.gh-issue-74929.C2nESp.rst"
],
"additions": 22,
"deletions": 3,
"labels": [],
"related_issues": [
"118581"
],
"code_samples": [
{
"file_path": "Doc/c-api/frame.rst",
"language": "restructuredtext",
"before_code": "\n.. c:function:: PyObject* PyFrame_GetLocals(PyFrameObject *frame)\n\n Get the *frame*'s :attr:`~frame.f_locals` attribute (:class:`dict`).\n\n Return a :term:`strong reference`.\n\n .. versionadded:: 3.11\n\n\n.. c:function:: int PyFrame_GetLineNumber(PyFrameObject *frame)\n",
"after_code": "\n.. c:function:: PyObject* PyFrame_GetLocals(PyFrameObject *frame)\n\n Get the *frame*'s :attr:`~frame.f_locals` attribute.\n If the frame refers to a function or comprehension, this returns\n a write-through proxy object that allows modifying the locals.\n In all other cases (classes, modules) it returns the :class:`dict`\n representing the frame locals directly.\n\n Return a :term:`strong reference`.\n\n .. versionadded:: 3.11\n\n .. versionchanged:: 3.13\n Return a proxy object for functions and comprehensions.\n\n\n.. c:function:: int PyFrame_GetLineNumber(PyFrameObject *frame)\n",
"diff_context": "\n.. c:function:: PyObject* PyFrame_GetLocals(PyFrameObject *frame)\n\n Get the *frame*'s :attr:`~frame.f_locals` attribute (:class:`dict`).\n Get the *frame*'s :attr:`~frame.f_locals` attribute.\n If the frame refers to a function or comprehension, this returns\n a write-through proxy object that allows modifying the locals.\n In all other cases (classes, modules) it returns the :class:`dict`\n representing the frame locals directly.\n\n Return a :term:`strong reference`.\n\n .. versionadded:: 3.11\n\n .. versionchanged:: 3.13\n Return a proxy object for functions and comprehensions.\n\n\n.. c:function:: int PyFrame_GetLineNumber(PyFrameObject *frame)\n",
"change_type": "modification",
"lines_of_context": 11,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "Doc/reference/datamodel.rst",
"language": "restructuredtext",
"before_code": "\n * - .. attribute:: frame.f_locals\n - The dictionary used by the frame to look up\n :ref:`local variables <naming>`\n\n * - .. attribute:: frame.f_globals\n - The dictionary used by the frame to look up",
"after_code": "\n * - .. attribute:: frame.f_locals\n - The dictionary used by the frame to look up\n :ref:`local variables <naming>`.\n If the frame refers to a function or comprehension,\n this may return a write-through proxy object.\n\n .. versionchanged:: 3.13\n Return a proxy for functions and comprehensions.\n\n * - .. attribute:: frame.f_globals\n - The dictionary used by the frame to look up",
"diff_context": "\n * - .. attribute:: frame.f_locals\n - The dictionary used by the frame to look up\n :ref:`local variables <naming>`\n :ref:`local variables <naming>`.\n If the frame refers to a function or comprehension,\n this may return a write-through proxy object.\n\n .. versionchanged:: 3.13\n Return a proxy for functions and comprehensions.\n\n * - .. attribute:: frame.f_globals\n - The dictionary used by the frame to look up",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "Doc/whatsnew/3.13.rst",
"language": "restructuredtext",
"before_code": " Performance improvements are modest -- we expect to be improving this\n over the next few releases.\n\nNew typing features:\n\n* :pep:`696`: Type parameters (:data:`typing.TypeVar`, :data:`typing.ParamSpec`,",
"after_code": " Performance improvements are modest -- we expect to be improving this\n over the next few releases.\n\n* :pep:`667`: :attr:`FrameType.f_locals <frame.f_locals>` when used in\n a function now returns a write-through proxy to the frame's locals,\n rather than a ``dict``. See the PEP for corresponding C API changes\n and deprecations.\n\nNew typing features:\n\n* :pep:`696`: Type parameters (:data:`typing.TypeVar`, :data:`typing.ParamSpec`,",
"diff_context": " Performance improvements are modest -- we expect to be improving this\n over the next few releases.\n\n* :pep:`667`: :attr:`FrameType.f_locals <frame.f_locals>` when used in\n a function now returns a write-through proxy to the frame's locals,\n rather than a ``dict``. See the PEP for corresponding C API changes\n and deprecations.\n\nNew typing features:\n\n* :pep:`696`: Type parameters (:data:`typing.TypeVar`, :data:`typing.ParamSpec`,",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "concise_subject; references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "cpython",
"title": "gh-118335: Rename --experimental-interpreter on Windows to --experimental-jit-interpreter (#118497)",
"description": "gh-118335: Rename --experimental-interpreter on Windows to --experimental-jit-interpreter (#118497)\n\nAlso fix docs for this in whatsnew.",
"url": "https://github.com/python/cpython/commit/a37b0932285b5e883b13a46ff2a32f15d7339894",
"date": "2024-05-02T00:48:34Z",
"sha_or_number": "a37b0932285b5e883b13a46ff2a32f15d7339894",
"files_changed": [
"Doc/whatsnew/3.13.rst",
"PCbuild/build.bat"
],
"additions": 5,
"deletions": 4,
"labels": [],
"related_issues": [
"118497"
],
"code_samples": [],
"commit_message_style": "references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Support TypeGuard (PEP 647) (#9865)",
"description": "Support TypeGuard (PEP 647) (#9865)\n\nPEP 647 is still in draft mode, but it is likely to be accepted, and this helps solve some real issues.",
"url": "https://github.com/python/mypy/commit/fffbe88fc54807c8b10ac40456522ad2faf8d350",
"date": "2021-01-18T18:13:36Z",
"sha_or_number": "fffbe88fc54807c8b10ac40456522ad2faf8d350",
"files_changed": [
"mypy/checker.py",
"mypy/checkexpr.py",
"mypy/constraints.py",
"mypy/expandtype.py",
"mypy/fixup.py",
"mypy/nodes.py",
"mypy/test/testcheck.py",
"mypy/typeanal.py",
"mypy/types.py",
"test-data/unit/check-python38.test",
"test-data/unit/check-serialize.test",
"test-data/unit/check-typeguard.test",
"test-data/unit/lib-stub/typing_extensions.pyi"
],
"additions": 408,
"deletions": 9,
"labels": [],
"related_issues": [
"9865"
],
"code_samples": [
{
"file_path": "mypy/checker.py",
"language": "python",
"before_code": " if literal(expr) == LITERAL_TYPE:\n vartype = type_map[expr]\n return self.conditional_callable_type_map(expr, vartype)\n elif isinstance(node, ComparisonExpr):\n # Step 1: Obtain the types of each operand and whether or not we can\n # narrow their types. (For example, we shouldn't try narrowing the",
"after_code": " if literal(expr) == LITERAL_TYPE:\n vartype = type_map[expr]\n return self.conditional_callable_type_map(expr, vartype)\n elif isinstance(node.callee, RefExpr):\n if node.callee.type_guard is not None:\n # TODO: Follow keyword args or *args, **kwargs\n if node.arg_kinds[0] != nodes.ARG_POS:\n self.fail(\"Type guard requires positional argument\", node)\n return {}, {}\n if literal(expr) == LITERAL_TYPE:\n return {expr: TypeGuardType(node.callee.type_guard)}, {}\n elif isinstance(node, ComparisonExpr):\n # Step 1: Obtain the types of each operand and whether or not we can\n # narrow their types. (For example, we shouldn't try narrowing the",
"diff_context": " if literal(expr) == LITERAL_TYPE:\n vartype = type_map[expr]\n return self.conditional_callable_type_map(expr, vartype)\n elif isinstance(node.callee, RefExpr):\n if node.callee.type_guard is not None:\n # TODO: Follow keyword args or *args, **kwargs\n if node.arg_kinds[0] != nodes.ARG_POS:\n self.fail(\"Type guard requires positional argument\", node)\n return {}, {}\n if literal(expr) == LITERAL_TYPE:\n return {expr: TypeGuardType(node.callee.type_guard)}, {}\n elif isinstance(node, ComparisonExpr):\n # Step 1: Obtain the types of each operand and whether or not we can\n # narrow their types. (For example, we shouldn't try narrowing the",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/checkexpr.py",
"language": "python",
"before_code": " ret_type=self.object_type(),\n fallback=self.named_type('builtins.function'))\n callee_type = get_proper_type(self.accept(e.callee, type_context, always_allow_any=True))\n if (self.chk.options.disallow_untyped_calls and\n self.chk.in_checked_function() and\n isinstance(callee_type, CallableType)",
"after_code": " ret_type=self.object_type(),\n fallback=self.named_type('builtins.function'))\n callee_type = get_proper_type(self.accept(e.callee, type_context, always_allow_any=True))\n if (isinstance(e.callee, RefExpr)\n and isinstance(callee_type, CallableType)\n and callee_type.type_guard is not None):\n # Cache it for find_isinstance_check()\n e.callee.type_guard = callee_type.type_guard\n if (self.chk.options.disallow_untyped_calls and\n self.chk.in_checked_function() and\n isinstance(callee_type, CallableType)",
"diff_context": " ret_type=self.object_type(),\n fallback=self.named_type('builtins.function'))\n callee_type = get_proper_type(self.accept(e.callee, type_context, always_allow_any=True))\n if (isinstance(e.callee, RefExpr)\n and isinstance(callee_type, CallableType)\n and callee_type.type_guard is not None):\n # Cache it for find_isinstance_check()\n e.callee.type_guard = callee_type.type_guard\n if (self.chk.options.disallow_untyped_calls and\n self.chk.in_checked_function() and\n isinstance(callee_type, CallableType)",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"generator_expression"
]
},
{
"file_path": "mypy/checkexpr.py",
"language": "python",
"before_code": " \"\"\"\n if literal(expr) >= LITERAL_TYPE:\n restriction = self.chk.binder.get(expr)\n # If the current node is deferred, some variables may get Any types that they\n # otherwise wouldn't have. We don't want to narrow down these since it may\n # produce invalid inferred Optional[Any] types, at least.",
"after_code": " \"\"\"\n if literal(expr) >= LITERAL_TYPE:\n restriction = self.chk.binder.get(expr)\n # Ignore the error about using get_proper_type().\n if isinstance(restriction, TypeGuardType): # type: ignore[misc]\n # A type guard forces the new type even if it doesn't overlap the old.\n return restriction.type_guard\n # If the current node is deferred, some variables may get Any types that they\n # otherwise wouldn't have. We don't want to narrow down these since it may\n # produce invalid inferred Optional[Any] types, at least.",
"diff_context": " \"\"\"\n if literal(expr) >= LITERAL_TYPE:\n restriction = self.chk.binder.get(expr)\n # Ignore the error about using get_proper_type().\n if isinstance(restriction, TypeGuardType): # type: ignore[misc]\n # A type guard forces the new type even if it doesn't overlap the old.\n return restriction.type_guard\n # If the current node is deferred, some variables may get Any types that they\n # otherwise wouldn't have. We don't want to narrow down these since it may\n # produce invalid inferred Optional[Any] types, at least.",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": "\"\"\"",
"coding_patterns": [
"type_hint"
]
},
{
"file_path": "mypy/constraints.py",
"language": "python",
"before_code": " for t, a in zip(template.arg_types, cactual.arg_types):\n # Negate direction due to function argument type contravariance.\n res.extend(infer_constraints(t, a, neg_op(self.direction)))\n res.extend(infer_constraints(template.ret_type, cactual.ret_type,\n self.direction))\n return res\n elif isinstance(self.actual, AnyType):",
"after_code": " for t, a in zip(template.arg_types, cactual.arg_types):\n # Negate direction due to function argument type contravariance.\n res.extend(infer_constraints(t, a, neg_op(self.direction)))\n template_ret_type, cactual_ret_type = template.ret_type, cactual.ret_type\n if template.type_guard is not None:\n template_ret_type = template.type_guard\n if cactual.type_guard is not None:\n cactual_ret_type = cactual.type_guard\n res.extend(infer_constraints(template_ret_type, cactual_ret_type,\n self.direction))\n return res\n elif isinstance(self.actual, AnyType):",
"diff_context": " for t, a in zip(template.arg_types, cactual.arg_types):\n # Negate direction due to function argument type contravariance.\n res.extend(infer_constraints(t, a, neg_op(self.direction)))\n res.extend(infer_constraints(template.ret_type, cactual.ret_type,\n template_ret_type, cactual_ret_type = template.ret_type, cactual.ret_type\n if template.type_guard is not None:\n template_ret_type = template.type_guard\n if cactual.type_guard is not None:\n cactual_ret_type = cactual.type_guard\n res.extend(infer_constraints(template_ret_type, cactual_ret_type,\n self.direction))\n return res\n elif isinstance(self.actual, AnyType):",
"change_type": "modification",
"lines_of_context": 6,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": [
"type_hint"
]
},
{
"file_path": "mypy/expandtype.py",
"language": "python",
"before_code": "\n def visit_callable_type(self, t: CallableType) -> Type:\n return t.copy_modified(arg_types=self.expand_types(t.arg_types),\n ret_type=t.ret_type.accept(self))\n\n def visit_overloaded(self, t: Overloaded) -> Type:\n items = [] # type: List[CallableType]",
"after_code": "\n def visit_callable_type(self, t: CallableType) -> Type:\n return t.copy_modified(arg_types=self.expand_types(t.arg_types),\n ret_type=t.ret_type.accept(self),\n type_guard=(t.type_guard.accept(self)\n if t.type_guard is not None else None))\n\n def visit_overloaded(self, t: Overloaded) -> Type:\n items = [] # type: List[CallableType]",
"diff_context": "\n def visit_callable_type(self, t: CallableType) -> Type:\n return t.copy_modified(arg_types=self.expand_types(t.arg_types),\n ret_type=t.ret_type.accept(self))\n ret_type=t.ret_type.accept(self),\n type_guard=(t.type_guard.accept(self)\n if t.type_guard is not None else None))\n\n def visit_overloaded(self, t: Overloaded) -> Type:\n items = [] # type: List[CallableType]",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "visit_overloaded",
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": "mypy/fixup.py",
"language": "python",
"before_code": " for arg in ct.bound_args:\n if arg:\n arg.accept(self)\n\n def visit_overloaded(self, t: Overloaded) -> None:\n for ct in t.items():",
"after_code": " for arg in ct.bound_args:\n if arg:\n arg.accept(self)\n if ct.type_guard is not None:\n ct.type_guard.accept(self)\n\n def visit_overloaded(self, t: Overloaded) -> None:\n for ct in t.items():",
"diff_context": " for arg in ct.bound_args:\n if arg:\n arg.accept(self)\n if ct.type_guard is not None:\n ct.type_guard.accept(self)\n\n def visit_overloaded(self, t: Overloaded) -> None:\n for ct in t.items():",
"change_type": "modification",
"lines_of_context": 6,
"function_name": "visit_overloaded",
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "concise_subject; references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Add a separate issue form to report crashes (#9549)",
"description": "Add a separate issue form to report crashes (#9549)",
"url": "https://github.com/python/mypy/commit/cca6e2fdc874b7538bd1d2ef70daab687b2a0363",
"date": "2020-10-08T22:30:06Z",
"sha_or_number": "cca6e2fdc874b7538bd1d2ef70daab687b2a0363",
"files_changed": [
".github/ISSUE_TEMPLATE/crash.md"
],
"additions": 41,
"deletions": 0,
"labels": [],
"related_issues": [
"9549"
],
"code_samples": [
{
"file_path": ".github/ISSUE_TEMPLATE/crash.md",
"language": "markdown",
"before_code": "",
"after_code": "---\nname: Crash Report\nabout: Crash (traceback or \"INTERNAL ERROR\")\nlabels: \"crash\"\n---\n\n<!--\n Use this form only if mypy reports an \"INTERNAL ERROR\" and/or gives a traceback.\n Please include the traceback and all other messages below (use `mypy --show-traceback`).\n-->\n\n**Crash Report**\n\n(Tell us what happened.)\n\n**Traceback**\n\n```\n(Insert traceback and other messages from mypy here -- use `--show-traceback`.)\n```\n\n**To Reproduce**\n\n(Write what you did to reproduce the crash. Full source code is\nappreciated. We also very much appreciate it if you try to narrow the\nsource down to a small stand-alone example.)\n\n**Your Environment**\n\n<!-- Include as many relevant details about the environment you experienced the bug in -->\n\n- Mypy version used:\n- Mypy command-line flags:\n- Mypy configuration options from `mypy.ini` (and other config files):\n- Python version used:\n- Operating system and version:\n\n<!--\nYou can freely edit this text, please remove all the lines\nyou believe are unnecessary.\n-->",
"diff_context": "---\nname: Crash Report\nabout: Crash (traceback or \"INTERNAL ERROR\")\nlabels: \"crash\"\n---\n\n<!--\n Use this form only if mypy reports an \"INTERNAL ERROR\" and/or gives a traceback.\n Please include the traceback and all other messages below (use `mypy --show-traceback`).\n-->\n\n**Crash Report**\n\n(Tell us what happened.)\n\n**Traceback**\n\n```\n(Insert traceback and other messages from mypy here -- use `--show-traceback`.)\n```\n\n**To Reproduce**\n\n(Write what you did to reproduce the crash. Full source code is\nappreciated. We also very much appreciate it if you try to narrow the\nsource down to a small stand-alone example.)\n\n**Your Environment**\n\n<!-- Include as many relevant details about the environment you experienced the bug in -->\n\n- Mypy version used:\n- Mypy command-line flags:\n- Mypy configuration options from `mypy.ini` (and other config files):\n- Python version used:\n- Operating system and version:\n\n<!--\nYou can freely edit this text, please remove all the lines\nyou believe are unnecessary.\n-->",
"change_type": "addition",
"lines_of_context": 0,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "imperative_mood; references_issue",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Make the new bug templates less markup-heavy (#9438)",
"description": "Make the new bug templates less markup-heavy (#9438)\n\n- Remove emoji\r\n- Instead of `## H2 headings` just use `**bold**`\r\n- Add link to docs\r\n- Add suggestion for new users not to file a bug",
"url": "https://github.com/python/mypy/commit/6f07cb6a2e02446b909846f99817f674675e826e",
"date": "2020-09-11T18:35:59Z",
"sha_or_number": "6f07cb6a2e02446b909846f99817f674675e826e",
"files_changed": [
".github/ISSUE_TEMPLATE/bug.md",
".github/ISSUE_TEMPLATE/documentation.md",
".github/ISSUE_TEMPLATE/feature.md",
".github/ISSUE_TEMPLATE/question.md"
],
"additions": 24,
"deletions": 18,
"labels": [],
"related_issues": [
"9438"
],
"code_samples": [
{
"file_path": ".github/ISSUE_TEMPLATE/bug.md",
"language": "markdown",
"before_code": "---\nname: 🐛 Bug Report\nabout: Submit a bug report\nlabels: \"bug\"\n---\n\n<!--\nNote: If the problem you are reporting is about a specific library function, then the typeshed tracker is better suited\nfor this report: https://github.com/python/typeshed/issues\n-->\n\n## 🐛 Bug Report\n\n(A clear and concise description of what the bug is.)\n\n## To Reproduce\n\n(Write your steps here:)\n\n1. Step 1...\n1. Step 2...\n1. Step 3...\n\n## Expected Behavior\n\n<!--\n How did you expect your project to behave?",
"after_code": "---\nname: Bug Report\nabout: Submit a bug report\nlabels: \"bug\"\n---\n\n<!--\n If you're new to mypy and you're not sure whether what you're experiencing is a mypy bug, please see the \"Question and Help\" form\n instead.\n-->\n\n**Bug Report**\n\n<!--\nNote: If the problem you are reporting is about a specific library function, then the typeshed tracker is better suited\nfor this report: https://github.com/python/typeshed/issues\n-->\n\n(A clear and concise description of what the bug is.)\n\n**To Reproduce**\n\n(Write your steps here:)\n\n1. Step 1...\n2. Step 2...\n3. Step 3...\n\n**Expected Behavior**\n\n<!--\n How did you expect your project to behave?",
"diff_context": "---\nname: 🐛 Bug Report\nname: Bug Report\nabout: Submit a bug report\nlabels: \"bug\"\n---\n\n<!--\n If you're new to mypy and you're not sure whether what you're experiencing is a mypy bug, please see the \"Question and Help\" form\n instead.\n-->\n\n**Bug Report**\n\n<!--\nNote: If the problem you are reporting is about a specific library function, then the typeshed tracker is better suited\nfor this report: https://github.com/python/typeshed/issues\n-->\n\n## 🐛 Bug Report\n\n(A clear and concise description of what the bug is.)\n\n## To Reproduce\n**To Reproduce**\n\n(Write your steps here:)\n\n1. Step 1...\n1. Step 2...\n1. Step 3...\n2. Step 2...\n3. Step 3...\n\n## Expected Behavior\n**Expected Behavior**\n\n<!--\n How did you expect your project to behave?",
"change_type": "modification",
"lines_of_context": 20,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Add MYPY_CONFIG_FILE_DIR to environment when config file is read (2nd try) (#9414)",
"description": "Add MYPY_CONFIG_FILE_DIR to environment when config file is read (2nd try) (#9414)\n\n(This fixes the mistake I introduced in the previous version.)\r\n\r\nResubmit of #9403.\r\n\r\nFixes #7968.\r\n\r\nCo-authored-by: aghast <aghast@aghast.dev>",
"url": "https://github.com/python/mypy/commit/9d038469d80e36057c77e0a8a18831f829778f9d",
"date": "2020-09-04T20:55:14Z",
"sha_or_number": "9d038469d80e36057c77e0a8a18831f829778f9d",
"files_changed": [
"mypy/config_parser.py",
"mypy/test/testcmdline.py",
"test-data/unit/envvars.test"
],
"additions": 15,
"deletions": 0,
"labels": [],
"related_issues": [
"9403",
"7968",
"9414"
],
"code_samples": [],
"commit_message_style": "imperative_mood; references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Revert \"Add MYPY_CONFIG_FILE_DIR to environment when config file is read (#9403)\"",
"description": "Revert \"Add MYPY_CONFIG_FILE_DIR to environment when config file is read (#9403)\"\n\nReason: This broke CI.\n\nThis reverts commit 652aca96609c876c47ca7eaa68d67ac1e36f4215.",
"url": "https://github.com/python/mypy/commit/57d3473ae906fe945953b874d3dcb66efb2710ca",
"date": "2020-09-04T02:45:27Z",
"sha_or_number": "57d3473ae906fe945953b874d3dcb66efb2710ca",
"files_changed": [
"mypy/config_parser.py",
"mypy/test/testcmdline.py",
"test-data/unit/envvars.test"
],
"additions": 0,
"deletions": 15,
"labels": [],
"related_issues": [
"9403"
],
"code_samples": [],
"commit_message_style": "references_issue; has_body",
"python_version": null,
"pep_status": null
},
{
"type": "commit",
"repository": "mypy",
"title": "Revert issue template (#9345) -- it doesn't work",
"description": "Revert issue template (#9345) -- it doesn't work\n\nThis reverts commit 18c84e0f6906cfb315c367aa35550a4727cb57f8.",
"url": "https://github.com/python/mypy/commit/42a522089c6b418727e143c181128e902acf0908",
"date": "2020-08-27T22:21:28Z",
"sha_or_number": "42a522089c6b418727e143c181128e902acf0908",
"files_changed": [
".github/ISSUE_TEMPLATE/bug.md",
".github/ISSUE_TEMPLATE/documentation.md",
".github/ISSUE_TEMPLATE/feature.md",
".github/ISSUE_TEMPLATE/question.md",
".github/PULL_REQUEST_TEMPLATE.md",
"ISSUE_TEMPLATE.md"
],
"additions": 20,
"deletions": 110,
"labels": [],
"related_issues": [
"9345"
],
"code_samples": [
{
"file_path": ".github/ISSUE_TEMPLATE/bug.md",
"language": "markdown",
"before_code": "---\nname: 🐛 Bug Report\nlabels: \"bug\"\n---\n\n<!--\nNote: If the problem you are reporting is about a specific library function, then the typeshed tracker is better suited\nfor this report: https://github.com/python/typeshed/issues\n-->\n\n## 🐛 Bug Report\n\n(A clear and concise description of what the bug is.)\n\n## To Reproduce\n\n(Write your steps here:)\n\n1. Step 1...\n1. Step 2...\n1. Step 3...\n\n## Expected Behavior\n\n<!--\n How did you expect your project to behave?\n Its fine if youre not sure your understanding is correct.\n Write down what you thought would happen. If you just expected no errors, you can delete this section.\n-->\n\n(Write what you thought would happen.)\n\n## Actual Behavior\n\n<!--\n Did something go wrong?\n Is something broken, or not behaving as you expected?\n-->\n\n(Write what happened.)\n\n## Your Environment\n\n<!-- Include as many relevant details about the environment you experienced the bug in -->\n\n- Mypy version used:\n- Mypy command-line flags:\n- Mypy configuration options from `mypy.ini` (and other config files):\n- Python version used:\n- Operating system and version:\n\n<!--\nYou can freely edit this text, please remove all the lines\nyou believe are unnecessary.\n-->",
"after_code": "",
"diff_context": "---\nname: 🐛 Bug Report\nlabels: \"bug\"\n---\n\n<!--\nNote: If the problem you are reporting is about a specific library function, then the typeshed tracker is better suited\nfor this report: https://github.com/python/typeshed/issues\n-->\n\n## 🐛 Bug Report\n\n(A clear and concise description of what the bug is.)\n\n## To Reproduce\n\n(Write your steps here:)\n\n1. Step 1...\n1. Step 2...\n1. Step 3...\n\n## Expected Behavior\n\n<!--\n How did you expect your project to behave?\n Its fine if youre not sure your understanding is correct.\n Write down what you thought would happen. If you just expected no errors, you can delete this section.\n-->\n\n(Write what you thought would happen.)\n\n## Actual Behavior\n\n<!--\n Did something go wrong?\n Is something broken, or not behaving as you expected?\n-->\n\n(Write what happened.)\n\n## Your Environment\n\n<!-- Include as many relevant details about the environment you experienced the bug in -->\n\n- Mypy version used:\n- Mypy command-line flags:\n- Mypy configuration options from `mypy.ini` (and other config files):\n- Python version used:\n- Operating system and version:\n\n<!--\nYou can freely edit this text, please remove all the lines\nyou believe are unnecessary.\n-->",
"change_type": "deletion",
"lines_of_context": 0,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": ".github/ISSUE_TEMPLATE/feature.md",
"language": "markdown",
"before_code": "---\nname: 🚀 Feature\nlabels: \"feature\"\n---\n\n## 🚀 Feature\n\n(A clear and concise description of your feature proposal.)\n\n## Pitch\n\n(Please explain why this feature should be implemented and how it would be used. Add examples, if applicable.)",
"after_code": "",
"diff_context": "---\nname: 🚀 Feature\nlabels: \"feature\"\n---\n\n## 🚀 Feature\n\n(A clear and concise description of your feature proposal.)\n\n## Pitch\n\n(Please explain why this feature should be implemented and how it would be used. Add examples, if applicable.)",
"change_type": "deletion",
"lines_of_context": 0,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
},
{
"file_path": ".github/PULL_REQUEST_TEMPLATE.md",
"language": "markdown",
"before_code": "### Have you read the [Contributing Guidelines](https://github.com/python/mypy/blob/master/CONTRIBUTING.md)?\n\n(Once you have, delete this section. If you leave it in, your PR may be closed without action.)\n\n### Description\n\n<!--\nIf this pull request closes or fixes an issue, write Closes #NNN\" or \"Fixes #NNN\" in that exact\nformat.\n-->\n\n(Explain how this PR changes mypy.)\n\n## Test Plan\n\n<!--\nIf this is a documentation change, rebuild the docs (link to instructions) and review the changed pages for markup errors.\nIf this is a code change, include new tests (link to the testing docs). Be sure to run the tests locally and fix any errors before submitting the PR (more instructions).\nIf this change cannot be tested by the CI, please explain how to verify it manually.\n-->\n\n(Write your test plan here. If you changed any code, please provide us with clear instructions on how you verified your changes work.)",
"after_code": "",
"diff_context": "### Have you read the [Contributing Guidelines](https://github.com/python/mypy/blob/master/CONTRIBUTING.md)?\n\n(Once you have, delete this section. If you leave it in, your PR may be closed without action.)\n\n### Description\n\n<!--\nIf this pull request closes or fixes an issue, write Closes #NNN\" or \"Fixes #NNN\" in that exact\nformat.\n-->\n\n(Explain how this PR changes mypy.)\n\n## Test Plan\n\n<!--\nIf this is a documentation change, rebuild the docs (link to instructions) and review the changed pages for markup errors.\nIf this is a code change, include new tests (link to the testing docs). Be sure to run the tests locally and fix any errors before submitting the PR (more instructions).\nIf this change cannot be tested by the CI, please explain how to verify it manually.\n-->\n\n(Write your test plan here. If you changed any code, please provide us with clear instructions on how you verified your changes work.)",
"change_type": "deletion",
"lines_of_context": 0,
"function_name": null,
"class_name": null,
"docstring": null,
"coding_patterns": []
}
],
"commit_message_style": "concise_subject; references_issue; has_body",
"python_version": null,
"pep_status": null
}
]

79
notebooks/data/my_developer_rules.md vendored Normal file
View file

@ -0,0 +1,79 @@
Assistant Guidelines
These rules are absolutely imperative to adhere to. Comply with them precisely as they are outlined.
The agent must use sequential thinking MCP tool to work out problems.
Core Behavior Guidelines
Respond only to explicit requests. Do not add files, code, tests, or comments unless asked.
Follow instructions precisely. No assumptions or speculative additions.
Use provided context accurately.
Avoid extra output. No debugging logs or test harnesses unless requested.
Produce clean, optimized code when code is requested. Respect existing style.
Deliver complete, standalone solutions. No placeholders.
Limit file creation. Only create new files when necessary.
If you modify the model in a user's code, you must confirm with the user and never be sneaky. Always tell the user exactly what you are doing.
Communication & Delivery
9. Don't explain unless asked. Do not expose reasoning in outputs.
10. If unsure, say "I don't know." Avoid hallucinated content.
11. Maintain consistency across sessions. Refer to project memory and documentation.
12. Respect privacy and permissions. Never leak or infer secure data.
13. Prioritize targeted edits over full rewrites.
14. Optimize incrementally. Avoid unnecessary overhauls.
Spec.md Requirement
You must maintain a file named Spec.md. This file acts as the single source of truth for the project.
Rules:
Before starting any implementation, check if Spec.md already exists.
If it does not exist, create one using the template provided below.
Always update Spec.md before and after any major change.
Use the contents of Spec.md to guide logic, structure, and implementation decisions.
When updating a section, condense previous content to keep the document concise.
Spec.md Starter Template (Plain Text Format)
Title: Spec.md Project Specification
Section: Purpose
Describe the main goal of this feature, tool, or system.
Section: Core Functionality
List the key features, expected behaviors, and common use cases.
Section: Architecture Overview
Summarize the technical setup, frameworks used, and main modules or services.
Section: Input and Output Contracts
List all inputs and outputs in a table-like format:
Input: describe the input data, its format, and where it comes from.
Output: describe the output data, its format, and its destination.
Section: Edge Cases and Constraints
List known limitations, special scenarios, and fallback behaviors.
Section: File and Module Map
List all important files or modules and describe what each one is responsible for.
Section: Open Questions or TODOs
Create a checklist of unresolved decisions, logic that needs clarification, or tasks that are still pending.
Section: Last Updated
Include the most recent update date and who made the update.

74
notebooks/data/pep_style_guide.md vendored Normal file
View file

@ -0,0 +1,74 @@
# PEP 8 Style Guide: Essentials
## Code Layout
- Indentation: 4 spaces per level
- Line length: 79 for code (88/100 acceptable by team), 72 for comments/docstrings
- Blank lines: 2 around top-level defs/classes, 1 between methods
```python
# Hanging indent for long calls
foo = long_function_name(
var_one, var_two,
var_three, var_four,
)
```
## Imports
- One import per line
- Group: stdlib, third-party, local
- Prefer absolute imports; avoid wildcard imports
```python
import os
import sys
from subprocess import Popen, PIPE
import requests
from myproject.models import User
```
## Whitespace
- No space inside brackets or before commas/semicolons
- Spaces around binary operators
```python
x = 1
hypot2 = x * x + y * y
```
## Naming
- snake_case: functions, variables
- PascalCase: classes
- SCREAMING_SNAKE_CASE: constants
## Comments & Docstrings
- Use complete sentences; keep up to date
- Triple-double quotes for public modules, classes, functions
```python
def f(x: int) -> int:
"""Return x doubled."""
return x * 2
```
## Type Hints
- Space after colon; arrow for returns
```python
def munge(s: str) -> str: ...
```
## Tooling
- Black, isort, Flake8 (or Ruff) to automate style
- Example pyproject.toml excerpt:
```toml
[tool.black]
line-length = 88
[tool.isort]
profile = "black"
```
## Common Violations
- E501: line too long -> break with parentheses
- E225: missing whitespace around operator
- E402: module import not at top of file

74
notebooks/data/zen_principles.md vendored Normal file
View file

@ -0,0 +1,74 @@
# The Zen of Python: Practical Guide
## Overview
The Zen of Python (Tim Peters, import this) captures Python's philosophy. Use these principles as a checklist during design, coding, and reviews.
## Key Principles With Guidance
### 1. Beautiful is better than ugly
Prefer descriptive names, clear structure, and consistent formatting.
### 2. Explicit is better than implicit
Be clear about behavior, imports, and types.
```python
from datetime import datetime, timedelta
def get_future_date(days_ahead: int) -> datetime:
return datetime.now() + timedelta(days=days_ahead)
```
### 3. Simple is better than complex
Choose straightforward solutions first.
### 4. Complex is better than complicated
When complexity is needed, organize it with clear abstractions.
### 5. Flat is better than nested
Use early returns to reduce indentation.
### 6. Sparse is better than dense
Give code room to breathe with whitespace.
### 7. Readability counts
Optimize for human readers; add docstrings for nontrivial code.
### 8. Special cases aren't special enough to break the rules
Stay consistent; exceptions should be rare and justified.
### 9. Although practicality beats purity
Prefer practical solutions that teams can maintain.
### 10. Errors should never pass silently
Handle exceptions explicitly; log with context.
### 11. Unless explicitly silenced
Silence only specific, acceptable errors and document why.
### 12. In the face of ambiguity, refuse the temptation to guess
Require explicit inputs and behavior.
### 13. There should be one obvious way to do it
Prefer standard library patterns and idioms.
### 14. Although that way may not be obvious at first
Learn Python idioms; embrace clarity over novelty.
### 15. Now is better than never; 16. Never is often better than right now
Iterate, but don't rush broken code.
### 17/18. Hard to explain is bad; easy to explain is good
Prefer designs you can explain simply.
### 19. Namespaces are one honking great idea
Use modules/packages to separate concerns; avoid wildcard imports.
## Modern Python Tie-ins
- Type hints reinforce explicitness
- Context managers enforce safe resource handling
- Dataclasses improve readability for data containers
## Quick Review Checklist
- Is it readable and explicit?
- Is this the simplest working solution?
- Are errors explicit and logged?
- Are modules/namespaces used appropriately?

584
notebooks/tutorial.ipynb vendored Normal file
View file

@ -0,0 +1,584 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6f22c8fe6d92cfcc",
"metadata": {},
"source": [
"# Using Cognee with Python Development Data\n",
"\n",
"Unite authoritative Python practice (Guido van Rossum's own contributions!), normative guidance (Zen/PEP8), and your lived context (rules + conversations) into one *AI memory* that produces answers that are relevant, explainable, and consistent."
]
},
{
"cell_type": "markdown",
"id": "fe69acbf9ab1a22b",
"metadata": {},
"source": [
"## What You'll Learn\n",
"\n",
"In this comprehensive tutorial, you'll discover how to transform scattered development data into an intelligent knowledge system that enhances your coding workflow. By the end, you'll have:\n",
"\n",
"- **Connected disparate data sources** (Guido's CPython contributions, mypy development, PEP discussions, your Python projects) into a unified AI memory graph\n",
"- **Built an memory layer** that understands Python design philosophy, best practice coding patterns, and your preferences and experience\n",
"- **Learn how to use intelligent search capabilities** that combine the diverse context\n",
"\n",
"This tutorial demonstrates the power of **knowledge graphs** and **retrieval-augmented generation (RAG)** for software development, showing you how to build systems that learn from Python's creator and improve your own Python development."
]
},
{
"cell_type": "markdown",
"id": "b03b59c064213dd4",
"metadata": {},
"source": [
"## Cognee and its core operations\n",
"\n",
"Before we dive in, let's understand the core Cognee operations we'll be working with:\n",
"\n",
"- **`cognee.add()`** - Ingests raw data (files, text, APIs) into the system\n",
"- **`cognee.cognify()`** - Processes and structures data into a knowledge graph using AI\n",
"- **`cognee.search()`** - Queries the knowledge graph with natural language or Cypher\n",
"- **`cognee.memify()`** - Cognee's \"secret sauce\" that infers implicit connections and rules from your data"
]
},
{
"cell_type": "markdown",
"id": "6a7669fbb6a3e6c7",
"metadata": {},
"source": [
"## Data used in this tutorial\n",
"\n",
"Cognee can ingest many types of sources. In this tutorial, we use a small, concrete set of files that cover different perspectives:\n",
"\n",
"- **`guido_contributions.json` — Authoritative exemplars.** Real PRs and commits from Guido van Rossum (mypy, CPython). These show how Pythons creator solved problems and provide concrete anchors for patterns.\n",
"- **`pep_style_guide.md` — Norms.** Encodes community style and typing conventions (PEP8 and related). Ensures that search results and inferred rules align with widely accepted standards.\n",
"- **`zen_principles.md` — Philosophy.** The Zen of Python. Grounds design tradeoffs (simplicity, explicitness, readability) beyond syntax or mechanics.\n",
"- **`my_developer_rules.md` — Local constraints.** Your house rules, conventions, and projectspecific requirements (scope, privacy, Spec.md). Keeps recommendations relevant to your actual workflow.\n",
"- **`copilot_conversations.json` — Personal history.** Transcripts of real assistant conversations, including your questions, code snippets, and discussion topics. Captures “how you code” and connects it to “how Guido codes.”"
]
},
{
"cell_type": "markdown",
"id": "2a5dac2c6fdc7ca7",
"metadata": {},
"source": [
"# Preliminaries\n",
"\n",
"Cognee relies heavily on async functions.\n",
"We need `nest_asyncio` so `await` works in this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20cb02b49e3c53e2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:35:00.836706Z",
"start_time": "2025-09-07T14:35:00.832646Z"
}
},
"outputs": [],
"source": [
"import nest_asyncio\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "markdown",
"id": "30e66c894fb4cfd5",
"metadata": {},
"source": [
"To strike the balanace between speed, cost, anc quality, we recommend using OpenAI's `4o-mini` model; make sure your `.env` file contains this line:\n",
"\n",
"```LLM_MODEL=\"gpt-4o-mini\"```"
]
},
{
"cell_type": "markdown",
"id": "45e1caaec20c9518",
"metadata": {},
"source": [
"We will do a quick import check."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9386ecb596860399",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:35:03.910260Z",
"start_time": "2025-09-07T14:35:00.938966Z"
}
},
"outputs": [],
"source": [
"import cognee\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"print('🔍 Quick Cognee Import Check')\n",
"print('=' * 30)\n",
"print(f'📍 Cognee location: {cognee.__file__}')\n",
"print(f'📁 Package directory: {os.path.dirname(cognee.__file__)}')\n",
"\n",
"# Check if it's local or installed\n",
"current_dir = Path.cwd()\n",
"cognee_path = Path(cognee.__file__)\n",
"if current_dir in cognee_path.parents:\n",
" print('🏠 Status: LOCAL DEVELOPMENT VERSION')\n",
"else:\n",
" print('📦 Status: INSTALLED PACKAGE')"
]
},
{
"cell_type": "markdown",
"id": "76895c6570d1a4dc",
"metadata": {},
"source": [
"And just to be safe, we will make sure that the path contains the root directory, so Python can find everything it needs to run the notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19e74e6b691020db",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:35:03.921217Z",
"start_time": "2025-09-07T14:35:03.918659Z"
}
},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"notebook_dir = Path.cwd()\n",
"if notebook_dir.name == 'notebooks':\n",
" project_root = notebook_dir.parent\n",
"else:\n",
" project_root = Path.cwd()\n",
"\n",
"# Add project root to the beginning of sys.path\n",
"project_root_str = str(project_root.absolute())\n",
"if project_root_str not in sys.path:\n",
" sys.path.insert(0, project_root_str)\n",
"\n",
"print(f\"📁 Project root: {project_root_str}\")"
]
},
{
"cell_type": "markdown",
"id": "af584b935cbdc8d",
"metadata": {},
"source": [
"Finally, we will begin with a clean slate, by removing any previous Cognee data:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd47383aa9519465",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:35:06.194073Z",
"start_time": "2025-09-07T14:35:03.929446Z"
}
},
"outputs": [],
"source": [
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)"
]
},
{
"cell_type": "markdown",
"id": "93c9783037715026",
"metadata": {},
"source": [
"### First data ingestion: Exploring Guido's Python Contributions\n",
"\n",
"We'll begin with a document that contains detailed PRs and commits from Guido van Rossum's work on mypy and CPython, showing real-world examples of Python's creator solving type system and language design challenges.\n",
"\n",
"We'll use Cognee's `add()` and `cognify()` functions to ingest this data and build a knowledge graph that connects Guido's development patterns with Python best practices."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8743ed520b4de37",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:39:53.511862Z",
"start_time": "2025-09-07T14:35:06.228778Z"
}
},
"outputs": [],
"source": [
"import cognee\n",
"\n",
"result = await cognee.add(\n",
" os.path.abspath(\"data/guido_contributions.json\"),\n",
" node_set=[\"guido_data\"]\n",
")\n",
"await cognee.cognify(temporal_cognify=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f08b362cbf12b398",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:39:53.561679Z",
"start_time": "2025-09-07T14:39:53.559528Z"
}
},
"outputs": [],
"source": [
"results = await cognee.search(\"Show me commits\")\n",
"print(results[0])"
]
},
{
"cell_type": "markdown",
"id": "10d582d02ead905e",
"metadata": {},
"source": [
"### What's just happened?\n",
"The `search()` function uses natural language to query a knowledge graph containing Guido's development history.\n",
"Unlike traditional databases, Cognee understands the relationships between commits, language features, design decisions, and evolution over time.\n",
"\n",
"Cognee also allows you to visualize the graphs created:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fb068f422bda6cf",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:39:53.688017Z",
"start_time": "2025-09-07T14:39:53.598467Z"
}
},
"outputs": [],
"source": [
"from cognee import visualize_graph\n",
"await visualize_graph('./guido_contributions.html')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f24341c97d6eaccb",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:39:53.733197Z",
"start_time": "2025-09-07T14:39:53.729922Z"
}
},
"outputs": [],
"source": [
"from IPython.display import IFrame, HTML, display\n",
"display(IFrame(\"./guido_contributions.html\", width=\"100%\", height=\"500\"))"
]
},
{
"cell_type": "markdown",
"id": "3418aa17bf35e3bb",
"metadata": {},
"source": [
"**Why visualization matters:** Knowledge graphs reveal hidden patterns in data. In this case, patterins in Guido's contributions to Python's development. The interactive visualization shows how different projects (CPython, mypy, PEPs), features, and time periods connect - insights that show Python's thoughtful evolution.\n",
"\n",
"Take a moment to explore the graph. Notice how:\n",
"\n",
"- CPython core development clusters around 2020\n",
"- Mypy contributions focus on fixtures and run classes\n",
"- PEP discussions mention Thomas Grainiger and Adam Turner\n",
"- Time-based connections show how ideas evolved into features\n",
"\n",
"*Note: You can open the visualization on your browser using the html file from path provided above.*"
]
},
{
"cell_type": "markdown",
"id": "5e8d9094a09ae05d",
"metadata": {},
"source": [
"### Ingesting more data\n",
"\n",
"Now we'll add the remaining data and see how they connections emerge between Guido's contributions, Python best practices, and user conversations."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5315318324968f0f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:40:48.589875Z",
"start_time": "2025-09-07T14:39:53.785054Z"
}
},
"outputs": [],
"source": [
"import os\n",
"\n",
"await cognee.add(os.path.abspath(\"data/copilot_conversations.json\"), node_set=[\"developer_data\"])\n",
"await cognee.add(os.path.abspath(\"data/my_developer_rules.md\"), node_set=[\"developer_data\"])\n",
"await cognee.add(os.path.abspath(\"data/zen_principles.md\"), node_set=[\"principles_data\"])\n",
"await cognee.add(os.path.abspath(\"data/pep_style_guide.md\"), node_set=[\"principles_data\"])\n",
"\n",
"await cognee.cognify(temporal_cognify=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98b69c45db2fca3",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:40:51.305617Z",
"start_time": "2025-09-07T14:40:48.605622Z"
}
},
"outputs": [],
"source": [
"results = await cognee.search(\n",
" \"What Python type hinting challenges did I face, and how does Guido approach similar problems in mypy?\",\n",
" query_type=cognee.SearchType.GRAPH_COMPLETION\n",
")\n",
"print(results)"
]
},
{
"cell_type": "markdown",
"id": "6c49c4c252036fa1",
"metadata": {},
"source": [
"You'll see that cognee has connected your Python development challenges with Guido's approaches, revealing patterns like:\n",
"\n",
"- \"Type hint implementation failed due to circular imports - similar to issue Guido solved in mypy PR #1234\"\n",
"- \"Performance bottleneck in list comprehension matches pattern Guido optimized in CPython commit abc123\""
]
},
{
"cell_type": "markdown",
"id": "a1f4606bfed8fc45",
"metadata": {},
"source": [
"### Memify\n",
"\n",
"Let's now introduce the memory functions. These algorithms run on top of your semantic layer, connecting the dots and improving the search.\n",
"\n",
"Memify is customizable and can use any transformation you'd like to write."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20234960f7566b15",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:43:42.390990Z",
"start_time": "2025-09-07T14:40:51.321301Z"
}
},
"outputs": [],
"source": [
"await cognee.memify()"
]
},
{
"cell_type": "markdown",
"id": "58d3ccec16f67c24",
"metadata": {},
"source": [
"**What `memify()` does for Python:** This advanced function uses AI to:\n",
"\n",
"- **Infer rule patterns** from your code (e.g., \"When implementing iterators, always follow the protocol Guido established\")\n",
"- **Connect design philosophy to practice** (e.g., linking \"explicit is better than implicit\" to your type hinting decisions)\n"
]
},
{
"cell_type": "markdown",
"id": "a304033f9f0f5dcf",
"metadata": {},
"source": [
"Now let's see how the system has connected your Python development patterns with established best practices:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "518fa9b17a604657",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:43:44.837614Z",
"start_time": "2025-09-07T14:43:42.465351Z"
}
},
"outputs": [],
"source": [
"# Search for connections between your async patterns and Python philosophy\n",
"results = await cognee.search(\n",
" query_text= \"How does my AsyncWebScraper implementation align with Python's design principles?\",\n",
" query_type=cognee.SearchType.GRAPH_COMPLETION\n",
")\n",
"print(\"Python Pattern Analysis:\", results)"
]
},
{
"cell_type": "markdown",
"id": "2c77c1582ab9fc32",
"metadata": {},
"source": [
"### Nodeset filtering\n",
"\n",
"You may have noticed that we added different documents to different datasets. This allows us to narrow our retrieval at search time.\n",
"\n",
"By constraining searches to particular node sets, you can ensure that responses draw from appropriate sources. Questions about style guidelines can be directed specifically to PEP documents and design principles, while implementation questions can focus on actual code examples and developer experiences."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c64036c03abe41e7",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:43:49.542281Z",
"start_time": "2025-09-07T14:43:44.852455Z"
}
},
"outputs": [],
"source": [
"from cognee.modules.engine.models.node_set import NodeSet\n",
"results = await cognee.search(\n",
" query_text= \"How should variables be named?\",\n",
" query_type=cognee.SearchType.GRAPH_COMPLETION,\n",
" node_type=NodeSet,\n",
" node_name=['principles_data']\n",
")\n",
"\n",
"print(results)"
]
},
{
"cell_type": "markdown",
"id": "c641b8b7e50dd2ae",
"metadata": {},
"source": [
"### Temporal graphs\n",
"\n",
"The `temporal_cognify` option enabled during graph construction provides powerful capabilities for understanding how Python development has evolved over time. This temporal awareness allows queries that explore trends, identify periods of intense development activity, or understand how specific features emerged and matured.\n",
"\n",
"Temporal queries can reveal insights about development velocity, the relationship between different features introduced in similar timeframes, or how implementation approaches have changed as the language has evolved. This historical perspective provides valuable context for understanding current best practices and anticipating future directions.\n",
"\n",
"For example:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28e7d5a75e076b8f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:43:55.278031Z",
"start_time": "2025-09-07T14:43:49.555704Z"
}
},
"outputs": [],
"source": [
"result = await cognee.search(\n",
" query_text = \"What can we learn from Guido's contributions in 2025?\",\n",
" query_type=cognee.SearchType.TEMPORAL\n",
")\n",
"\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"id": "ec6cf074a6c272ab",
"metadata": {},
"source": [
"### Feedback loops\n",
"\n",
"cognee supports continuous improvement through a feedback mechanism that captures the utility and relevance of search results based on actualy user messages. This creates a learning system that adapts to your specific needs and preferences over time.\n",
"\n",
"When search interactions are saved, you can store the results use SearchType.FEEDBACK to provide feedback about the last_k answer from the system. This feedback becomes part of the graph itself, helping the system provide increasingly relevant results."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67dec85a658aad76",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:43:58.472950Z",
"start_time": "2025-09-07T14:43:55.288993Z"
}
},
"outputs": [],
"source": [
"answer = await cognee.search(\n",
" query_type=cognee.SearchType.GRAPH_COMPLETION,\n",
" query_text=\"What is the most zen thing about Python?\",\n",
" save_interaction=True, # This enables feedback later\n",
")\n",
"\n",
"print(answer)"
]
},
{
"cell_type": "markdown",
"id": "2f64d90e6dadaefb",
"metadata": {},
"source": [
"This enables giving feedback, that itself can also be stored in the graph and will be included in future searches:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "870149ea58fd109c",
"metadata": {
"ExecuteTime": {
"end_time": "2025-09-07T14:44:00.503091Z",
"start_time": "2025-09-07T14:43:58.480893Z"
}
},
"outputs": [],
"source": [
"feedback = await cognee.search(\n",
" query_type=cognee.SearchType.FEEDBACK,\n",
" query_text=\"Last result was useful, I like code that complies with best practices.\",\n",
" last_k=1,\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

1646
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,49 +20,38 @@ classifiers = [
"Operating System :: Microsoft :: Windows",
]
dependencies = [
"openai>=1.80.1,<2.0.0",
"openai>=1.80.1",
"python-dotenv>=1.0.1,<2.0.0",
"pydantic>=2.10.5,<3.0.0",
"pydantic-settings>=2.2.1,<3",
"typing_extensions>=4.12.2,<5.0.0",
"nltk>=3.9.1,<4.0.0",
"numpy>=1.26.4, <=4.0.0",
"pandas>=2.2.2,<3.0.0",
# Note: New s3fs and boto3 versions don't work well together
# Always use comaptible fixed versions of these two dependencies
"s3fs[boto3]==2025.3.2",
"sqlalchemy>=2.0.39,<3.0.0",
"aiosqlite>=0.20.0,<1.0.0",
"tiktoken>=0.8.0,<1.0.0",
"litellm>=1.71.0, <2.0.0",
"litellm>=1.76.0",
"instructor>=1.9.1,<2.0.0",
"langfuse>=2.32.0,<3",
"filetype>=1.2.0,<2.0.0",
"aiohttp>=3.11.14,<4.0.0",
"aiofiles>=23.2.1,<24.0.0",
"rdflib>=7.1.4,<7.2.0",
"pypdf>=4.1.0,<7.0.0",
"jinja2>=3.1.3,<4",
"matplotlib>=3.8.3,<4",
"networkx>=3.4.2,<4",
"lancedb>=0.24.0,<1.0.0",
"nbformat>=5.7.0,<6.0.0",
"alembic>=1.13.3,<2",
"pre-commit>=4.0.1,<5",
"scikit-learn>=1.6.1,<2",
"limits>=4.4.1,<5",
"fastapi>=0.115.7,<1.0.0",
"python-multipart>=0.0.20,<1.0.0",
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
"dlt[sqlalchemy]>=1.9.0,<2",
"sentry-sdk[fastapi]>=2.9.0,<3",
"structlog>=25.2.0,<26",
"baml-py (==0.206.0)",
"pympler>=1.1,<2.0.0",
"onnxruntime>=1.0.0,<2.0.0",
"pylance>=0.22.0,<1.0.0",
"onnxruntime<=1.22.1",
"pylance>=0.22.0,<=0.36.0",
"kuzu (==0.11.0)",
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
"fastembed<=0.6.0",
"networkx>=3.4.2,<4",
"uvicorn>=0.34.0,<1.0.0",
"gunicorn>=20.1.0,<24",
"websockets>=15.0.1,<16.0.0"
@ -70,6 +59,7 @@ dependencies = [
[project.optional-dependencies]
api=[]
distributed = [
"modal>=1.0.5,<2.0.0",
]
@ -115,25 +105,28 @@ codegraph = [
evals = [
"plotly>=6.0.0,<7",
"gdown>=5.2.0,<6",
"pandas>=2.2.2,<3.0.0",
"matplotlib>=3.8.3,<4",
"scikit-learn>=1.6.1,<2",
]
gui = [
"pyside6>=6.8.3,<7",
"qasync>=0.27.1,<0.28",
]
graphiti = ["graphiti-core>=0.7.0,<0.8"]
# Note: New s3fs and boto3 versions don't work well together
# Always use comaptible fixed versions of these two dependencies
aws = ["s3fs[boto3]==2025.3.2"]
dlt = ["dlt[sqlalchemy]>=1.9.0,<2"]
baml = ["baml-py (==0.206.0)"]
dev = [
"pytest>=7.4.0,<8",
"pytest-cov>=6.1.1,<7.0.0",
"pytest-asyncio>=0.21.1,<0.22",
"coverage>=7.3.2,<8",
"mypy>=1.7.1,<2",
"pre-commit>=4.0.1,<5",
"notebook>=7.1.0,<8",
"deptry>=0.20.0,<0.21",
"pylint>=3.0.3,<4",
"ruff>=0.9.2,<1.0.0",
"ruff>=0.9.2,<=0.13.1",
"tweepy>=4.14.0,<5.0.0",
"gitpython>=3.1.43,<4",
"mkdocs-material>=9.5.42,<10",
@ -142,11 +135,14 @@ dev = [
]
debug = ["debugpy>=1.8.9,<2.0.0"]
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
[project.urls]
Homepage = "https://www.cognee.ai"
Repository = "https://github.com/topoteretes/cognee"
[project.scripts]
cognee = "cognee.cli._cognee:main"
cognee-cli = "cognee.cli._cognee:main"
[build-system]

1209
uv.lock generated

File diff suppressed because it is too large Load diff