backport: Adds lance-namespace version fix to toml (fixes lancedb issue with 0.2.0 lance-namespace version) + crawler ingetration test url fix (#1842)

<!-- .github/pull_request_template.md -->

## Description
Implements a quick fix for the lance-namespace 0.0.21 to 0.2.0 release
issue with lancedb. Later this has to be revisited if they fix it on
their side, for now we fixed the lance-namespace version to the previous
one.


**If Lancedb fixes the issue on their side this can be closed**


Additionally cherry picking crawler integration test fixes from dev

## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-11-27 10:47:00 -08:00 committed by GitHub
commit 00b60aed6c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 5983 additions and 5167 deletions

View file

@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler
@pytest.mark.asyncio
async def test_fetch():
crawler = DefaultUrlCrawler()
url = "https://en.wikipedia.org/wiki/Large_language_model"
url = "http://example.com/"
results = await crawler.fetch_urls(url)
assert len(results) == 1
assert isinstance(results, dict)

View file

@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif(
@skip_in_ci
@pytest.mark.asyncio
async def test_fetch():
url = "https://en.wikipedia.org/wiki/Large_language_model"
url = "http://example.com/"
results = await fetch_with_tavily(url)
assert isinstance(results, dict)
assert len(results) == 1

View file

@ -14,9 +14,7 @@ async def test_url_saves_as_html_file():
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
@ -44,9 +42,7 @@ async def test_saved_html_is_valid():
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
content = Path(file_path).read_text()
@ -72,7 +68,7 @@ async def test_add_url():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
await cognee.add("http://example.com/")
skip_in_ci = pytest.mark.skipif(
@ -88,7 +84,7 @@ async def test_add_url_with_tavily():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("https://en.wikipedia.org/wiki/Large_language_model")
await cognee.add("http://example.com/")
@pytest.mark.asyncio
@ -98,7 +94,7 @@ async def test_add_url_without_incremental_loading():
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
"http://example.com/",
incremental_loading=False,
)
except Exception as e:
@ -112,7 +108,7 @@ async def test_add_url_with_incremental_loading():
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
"http://example.com/",
incremental_loading=True,
)
except Exception as e:
@ -125,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str():
await cognee.prune.prune_system(metadata=True)
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
"http://example.com/",
preferred_loaders=["beautiful_soup_loader"],
)
@ -144,7 +140,7 @@ async def test_add_url_with_extraction_rules():
try:
await cognee.add(
"https://en.wikipedia.org/wiki/Large_language_model",
"http://example.com/",
preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}},
)
except Exception as e:
@ -163,9 +159,7 @@ async def test_loader_is_none_by_default():
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
@ -196,9 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
@ -225,9 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments():
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
@ -263,9 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
@ -302,9 +290,7 @@ async def test_beautiful_soup_loads_file_successfully():
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
original_file_path = await save_data_item_to_storage("http://example.com/")
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
original_file = Path(file_path)

7780
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
[project]
name = "cognee"
version = "0.4.0"
version = "0.4.1"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = [
{ name = "Vasilije Markovic" },
@ -37,7 +37,8 @@ dependencies = [
"rdflib>=7.1.4,<7.2.0",
"pypdf>=4.1.0,<7.0.0",
"jinja2>=3.1.3,<4",
"lancedb>=0.24.0,<1.0.0",
"lancedb>=0.24.0,<=0.25.3", # Quick fix for lancedb lance-namespace 0.2.0 issue
"lance-namespace<=0.0.21", # Quick fix for lancedb lance-namespace 0.2.0 issue
"nbformat>=5.7.0,<6.0.0",
"alembic>=1.13.3,<2",
"limits>=4.4.1,<5",

3321
uv.lock generated

File diff suppressed because it is too large Load diff