From 2176ec16b8e440087f96410fed979528e8159ca2 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:03:36 +0100 Subject: [PATCH] chore: changes url for crawler tests (#1816) Updates crawler test url to avoid blocking and unavailable sites in CI. ## Description ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../test_default_url_crawler.py | 2 +- .../web_url_crawler/test_tavily_crawler.py | 2 +- .../web_url_crawler/test_url_adding_e2e.py | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py index f48c1cedc..af2595b14 100644 --- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py @@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): crawler = DefaultUrlCrawler() - url = "https://httpbin.org/html" + url = "http://example.com/" results = await crawler.fetch_urls(url) assert len(results) == 1 assert isinstance(results, dict) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 19ffdc4ea..5db9b58ce 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif( @skip_in_ci @pytest.mark.asyncio async def test_fetch(): - url = "https://httpbin.org/html" + url = "http://example.com/" results = await fetch_with_tavily(url) assert isinstance(results, dict) assert len(results) == 1 diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index cc8ae24d0..200f40a94 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -14,7 +14,7 @@ async def test_url_saves_as_html_file(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -42,7 +42,7 @@ async def test_saved_html_is_valid(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) content = Path(file_path).read_text() @@ -68,7 +68,7 @@ async def test_add_url(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") skip_in_ci = pytest.mark.skipif( @@ -84,7 +84,7 @@ async def test_add_url_with_tavily(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") @pytest.mark.asyncio @@ -94,7 +94,7 @@ async def test_add_url_without_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=False, ) except Exception as e: @@ -108,7 +108,7 @@ async def test_add_url_with_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=True, ) except Exception as e: @@ -121,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str(): await cognee.prune.prune_system(metadata=True) await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders=["beautiful_soup_loader"], ) @@ -140,7 +140,7 @@ async def test_add_url_with_extraction_rules(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: @@ -159,7 +159,7 @@ async def test_loader_is_none_by_default(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -190,7 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -217,7 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -253,7 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -290,7 +290,7 @@ async def test_beautiful_soup_loads_file_successfully(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") original_file = Path(file_path)