From 9395539868155cbb1402f1cd77a8f5f7c5f478cd Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 11:52:16 +0100 Subject: [PATCH 01/44] feat: interface for WebLoader --- .../loaders/external/WebLoader.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 cognee/infrastructure/loaders/external/WebLoader.py diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/WebLoader.py new file mode 100644 index 000000000..609ade2e0 --- /dev/null +++ b/cognee/infrastructure/loaders/external/WebLoader.py @@ -0,0 +1,61 @@ +from cognee.infrastructure.loaders import LoaderInterface +from typing import List + + +class WebLoader(LoaderInterface): + @property + def supported_extensions(self) -> List[str]: + """ + List of file extensions this loader supports. + + Returns: + List of extensions including the dot (e.g., ['.txt', '.md']) + """ + raise NotImplementedError + + @property + def supported_mime_types(self) -> List[str]: + """ + List of MIME types this loader supports. + + Returns: + List of MIME type strings (e.g., ['text/plain', 'application/pdf']) + """ + raise NotImplementedError + + @property + def loader_name(self) -> str: + """ + Unique name identifier for this loader. + + Returns: + String identifier used for registration and configuration + """ + raise NotImplementedError + + def can_handle(self, extension: str, mime_type: str) -> bool: + """ + Check if this loader can handle the given file. + + Args: + extension: File extension + mime_type: MIME type of the file + + Returns: + True if this loader can process the file, False otherwise + """ + raise NotImplementedError + + async def load(self, file_path: str, **kwargs): + """ + Load and process the file, returning standardized result. + + Args: + file_path: Path to the file to be processed + file_stream: If file stream is provided it will be used to process file instead + **kwargs: Additional loader-specific configuration + + Raises: + Exception: If file cannot be processed + """ + raise NotImplementedError From 95106d5914a2fe47f93b67896e404e6a41f39430 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 12:16:37 +0100 Subject: [PATCH 02/44] fix: ensure web urls correctly go through ingest_data and reach loaders --- .../files/utils/get_data_file_path.py | 3 ++ .../loaders/external/WebLoader.py | 4 +-- .../tasks/ingestion/data_item_to_text_file.py | 5 +++ .../ingestion/save_data_item_to_storage.py | 35 +------------------ 4 files changed, 11 insertions(+), 36 deletions(-) diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py index 7ffda79bd..242d130a9 100644 --- a/cognee/infrastructure/files/utils/get_data_file_path.py +++ b/cognee/infrastructure/files/utils/get_data_file_path.py @@ -38,6 +38,9 @@ def get_data_file_path(file_path: str): return normalized_url + elif file_path.startswith(("http://", "https://")): + return file_path + else: # Regular file path - normalize separators normalized_path = os.path.normpath(file_path) diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/WebLoader.py index 609ade2e0..db24c86e6 100644 --- a/cognee/infrastructure/loaders/external/WebLoader.py +++ b/cognee/infrastructure/loaders/external/WebLoader.py @@ -11,7 +11,7 @@ class WebLoader(LoaderInterface): Returns: List of extensions including the dot (e.g., ['.txt', '.md']) """ - raise NotImplementedError + return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality @property def supported_mime_types(self) -> List[str]: @@ -21,7 +21,7 @@ class WebLoader(LoaderInterface): Returns: List of MIME type strings (e.g., ['text/plain', 'application/pdf']) """ - raise NotImplementedError + return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality @property def loader_name(self) -> str: diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 9fcafca57..cd722bd76 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -75,5 +75,10 @@ async def data_item_to_text_file( else: raise IngestionError(message="Local files are not accepted.") + elif data_item_path.startswith(("http://", "https://")): + loader = get_loader_engine() + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item_path)}") diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index b6e1f7d00..d9f1beae7 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -63,40 +63,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - # Validate URL by sending a HEAD request - try: - from cognee.context_global_variables import tavily_config, soup_crawler_config - from cognee.tasks.web_scraper import fetch_page_content - - tavily = tavily_config.get() - soup_crawler = soup_crawler_config.get() - preferred_tool = "beautifulsoup" if soup_crawler else "tavily" - if preferred_tool == "tavily" and tavily is None: - raise IngestionError( - message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." - ) - if preferred_tool == "beautifulsoup" and soup_crawler is None: - raise IngestionError( - message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." - ) - - data = await fetch_page_content( - data_item, - preferred_tool=preferred_tool, - tavily_config=tavily, - soup_crawler_config=soup_crawler, - ) - content = "" - for key, value in data.items(): - content += f"{key}:\n{value}\n\n" - return await save_data_to_file(content) - except IngestionError: - raise - except Exception as e: - raise IngestionError( - message=f"Error ingesting webpage results of url {data_item}: {str(e)}" - ) - + return data_item # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: From 305969c61b08bdaabb4ed554c57833f89b61a005 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 12:17:43 +0100 Subject: [PATCH 03/44] refactor web_url_loader filename --- .../loaders/external/{WebLoader.py => web_url_loader.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cognee/infrastructure/loaders/external/{WebLoader.py => web_url_loader.py} (100%) diff --git a/cognee/infrastructure/loaders/external/WebLoader.py b/cognee/infrastructure/loaders/external/web_url_loader.py similarity index 100% rename from cognee/infrastructure/loaders/external/WebLoader.py rename to cognee/infrastructure/loaders/external/web_url_loader.py From d884867d2c3953d92d55da5132f29c4893c0b176 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 12:40:07 +0100 Subject: [PATCH 04/44] extend LoaderInterface to support web_url_loader, implement `load()` --- cognee/infrastructure/loaders/LoaderEngine.py | 14 ++++-- .../infrastructure/loaders/LoaderInterface.py | 4 +- .../loaders/external/__init__.py | 8 ++++ .../loaders/external/web_url_loader.py | 47 +++++++++++++++++-- .../loaders/supported_loaders.py | 7 +++ 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 6b62f7641..af6b53e93 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -64,7 +64,7 @@ class LoaderEngine: return True def get_loader( - self, file_path: str, preferred_loaders: List[str] = None + self, data_item_path: str, preferred_loaders: List[str] = None ) -> Optional[LoaderInterface]: """ Get appropriate loader for a file. @@ -77,20 +77,26 @@ class LoaderEngine: LoaderInterface that can handle the file, or None if not found """ - file_info = filetype.guess(file_path) + file_info = filetype.guess(data_item_path) # Try preferred loaders first if preferred_loaders: for loader_name in preferred_loaders: if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): + if loader.can_handle( + extension=file_info.extension, + mime_type=file_info.mime, + data_item_path=data_item_path, + ): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time return loader else: logger.info(f"Skipping {loader_name}: Preferred Loader not registered") # Try default priority order - for loader_name in self.default_loader_priority: + for loader_name in ( + self.default_loader_priority + ): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review if loader_name in self._loaders: loader = self._loaders[loader_name] if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): diff --git a/cognee/infrastructure/loaders/LoaderInterface.py b/cognee/infrastructure/loaders/LoaderInterface.py index 3a1c9bf3e..fb309304b 100644 --- a/cognee/infrastructure/loaders/LoaderInterface.py +++ b/cognee/infrastructure/loaders/LoaderInterface.py @@ -44,7 +44,9 @@ class LoaderInterface(ABC): pass @abstractmethod - def can_handle(self, extension: str, mime_type: str) -> bool: + def can_handle( + self, extension: str, mime_type: str, data_item_path: str = None + ) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py """ Check if this loader can handle the given file. diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index 6bf9f9200..b92d9e7f0 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -10,6 +10,7 @@ These loaders are optional and only available if their dependencies are installe """ from .pypdf_loader import PyPdfLoader +from .web_url_loader import WebUrlLoader __all__ = ["PyPdfLoader"] @@ -27,3 +28,10 @@ try: __all__.append("AdvancedPdfLoader") except ImportError: pass + +try: + from .web_url_loader import WebUrlLoader + + __all__.append("WebUrlLoader") +except ImportError: + pass diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index db24c86e6..4d519d443 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -1,8 +1,11 @@ from cognee.infrastructure.loaders import LoaderInterface from typing import List +from cognee.modules.ingestion.exceptions.exceptions import IngestionError +from cognee.modules.ingestion import save_data_to_file -class WebLoader(LoaderInterface): + +class WebUrlLoader(LoaderInterface): @property def supported_extensions(self) -> List[str]: """ @@ -31,9 +34,9 @@ class WebLoader(LoaderInterface): Returns: String identifier used for registration and configuration """ - raise NotImplementedError + return "web_url_loader" - def can_handle(self, extension: str, mime_type: str) -> bool: + def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool: """ Check if this loader can handle the given file. @@ -44,7 +47,9 @@ class WebLoader(LoaderInterface): Returns: True if this loader can process the file, False otherwise """ - raise NotImplementedError + if data_item_path is None: + raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py + return data_item_path.startswith(("http://", "https://")) async def load(self, file_path: str, **kwargs): """ @@ -58,4 +63,38 @@ class WebLoader(LoaderInterface): Raises: Exception: If file cannot be processed """ + try: + from cognee.context_global_variables import tavily_config, soup_crawler_config + from cognee.tasks.web_scraper import fetch_page_content + + tavily = tavily_config.get() + soup_crawler = soup_crawler_config.get() + preferred_tool = "beautifulsoup" if soup_crawler else "tavily" + if preferred_tool == "tavily" and tavily is None: + raise IngestionError( + message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." + ) + if preferred_tool == "beautifulsoup" and soup_crawler is None: + raise IngestionError( + message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." + ) + + data = await fetch_page_content( + file_path, + preferred_tool=preferred_tool, + tavily_config=tavily, + soup_crawler_config=soup_crawler, + ) + content = "" + for key, value in data.items(): + content += f"{key}:\n{value}\n\n" + await save_data_to_file(content) + + return content + except IngestionError: + raise + except Exception as e: + raise IngestionError( + message=f"Error ingesting webpage results of url {file_path}: {str(e)}" + ) raise NotImplementedError diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index d103babe3..7f92aa36a 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -23,3 +23,10 @@ try: supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader except ImportError: pass + +try: + from cognee.infrastructure.loaders.external import WebUrlLoader + + supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader +except ImportError: + pass From 185600fe177783e5816a98798746dd1d658a82dc Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 13:58:02 +0100 Subject: [PATCH 05/44] revert url_crawler changes to `cognee.add()`, and update `web_url_loader.load()` --- cognee/api/v1/add/add.py | 39 ++----------------- .../loaders/external/web_url_loader.py | 36 +++++++++++------ .../tasks/ingestion/data_item_to_text_file.py | 13 +++++-- cognee/tasks/ingestion/ingest_data.py | 6 ++- 4 files changed, 42 insertions(+), 52 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 0f14683f9..3c4d7b696 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -3,6 +3,7 @@ import os from typing import Union, BinaryIO, List, Optional, Dict, Any from pydantic import BaseModel from urllib.parse import urlparse +from cognee.infrastructure.loaders import LoaderInterface from cognee.modules.users.models import User from cognee.modules.pipelines import Task, run_pipeline from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import ( @@ -17,16 +18,6 @@ from cognee.shared.logging_utils import get_logger logger = get_logger() -try: - from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig - from cognee.context_global_variables import ( - tavily_config as tavily, - soup_crawler_config as soup_crawler, - ) -except ImportError: - logger.debug(f"Unable to import {str(ImportError)}") - pass - async def add( data: Union[BinaryIO, list[BinaryIO], str, list[str]], @@ -38,10 +29,8 @@ async def add( dataset_id: Optional[UUID] = None, preferred_loaders: List[str] = None, incremental_loading: bool = True, - extraction_rules: Optional[Dict[str, Any]] = None, - tavily_config: Optional[BaseModel] = None, - soup_crawler_config: Optional[BaseModel] = None, data_per_batch: Optional[int] = 20, + loaders_config: dict[LoaderInterface, dict] = {}, ): """ Add data to Cognee for knowledge graph processing. @@ -180,29 +169,6 @@ async def add( - TAVILY_API_KEY: YOUR_TAVILY_API_KEY """ - - try: - if not soup_crawler_config and extraction_rules: - soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules) - if not tavily_config and os.getenv("TAVILY_API_KEY"): - tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY")) - - soup_crawler.set(soup_crawler_config) - tavily.set(tavily_config) - - http_schemes = {"http", "https"} - - def _is_http_url(item: Union[str, BinaryIO]) -> bool: - return isinstance(item, str) and urlparse(item).scheme in http_schemes - - if _is_http_url(data): - node_set = ["web_content"] if not node_set else node_set + ["web_content"] - elif isinstance(data, list) and any(_is_http_url(item) for item in data): - node_set = ["web_content"] if not node_set else node_set + ["web_content"] - except NameError: - logger.debug(f"Unable to import {str(ImportError)}") - pass - tasks = [ Task(resolve_data_directories, include_subdirectories=True), Task( @@ -212,6 +178,7 @@ async def add( node_set, dataset_id, preferred_loaders, + loaders_config, ), ] diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 4d519d443..5e0cf07f1 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -48,7 +48,9 @@ class WebUrlLoader(LoaderInterface): True if this loader can process the file, False otherwise """ if data_item_path is None: - raise # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py + raise IngestionError( + "data_item_path should not be None" + ) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py return data_item_path.startswith(("http://", "https://")) async def load(self, file_path: str, **kwargs): @@ -63,18 +65,31 @@ class WebUrlLoader(LoaderInterface): Raises: Exception: If file cannot be processed """ + loaders_config = kwargs.get("loaders_config") + if not isinstance(loaders_config, dict): + raise IngestionError("loaders_config must be a valid dictionary") + + web_url_loader_config = loaders_config.get(self.loader_name) + if not isinstance(web_url_loader_config, dict): + raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary") + try: from cognee.context_global_variables import tavily_config, soup_crawler_config from cognee.tasks.web_scraper import fetch_page_content - tavily = tavily_config.get() - soup_crawler = soup_crawler_config.get() - preferred_tool = "beautifulsoup" if soup_crawler else "tavily" - if preferred_tool == "tavily" and tavily is None: + _tavily_config = web_url_loader_config.get("tavily_config") + _soup_config = web_url_loader_config.get("soup_config") + + # Set global configs for downstream access + tavily_config.set(_tavily_config) + soup_crawler_config.set(_soup_config) + + preferred_tool = "beautifulsoup" if _soup_config else "tavily" + if preferred_tool == "tavily" and _tavily_config is None: raise IngestionError( message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." ) - if preferred_tool == "beautifulsoup" and soup_crawler is None: + if preferred_tool == "beautifulsoup" and _soup_config is None: raise IngestionError( message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." ) @@ -82,8 +97,8 @@ class WebUrlLoader(LoaderInterface): data = await fetch_page_content( file_path, preferred_tool=preferred_tool, - tavily_config=tavily, - soup_crawler_config=soup_crawler, + tavily_config=_tavily_config, + soup_crawler_config=_soup_config, ) content = "" for key, value in data.items(): @@ -94,7 +109,4 @@ class WebUrlLoader(LoaderInterface): except IngestionError: raise except Exception as e: - raise IngestionError( - message=f"Error ingesting webpage results of url {file_path}: {str(e)}" - ) - raise NotImplementedError + raise IngestionError(message=f"Error ingesting webpage from URL {file_path}: {str(e)}") diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index cd722bd76..91d09059a 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -34,7 +34,9 @@ async def pull_from_s3(file_path, destination_file) -> None: async def data_item_to_text_file( - data_item_path: str, preferred_loaders: List[str] + data_item_path: str, + preferred_loaders: List[str], + loaders_config: dict[LoaderInterface, dict], ) -> Tuple[str, LoaderInterface]: if isinstance(data_item_path, str): parsed_url = urlparse(data_item_path) @@ -77,8 +79,13 @@ async def data_item_to_text_file( elif data_item_path.startswith(("http://", "https://")): loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders + return ( + await loader.load_file( + data_item_path, + preferred_loaders, + loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal + ), + loader.get_loader(data_item_path, preferred_loaders), ) # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item_path)}") diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 3c20a2b13..3fb161181 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -3,6 +3,7 @@ import inspect from uuid import UUID from typing import Union, BinaryIO, Any, List, Optional +from cognee.infrastructure.loaders import LoaderInterface import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data @@ -28,6 +29,7 @@ async def ingest_data( node_set: Optional[List[str]] = None, dataset_id: UUID = None, preferred_loaders: List[str] = None, + loaders_config: dict[LoaderInterface, dict] = {}, ): if not user: user = await get_default_user() @@ -85,7 +87,9 @@ async def ingest_data( # Store all input data as text files in Cognee data storage cognee_storage_file_path, loader_engine = await data_item_to_text_file( - actual_file_path, preferred_loaders + actual_file_path, + preferred_loaders, + loaders_config, ) # Find metadata from original file From 9a9f9f6836859db629024ffedce34cefba8700a9 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 13:58:38 +0100 Subject: [PATCH 06/44] tests: add some tests to assert behaviour is as expected --- .../integration/web_url_crawler/test_add.py | 26 +++++++++++++++++++ .../web_url_crawler/test_loader_engine.py | 20 ++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 cognee/tests/integration/web_url_crawler/test_add.py create mode 100644 cognee/tests/integration/web_url_crawler/test_loader_engine.py diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py new file mode 100644 index 000000000..b9840df3d --- /dev/null +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -0,0 +1,26 @@ +import pytest +import cognee + + +@pytest.mark.asyncio +async def test_add_fails_when_preferred_loader_not_specified(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + with pytest.raises: + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["web_url_loader"], + ) + + +@pytest.mark.asyncio +async def test_add_succesfully_adds_url_when_preferred_loader_specified(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + try: + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["web_url_loader"], + ) + except Exception as e: + pytest.fail(f"Failed to add url: {e}") diff --git a/cognee/tests/integration/web_url_crawler/test_loader_engine.py b/cognee/tests/integration/web_url_crawler/test_loader_engine.py new file mode 100644 index 000000000..018c034e1 --- /dev/null +++ b/cognee/tests/integration/web_url_crawler/test_loader_engine.py @@ -0,0 +1,20 @@ +import pytest + +from cognee.infrastructure.loaders import get_loader_engine +from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader + + +def test_get_loader_returns_none_by_default_for_web_urls(): + loader_engine = get_loader_engine() + urls = ["https://cognee.ai", "http://cognee.ai"] + for url in urls: + loader = loader_engine.get_loader(url) + assert loader is None + + +def test_get_loader_returns_valid_loader_when_preferred_loaders_specified(): + loader_engine = get_loader_engine() + urls = ["https://cognee.ai", "http://cognee.ai"] + for url in urls: + loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"]) + assert isinstance(loader, WebUrlLoader) From 36364285b27d2305c4e88ed6d323a6e934d6c59d Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 15:53:49 +0100 Subject: [PATCH 07/44] tests: fix failing tests --- cognee/infrastructure/loaders/LoaderEngine.py | 15 +++++++++---- .../loaders/external/web_url_loader.py | 2 +- .../tasks/ingestion/data_item_to_text_file.py | 21 ++++++++++--------- .../integration/web_url_crawler/test_add.py | 16 ++++++++++++-- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index af6b53e93..d6c4d4d8c 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -76,8 +76,15 @@ class LoaderEngine: Returns: LoaderInterface that can handle the file, or None if not found """ + is_url = data_item_path.startswith(("http://", "https://")) - file_info = filetype.guess(data_item_path) + if is_url: + extension = None + mime_type = None + else: + file_info = filetype.guess(data_item_path) + extension = file_info.extension if file_info else None + mime_type = file_info.mime if file_info else None # Try preferred loaders first if preferred_loaders: @@ -85,8 +92,8 @@ class LoaderEngine: if loader_name in self._loaders: loader = self._loaders[loader_name] if loader.can_handle( - extension=file_info.extension, - mime_type=file_info.mime, + extension=extension, + mime_type=mime_type, data_item_path=data_item_path, ): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time return loader @@ -99,7 +106,7 @@ class LoaderEngine: ): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): + if loader.can_handle(extension=extension, mime_type=mime_type): return loader else: logger.info( diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 5e0cf07f1..38bca2523 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -1,4 +1,4 @@ -from cognee.infrastructure.loaders import LoaderInterface +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 91d09059a..f82d9a0dc 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -50,17 +50,17 @@ async def data_item_to_text_file( await pull_from_s3(data_item_path, temp_file) temp_file.flush() # Data needs to be saved to local storage loader = get_loader_engine() - return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader( - temp_file.name, preferred_loaders - ) + return await loader.load_file( + temp_file.name, None, preferred_loaders + ), loader.get_loader(temp_file.name, preferred_loaders) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") @@ -71,9 +71,9 @@ async def data_item_to_text_file( # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path) if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( - data_item_path, preferred_loaders - ) + return await loader.load_file( + data_item_path, None, preferred_loaders + ), loader.get_loader(data_item_path, preferred_loaders) else: raise IngestionError(message="Local files are not accepted.") @@ -82,8 +82,9 @@ async def data_item_to_text_file( return ( await loader.load_file( data_item_path, + None, preferred_loaders, - loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal + loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal ), loader.get_loader(data_item_path, preferred_loaders), ) diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index b9840df3d..0c4332c6d 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -6,10 +6,10 @@ import cognee async def test_add_fails_when_preferred_loader_not_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - with pytest.raises: + with pytest.raises(ValueError): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], + incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix ) @@ -17,10 +17,22 @@ async def test_add_fails_when_preferred_loader_not_specified(): async def test_add_succesfully_adds_url_when_preferred_loader_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) + + loaders_config = { + "web_url_loader": { + "soup_config": { + "max_depth": 1, + "follow_links": False, + } + } + } + try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", preferred_loaders=["web_url_loader"], + incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix + loaders_config=loaders_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") From 572c8ebce745d4cd675eb522c6e500cf04102591 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 18:26:50 +0100 Subject: [PATCH 08/44] refactor: use pydantic models for tavily and beautifulsoup configs instead of dicts --- .../loaders/external/web_url_loader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 38bca2523..f9fce47a9 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -3,6 +3,7 @@ from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file +from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig class WebUrlLoader(LoaderInterface): @@ -77,8 +78,11 @@ class WebUrlLoader(LoaderInterface): from cognee.context_global_variables import tavily_config, soup_crawler_config from cognee.tasks.web_scraper import fetch_page_content - _tavily_config = web_url_loader_config.get("tavily_config") - _soup_config = web_url_loader_config.get("soup_config") + tavily_dict = web_url_loader_config.get("tavily_config") + _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None + + soup_dict = web_url_loader_config.get("soup_config") + _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None # Set global configs for downstream access tavily_config.set(_tavily_config) @@ -109,4 +113,6 @@ class WebUrlLoader(LoaderInterface): except IngestionError: raise except Exception as e: - raise IngestionError(message=f"Error ingesting webpage from URL {file_path}: {str(e)}") + raise IngestionError( + message=f"Error ingesting webpage from URL {file_path}: {str(e)}" + ) from e From c0d450b165a0d19ff28bc3e7be7ef30c66926795 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 18:27:03 +0100 Subject: [PATCH 09/44] tests: fix test_add - add missing required parameter --- cognee/tests/integration/web_url_crawler/test_add.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index 0c4332c6d..e0dda94a9 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -18,11 +18,19 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title", "attr": "text"}, + "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "attr": "text", "all": True}, + } + loaders_config = { "web_url_loader": { "soup_config": { "max_depth": 1, "follow_links": False, + "extraction_rules": extraction_rules, } } } From 2e7ff0b01ba3e25e4c50b4ea9bb02fdd3adf9c8d Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 16 Oct 2025 18:28:51 +0100 Subject: [PATCH 10/44] remove reduntant HtmlContent class in save_data_item_to_storage --- cognee/tasks/ingestion/save_data_item_to_storage.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index d9f1beae7..5761b19ba 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -18,13 +18,6 @@ class SaveDataSettings(BaseSettings): model_config = SettingsConfigDict(env_file=".env", extra="allow") -class HTMLContent(str): - def __new__(cls, value: str): - if not ("<" in value and ">" in value): - raise ValueError("Not valid HTML-like content") - return super().__new__(cls, value) - - settings = SaveDataSettings() From d0f3e224cb07c958658082cd0dfaf6012c82b397 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 11:12:09 +0100 Subject: [PATCH 11/44] refactor ingest_data to accomodate non-FS data items --- cognee/infrastructure/files/exceptions.py | 12 +++++ .../files/utils/get_data_file_path.py | 6 ++- cognee/tasks/ingestion/ingest_data.py | 45 ++++++++++++------- .../ingestion/save_data_item_to_storage.py | 5 ++- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/cognee/infrastructure/files/exceptions.py b/cognee/infrastructure/files/exceptions.py index 351eaee9c..eb6efdbce 100644 --- a/cognee/infrastructure/files/exceptions.py +++ b/cognee/infrastructure/files/exceptions.py @@ -11,3 +11,15 @@ class FileContentHashingError(Exception): status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, ): super().__init__(message, name, status_code) + + +class UnsupportedPathSchemeError(Exception): + """Raised when a non-filesystem path scheme (like http://, https://) is passed to a function expecting filesystem paths.""" + + def __init__( + self, + message: str = "This function only supports filesystem paths (file:// or local paths), not HTTP/HTTPS URLs.", + name: str = "UnsupportedPathSchemeError", + status_code=status.HTTP_400_BAD_REQUEST, + ): + super().__init__(message, name, status_code) diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py index 242d130a9..d67fc95a0 100644 --- a/cognee/infrastructure/files/utils/get_data_file_path.py +++ b/cognee/infrastructure/files/utils/get_data_file_path.py @@ -1,6 +1,8 @@ import os from urllib.parse import urlparse +from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError + def get_data_file_path(file_path: str): # Check if this is a file URI BEFORE normalizing (which corrupts URIs) @@ -39,7 +41,9 @@ def get_data_file_path(file_path: str): return normalized_url elif file_path.startswith(("http://", "https://")): - return file_path + raise UnsupportedPathSchemeError( + message=f"HTTP/HTTPS URLs are not supported by get_data_file_path(). Received: {file_path}" + ) else: # Regular file path - normalize separators diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 3fb161181..b742e474e 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -3,6 +3,7 @@ import inspect from uuid import UUID from typing import Union, BinaryIO, Any, List, Optional +from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError from cognee.infrastructure.loaders import LoaderInterface import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine @@ -79,11 +80,16 @@ async def ingest_data( dataset_data_map = {str(data.id): True for data in dataset_data} for data_item in data: - # Get file path of data item or create a file it doesn't exist - original_file_path = await save_data_item_to_storage(data_item) - - # Transform file path to be OS usable - actual_file_path = get_data_file_path(original_file_path) + try: + # Get file path of data item or create a file if it doesn't exist + original_file_path = await save_data_item_to_storage(data_item) + # Transform file path to be OS usable + actual_file_path = get_data_file_path(original_file_path) + except UnsupportedPathSchemeError: + # This data_item (e.g., HTTP/HTTPS URL) should be passed directly to the loader + # skip save_data_item_to_storage and get_data_file_path + actual_file_path = data_item + original_file_path = None # we don't have an original file path # Store all input data as text files in Cognee data storage cognee_storage_file_path, loader_engine = await data_item_to_text_file( @@ -93,17 +99,26 @@ async def ingest_data( ) # Find metadata from original file - async with open_data_file(original_file_path) as file: - classified_data = ingestion.classify(file) + if original_file_path is not None: + # Standard flow: extract metadata from both original and stored files + async with open_data_file(original_file_path) as file: + classified_data = ingestion.classify(file) + data_id = ingestion.identify(classified_data, user) + original_file_metadata = classified_data.get_metadata() - # data_id is the hash of original file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) - original_file_metadata = classified_data.get_metadata() - - # Find metadata from Cognee data storage text file - async with open_data_file(cognee_storage_file_path) as file: - classified_data = ingestion.classify(file) - storage_file_metadata = classified_data.get_metadata() + async with open_data_file(cognee_storage_file_path) as file: + classified_data = ingestion.classify(file) + storage_file_metadata = classified_data.get_metadata() + else: + # Alternative flow (e.g., URLs): extract metadata once from stored file + async with open_data_file(cognee_storage_file_path) as file: + classified_data = ingestion.classify(file) + data_id = ingestion.identify(classified_data, user) + original_file_metadata = classified_data.get_metadata() + # Override file_path to be the actual data_item (e.g., URL) ? + # original_file_metadata["file_path"] = actual_file_path + # Storage metadata is the same as original + # storage_file_metadata = original_file_metadata.copy() from sqlalchemy import select diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 5761b19ba..cf32477cb 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -3,6 +3,7 @@ from pathlib import Path from urllib.parse import urlparse from typing import Union, BinaryIO, Any +from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger @@ -56,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - return data_item + raise UnsupportedPathSchemeError( + message=f"HTTP/HTTPS URLs should be handled by loader, not by save_data_item_to_storage. Received: {data_item}" + ) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: From 9b802f651bc642f318ecec07af5fb3e1f46a5146 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 12:34:50 +0100 Subject: [PATCH 12/44] fix: web_url_loader load_data should yield stored_path --- cognee/infrastructure/loaders/external/web_url_loader.py | 6 ++++-- cognee/tasks/ingestion/ingest_data.py | 4 ++-- cognee/tests/integration/web_url_crawler/test_add.py | 3 +++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index f9fce47a9..491428c82 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -63,6 +63,8 @@ class WebUrlLoader(LoaderInterface): file_stream: If file stream is provided it will be used to process file instead **kwargs: Additional loader-specific configuration + Returns: + file path to the stored file Raises: Exception: If file cannot be processed """ @@ -107,9 +109,9 @@ class WebUrlLoader(LoaderInterface): content = "" for key, value in data.items(): content += f"{key}:\n{value}\n\n" - await save_data_to_file(content) + stored_path = await save_data_to_file(content) - return content + return stored_path except IngestionError: raise except Exception as e: diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index b742e474e..233bb5f1c 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -116,9 +116,9 @@ async def ingest_data( data_id = ingestion.identify(classified_data, user) original_file_metadata = classified_data.get_metadata() # Override file_path to be the actual data_item (e.g., URL) ? - # original_file_metadata["file_path"] = actual_file_path + original_file_metadata["file_path"] = actual_file_path # Storage metadata is the same as original - # storage_file_metadata = original_file_metadata.copy() + storage_file_metadata = original_file_metadata.copy() from sqlalchemy import select diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index e0dda94a9..2a75b5054 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -4,6 +4,9 @@ import cognee @pytest.mark.asyncio async def test_add_fails_when_preferred_loader_not_specified(): + from cognee.shared.logging_utils import setup_logging, ERROR + + setup_logging(log_level=ERROR) await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) with pytest.raises(ValueError): From b9877f9e876de87e1cea051b764669922290e87e Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 13:16:30 +0100 Subject: [PATCH 13/44] create web_url_loader_example.py --- examples/python/web_url_loader_example.py | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 examples/python/web_url_loader_example.py diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_loader_example.py new file mode 100644 index 000000000..7845d4001 --- /dev/null +++ b/examples/python/web_url_loader_example.py @@ -0,0 +1,46 @@ +import asyncio + +import cognee +from cognee.shared.logging_utils import setup_logging, ERROR + + +async def main(): + await cognee.prune.prune_data() + print("Data pruned.") + + await cognee.prune.prune_system(metadata=True) + + extraction_rules = { + "title": {"selector": "title", "attr": "text"}, + "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "attr": "text", "all": True}, + } + + loaders_config = { + "web_url_loader": { + "soup_config": { + "max_depth": 1, + "follow_links": False, + "extraction_rules": extraction_rules, + } + } + } + + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["web_url_loader"], + incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix + loaders_config=loaders_config, + ) + + await cognee.cognify() + print("Knowledge graph created.") + + await cognee.visualize_graph() + print("Data visualized") + + +if __name__ == "__main__": + logger = setup_logging(log_level=ERROR) + asyncio.run(main()) From b5190c90f1efc8256ae7405728e766c06b97e963 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 16:51:36 +0100 Subject: [PATCH 14/44] add logging for crawling status; add cap to the crawl_delay from robots.txt - Not advising to use the cap, but giving an option to be able to configure it --- .../loaders/external/web_url_loader.py | 13 ++++ cognee/tasks/web_scraper/bs4_crawler.py | 66 ++++++++++++++++++- cognee/tasks/web_scraper/config.py | 3 + cognee/tasks/web_scraper/utils.py | 41 ++++++++++-- 4 files changed, 116 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 491428c82..1ecf82171 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -4,6 +4,9 @@ from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig +from cognee.shared.logging_utils import get_logger + +logger = get_logger() class WebUrlLoader(LoaderInterface): @@ -100,16 +103,26 @@ class WebUrlLoader(LoaderInterface): message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." ) + logger.info(f"Starting web URL crawling for: {file_path}") + logger.info(f"Using scraping tool: {preferred_tool}") + data = await fetch_page_content( file_path, preferred_tool=preferred_tool, tavily_config=_tavily_config, soup_crawler_config=_soup_config, ) + + logger.info(f"Successfully fetched content from {len(data)} URL(s)") + logger.info("Processing and concatenating fetched content") + content = "" for key, value in data.items(): content += f"{key}:\n{value}\n\n" + + logger.info(f"Saving content to file (total size: {len(content)} characters)") stored_path = await save_data_to_file(content) + logger.info(f"Successfully saved content to: {stored_path}") return stored_path except IngestionError: diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 0fbff4808..400287e08 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -75,6 +75,7 @@ class BeautifulSoupCrawler: Attributes: concurrency: Number of concurrent requests allowed. crawl_delay: Minimum seconds between requests to the same domain. + max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). timeout: Per-request timeout in seconds. max_retries: Number of retries for failed requests. retry_delay_factor: Multiplier for exponential backoff on retries. @@ -87,6 +88,7 @@ class BeautifulSoupCrawler: *, concurrency: int = 5, crawl_delay: float = 0.5, + max_crawl_delay: Optional[float] = 10.0, timeout: float = 15.0, max_retries: int = 2, retry_delay_factor: float = 0.5, @@ -98,6 +100,7 @@ class BeautifulSoupCrawler: Args: concurrency: Number of concurrent requests allowed. crawl_delay: Minimum seconds between requests to the same domain. + max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). timeout: Per-request timeout in seconds. max_retries: Number of retries for failed requests. retry_delay_factor: Multiplier for exponential backoff on retries. @@ -107,6 +110,7 @@ class BeautifulSoupCrawler: self.concurrency = concurrency self._sem = asyncio.Semaphore(concurrency) self.crawl_delay = crawl_delay + self.max_crawl_delay = max_crawl_delay self.timeout = timeout self.max_retries = max_retries self.retry_delay_factor = retry_delay_factor @@ -183,7 +187,11 @@ class BeautifulSoupCrawler: elapsed = time.time() - last wait_for = delay - elapsed if wait_for > 0: + logger.info( + f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)" + ) await asyncio.sleep(wait_for) + logger.info(f"Rate limit wait completed for {url}") self._last_request_time_per_domain[domain] = time.time() async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]: @@ -236,7 +244,16 @@ class BeautifulSoupCrawler: crawl_delay = self.crawl_delay if protego: delay = protego.crawl_delay(agent) or protego.crawl_delay("*") - crawl_delay = delay if delay else self.crawl_delay + if delay: + # Apply max_crawl_delay cap if configured + if self.max_crawl_delay is not None and delay > self.max_crawl_delay: + logger.warning( + f"robots.txt specifies crawl_delay={delay}s for {domain_root}, " + f"capping to max_crawl_delay={self.max_crawl_delay}s" + ) + crawl_delay = self.max_crawl_delay + else: + crawl_delay = delay cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay) self._robots_cache[domain_root] = cache_entry @@ -307,12 +324,16 @@ class BeautifulSoupCrawler: attempt = 0 crawl_delay = await self._get_crawl_delay(url) + logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}") while True: try: await self._respect_rate_limit(url, crawl_delay) resp = await self._client.get(url) resp.raise_for_status() + logger.info( + f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)" + ) return resp.text except Exception as exc: attempt += 1 @@ -347,22 +368,35 @@ class BeautifulSoupCrawler: raise RuntimeError( "Playwright is not installed. Install with `pip install playwright` and run `playwright install`." ) + + timeout_val = timeout or self.timeout + logger.info( + f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}" + ) + attempt = 0 while True: try: async with async_playwright() as p: + logger.info(f"Launching headless Chromium browser for {url}") browser = await p.chromium.launch(headless=True) try: context = await browser.new_context() page = await context.new_page() + logger.info(f"Navigating to {url} and waiting for network idle") await page.goto( url, wait_until="networkidle", - timeout=int((timeout or self.timeout) * 1000), + timeout=int(timeout_val * 1000), ) if js_wait: + logger.info(f"Waiting {js_wait}s for JavaScript to execute") await asyncio.sleep(js_wait) - return await page.content() + content = await page.content() + logger.info( + f"Successfully rendered {url} with Playwright (size={len(content)} bytes)" + ) + return content finally: await browser.close() except Exception as exc: @@ -498,6 +532,10 @@ class BeautifulSoupCrawler: else: raise ValueError(f"Invalid urls type: {type(urls)}") + logger.info( + f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)" + ) + normalized_url_rules: Dict[str, List[ExtractionRule]] = {} for url, rules in url_rules_map.items(): normalized_rules = [] @@ -508,21 +546,36 @@ class BeautifulSoupCrawler: normalized_rules.append(r) normalized_url_rules[url] = normalized_rules + logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)") + async def _task(url: str): async with self._sem: try: + logger.info(f"Processing URL: {url}") + + # Check robots.txt allowed = await self._is_url_allowed(url) if not allowed: logger.warning(f"URL disallowed by robots.txt: {url}") return url, "" + logger.info(f"Robots.txt check passed for {url}") + + # Fetch HTML if use_playwright: + logger.info( + f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)" + ) html = await self._render_with_playwright( url, js_wait=playwright_js_wait, timeout=self.timeout ) else: + logger.info(f"Fetching {url} with httpx") html = await self._fetch_httpx(url) + logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)") + + # Extract content pieces = [] for rule in normalized_url_rules[url]: text = self._extract_with_bs4(html, rule) @@ -530,17 +583,24 @@ class BeautifulSoupCrawler: pieces.append(text) concatenated = " ".join(pieces).strip() + logger.info(f"Extracted {len(concatenated)} characters from {url}") return url, concatenated except Exception as e: logger.error(f"Error processing {url}: {e}") return url, "" + logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching") tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()] results = {} + completed = 0 + total = len(tasks) for coro in asyncio.as_completed(tasks): url, text = await coro results[url] = text + completed += 1 + logger.info(f"Progress: {completed}/{total} URLs processed") + logger.info(f"Completed fetching all {len(results)} URL(s)") return results diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index 2ee43ed32..ac470daa9 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -13,6 +13,9 @@ class TavilyConfig(BaseModel): class SoupCrawlerConfig(BaseModel): concurrency: int = 5 crawl_delay: float = 0.5 + max_crawl_delay: Optional[float] = ( + 10.0 # Maximum crawl delay to respect from robots.txt (None = no limit) + ) timeout: float = 15.0 max_retries: int = 2 retry_delay_factor: float = 0.5 diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 6d094f423..a32b6848c 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -45,9 +45,13 @@ async def fetch_page_content( ImportError: If required dependencies (beautifulsoup4 or tavily-python) are not installed. """ + url_list = [urls] if isinstance(urls, str) else urls + logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}") + if preferred_tool == "tavily": if not tavily_config or tavily_config.api_key is None: raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily") + logger.info("Using Tavily API for content extraction") return await fetch_with_tavily(urls, tavily_config) if preferred_tool == "beautifulsoup": @@ -60,10 +64,17 @@ async def fetch_page_content( raise ImportError if not soup_crawler_config or soup_crawler_config.extraction_rules is None: raise ValueError("extraction_rules must be provided when not using Tavily") + + logger.info("Using BeautifulSoup for content extraction") extraction_rules = soup_crawler_config.extraction_rules + logger.info( + f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s" + ) + crawler = BeautifulSoupCrawler( concurrency=soup_crawler_config.concurrency, crawl_delay=soup_crawler_config.crawl_delay, + max_crawl_delay=soup_crawler_config.max_crawl_delay, timeout=soup_crawler_config.timeout, max_retries=soup_crawler_config.max_retries, retry_delay_factor=soup_crawler_config.retry_delay_factor, @@ -71,6 +82,9 @@ async def fetch_page_content( robots_cache_ttl=soup_crawler_config.robots_cache_ttl, ) try: + logger.info( + f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})" + ) results = await crawler.fetch_with_bs4( urls, extraction_rules, @@ -78,11 +92,13 @@ async def fetch_page_content( playwright_js_wait=soup_crawler_config.playwright_js_wait, join_all_matches=soup_crawler_config.join_all_matches, ) + logger.info(f"Successfully fetched content from {len(results)} URL(s)") return results except Exception as e: logger.error(f"Error fetching page content: {str(e)}") raise finally: + logger.info("Closing BeautifulSoup crawler") await crawler.close() @@ -108,19 +124,36 @@ async def fetch_with_tavily( "Failed to import tavily, make sure to install using pip install tavily-python>=0.7.0" ) raise + + url_list = [urls] if isinstance(urls, str) else urls + extract_depth = tavily_config.extract_depth if tavily_config else "basic" + timeout = tavily_config.timeout if tavily_config else 10 + + logger.info( + f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s" + ) client = AsyncTavilyClient( api_key=tavily_config.api_key if tavily_config else None, proxies=tavily_config.proxies if tavily_config else None, ) + + logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)") results = await client.extract( urls, format="text", - extract_depth=tavily_config.extract_depth if tavily_config else "basic", - timeout=tavily_config.timeout if tavily_config else 10, + extract_depth=extract_depth, + timeout=timeout, ) - for failed_result in results.get("failed_results", []): - logger.warning(f"Failed to fetch {failed_result}") + + failed_count = len(results.get("failed_results", [])) + if failed_count > 0: + logger.warning(f"Tavily API failed to fetch {failed_count} URL(s)") + for failed_result in results.get("failed_results", []): + logger.warning(f"Failed to fetch {failed_result}") + return_results = {} for result in results.get("results", []): return_results[result["url"]] = result["raw_content"] + + logger.info(f"Successfully fetched content from {len(return_results)} URL(s) via Tavily") return return_results From a69a7e5fc46b5c42c29abd6cbf0f21c911dacde5 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 16:52:37 +0100 Subject: [PATCH 15/44] tests: remove redundant bs4 configs from tests --- .../tests/integration/web_url_crawler/test_add.py | 6 +++--- examples/python/web_url_loader_example.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index 2a75b5054..926c25a94 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -22,10 +22,10 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified(): await cognee.prune.prune_system(metadata=True) extraction_rules = { - "title": {"selector": "title", "attr": "text"}, - "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True}, + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "attr": "text", "all": True}, + "paragraphs": {"selector": "p", "all": True}, } loaders_config = { diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_loader_example.py index 7845d4001..37dd2258c 100644 --- a/examples/python/web_url_loader_example.py +++ b/examples/python/web_url_loader_example.py @@ -1,7 +1,6 @@ import asyncio import cognee -from cognee.shared.logging_utils import setup_logging, ERROR async def main(): @@ -11,10 +10,14 @@ async def main(): await cognee.prune.prune_system(metadata=True) extraction_rules = { - "title": {"selector": "title", "attr": "text"}, - "headings": {"selector": "h1, h2, h3", "attr": "text", "all": True}, - "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "attr": "text", "all": True}, + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": { + "selector": "a", + "attr": "href", + "all": True, + }, + "paragraphs": {"selector": "p", "all": True}, } loaders_config = { @@ -42,5 +45,4 @@ async def main(): if __name__ == "__main__": - logger = setup_logging(log_level=ERROR) asyncio.run(main()) From a0f760a3d101bae4446a62e5fec2cfeba4c73b50 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 16:54:27 +0100 Subject: [PATCH 16/44] refactor: remove redundant `filestream` arg from `LoaderEngine.load_file(...)` --- .../tasks/ingestion/data_item_to_text_file.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index f82d9a0dc..211b918ae 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -50,17 +50,17 @@ async def data_item_to_text_file( await pull_from_s3(data_item_path, temp_file) temp_file.flush() # Data needs to be saved to local storage loader = get_loader_engine() - return await loader.load_file( - temp_file.name, None, preferred_loaders - ), loader.get_loader(temp_file.name, preferred_loaders) + return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader( + temp_file.name, preferred_loaders + ) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file( - data_item_path, None, preferred_loaders - ), loader.get_loader(data_item_path, preferred_loaders) + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) else: raise IngestionError(message="Local files are not accepted.") @@ -71,9 +71,9 @@ async def data_item_to_text_file( # Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path) if settings.accept_local_file_path: loader = get_loader_engine() - return await loader.load_file( - data_item_path, None, preferred_loaders - ), loader.get_loader(data_item_path, preferred_loaders) + return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader( + data_item_path, preferred_loaders + ) else: raise IngestionError(message="Local files are not accepted.") @@ -82,7 +82,6 @@ async def data_item_to_text_file( return ( await loader.load_file( data_item_path, - None, preferred_loaders, loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal ), From 1a0978fb3764fb47619a0cf5a6881b7c1c70ae7e Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Fri, 17 Oct 2025 22:38:36 +0100 Subject: [PATCH 17/44] incremental loading - fallback to regular, update test cases --- .../operations/run_tasks_data_item.py | 74 +++++++++++-------- .../integration/web_url_crawler/test_add.py | 66 ++++++++++++++++- 2 files changed, 109 insertions(+), 31 deletions(-) diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 152e72d7f..0118e7976 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -9,6 +9,7 @@ import os from typing import Any, Dict, AsyncGenerator, Optional from sqlalchemy import select +from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.files.utils.open_data_file import open_data_file @@ -63,36 +64,51 @@ async def run_tasks_data_item_incremental( # If incremental_loading of data is set to True don't process documents already processed by pipeline # If data is being added to Cognee for the first time calculate the id of the data - if not isinstance(data_item, Data): - file_path = await save_data_item_to_storage(data_item) - # Ingest data and add metadata - async with open_data_file(file_path) as file: - classified_data = ingestion.classify(file) - # data_id is the hash of file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) - else: - # If data was already processed by Cognee get data id - data_id = data_item.id + try: + if not isinstance(data_item, Data): + file_path = await save_data_item_to_storage(data_item) + # Ingest data and add metadata + async with open_data_file(file_path) as file: + classified_data = ingestion.classify(file) + # data_id is the hash of file contents + owner id to avoid duplicate data + data_id = ingestion.identify(classified_data, user) + else: + # If data was already processed by Cognee get data id + data_id = data_item.id - # Check pipeline status, if Data already processed for pipeline before skip current processing - async with db_engine.get_async_session() as session: - data_point = ( - await session.execute(select(Data).filter(Data.id == data_id)) - ).scalar_one_or_none() - if data_point: - if ( - data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id)) - == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED - ): - yield { - "run_info": PipelineRunAlreadyCompleted( - pipeline_run_id=pipeline_run_id, - dataset_id=dataset.id, - dataset_name=dataset.name, - ), - "data_id": data_id, - } - return + # Check pipeline status, if Data already processed for pipeline before skip current processing + async with db_engine.get_async_session() as session: + data_point = ( + await session.execute(select(Data).filter(Data.id == data_id)) + ).scalar_one_or_none() + if data_point: + if ( + data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id)) + == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED + ): + yield { + "run_info": PipelineRunAlreadyCompleted( + pipeline_run_id=pipeline_run_id, + dataset_id=dataset.id, + dataset_name=dataset.name, + ), + "data_id": data_id, + } + return + except UnsupportedPathSchemeError as e: + logger.warning(f"data_item does not support incremental loading: {str(e)}") + # Fall back to regular processing since incremental loading is not supported + async for result in run_tasks_data_item_regular( + data_item=data_item, + dataset=dataset, + tasks=tasks, + pipeline_id=pipeline_id, + pipeline_run_id=pipeline_run_id, + context=context, + user=user, + ): + yield result + return try: # Process data based on data_item and list of tasks diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index 926c25a94..abd0d77ba 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -12,7 +12,6 @@ async def test_add_fails_when_preferred_loader_not_specified(): with pytest.raises(ValueError): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix ) @@ -42,7 +41,70 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified(): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", preferred_loaders=["web_url_loader"], - incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix + loaders_config=loaders_config, + ) + except Exception as e: + pytest.fail(f"Failed to add url: {e}") + + +@pytest.mark.asyncio +async def test_add_with_incremental_loading_works(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + loaders_config = { + "web_url_loader": { + "soup_config": { + "max_depth": 1, + "follow_links": False, + "extraction_rules": extraction_rules, + } + } + } + try: + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["web_url_loader"], + incremental_loading=True, + loaders_config=loaders_config, + ) + except Exception as e: + pytest.fail(f"Failed to add url: {e}") + + +@pytest.mark.asyncio +async def test_add_without_incremental_loading_works(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + loaders_config = { + "web_url_loader": { + "soup_config": { + "max_depth": 1, + "follow_links": False, + "extraction_rules": extraction_rules, + } + } + } + try: + await cognee.add( + "https://en.wikipedia.org/wiki/Large_language_model", + preferred_loaders=["web_url_loader"], + incremental_loading=False, loaders_config=loaders_config, ) except Exception as e: From 8fe789ee9627cc8dbf6c707c3dde9c15e0fac893 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 19:24:40 +0100 Subject: [PATCH 18/44] nit: remove uneccessary import --- cognee/infrastructure/loaders/external/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index b92d9e7f0..2790a7ea0 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -10,7 +10,6 @@ These loaders are optional and only available if their dependencies are installe """ from .pypdf_loader import PyPdfLoader -from .web_url_loader import WebUrlLoader __all__ = ["PyPdfLoader"] From 17b33ab443f7d2beb407438cdd2cfe2a959d531d Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 20:54:39 +0100 Subject: [PATCH 19/44] feat: web_url_fetcher --- .../tasks/ingestion/data_fetchers/__init__.py | 8 +++ .../data_fetchers/data_fetcher_interface.py | 15 ++++ .../data_fetchers/web_url_fetcher.py | 70 +++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 cognee/tasks/ingestion/data_fetchers/__init__.py create mode 100644 cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py create mode 100644 cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py diff --git a/cognee/tasks/ingestion/data_fetchers/__init__.py b/cognee/tasks/ingestion/data_fetchers/__init__.py new file mode 100644 index 000000000..63530b427 --- /dev/null +++ b/cognee/tasks/ingestion/data_fetchers/__init__.py @@ -0,0 +1,8 @@ +__all__ = [] + +try: + from .web_url_fetcher import WebUrlFetcher + + __all__.append("WebUrlFetcher") +except ImportError: + pass diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py new file mode 100644 index 000000000..db8b8963b --- /dev/null +++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod +from typing import Any + + +class DataFetcherInterface(ABC): + @abstractmethod + def fetcher_name(self) -> str: + pass + + @abstractmethod + async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str: + """ + args: data_item_path - path to the data item + """ + pass diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py new file mode 100644 index 000000000..f1e5dac91 --- /dev/null +++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py @@ -0,0 +1,70 @@ +from cognee.modules.ingestion import save_data_to_file +from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface +from typing import Any +from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig +from cognee.modules.ingestion.exceptions.exceptions import IngestionError +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + + +class WebUrlFetcher(DataFetcherInterface): + def __init__(self): ... + + def fetcher_name(self): + return "web_url_fetcher" + + async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]): + from cognee.context_global_variables import tavily_config, soup_crawler_config + from cognee.tasks.web_scraper import fetch_page_content + + web_url_fetcher_config = fetchers_config.get(self.fetcher_name()) + if not isinstance(web_url_fetcher_config, dict): + raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary") + + tavily_dict = web_url_fetcher_config.get("tavily_config") + _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None + + soup_dict = web_url_fetcher_config.get("soup_config") + _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None + + # Set global configs for downstream access + tavily_config.set(_tavily_config) + soup_crawler_config.set(_soup_config) + + preferred_tool = "beautifulsoup" if _soup_config else "tavily" + if preferred_tool == "tavily" and _tavily_config is None: + raise IngestionError( + message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." + ) + if preferred_tool == "beautifulsoup" and _soup_config is None: + raise IngestionError( + message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." + ) + + logger.info(f"Starting web URL crawling for: {data_item_path}") + logger.info(f"Using scraping tool: {preferred_tool}") + + data = await fetch_page_content( + data_item_path, + preferred_tool=preferred_tool, + soup_crawler_config=_soup_config, + tavily_config=_tavily_config, + ) + + logger.info(f"Successfully fetched content from URL {data_item_path}") + + # fetch_page_content returns a dict like {url: content} + # Extract the content string before saving + if isinstance(data, dict): + # Concatenate all URL contents (usually just one URL) + content = "" + for url, text in data.items(): + content += f"{url}:\n{text}\n\n" + logger.info( + f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters" + ) + else: + content = data + + return await save_data_to_file(content) From d7417d9b06af7f912ff2c5f73a971a95466c2f9a Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 21:47:11 +0100 Subject: [PATCH 20/44] refactor: move url data fetching logic into `save_data_item_to_storage` --- cognee/api/v1/add/add.py | 3 + .../loaders/external/web_url_loader.py | 64 +--------------- .../modules/pipelines/operations/pipeline.py | 14 +++- .../modules/pipelines/operations/run_tasks.py | 2 + .../operations/run_tasks_data_item.py | 76 ++++++++----------- cognee/tasks/ingestion/ingest_data.py | 42 ++++------ .../ingestion/save_data_item_to_storage.py | 11 ++- .../integration/web_url_crawler/test_add.py | 34 +++++---- 8 files changed, 91 insertions(+), 155 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 3c4d7b696..1c76f7a52 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -31,6 +31,7 @@ async def add( incremental_loading: bool = True, data_per_batch: Optional[int] = 20, loaders_config: dict[LoaderInterface, dict] = {}, + fetchers_config: dict[str, Any] = {}, ): """ Add data to Cognee for knowledge graph processing. @@ -179,6 +180,7 @@ async def add( dataset_id, preferred_loaders, loaders_config, + fetchers_config, ), ] @@ -204,6 +206,7 @@ async def add( graph_db_config=graph_db_config, incremental_loading=incremental_loading, data_per_batch=data_per_batch, + fetchers_config=fetchers_config, ): pipeline_run_info = run_info diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py index 1ecf82171..996f7dae6 100644 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ b/cognee/infrastructure/loaders/external/web_url_loader.py @@ -2,8 +2,6 @@ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface from typing import List from cognee.modules.ingestion.exceptions.exceptions import IngestionError -from cognee.modules.ingestion import save_data_to_file -from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig from cognee.shared.logging_utils import get_logger logger = get_logger() @@ -62,7 +60,7 @@ class WebUrlLoader(LoaderInterface): Load and process the file, returning standardized result. Args: - file_path: Path to the file to be processed + file_path: Path to the file to be processed (already saved by fetcher) file_stream: If file stream is provided it will be used to process file instead **kwargs: Additional loader-specific configuration @@ -71,63 +69,5 @@ class WebUrlLoader(LoaderInterface): Raises: Exception: If file cannot be processed """ - loaders_config = kwargs.get("loaders_config") - if not isinstance(loaders_config, dict): - raise IngestionError("loaders_config must be a valid dictionary") - web_url_loader_config = loaders_config.get(self.loader_name) - if not isinstance(web_url_loader_config, dict): - raise IngestionError(f"{self.loader_name} configuration must be a valid dictionary") - - try: - from cognee.context_global_variables import tavily_config, soup_crawler_config - from cognee.tasks.web_scraper import fetch_page_content - - tavily_dict = web_url_loader_config.get("tavily_config") - _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None - - soup_dict = web_url_loader_config.get("soup_config") - _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None - - # Set global configs for downstream access - tavily_config.set(_tavily_config) - soup_crawler_config.set(_soup_config) - - preferred_tool = "beautifulsoup" if _soup_config else "tavily" - if preferred_tool == "tavily" and _tavily_config is None: - raise IngestionError( - message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." - ) - if preferred_tool == "beautifulsoup" and _soup_config is None: - raise IngestionError( - message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." - ) - - logger.info(f"Starting web URL crawling for: {file_path}") - logger.info(f"Using scraping tool: {preferred_tool}") - - data = await fetch_page_content( - file_path, - preferred_tool=preferred_tool, - tavily_config=_tavily_config, - soup_crawler_config=_soup_config, - ) - - logger.info(f"Successfully fetched content from {len(data)} URL(s)") - logger.info("Processing and concatenating fetched content") - - content = "" - for key, value in data.items(): - content += f"{key}:\n{value}\n\n" - - logger.info(f"Saving content to file (total size: {len(content)} characters)") - stored_path = await save_data_to_file(content) - logger.info(f"Successfully saved content to: {stored_path}") - - return stored_path - except IngestionError: - raise - except Exception as e: - raise IngestionError( - message=f"Error ingesting webpage from URL {file_path}: {str(e)}" - ) from e + return file_path diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index e15e9e505..1e2b3aca5 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -20,6 +20,7 @@ from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import ( from cognee.modules.pipelines.layers.check_pipeline_run_qualification import ( check_pipeline_run_qualification, ) +from typing import Any logger = get_logger("cognee.pipeline") @@ -36,6 +37,7 @@ async def run_pipeline( graph_db_config: dict = None, incremental_loading: bool = False, data_per_batch: int = 20, + fetchers_config: dict[str, Any] = {}, ): validate_pipeline_tasks(tasks) await setup_and_check_environment(vector_db_config, graph_db_config) @@ -52,6 +54,7 @@ async def run_pipeline( context={"dataset": dataset}, incremental_loading=incremental_loading, data_per_batch=data_per_batch, + fetchers_config=fetchers_config, ): yield run_info @@ -65,6 +68,7 @@ async def run_pipeline_per_dataset( context: dict = None, incremental_loading=False, data_per_batch: int = 20, + fetchers_config: dict[str, Any] = {}, ): # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -80,7 +84,15 @@ async def run_pipeline_per_dataset( return pipeline_run = run_tasks( - tasks, dataset.id, data, user, pipeline_name, context, incremental_loading, data_per_batch + tasks, + dataset.id, + data, + user, + pipeline_name, + context, + incremental_loading, + data_per_batch, + fetchers_config, ) async for pipeline_run_info in pipeline_run: diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index ecc2f647b..d11d87ddf 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -60,6 +60,7 @@ async def run_tasks( context: dict = None, incremental_loading: bool = False, data_per_batch: int = 20, + fetchers_config: dict[str, Any] = {}, ): if not user: user = await get_default_user() @@ -106,6 +107,7 @@ async def run_tasks( context, user, incremental_loading, + fetchers_config, ) ) for data_item in data_batch diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 0118e7976..9ddadd855 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -39,6 +39,7 @@ async def run_tasks_data_item_incremental( pipeline_run_id: str, context: Optional[Dict[str, Any]], user: User, + fetchers_config: dict[str, Any], ) -> AsyncGenerator[Dict[str, Any], None]: """ Process a single data item with incremental loading support. @@ -64,51 +65,36 @@ async def run_tasks_data_item_incremental( # If incremental_loading of data is set to True don't process documents already processed by pipeline # If data is being added to Cognee for the first time calculate the id of the data - try: - if not isinstance(data_item, Data): - file_path = await save_data_item_to_storage(data_item) - # Ingest data and add metadata - async with open_data_file(file_path) as file: - classified_data = ingestion.classify(file) - # data_id is the hash of file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) - else: - # If data was already processed by Cognee get data id - data_id = data_item.id + if not isinstance(data_item, Data): + file_path = await save_data_item_to_storage(data_item, fetchers_config) + # Ingest data and add metadata + async with open_data_file(file_path) as file: + classified_data = ingestion.classify(file) + # data_id is the hash of file contents + owner id to avoid duplicate data + data_id = ingestion.identify(classified_data, user) + else: + # If data was already processed by Cognee get data id + data_id = data_item.id - # Check pipeline status, if Data already processed for pipeline before skip current processing - async with db_engine.get_async_session() as session: - data_point = ( - await session.execute(select(Data).filter(Data.id == data_id)) - ).scalar_one_or_none() - if data_point: - if ( - data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id)) - == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED - ): - yield { - "run_info": PipelineRunAlreadyCompleted( - pipeline_run_id=pipeline_run_id, - dataset_id=dataset.id, - dataset_name=dataset.name, - ), - "data_id": data_id, - } - return - except UnsupportedPathSchemeError as e: - logger.warning(f"data_item does not support incremental loading: {str(e)}") - # Fall back to regular processing since incremental loading is not supported - async for result in run_tasks_data_item_regular( - data_item=data_item, - dataset=dataset, - tasks=tasks, - pipeline_id=pipeline_id, - pipeline_run_id=pipeline_run_id, - context=context, - user=user, - ): - yield result - return + # Check pipeline status, if Data already processed for pipeline before skip current processing + async with db_engine.get_async_session() as session: + data_point = ( + await session.execute(select(Data).filter(Data.id == data_id)) + ).scalar_one_or_none() + if data_point: + if ( + data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id)) + == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED + ): + yield { + "run_info": PipelineRunAlreadyCompleted( + pipeline_run_id=pipeline_run_id, + dataset_id=dataset.id, + dataset_name=dataset.name, + ), + "data_id": data_id, + } + return try: # Process data based on data_item and list of tasks @@ -225,6 +211,7 @@ async def run_tasks_data_item( context: Optional[Dict[str, Any]], user: User, incremental_loading: bool, + fetchers_config: dict[str, Any] = {}, ) -> Optional[Dict[str, Any]]: """ Process a single data item, choosing between incremental and regular processing. @@ -259,6 +246,7 @@ async def run_tasks_data_item( pipeline_run_id=pipeline_run_id, context=context, user=user, + fetchers_config=fetchers_config, ): pass else: diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 233bb5f1c..84cd1f38b 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -31,6 +31,7 @@ async def ingest_data( dataset_id: UUID = None, preferred_loaders: List[str] = None, loaders_config: dict[LoaderInterface, dict] = {}, + fetchers_config: dict[str, Any] = {}, ): if not user: user = await get_default_user() @@ -80,16 +81,10 @@ async def ingest_data( dataset_data_map = {str(data.id): True for data in dataset_data} for data_item in data: - try: - # Get file path of data item or create a file if it doesn't exist - original_file_path = await save_data_item_to_storage(data_item) - # Transform file path to be OS usable - actual_file_path = get_data_file_path(original_file_path) - except UnsupportedPathSchemeError: - # This data_item (e.g., HTTP/HTTPS URL) should be passed directly to the loader - # skip save_data_item_to_storage and get_data_file_path - actual_file_path = data_item - original_file_path = None # we don't have an original file path + # Get file path of data item or create a file if it doesn't exist + original_file_path = await save_data_item_to_storage(data_item, fetchers_config) + # Transform file path to be OS usable + actual_file_path = get_data_file_path(original_file_path) # Store all input data as text files in Cognee data storage cognee_storage_file_path, loader_engine = await data_item_to_text_file( @@ -99,26 +94,15 @@ async def ingest_data( ) # Find metadata from original file - if original_file_path is not None: - # Standard flow: extract metadata from both original and stored files - async with open_data_file(original_file_path) as file: - classified_data = ingestion.classify(file) - data_id = ingestion.identify(classified_data, user) - original_file_metadata = classified_data.get_metadata() + # Standard flow: extract metadata from both original and stored files + async with open_data_file(original_file_path) as file: + classified_data = ingestion.classify(file) + data_id = ingestion.identify(classified_data, user) + original_file_metadata = classified_data.get_metadata() - async with open_data_file(cognee_storage_file_path) as file: - classified_data = ingestion.classify(file) - storage_file_metadata = classified_data.get_metadata() - else: - # Alternative flow (e.g., URLs): extract metadata once from stored file - async with open_data_file(cognee_storage_file_path) as file: - classified_data = ingestion.classify(file) - data_id = ingestion.identify(classified_data, user) - original_file_metadata = classified_data.get_metadata() - # Override file_path to be the actual data_item (e.g., URL) ? - original_file_metadata["file_path"] = actual_file_path - # Storage metadata is the same as original - storage_file_metadata = original_file_metadata.copy() + async with open_data_file(cognee_storage_file_path) as file: + classified_data = ingestion.classify(file) + storage_file_metadata = classified_data.get_metadata() from sqlalchemy import select diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index cf32477cb..d9b98268d 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -9,6 +9,8 @@ from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger from pydantic_settings import BaseSettings, SettingsConfigDict +from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher + logger = get_logger() @@ -22,7 +24,9 @@ class SaveDataSettings(BaseSettings): settings = SaveDataSettings() -async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str: +async def save_data_item_to_storage( + data_item: Union[BinaryIO, str, Any], fetchers_config: dict[str, Any] = {} +) -> str: if "llama_index" in str(type(data_item)): # Dynamic import is used because the llama_index module is optional. from .transform_data import get_data_from_llama_index @@ -57,9 +61,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - raise UnsupportedPathSchemeError( - message=f"HTTP/HTTPS URLs should be handled by loader, not by save_data_item_to_storage. Received: {data_item}" - ) + fetcher = WebUrlFetcher() + return await fetcher.fetch(data_item, fetchers_config) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index abd0d77ba..b45ed9139 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -1,22 +1,28 @@ +from sys import exc_info import pytest import cognee +from cognee.modules.ingestion.exceptions.exceptions import IngestionError @pytest.mark.asyncio -async def test_add_fails_when_preferred_loader_not_specified(): +async def test_add_fails_when_web_url_fetcher_config_not_specified(): from cognee.shared.logging_utils import setup_logging, ERROR setup_logging(log_level=ERROR) await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - with pytest.raises(ValueError): + with pytest.raises(IngestionError) as excinfo: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", + incremental_loading=False, ) + assert excinfo.value.message.startswith( + "web_url_fetcher configuration must be a valid dictionary" + ) @pytest.mark.asyncio -async def test_add_succesfully_adds_url_when_preferred_loader_specified(): +async def test_add_succesfully_adds_url_when_fetcher_config_specified(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -27,8 +33,8 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified(): "paragraphs": {"selector": "p", "all": True}, } - loaders_config = { - "web_url_loader": { + fetchers_config = { + "web_url_fetcher": { "soup_config": { "max_depth": 1, "follow_links": False, @@ -40,8 +46,8 @@ async def test_add_succesfully_adds_url_when_preferred_loader_specified(): try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], - loaders_config=loaders_config, + incremental_loading=False, + fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") @@ -59,8 +65,8 @@ async def test_add_with_incremental_loading_works(): "paragraphs": {"selector": "p", "all": True}, } - loaders_config = { - "web_url_loader": { + fetchers_config = { + "web_url_fetcher": { "soup_config": { "max_depth": 1, "follow_links": False, @@ -71,9 +77,8 @@ async def test_add_with_incremental_loading_works(): try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], incremental_loading=True, - loaders_config=loaders_config, + fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") @@ -91,8 +96,8 @@ async def test_add_without_incremental_loading_works(): "paragraphs": {"selector": "p", "all": True}, } - loaders_config = { - "web_url_loader": { + fetchers_config = { + "web_url_fetcher": { "soup_config": { "max_depth": 1, "follow_links": False, @@ -103,9 +108,8 @@ async def test_add_without_incremental_loading_works(): try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], incremental_loading=False, - loaders_config=loaders_config, + fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") From fc660b46bb13fcaf901a830636a3aed73c4c9065 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 21:50:10 +0100 Subject: [PATCH 21/44] remove web_url_loader since there is no logic post fetching for loader --- .../loaders/external/__init__.py | 7 -- .../loaders/external/web_url_loader.py | 73 ------------------- .../tasks/ingestion/data_item_to_text_file.py | 11 --- .../web_url_crawler/test_loader_engine.py | 20 ----- ..._example.py => web_url_fetcher_example.py} | 7 +- 5 files changed, 3 insertions(+), 115 deletions(-) delete mode 100644 cognee/infrastructure/loaders/external/web_url_loader.py delete mode 100644 cognee/tests/integration/web_url_crawler/test_loader_engine.py rename examples/python/{web_url_loader_example.py => web_url_fetcher_example.py} (80%) diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index 2790a7ea0..6bf9f9200 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -27,10 +27,3 @@ try: __all__.append("AdvancedPdfLoader") except ImportError: pass - -try: - from .web_url_loader import WebUrlLoader - - __all__.append("WebUrlLoader") -except ImportError: - pass diff --git a/cognee/infrastructure/loaders/external/web_url_loader.py b/cognee/infrastructure/loaders/external/web_url_loader.py deleted file mode 100644 index 996f7dae6..000000000 --- a/cognee/infrastructure/loaders/external/web_url_loader.py +++ /dev/null @@ -1,73 +0,0 @@ -from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface -from typing import List - -from cognee.modules.ingestion.exceptions.exceptions import IngestionError -from cognee.shared.logging_utils import get_logger - -logger = get_logger() - - -class WebUrlLoader(LoaderInterface): - @property - def supported_extensions(self) -> List[str]: - """ - List of file extensions this loader supports. - - Returns: - List of extensions including the dot (e.g., ['.txt', '.md']) - """ - return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality - - @property - def supported_mime_types(self) -> List[str]: - """ - List of MIME types this loader supports. - - Returns: - List of MIME type strings (e.g., ['text/plain', 'application/pdf']) - """ - return [] # N/A, we can safely return empty since it's used in register and get_loader_info, doesn't reflect on functionality - - @property - def loader_name(self) -> str: - """ - Unique name identifier for this loader. - - Returns: - String identifier used for registration and configuration - """ - return "web_url_loader" - - def can_handle(self, extension: str, mime_type: str, data_item_path: str = None) -> bool: - """ - Check if this loader can handle the given file. - - Args: - extension: File extension - mime_type: MIME type of the file - - Returns: - True if this loader can process the file, False otherwise - """ - if data_item_path is None: - raise IngestionError( - "data_item_path should not be None" - ) # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py - return data_item_path.startswith(("http://", "https://")) - - async def load(self, file_path: str, **kwargs): - """ - Load and process the file, returning standardized result. - - Args: - file_path: Path to the file to be processed (already saved by fetcher) - file_stream: If file stream is provided it will be used to process file instead - **kwargs: Additional loader-specific configuration - - Returns: - file path to the stored file - Raises: - Exception: If file cannot be processed - """ - - return file_path diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 211b918ae..8d2e915b0 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -76,16 +76,5 @@ async def data_item_to_text_file( ) else: raise IngestionError(message="Local files are not accepted.") - - elif data_item_path.startswith(("http://", "https://")): - loader = get_loader_engine() - return ( - await loader.load_file( - data_item_path, - preferred_loaders, - loaders_config=loaders_config, # TODO: right now loaders_config is only needed for web_url_loader, so keeping changes minimal - ), - loader.get_loader(data_item_path, preferred_loaders), - ) # data is not a supported type raise IngestionError(message=f"Data type not supported: {type(data_item_path)}") diff --git a/cognee/tests/integration/web_url_crawler/test_loader_engine.py b/cognee/tests/integration/web_url_crawler/test_loader_engine.py deleted file mode 100644 index 018c034e1..000000000 --- a/cognee/tests/integration/web_url_crawler/test_loader_engine.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from cognee.infrastructure.loaders import get_loader_engine -from cognee.infrastructure.loaders.external.web_url_loader import WebUrlLoader - - -def test_get_loader_returns_none_by_default_for_web_urls(): - loader_engine = get_loader_engine() - urls = ["https://cognee.ai", "http://cognee.ai"] - for url in urls: - loader = loader_engine.get_loader(url) - assert loader is None - - -def test_get_loader_returns_valid_loader_when_preferred_loaders_specified(): - loader_engine = get_loader_engine() - urls = ["https://cognee.ai", "http://cognee.ai"] - for url in urls: - loader = loader_engine.get_loader(url, preferred_loaders=["web_url_loader"]) - assert isinstance(loader, WebUrlLoader) diff --git a/examples/python/web_url_loader_example.py b/examples/python/web_url_fetcher_example.py similarity index 80% rename from examples/python/web_url_loader_example.py rename to examples/python/web_url_fetcher_example.py index 37dd2258c..9ac099e16 100644 --- a/examples/python/web_url_loader_example.py +++ b/examples/python/web_url_fetcher_example.py @@ -20,7 +20,7 @@ async def main(): "paragraphs": {"selector": "p", "all": True}, } - loaders_config = { + fetchers_config = { "web_url_loader": { "soup_config": { "max_depth": 1, @@ -32,9 +32,8 @@ async def main(): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - preferred_loaders=["web_url_loader"], - incremental_loading=False, # TODO: incremental loading bypasses regular data ingestion, which breaks. Will fix - loaders_config=loaders_config, + incremental_loading=False, + fetchers_config=fetchers_config, ) await cognee.cognify() From f7c2187ce7612c0ca4068bfe39a7895bf8823520 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 21:52:10 +0100 Subject: [PATCH 22/44] remove `loaders_config` as it's not in use --- cognee/api/v1/add/add.py | 2 -- cognee/tasks/ingestion/data_item_to_text_file.py | 1 - cognee/tasks/ingestion/ingest_data.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 1c76f7a52..44005d755 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -30,7 +30,6 @@ async def add( preferred_loaders: List[str] = None, incremental_loading: bool = True, data_per_batch: Optional[int] = 20, - loaders_config: dict[LoaderInterface, dict] = {}, fetchers_config: dict[str, Any] = {}, ): """ @@ -179,7 +178,6 @@ async def add( node_set, dataset_id, preferred_loaders, - loaders_config, fetchers_config, ), ] diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 8d2e915b0..4b9e4bb23 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -36,7 +36,6 @@ async def pull_from_s3(file_path, destination_file) -> None: async def data_item_to_text_file( data_item_path: str, preferred_loaders: List[str], - loaders_config: dict[LoaderInterface, dict], ) -> Tuple[str, LoaderInterface]: if isinstance(data_item_path, str): parsed_url = urlparse(data_item_path) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 84cd1f38b..648a34ace 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -30,7 +30,6 @@ async def ingest_data( node_set: Optional[List[str]] = None, dataset_id: UUID = None, preferred_loaders: List[str] = None, - loaders_config: dict[LoaderInterface, dict] = {}, fetchers_config: dict[str, Any] = {}, ): if not user: @@ -90,7 +89,6 @@ async def ingest_data( cognee_storage_file_path, loader_engine = await data_item_to_text_file( actual_file_path, preferred_loaders, - loaders_config, ) # Find metadata from original file From 1213a3a4cb529a7f394baa9f8ef83e9e2bd2d67f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 22:00:13 +0100 Subject: [PATCH 23/44] revert changes to `LoaderEngine` and `LoaderInterface` --- cognee/infrastructure/loaders/LoaderEngine.py | 23 ++++--------------- .../infrastructure/loaders/LoaderInterface.py | 4 +--- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index d6c4d4d8c..6b62f7641 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -64,7 +64,7 @@ class LoaderEngine: return True def get_loader( - self, data_item_path: str, preferred_loaders: List[str] = None + self, file_path: str, preferred_loaders: List[str] = None ) -> Optional[LoaderInterface]: """ Get appropriate loader for a file. @@ -76,37 +76,24 @@ class LoaderEngine: Returns: LoaderInterface that can handle the file, or None if not found """ - is_url = data_item_path.startswith(("http://", "https://")) - if is_url: - extension = None - mime_type = None - else: - file_info = filetype.guess(data_item_path) - extension = file_info.extension if file_info else None - mime_type = file_info.mime if file_info else None + file_info = filetype.guess(file_path) # Try preferred loaders first if preferred_loaders: for loader_name in preferred_loaders: if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle( - extension=extension, - mime_type=mime_type, - data_item_path=data_item_path, - ): # TODO: I'd like to refactor this to be just one argument and let loaders get file_info inside, but I'll keep that until review time + if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: logger.info(f"Skipping {loader_name}: Preferred Loader not registered") # Try default priority order - for loader_name in ( - self.default_loader_priority - ): # TODO: I'm in favor of adding WebUrlLoader to defaults, but keeping it for review + for loader_name in self.default_loader_priority: if loader_name in self._loaders: loader = self._loaders[loader_name] - if loader.can_handle(extension=extension, mime_type=mime_type): + if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: logger.info( diff --git a/cognee/infrastructure/loaders/LoaderInterface.py b/cognee/infrastructure/loaders/LoaderInterface.py index fb309304b..3a1c9bf3e 100644 --- a/cognee/infrastructure/loaders/LoaderInterface.py +++ b/cognee/infrastructure/loaders/LoaderInterface.py @@ -44,9 +44,7 @@ class LoaderInterface(ABC): pass @abstractmethod - def can_handle( - self, extension: str, mime_type: str, data_item_path: str = None - ) -> bool: # TODO: Temporarily set this to default to None so that I don't update other loaders unnecessarily yet, see TODO in LoaderEngine.py + def can_handle(self, extension: str, mime_type: str) -> bool: """ Check if this loader can handle the given file. From fdf7c27fec2762c3fa84fb949f1c63266eaee5a6 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 22:02:29 +0100 Subject: [PATCH 24/44] refactor: remove WebUrlLoader imports --- cognee/infrastructure/loaders/supported_loaders.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index 7f92aa36a..d103babe3 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -23,10 +23,3 @@ try: supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader except ImportError: pass - -try: - from cognee.infrastructure.loaders.external import WebUrlLoader - - supported_loaders[WebUrlLoader.loader_name] = WebUrlLoader -except ImportError: - pass From 35d3c0877922624a5e9263d3efc5ebe4abcbe332 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 22:04:00 +0100 Subject: [PATCH 25/44] Clean up `add.py` imports --- cognee/api/v1/add/add.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 44005d755..67da3047b 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -1,9 +1,5 @@ from uuid import UUID -import os -from typing import Union, BinaryIO, List, Optional, Dict, Any -from pydantic import BaseModel -from urllib.parse import urlparse -from cognee.infrastructure.loaders import LoaderInterface +from typing import Union, BinaryIO, List, Optional, Any from cognee.modules.users.models import User from cognee.modules.pipelines import Task, run_pipeline from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import ( From 085e81c082e46f1a96c02d4443657baa3a5cb07f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 22:05:14 +0100 Subject: [PATCH 26/44] Clean up - remove `UnsupportedPathSchemeError` --- cognee/infrastructure/files/exceptions.py | 12 ------------ .../infrastructure/files/utils/get_data_file_path.py | 7 ------- .../pipelines/operations/run_tasks_data_item.py | 1 - cognee/tasks/ingestion/ingest_data.py | 2 -- cognee/tasks/ingestion/save_data_item_to_storage.py | 1 - 5 files changed, 23 deletions(-) diff --git a/cognee/infrastructure/files/exceptions.py b/cognee/infrastructure/files/exceptions.py index eb6efdbce..351eaee9c 100644 --- a/cognee/infrastructure/files/exceptions.py +++ b/cognee/infrastructure/files/exceptions.py @@ -11,15 +11,3 @@ class FileContentHashingError(Exception): status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, ): super().__init__(message, name, status_code) - - -class UnsupportedPathSchemeError(Exception): - """Raised when a non-filesystem path scheme (like http://, https://) is passed to a function expecting filesystem paths.""" - - def __init__( - self, - message: str = "This function only supports filesystem paths (file:// or local paths), not HTTP/HTTPS URLs.", - name: str = "UnsupportedPathSchemeError", - status_code=status.HTTP_400_BAD_REQUEST, - ): - super().__init__(message, name, status_code) diff --git a/cognee/infrastructure/files/utils/get_data_file_path.py b/cognee/infrastructure/files/utils/get_data_file_path.py index d67fc95a0..7ffda79bd 100644 --- a/cognee/infrastructure/files/utils/get_data_file_path.py +++ b/cognee/infrastructure/files/utils/get_data_file_path.py @@ -1,8 +1,6 @@ import os from urllib.parse import urlparse -from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError - def get_data_file_path(file_path: str): # Check if this is a file URI BEFORE normalizing (which corrupts URIs) @@ -40,11 +38,6 @@ def get_data_file_path(file_path: str): return normalized_url - elif file_path.startswith(("http://", "https://")): - raise UnsupportedPathSchemeError( - message=f"HTTP/HTTPS URLs are not supported by get_data_file_path(). Received: {file_path}" - ) - else: # Regular file path - normalize separators normalized_path = os.path.normpath(file_path) diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 9ddadd855..e445d323b 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -9,7 +9,6 @@ import os from typing import Any, Dict, AsyncGenerator, Optional from sqlalchemy import select -from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.files.utils.open_data_file import open_data_file diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 648a34ace..e707f4d92 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -3,8 +3,6 @@ import inspect from uuid import UUID from typing import Union, BinaryIO, Any, List, Optional -from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError -from cognee.infrastructure.loaders import LoaderInterface import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index d9b98268d..c70ddb2de 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -3,7 +3,6 @@ from pathlib import Path from urllib.parse import urlparse from typing import Union, BinaryIO, Any -from cognee.infrastructure.files.exceptions import UnsupportedPathSchemeError from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger From abbbf88ad342e70b0d1dab57100bf7821e0e25e5 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Mon, 20 Oct 2025 23:31:49 +0100 Subject: [PATCH 27/44] CI: use scraping dependenies for integration tests --- .github/workflows/basic_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/basic_tests.yml b/.github/workflows/basic_tests.yml index 3f3e644a2..f89d031a6 100644 --- a/.github/workflows/basic_tests.yml +++ b/.github/workflows/basic_tests.yml @@ -123,6 +123,7 @@ jobs: uses: ./.github/actions/cognee_setup with: python-version: ${{ inputs.python-version }} + extra-dependencies: "scraping" - name: Run Integration Tests run: uv run pytest cognee/tests/integration/ From 95e735d3979aba36f72c7cc353c8641756fe2359 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 12:04:35 +0100 Subject: [PATCH 28/44] remove `fetchers_config`, use default configs for Tavily and BeautifulSoup --- cognee/api/v1/add/add.py | 3 -- .../modules/pipelines/operations/pipeline.py | 4 --- .../modules/pipelines/operations/run_tasks.py | 2 -- .../operations/run_tasks_data_item.py | 5 +-- .../data_fetchers/data_fetcher_interface.py | 2 +- .../data_fetchers/web_url_fetcher.py | 33 ++++++------------- cognee/tasks/ingestion/ingest_data.py | 3 +- .../ingestion/save_data_item_to_storage.py | 6 ++-- .../integration/web_url_crawler/test_add.py | 31 ----------------- examples/python/web_url_fetcher_example.py | 11 ------- 10 files changed, 15 insertions(+), 85 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 67da3047b..216911ec0 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -26,7 +26,6 @@ async def add( preferred_loaders: List[str] = None, incremental_loading: bool = True, data_per_batch: Optional[int] = 20, - fetchers_config: dict[str, Any] = {}, ): """ Add data to Cognee for knowledge graph processing. @@ -174,7 +173,6 @@ async def add( node_set, dataset_id, preferred_loaders, - fetchers_config, ), ] @@ -200,7 +198,6 @@ async def add( graph_db_config=graph_db_config, incremental_loading=incremental_loading, data_per_batch=data_per_batch, - fetchers_config=fetchers_config, ): pipeline_run_info = run_info diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index 1e2b3aca5..eb0ebe8bd 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -37,7 +37,6 @@ async def run_pipeline( graph_db_config: dict = None, incremental_loading: bool = False, data_per_batch: int = 20, - fetchers_config: dict[str, Any] = {}, ): validate_pipeline_tasks(tasks) await setup_and_check_environment(vector_db_config, graph_db_config) @@ -54,7 +53,6 @@ async def run_pipeline( context={"dataset": dataset}, incremental_loading=incremental_loading, data_per_batch=data_per_batch, - fetchers_config=fetchers_config, ): yield run_info @@ -68,7 +66,6 @@ async def run_pipeline_per_dataset( context: dict = None, incremental_loading=False, data_per_batch: int = 20, - fetchers_config: dict[str, Any] = {}, ): # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -92,7 +89,6 @@ async def run_pipeline_per_dataset( context, incremental_loading, data_per_batch, - fetchers_config, ) async for pipeline_run_info in pipeline_run: diff --git a/cognee/modules/pipelines/operations/run_tasks.py b/cognee/modules/pipelines/operations/run_tasks.py index d11d87ddf..ecc2f647b 100644 --- a/cognee/modules/pipelines/operations/run_tasks.py +++ b/cognee/modules/pipelines/operations/run_tasks.py @@ -60,7 +60,6 @@ async def run_tasks( context: dict = None, incremental_loading: bool = False, data_per_batch: int = 20, - fetchers_config: dict[str, Any] = {}, ): if not user: user = await get_default_user() @@ -107,7 +106,6 @@ async def run_tasks( context, user, incremental_loading, - fetchers_config, ) ) for data_item in data_batch diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index e445d323b..152e72d7f 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -38,7 +38,6 @@ async def run_tasks_data_item_incremental( pipeline_run_id: str, context: Optional[Dict[str, Any]], user: User, - fetchers_config: dict[str, Any], ) -> AsyncGenerator[Dict[str, Any], None]: """ Process a single data item with incremental loading support. @@ -65,7 +64,7 @@ async def run_tasks_data_item_incremental( # If incremental_loading of data is set to True don't process documents already processed by pipeline # If data is being added to Cognee for the first time calculate the id of the data if not isinstance(data_item, Data): - file_path = await save_data_item_to_storage(data_item, fetchers_config) + file_path = await save_data_item_to_storage(data_item) # Ingest data and add metadata async with open_data_file(file_path) as file: classified_data = ingestion.classify(file) @@ -210,7 +209,6 @@ async def run_tasks_data_item( context: Optional[Dict[str, Any]], user: User, incremental_loading: bool, - fetchers_config: dict[str, Any] = {}, ) -> Optional[Dict[str, Any]]: """ Process a single data item, choosing between incremental and regular processing. @@ -245,7 +243,6 @@ async def run_tasks_data_item( pipeline_run_id=pipeline_run_id, context=context, user=user, - fetchers_config=fetchers_config, ): pass else: diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py index db8b8963b..9171e429d 100644 --- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py +++ b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py @@ -8,7 +8,7 @@ class DataFetcherInterface(ABC): pass @abstractmethod - async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]) -> str: + async def fetch(self, data_item_path: str) -> str: """ args: data_item_path - path to the data item """ diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py index f1e5dac91..3b90b51b1 100644 --- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py +++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py @@ -1,8 +1,7 @@ +import os from cognee.modules.ingestion import save_data_to_file from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface -from typing import Any from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig -from cognee.modules.ingestion.exceptions.exceptions import IngestionError from cognee.shared.logging_utils import get_logger logger = get_logger() @@ -14,34 +13,22 @@ class WebUrlFetcher(DataFetcherInterface): def fetcher_name(self): return "web_url_fetcher" - async def fetch(self, data_item_path: str, fetchers_config: dict[str, Any]): + async def fetch(self, data_item_path: str): from cognee.context_global_variables import tavily_config, soup_crawler_config from cognee.tasks.web_scraper import fetch_page_content - web_url_fetcher_config = fetchers_config.get(self.fetcher_name()) - if not isinstance(web_url_fetcher_config, dict): - raise IngestionError(f"{self.fetcher_name()} configuration must be a valid dictionary") + if os.getenv("TAVILY_API_KEY"): + _tavily_config = TavilyConfig() + _soup_config = None + preferred_tool = "tavily" + else: + _tavily_config = None + _soup_config = SoupCrawlerConfig() + preferred_tool = "beautifulsoup" - tavily_dict = web_url_fetcher_config.get("tavily_config") - _tavily_config = TavilyConfig(**tavily_dict) if tavily_dict else None - - soup_dict = web_url_fetcher_config.get("soup_config") - _soup_config = SoupCrawlerConfig(**soup_dict) if soup_dict else None - - # Set global configs for downstream access tavily_config.set(_tavily_config) soup_crawler_config.set(_soup_config) - preferred_tool = "beautifulsoup" if _soup_config else "tavily" - if preferred_tool == "tavily" and _tavily_config is None: - raise IngestionError( - message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig." - ) - if preferred_tool == "beautifulsoup" and _soup_config is None: - raise IngestionError( - message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper." - ) - logger.info(f"Starting web URL crawling for: {data_item_path}") logger.info(f"Using scraping tool: {preferred_tool}") diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index e707f4d92..02987b893 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -28,7 +28,6 @@ async def ingest_data( node_set: Optional[List[str]] = None, dataset_id: UUID = None, preferred_loaders: List[str] = None, - fetchers_config: dict[str, Any] = {}, ): if not user: user = await get_default_user() @@ -79,7 +78,7 @@ async def ingest_data( for data_item in data: # Get file path of data item or create a file if it doesn't exist - original_file_path = await save_data_item_to_storage(data_item, fetchers_config) + original_file_path = await save_data_item_to_storage(data_item) # Transform file path to be OS usable actual_file_path = get_data_file_path(original_file_path) diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index c70ddb2de..453219f15 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -23,9 +23,7 @@ class SaveDataSettings(BaseSettings): settings = SaveDataSettings() -async def save_data_item_to_storage( - data_item: Union[BinaryIO, str, Any], fetchers_config: dict[str, Any] = {} -) -> str: +async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str: if "llama_index" in str(type(data_item)): # Dynamic import is used because the llama_index module is optional. from .transform_data import get_data_from_llama_index @@ -61,7 +59,7 @@ async def save_data_item_to_storage( return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": fetcher = WebUrlFetcher() - return await fetcher.fetch(data_item, fetchers_config) + return await fetcher.fetch(data_item) # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index b45ed9139..a00ca9e0d 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -33,21 +33,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified(): "paragraphs": {"selector": "p", "all": True}, } - fetchers_config = { - "web_url_fetcher": { - "soup_config": { - "max_depth": 1, - "follow_links": False, - "extraction_rules": extraction_rules, - } - } - } - try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=False, - fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") @@ -65,20 +54,10 @@ async def test_add_with_incremental_loading_works(): "paragraphs": {"selector": "p", "all": True}, } - fetchers_config = { - "web_url_fetcher": { - "soup_config": { - "max_depth": 1, - "follow_links": False, - "extraction_rules": extraction_rules, - } - } - } try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=True, - fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") @@ -96,20 +75,10 @@ async def test_add_without_incremental_loading_works(): "paragraphs": {"selector": "p", "all": True}, } - fetchers_config = { - "web_url_fetcher": { - "soup_config": { - "max_depth": 1, - "follow_links": False, - "extraction_rules": extraction_rules, - } - } - } try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=False, - fetchers_config=fetchers_config, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py index 9ac099e16..2195a62c0 100644 --- a/examples/python/web_url_fetcher_example.py +++ b/examples/python/web_url_fetcher_example.py @@ -20,20 +20,9 @@ async def main(): "paragraphs": {"selector": "p", "all": True}, } - fetchers_config = { - "web_url_loader": { - "soup_config": { - "max_depth": 1, - "follow_links": False, - "extraction_rules": extraction_rules, - } - } - } - await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=False, - fetchers_config=fetchers_config, ) await cognee.cognify() From 5035c872a71ce77dc8bd564ebb16400ec5a5f3dd Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 15:20:09 +0100 Subject: [PATCH 29/44] refactor: update web scraper configurations and simplify fetch logic --- cognee/context_global_variables.py | 7 ++- .../data_fetchers/web_url_fetcher.py | 2 - cognee/tasks/web_scraper/bs4_crawler.py | 55 +++---------------- cognee/tasks/web_scraper/utils.py | 24 ++++---- 4 files changed, 25 insertions(+), 63 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 2ecf9b8d3..388316359 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -7,14 +7,17 @@ from cognee.base_config import get_base_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user +from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig # Note: ContextVar allows us to use different graph db configurations in Cognee # for different async tasks, threads and processes vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -soup_crawler_config = ContextVar("soup_crawler_config", default=None) -tavily_config = ContextVar("tavily_config", default=None) +soup_crawler_config: ContextVar[SoupCrawlerConfig | None] = ContextVar( + "soup_crawler_config", default=None +) +tavily_config: ContextVar[TavilyConfig | None] = ContextVar("tavily_config", default=None) async def set_session_user_context_variable(user): diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py index 3b90b51b1..949cb9b0a 100644 --- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py +++ b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py @@ -35,8 +35,6 @@ class WebUrlFetcher(DataFetcherInterface): data = await fetch_page_content( data_item_path, preferred_tool=preferred_tool, - soup_crawler_config=_soup_config, - tavily_config=_tavily_config, ) logger.info(f"Successfully fetched content from URL {data_item_path}") diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 400287e08..969058466 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -66,6 +66,7 @@ class RobotsTxtCache: timestamp: float = field(default_factory=time.time) +# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler class BeautifulSoupCrawler: """Crawler for fetching and extracting web content using BeautifulSoup. @@ -491,14 +492,12 @@ class BeautifulSoupCrawler: return (val or "").strip() return el.get_text(strip=True) - async def fetch_with_bs4( + async def fetch_urls( self, - urls: Union[str, List[str], Dict[str, Dict[str, Any]]], - extraction_rules: Optional[Dict[str, Any]] = None, + urls: Union[str, List[str]], *, use_playwright: bool = False, playwright_js_wait: float = 0.8, - join_all_matches: bool = False, ) -> Dict[str, str]: """Fetch and extract content from URLs using BeautifulSoup or Playwright. @@ -516,38 +515,11 @@ class BeautifulSoupCrawler: ValueError: If extraction_rules are missing when required or if urls is invalid. Exception: If fetching or extraction fails. """ - url_rules_map: Dict[str, Dict[str, Any]] = {} - if isinstance(urls, str): - if not extraction_rules: - raise ValueError("extraction_rules required when urls is a string") - url_rules_map[urls] = extraction_rules - elif isinstance(urls, list): - if not extraction_rules: - raise ValueError("extraction_rules required when urls is a list") - for url in urls: - url_rules_map[url] = extraction_rules - elif isinstance(urls, dict): - url_rules_map = urls + urls = [urls] else: raise ValueError(f"Invalid urls type: {type(urls)}") - logger.info( - f"Preparing to fetch {len(url_rules_map)} URL(s) with {len(extraction_rules) if extraction_rules else 0} extraction rule(s)" - ) - - normalized_url_rules: Dict[str, List[ExtractionRule]] = {} - for url, rules in url_rules_map.items(): - normalized_rules = [] - for _, rule in rules.items(): - r = self._normalize_rule(rule) - if join_all_matches: - r.all = True - normalized_rules.append(r) - normalized_url_rules[url] = normalized_rules - - logger.info(f"Normalized extraction rules for {len(normalized_url_rules)} URL(s)") - async def _task(url: str): async with self._sem: try: @@ -575,30 +547,21 @@ class BeautifulSoupCrawler: logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)") - # Extract content - pieces = [] - for rule in normalized_url_rules[url]: - text = self._extract_with_bs4(html, rule) - if text: - pieces.append(text) - - concatenated = " ".join(pieces).strip() - logger.info(f"Extracted {len(concatenated)} characters from {url}") - return url, concatenated + return url, html except Exception as e: logger.error(f"Error processing {url}: {e}") return url, "" - logger.info(f"Creating {len(url_rules_map)} async tasks for concurrent fetching") - tasks = [asyncio.create_task(_task(u)) for u in url_rules_map.keys()] + logger.info(f"Creating {len(urls)} async tasks for concurrent fetching") + tasks = [asyncio.create_task(_task(u)) for u in urls] results = {} completed = 0 total = len(tasks) for coro in asyncio.as_completed(tasks): - url, text = await coro - results[url] = text + url, html = await coro + results[url] = html completed += 1 logger.info(f"Progress: {completed}/{total} URLs processed") diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index a32b6848c..8b8bcc11f 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -5,19 +5,17 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping """ from typing import Dict, List, Union, Optional, Literal +from cognee.context_global_variables import soup_crawler_config, tavily_config from cognee.shared.logging_utils import get_logger from .bs4_crawler import BeautifulSoupCrawler -from .config import TavilyConfig, SoupCrawlerConfig +from .config import TavilyConfig logger = get_logger(__name__) async def fetch_page_content( urls: Union[str, List[str]], - *, preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup", - tavily_config: Optional[TavilyConfig] = None, - soup_crawler_config: Optional[SoupCrawlerConfig] = None, ) -> Dict[str, str]: """Fetch content from one or more URLs using the specified tool. @@ -48,6 +46,9 @@ async def fetch_page_content( url_list = [urls] if isinstance(urls, str) else urls logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}") + _tavily_config = tavily_config.get() + _soup_crawler_config = soup_crawler_config.get() + if preferred_tool == "tavily": if not tavily_config or tavily_config.api_key is None: raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily") @@ -62,11 +63,10 @@ async def fetch_page_content( "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1" ) raise ImportError - if not soup_crawler_config or soup_crawler_config.extraction_rules is None: - raise ValueError("extraction_rules must be provided when not using Tavily") + if soup_crawler_config is None or soup_crawler_config.extraction_rules is None: + raise ValueError("soup_crawler_config must be provided when not using Tavily") logger.info("Using BeautifulSoup for content extraction") - extraction_rules = soup_crawler_config.extraction_rules logger.info( f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s" ) @@ -85,12 +85,10 @@ async def fetch_page_content( logger.info( f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})" ) - results = await crawler.fetch_with_bs4( + results = await crawler.fetch_urls( urls, - extraction_rules, use_playwright=soup_crawler_config.use_playwright, playwright_js_wait=soup_crawler_config.playwright_js_wait, - join_all_matches=soup_crawler_config.join_all_matches, ) logger.info(f"Successfully fetched content from {len(results)} URL(s)") return results @@ -103,7 +101,7 @@ async def fetch_page_content( async def fetch_with_tavily( - urls: Union[str, List[str]], tavily_config: Optional[TavilyConfig] = None + urls: Union[str, List[str]], tavily_config: TavilyConfig ) -> Dict[str, str]: """Fetch content from URLs using the Tavily API. @@ -133,8 +131,8 @@ async def fetch_with_tavily( f"Initializing Tavily client with extract_depth={extract_depth}, timeout={timeout}s" ) client = AsyncTavilyClient( - api_key=tavily_config.api_key if tavily_config else None, - proxies=tavily_config.proxies if tavily_config else None, + api_key=tavily_config.api_key, + proxies=tavily_config.proxies, ) logger.info(f"Sending extract request to Tavily API for {len(url_list)} URL(s)") From a7ff18801866def587d028c7258749d4d9e6d80f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 15:22:40 +0100 Subject: [PATCH 30/44] add crawler tests --- .../web_url_crawler/test_bs4_crawler.py | 13 +++++++++++++ .../web_url_crawler/test_tavily_crawler.py | 15 +++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 cognee/tests/integration/web_url_crawler/test_bs4_crawler.py create mode 100644 cognee/tests/integration/web_url_crawler/test_tavily_crawler.py diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py new file mode 100644 index 000000000..0e7637d86 --- /dev/null +++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py @@ -0,0 +1,13 @@ +import pytest +from cognee.tasks.web_scraper import BeautifulSoupCrawler + + +@pytest.mark.asyncio +async def test_fetch(): + crawler = BeautifulSoupCrawler() + url = "https://en.wikipedia.org/wiki/Large_language_model" + results = await crawler.fetch_urls(url) + assert len(results) == 1 + assert isinstance(results, dict) + html = results[url] + assert isinstance(html, str) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py new file mode 100644 index 000000000..7edb9b8d3 --- /dev/null +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -0,0 +1,15 @@ +import os +import pytest +from cognee.tasks.web_scraper.config import TavilyConfig +from cognee.tasks.web_scraper.utils import fetch_with_tavily + + +@pytest.mark.asyncio +async def test_fetch(): + url = "https://en.wikipedia.org/wiki/Large_language_model" + tavily_config = TavilyConfig() + results = await fetch_with_tavily(url, tavily_config) + assert len(results) == 1 + assert isinstance(results, dict) + html = results[url] + assert isinstance(html, str) From 9d9969676f105d60e46c6bdf7d0b75a4b5f3c8bb Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 15:49:12 +0100 Subject: [PATCH 31/44] Separate BeautifulSoup crawling from fetching --- cognee/context_global_variables.py | 6 - .../tasks/ingestion/data_fetchers/__init__.py | 8 - .../data_fetchers/data_fetcher_interface.py | 15 - .../data_fetchers/web_url_fetcher.py | 55 --- cognee/tasks/web_scraper/__init__.py | 2 + cognee/tasks/web_scraper/bs4_crawler.py | 441 +---------------- cognee/tasks/web_scraper/config.py | 2 +- .../tasks/web_scraper/default_url_crawler.py | 446 ++++++++++++++++++ cognee/tasks/web_scraper/utils.py | 71 ++- cognee/tasks/web_scraper/web_scraper_task.py | 10 +- .../tasks/web_scraping/web_scraping_test.py | 6 +- 11 files changed, 489 insertions(+), 573 deletions(-) delete mode 100644 cognee/tasks/ingestion/data_fetchers/__init__.py delete mode 100644 cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py delete mode 100644 cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py create mode 100644 cognee/tasks/web_scraper/default_url_crawler.py diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 388316359..aad53341a 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -7,18 +7,12 @@ from cognee.base_config import get_base_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user -from cognee.tasks.web_scraper.config import SoupCrawlerConfig, TavilyConfig # Note: ContextVar allows us to use different graph db configurations in Cognee # for different async tasks, threads and processes vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -soup_crawler_config: ContextVar[SoupCrawlerConfig | None] = ContextVar( - "soup_crawler_config", default=None -) -tavily_config: ContextVar[TavilyConfig | None] = ContextVar("tavily_config", default=None) - async def set_session_user_context_variable(user): session_user.set(user) diff --git a/cognee/tasks/ingestion/data_fetchers/__init__.py b/cognee/tasks/ingestion/data_fetchers/__init__.py deleted file mode 100644 index 63530b427..000000000 --- a/cognee/tasks/ingestion/data_fetchers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -__all__ = [] - -try: - from .web_url_fetcher import WebUrlFetcher - - __all__.append("WebUrlFetcher") -except ImportError: - pass diff --git a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py b/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py deleted file mode 100644 index 9171e429d..000000000 --- a/cognee/tasks/ingestion/data_fetchers/data_fetcher_interface.py +++ /dev/null @@ -1,15 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any - - -class DataFetcherInterface(ABC): - @abstractmethod - def fetcher_name(self) -> str: - pass - - @abstractmethod - async def fetch(self, data_item_path: str) -> str: - """ - args: data_item_path - path to the data item - """ - pass diff --git a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py b/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py deleted file mode 100644 index 949cb9b0a..000000000 --- a/cognee/tasks/ingestion/data_fetchers/web_url_fetcher.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -from cognee.modules.ingestion import save_data_to_file -from cognee.tasks.ingestion.data_fetchers.data_fetcher_interface import DataFetcherInterface -from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig -from cognee.shared.logging_utils import get_logger - -logger = get_logger() - - -class WebUrlFetcher(DataFetcherInterface): - def __init__(self): ... - - def fetcher_name(self): - return "web_url_fetcher" - - async def fetch(self, data_item_path: str): - from cognee.context_global_variables import tavily_config, soup_crawler_config - from cognee.tasks.web_scraper import fetch_page_content - - if os.getenv("TAVILY_API_KEY"): - _tavily_config = TavilyConfig() - _soup_config = None - preferred_tool = "tavily" - else: - _tavily_config = None - _soup_config = SoupCrawlerConfig() - preferred_tool = "beautifulsoup" - - tavily_config.set(_tavily_config) - soup_crawler_config.set(_soup_config) - - logger.info(f"Starting web URL crawling for: {data_item_path}") - logger.info(f"Using scraping tool: {preferred_tool}") - - data = await fetch_page_content( - data_item_path, - preferred_tool=preferred_tool, - ) - - logger.info(f"Successfully fetched content from URL {data_item_path}") - - # fetch_page_content returns a dict like {url: content} - # Extract the content string before saving - if isinstance(data, dict): - # Concatenate all URL contents (usually just one URL) - content = "" - for url, text in data.items(): - content += f"{url}:\n{text}\n\n" - logger.info( - f"Extracted content from {len(data)} URL(s), total size: {len(content)} characters" - ) - else: - content = data - - return await save_data_to_file(content) diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py index d8e580fad..f4d6677c7 100644 --- a/cognee/tasks/web_scraper/__init__.py +++ b/cognee/tasks/web_scraper/__init__.py @@ -8,6 +8,7 @@ BeautifulSoup or Tavily, defining data models, and handling scraping configurati from .bs4_crawler import BeautifulSoupCrawler from .utils import fetch_page_content from .web_scraper_task import cron_web_scraper_task, web_scraper_task +from .default_url_crawler import DefaultUrlCrawler __all__ = [ @@ -15,4 +16,5 @@ __all__ = [ "fetch_page_content", "cron_web_scraper_task", "web_scraper_task", + "DefaultUrlCrawler", ] diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/tasks/web_scraper/bs4_crawler.py index 969058466..171a76633 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/tasks/web_scraper/bs4_crawler.py @@ -5,32 +5,13 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. supports robots.txt handling, rate limiting, and custom extraction rules. """ -import asyncio -import time from typing import Union, List, Dict, Any, Optional -from urllib.parse import urlparse -from dataclasses import dataclass, field -from functools import lru_cache -import httpx +from dataclasses import dataclass from bs4 import BeautifulSoup from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) -try: - from playwright.async_api import async_playwright -except ImportError: - logger.warning( - "Failed to import playwright, make sure to install using pip install playwright>=1.9.0" - ) - async_playwright = None - -try: - from protego import Protego -except ImportError: - logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1") - Protego = None - @dataclass class ExtractionRule: @@ -51,21 +32,6 @@ class ExtractionRule: join_with: str = " " -@dataclass -class RobotsTxtCache: - """Cache for robots.txt data. - - Attributes: - protego: Parsed robots.txt object (Protego instance). - crawl_delay: Delay between requests (in seconds). - timestamp: Time when the cache entry was created. - """ - - protego: Any - crawl_delay: float - timestamp: float = field(default_factory=time.time) - - # TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler class BeautifulSoupCrawler: """Crawler for fetching and extracting web content using BeautifulSoup. @@ -84,333 +50,6 @@ class BeautifulSoupCrawler: robots_cache_ttl: Time-to-live for robots.txt cache in seconds. """ - def __init__( - self, - *, - concurrency: int = 5, - crawl_delay: float = 0.5, - max_crawl_delay: Optional[float] = 10.0, - timeout: float = 15.0, - max_retries: int = 2, - retry_delay_factor: float = 0.5, - headers: Optional[Dict[str, str]] = None, - robots_cache_ttl: float = 3600.0, - ): - """Initialize the BeautifulSoupCrawler. - - Args: - concurrency: Number of concurrent requests allowed. - crawl_delay: Minimum seconds between requests to the same domain. - max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). - timeout: Per-request timeout in seconds. - max_retries: Number of retries for failed requests. - retry_delay_factor: Multiplier for exponential backoff on retries. - headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0). - robots_cache_ttl: Time-to-live for robots.txt cache in seconds. - """ - self.concurrency = concurrency - self._sem = asyncio.Semaphore(concurrency) - self.crawl_delay = crawl_delay - self.max_crawl_delay = max_crawl_delay - self.timeout = timeout - self.max_retries = max_retries - self.retry_delay_factor = retry_delay_factor - self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"} - self.robots_cache_ttl = robots_cache_ttl - self._last_request_time_per_domain: Dict[str, float] = {} - self._robots_cache: Dict[str, RobotsTxtCache] = {} - self._client: Optional[httpx.AsyncClient] = None - self._robots_lock = asyncio.Lock() - - async def _ensure_client(self): - """Initialize the HTTP client if not already created.""" - if self._client is None: - self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers) - - async def close(self): - """Close the HTTP client.""" - if self._client: - await self._client.aclose() - self._client = None - - async def __aenter__(self): - """Enter the context manager, initializing the HTTP client.""" - await self._ensure_client() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """Exit the context manager, closing the HTTP client.""" - await self.close() - - @lru_cache(maxsize=1024) - def _domain_from_url(self, url: str) -> str: - """Extract the domain (netloc) from a URL. - - Args: - url: The URL to parse. - - Returns: - str: The domain (netloc) of the URL. - """ - try: - return urlparse(url).netloc - except Exception: - return url - - @lru_cache(maxsize=1024) - def _get_domain_root(self, url: str) -> str: - """Get the root URL (scheme and netloc) from a URL. - - Args: - url: The URL to parse. - - Returns: - str: The root URL (e.g., "https://example.com"). - """ - parsed = urlparse(url) - return f"{parsed.scheme}://{parsed.netloc}" - - async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None): - """Enforce rate limiting for requests to the same domain. - - Args: - url: The URL to check. - crawl_delay: Custom crawl delay in seconds (if any). - """ - domain = self._domain_from_url(url) - last = self._last_request_time_per_domain.get(domain) - delay = crawl_delay if crawl_delay is not None else self.crawl_delay - - if last is None: - self._last_request_time_per_domain[domain] = time.time() - return - - elapsed = time.time() - last - wait_for = delay - elapsed - if wait_for > 0: - logger.info( - f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)" - ) - await asyncio.sleep(wait_for) - logger.info(f"Rate limit wait completed for {url}") - self._last_request_time_per_domain[domain] = time.time() - - async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]: - """Get cached robots.txt data if valid. - - Args: - domain_root: The root URL (e.g., "https://example.com"). - - Returns: - Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found. - """ - if Protego is None: - return None - - cached = self._robots_cache.get(domain_root) - if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl: - return cached - return None - - async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache: - """Fetch and cache robots.txt data. - - Args: - domain_root: The root URL (e.g., "https://example.com"). - - Returns: - RobotsTxtCache: Cached robots.txt data with crawl delay. - - Raises: - Exception: If fetching robots.txt fails. - """ - async with self._robots_lock: - cached = await self._get_robots_cache(domain_root) - if cached: - return cached - - robots_url = f"{domain_root}/robots.txt" - try: - await self._ensure_client() - await self._respect_rate_limit(robots_url, self.crawl_delay) - resp = await self._client.get(robots_url, timeout=5.0) - content = resp.text if resp.status_code == 200 else "" - except Exception as e: - logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}") - content = "" - - protego = Protego.parse(content) if content.strip() else None - agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*") - - crawl_delay = self.crawl_delay - if protego: - delay = protego.crawl_delay(agent) or protego.crawl_delay("*") - if delay: - # Apply max_crawl_delay cap if configured - if self.max_crawl_delay is not None and delay > self.max_crawl_delay: - logger.warning( - f"robots.txt specifies crawl_delay={delay}s for {domain_root}, " - f"capping to max_crawl_delay={self.max_crawl_delay}s" - ) - crawl_delay = self.max_crawl_delay - else: - crawl_delay = delay - - cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay) - self._robots_cache[domain_root] = cache_entry - return cache_entry - - async def _is_url_allowed(self, url: str) -> bool: - """Check if a URL is allowed by robots.txt. - - Args: - url: The URL to check. - - Returns: - bool: True if the URL is allowed, False otherwise. - """ - if Protego is None: - return True - - try: - domain_root = self._get_domain_root(url) - cache = await self._get_robots_cache(domain_root) - if cache is None: - cache = await self._fetch_and_cache_robots(domain_root) - - if cache.protego is None: - return True - - agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*") - return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url) - except Exception as e: - logger.debug(f"Error checking robots.txt for {url}: {e}") - return True - - async def _get_crawl_delay(self, url: str) -> float: - """Get the crawl delay for a URL from robots.txt. - - Args: - url: The URL to check. - - Returns: - float: Crawl delay in seconds. - """ - if Protego is None: - return self.crawl_delay - - try: - domain_root = self._get_domain_root(url) - cache = await self._get_robots_cache(domain_root) - if cache is None: - cache = await self._fetch_and_cache_robots(domain_root) - return cache.crawl_delay - except Exception: - return self.crawl_delay - - async def _fetch_httpx(self, url: str) -> str: - """Fetch a URL using HTTPX with retries. - - Args: - url: The URL to fetch. - - Returns: - str: The HTML content of the page. - - Raises: - Exception: If all retry attempts fail. - """ - await self._ensure_client() - assert self._client is not None, "HTTP client not initialized" - - attempt = 0 - crawl_delay = await self._get_crawl_delay(url) - logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}") - - while True: - try: - await self._respect_rate_limit(url, crawl_delay) - resp = await self._client.get(url) - resp.raise_for_status() - logger.info( - f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)" - ) - return resp.text - except Exception as exc: - attempt += 1 - if attempt > self.max_retries: - logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}") - raise - - delay = self.retry_delay_factor * (2 ** (attempt - 1)) - logger.warning( - f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}" - ) - await asyncio.sleep(delay) - - async def _render_with_playwright( - self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None - ) -> str: - """Fetch and render a URL using Playwright for JavaScript content. - - Args: - url: The URL to fetch. - js_wait: Seconds to wait for JavaScript to load. - timeout: Timeout for the request (in seconds, defaults to instance timeout). - - Returns: - str: The rendered HTML content. - - Raises: - RuntimeError: If Playwright is not installed. - Exception: If all retry attempts fail. - """ - if async_playwright is None: - raise RuntimeError( - "Playwright is not installed. Install with `pip install playwright` and run `playwright install`." - ) - - timeout_val = timeout or self.timeout - logger.info( - f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}" - ) - - attempt = 0 - while True: - try: - async with async_playwright() as p: - logger.info(f"Launching headless Chromium browser for {url}") - browser = await p.chromium.launch(headless=True) - try: - context = await browser.new_context() - page = await context.new_page() - logger.info(f"Navigating to {url} and waiting for network idle") - await page.goto( - url, - wait_until="networkidle", - timeout=int(timeout_val * 1000), - ) - if js_wait: - logger.info(f"Waiting {js_wait}s for JavaScript to execute") - await asyncio.sleep(js_wait) - content = await page.content() - logger.info( - f"Successfully rendered {url} with Playwright (size={len(content)} bytes)" - ) - return content - finally: - await browser.close() - except Exception as exc: - attempt += 1 - if attempt > self.max_retries: - logger.error(f"Playwright fetch failed for {url}: {exc}") - raise - backoff = self.retry_delay_factor * (2 ** (attempt - 1)) - logger.warning( - f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})" - ) - await asyncio.sleep(backoff) - def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. @@ -435,7 +74,7 @@ class BeautifulSoupCrawler: ) raise ValueError(f"Invalid extraction rule: {rule}") - def _extract_with_bs4(self, html: str, rule: ExtractionRule) -> str: + def extract(self, html: str, rule: ExtractionRule) -> str: """Extract content from HTML using BeautifulSoup or lxml XPath. Args: @@ -491,79 +130,3 @@ class BeautifulSoupCrawler: val = el.get(rule.attr) return (val or "").strip() return el.get_text(strip=True) - - async def fetch_urls( - self, - urls: Union[str, List[str]], - *, - use_playwright: bool = False, - playwright_js_wait: float = 0.8, - ) -> Dict[str, str]: - """Fetch and extract content from URLs using BeautifulSoup or Playwright. - - Args: - urls: A single URL, list of URLs, or dict mapping URLs to extraction rules. - extraction_rules: Default extraction rules for string or list URLs. - use_playwright: If True, use Playwright for JavaScript rendering. - playwright_js_wait: Seconds to wait for JavaScript to load. - join_all_matches: If True, extract all matching elements for each rule. - - Returns: - Dict[str, str]: A dictionary mapping URLs to their extracted content. - - Raises: - ValueError: If extraction_rules are missing when required or if urls is invalid. - Exception: If fetching or extraction fails. - """ - if isinstance(urls, str): - urls = [urls] - else: - raise ValueError(f"Invalid urls type: {type(urls)}") - - async def _task(url: str): - async with self._sem: - try: - logger.info(f"Processing URL: {url}") - - # Check robots.txt - allowed = await self._is_url_allowed(url) - if not allowed: - logger.warning(f"URL disallowed by robots.txt: {url}") - return url, "" - - logger.info(f"Robots.txt check passed for {url}") - - # Fetch HTML - if use_playwright: - logger.info( - f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)" - ) - html = await self._render_with_playwright( - url, js_wait=playwright_js_wait, timeout=self.timeout - ) - else: - logger.info(f"Fetching {url} with httpx") - html = await self._fetch_httpx(url) - - logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)") - - return url, html - - except Exception as e: - logger.error(f"Error processing {url}: {e}") - return url, "" - - logger.info(f"Creating {len(urls)} async tasks for concurrent fetching") - tasks = [asyncio.create_task(_task(u)) for u in urls] - results = {} - completed = 0 - total = len(tasks) - - for coro in asyncio.as_completed(tasks): - url, html = await coro - results[url] = html - completed += 1 - logger.info(f"Progress: {completed}/{total} URLs processed") - - logger.info(f"Completed fetching all {len(results)} URL(s)") - return results diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index ac470daa9..fcf22ab33 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -10,7 +10,7 @@ class TavilyConfig(BaseModel): timeout: Optional[int] = Field(default=10, ge=1, le=60) -class SoupCrawlerConfig(BaseModel): +class DefaultCrawlerConfig(BaseModel): concurrency: int = 5 crawl_delay: float = 0.5 max_crawl_delay: Optional[float] = ( diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py new file mode 100644 index 000000000..d9d2ee922 --- /dev/null +++ b/cognee/tasks/web_scraper/default_url_crawler.py @@ -0,0 +1,446 @@ +import asyncio +from dataclasses import dataclass, field +from functools import lru_cache +import time +from typing import Any, Union, List, Dict, Optional +from urllib.parse import urlparse +import httpx + +from cognee.shared.logging_utils import get_logger +from cognee.tasks.web_scraper.utils import UrlsToHtmls + +logger = get_logger() + +try: + from protego import Protego +except ImportError: + logger.warning("Failed to import protego, make sure to install using pip install protego>=0.1") + Protego = None + +try: + from playwright.async_api import async_playwright +except ImportError: + logger.warning( + "Failed to import playwright, make sure to install using pip install playwright>=1.9.0" + ) + async_playwright = None + + +@dataclass +class RobotsTxtCache: + """Cache for robots.txt data. + + Attributes: + protego: Parsed robots.txt object (Protego instance). + crawl_delay: Delay between requests (in seconds). + timestamp: Time when the cache entry was created. + """ + + protego: Any + crawl_delay: float + timestamp: float = field(default_factory=time.time) + + +class DefaultUrlCrawler: + def __init__( + self, + *, + concurrency: int = 5, + crawl_delay: float = 0.5, + max_crawl_delay: Optional[float] = 10.0, + timeout: float = 15.0, + max_retries: int = 2, + retry_delay_factor: float = 0.5, + headers: Optional[Dict[str, str]] = None, + robots_cache_ttl: float = 3600.0, + ): + """Initialize the BeautifulSoupCrawler. + + Args: + concurrency: Number of concurrent requests allowed. + crawl_delay: Minimum seconds between requests to the same domain. + max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit). + timeout: Per-request timeout in seconds. + max_retries: Number of retries for failed requests. + retry_delay_factor: Multiplier for exponential backoff on retries. + headers: HTTP headers for requests (defaults to User-Agent: Cognee-Scraper/1.0). + robots_cache_ttl: Time-to-live for robots.txt cache in seconds. + """ + self.concurrency = concurrency + self._sem = asyncio.Semaphore(concurrency) + self.crawl_delay = crawl_delay + self.max_crawl_delay = max_crawl_delay + self.timeout = timeout + self.max_retries = max_retries + self.retry_delay_factor = retry_delay_factor + self.headers = headers or {"User-Agent": "Cognee-Scraper/1.0"} + self.robots_cache_ttl = robots_cache_ttl + self._last_request_time_per_domain: Dict[str, float] = {} + self._robots_cache: Dict[str, RobotsTxtCache] = {} + self._client: Optional[httpx.AsyncClient] = None + self._robots_lock = asyncio.Lock() + + async def _ensure_client(self): + """Initialize the HTTP client if not already created.""" + if self._client is None: + self._client = httpx.AsyncClient(timeout=self.timeout, headers=self.headers) + + async def close(self): + """Close the HTTP client.""" + if self._client: + await self._client.aclose() + self._client = None + + async def __aenter__(self): + """Enter the context manager, initializing the HTTP client.""" + await self._ensure_client() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Exit the context manager, closing the HTTP client.""" + await self.close() + + @lru_cache(maxsize=1024) + def _domain_from_url(self, url: str) -> str: + """Extract the domain (netloc) from a URL. + + Args: + url: The URL to parse. + + Returns: + str: The domain (netloc) of the URL. + """ + try: + return urlparse(url).netloc + except Exception: + return url + + @lru_cache(maxsize=1024) + def _get_domain_root(self, url: str) -> str: + """Get the root URL (scheme and netloc) from a URL. + + Args: + url: The URL to parse. + + Returns: + str: The root URL (e.g., "https://example.com"). + """ + parsed = urlparse(url) + return f"{parsed.scheme}://{parsed.netloc}" + + async def _respect_rate_limit(self, url: str, crawl_delay: Optional[float] = None): + """Enforce rate limiting for requests to the same domain. + + Args: + url: The URL to check. + crawl_delay: Custom crawl delay in seconds (if any). + """ + domain = self._domain_from_url(url) + last = self._last_request_time_per_domain.get(domain) + delay = crawl_delay if crawl_delay is not None else self.crawl_delay + + if last is None: + self._last_request_time_per_domain[domain] = time.time() + return + + elapsed = time.time() - last + wait_for = delay - elapsed + if wait_for > 0: + logger.info( + f"Rate limiting: waiting {wait_for:.2f}s before requesting {url} (crawl_delay={delay}s from robots.txt)" + ) + await asyncio.sleep(wait_for) + logger.info(f"Rate limit wait completed for {url}") + self._last_request_time_per_domain[domain] = time.time() + + async def _get_robots_cache(self, domain_root: str) -> Optional[RobotsTxtCache]: + """Get cached robots.txt data if valid. + + Args: + domain_root: The root URL (e.g., "https://example.com"). + + Returns: + Optional[RobotsTxtCache]: Cached robots.txt data or None if expired or not found. + """ + if Protego is None: + return None + + cached = self._robots_cache.get(domain_root) + if cached and (time.time() - cached.timestamp) < self.robots_cache_ttl: + return cached + return None + + async def _fetch_and_cache_robots(self, domain_root: str) -> RobotsTxtCache: + """Fetch and cache robots.txt data. + + Args: + domain_root: The root URL (e.g., "https://example.com"). + + Returns: + RobotsTxtCache: Cached robots.txt data with crawl delay. + + Raises: + Exception: If fetching robots.txt fails. + """ + async with self._robots_lock: + cached = await self._get_robots_cache(domain_root) + if cached: + return cached + + robots_url = f"{domain_root}/robots.txt" + try: + await self._ensure_client() + await self._respect_rate_limit(robots_url, self.crawl_delay) + resp = await self._client.get(robots_url, timeout=5.0) + content = resp.text if resp.status_code == 200 else "" + except Exception as e: + logger.debug(f"Failed to fetch robots.txt from {domain_root}: {e}") + content = "" + + protego = Protego.parse(content) if content.strip() else None + agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*") + + crawl_delay = self.crawl_delay + if protego: + delay = protego.crawl_delay(agent) or protego.crawl_delay("*") + if delay: + # Apply max_crawl_delay cap if configured + if self.max_crawl_delay is not None and delay > self.max_crawl_delay: + logger.warning( + f"robots.txt specifies crawl_delay={delay}s for {domain_root}, " + f"capping to max_crawl_delay={self.max_crawl_delay}s" + ) + crawl_delay = self.max_crawl_delay + else: + crawl_delay = delay + + cache_entry = RobotsTxtCache(protego=protego, crawl_delay=crawl_delay) + self._robots_cache[domain_root] = cache_entry + return cache_entry + + async def _is_url_allowed(self, url: str) -> bool: + """Check if a URL is allowed by robots.txt. + + Args: + url: The URL to check. + + Returns: + bool: True if the URL is allowed, False otherwise. + """ + if Protego is None: + return True + + try: + domain_root = self._get_domain_root(url) + cache = await self._get_robots_cache(domain_root) + if cache is None: + cache = await self._fetch_and_cache_robots(domain_root) + + if cache.protego is None: + return True + + agent = next((v for k, v in self.headers.items() if k.lower() == "user-agent"), "*") + return cache.protego.can_fetch(agent, url) or cache.protego.can_fetch("*", url) + except Exception as e: + logger.debug(f"Error checking robots.txt for {url}: {e}") + return True + + async def _get_crawl_delay(self, url: str) -> float: + """Get the crawl delay for a URL from robots.txt. + + Args: + url: The URL to check. + + Returns: + float: Crawl delay in seconds. + """ + if Protego is None: + return self.crawl_delay + + try: + domain_root = self._get_domain_root(url) + cache = await self._get_robots_cache(domain_root) + if cache is None: + cache = await self._fetch_and_cache_robots(domain_root) + return cache.crawl_delay + except Exception: + return self.crawl_delay + + async def _fetch_httpx(self, url: str) -> str: + """Fetch a URL using HTTPX with retries. + + Args: + url: The URL to fetch. + + Returns: + str: The HTML content of the page. + + Raises: + Exception: If all retry attempts fail. + """ + await self._ensure_client() + assert self._client is not None, "HTTP client not initialized" + + attempt = 0 + crawl_delay = await self._get_crawl_delay(url) + logger.info(f"Fetching URL with httpx (crawl_delay={crawl_delay}s): {url}") + + while True: + try: + await self._respect_rate_limit(url, crawl_delay) + resp = await self._client.get(url) + resp.raise_for_status() + logger.info( + f"Successfully fetched {url} (status={resp.status_code}, size={len(resp.text)} bytes)" + ) + return resp.text + except Exception as exc: + attempt += 1 + if attempt > self.max_retries: + logger.error(f"Fetch failed for {url} after {attempt} attempts: {exc}") + raise + + delay = self.retry_delay_factor * (2 ** (attempt - 1)) + logger.warning( + f"Retrying {url} after {delay:.2f}s (attempt {attempt}) due to {exc}" + ) + await asyncio.sleep(delay) + + async def _render_with_playwright( + self, url: str, js_wait: float = 1.0, timeout: Optional[float] = None + ) -> str: + """Fetch and render a URL using Playwright for JavaScript content. + + Args: + url: The URL to fetch. + js_wait: Seconds to wait for JavaScript to load. + timeout: Timeout for the request (in seconds, defaults to instance timeout). + + Returns: + str: The rendered HTML content. + + Raises: + RuntimeError: If Playwright is not installed. + Exception: If all retry attempts fail. + """ + if async_playwright is None: + raise RuntimeError( + "Playwright is not installed. Install with `pip install playwright` and run `playwright install`." + ) + + timeout_val = timeout or self.timeout + logger.info( + f"Rendering URL with Playwright (js_wait={js_wait}s, timeout={timeout_val}s): {url}" + ) + + attempt = 0 + while True: + try: + async with async_playwright() as p: + logger.info(f"Launching headless Chromium browser for {url}") + browser = await p.chromium.launch(headless=True) + try: + context = await browser.new_context() + page = await context.new_page() + logger.info(f"Navigating to {url} and waiting for network idle") + await page.goto( + url, + wait_until="networkidle", + timeout=int(timeout_val * 1000), + ) + if js_wait: + logger.info(f"Waiting {js_wait}s for JavaScript to execute") + await asyncio.sleep(js_wait) + content = await page.content() + logger.info( + f"Successfully rendered {url} with Playwright (size={len(content)} bytes)" + ) + return content + finally: + await browser.close() + except Exception as exc: + attempt += 1 + if attempt > self.max_retries: + logger.error(f"Playwright fetch failed for {url}: {exc}") + raise + backoff = self.retry_delay_factor * (2 ** (attempt - 1)) + logger.warning( + f"Retrying playwright fetch {url} after {backoff:.2f}s (attempt {attempt})" + ) + await asyncio.sleep(backoff) + + async def fetch_urls( + self, + urls: Union[str, List[str]], + *, + use_playwright: bool = False, + playwright_js_wait: float = 0.8, + ) -> UrlsToHtmls: + """Fetch and extract content from URLs using BeautifulSoup or Playwright. + + Args: + urls: A single URL, list of URLs, or dict mapping URLs to extraction rules. + extraction_rules: Default extraction rules for string or list URLs. + use_playwright: If True, use Playwright for JavaScript rendering. + playwright_js_wait: Seconds to wait for JavaScript to load. + join_all_matches: If True, extract all matching elements for each rule. + + Returns: + Dict[str, str]: A dictionary mapping URLs to their extracted content. + + Raises: + ValueError: If extraction_rules are missing when required or if urls is invalid. + Exception: If fetching or extraction fails. + """ + if isinstance(urls, str): + urls = [urls] + else: + raise ValueError(f"Invalid urls type: {type(urls)}") + + async def _task(url: str): + async with self._sem: + try: + logger.info(f"Processing URL: {url}") + + # Check robots.txt + allowed = await self._is_url_allowed(url) + if not allowed: + logger.warning(f"URL disallowed by robots.txt: {url}") + return url, "" + + logger.info(f"Robots.txt check passed for {url}") + + # Fetch HTML + if use_playwright: + logger.info( + f"Rendering {url} with Playwright (JS wait: {playwright_js_wait}s)" + ) + html = await self._render_with_playwright( + url, js_wait=playwright_js_wait, timeout=self.timeout + ) + else: + logger.info(f"Fetching {url} with httpx") + html = await self._fetch_httpx(url) + + logger.info(f"Successfully fetched HTML from {url} ({len(html)} bytes)") + + return url, html + + except Exception as e: + logger.error(f"Error processing {url}: {e}") + return url, "" + + logger.info(f"Creating {len(urls)} async tasks for concurrent fetching") + tasks = [asyncio.create_task(_task(u)) for u in urls] + results = {} + completed = 0 + total = len(tasks) + + for coro in asyncio.as_completed(tasks): + url, html = await coro + results[url] = html + completed += 1 + logger.info(f"Progress: {completed}/{total} URLs processed") + + logger.info(f"Completed fetching all {len(results)} URL(s)") + return results diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 8b8bcc11f..0cbd355a3 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -4,19 +4,20 @@ This module provides functions to fetch and extract content from web pages, supp both BeautifulSoup for custom extraction rules and Tavily for API-based scraping. """ -from typing import Dict, List, Union, Optional, Literal -from cognee.context_global_variables import soup_crawler_config, tavily_config +import os +from re import L +from typing import List, Union, TypeAlias from cognee.shared.logging_utils import get_logger +from .default_url_crawler import DefaultUrlCrawler from .bs4_crawler import BeautifulSoupCrawler -from .config import TavilyConfig +from .config import DefaultCrawlerConfig, TavilyConfig logger = get_logger(__name__) +UrlsToHtmls: TypeAlias = dict[str, str] -async def fetch_page_content( - urls: Union[str, List[str]], - preferred_tool: Optional[Literal["tavily", "beautifulsoup"]] = "beautifulsoup", -) -> Dict[str, str]: + +async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: """Fetch content from one or more URLs using the specified tool. This function retrieves web page content using either BeautifulSoup (with custom @@ -29,7 +30,7 @@ async def fetch_page_content( Defaults to "beautifulsoup". tavily_config: Configuration for Tavily API, including API key. Required if preferred_tool is "tavily". - soup_crawler_config: Configuration for BeautifulSoup crawler, including + default_crawler_config: Configuration for BeautifulSoup crawler, including extraction rules. Required if preferred_tool is "beautifulsoup" and extraction_rules are needed. @@ -44,51 +45,39 @@ async def fetch_page_content( installed. """ url_list = [urls] if isinstance(urls, str) else urls - logger.info(f"Starting to fetch content from {len(url_list)} URL(s) using {preferred_tool}") - _tavily_config = tavily_config.get() - _soup_crawler_config = soup_crawler_config.get() - - if preferred_tool == "tavily": - if not tavily_config or tavily_config.api_key is None: - raise ValueError("TAVILY_API_KEY must be set in TavilyConfig to use Tavily") - logger.info("Using Tavily API for content extraction") + if os.getenv("TAVILY_API_KEY"): + logger.info("Using Tavily API for url fetching") return await fetch_with_tavily(urls, tavily_config) + else: + logger.info("Using default crawler for content extraction") - if preferred_tool == "beautifulsoup": - try: - from bs4 import BeautifulSoup as _ # noqa: F401 - except ImportError: - logger.error( - "Failed to import bs4, make sure to install using pip install beautifulsoup4>=4.13.1" - ) - raise ImportError - if soup_crawler_config is None or soup_crawler_config.extraction_rules is None: - raise ValueError("soup_crawler_config must be provided when not using Tavily") + default_crawler_config = ( + DefaultCrawlerConfig() + ) # We've decided to use defaults, and configure through env vars as needed - logger.info("Using BeautifulSoup for content extraction") logger.info( - f"Initializing BeautifulSoup crawler with concurrency={soup_crawler_config.concurrency}, timeout={soup_crawler_config.timeout}s, max_crawl_delay={soup_crawler_config.max_crawl_delay}s" + f"Initializing BeautifulSoup crawler with concurrency={default_crawler_config.concurrency}, timeout={default_crawler_config.timeout}s, max_crawl_delay={default_crawler_config.max_crawl_delay}s" ) - crawler = BeautifulSoupCrawler( - concurrency=soup_crawler_config.concurrency, - crawl_delay=soup_crawler_config.crawl_delay, - max_crawl_delay=soup_crawler_config.max_crawl_delay, - timeout=soup_crawler_config.timeout, - max_retries=soup_crawler_config.max_retries, - retry_delay_factor=soup_crawler_config.retry_delay_factor, - headers=soup_crawler_config.headers, - robots_cache_ttl=soup_crawler_config.robots_cache_ttl, + crawler = DefaultUrlCrawler( + concurrency=default_crawler_config.concurrency, + crawl_delay=default_crawler_config.crawl_delay, + max_crawl_delay=default_crawler_config.max_crawl_delay, + timeout=default_crawler_config.timeout, + max_retries=default_crawler_config.max_retries, + retry_delay_factor=default_crawler_config.retry_delay_factor, + headers=default_crawler_config.headers, + robots_cache_ttl=default_crawler_config.robots_cache_ttl, ) try: logger.info( - f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={soup_crawler_config.use_playwright})" + f"Starting to crawl {len(url_list)} URL(s) with BeautifulSoup (use_playwright={default_crawler_config.use_playwright})" ) results = await crawler.fetch_urls( urls, - use_playwright=soup_crawler_config.use_playwright, - playwright_js_wait=soup_crawler_config.playwright_js_wait, + use_playwright=default_crawler_config.use_playwright, + playwright_js_wait=default_crawler_config.playwright_js_wait, ) logger.info(f"Successfully fetched content from {len(results)} URL(s)") return results @@ -102,7 +91,7 @@ async def fetch_page_content( async def fetch_with_tavily( urls: Union[str, List[str]], tavily_config: TavilyConfig -) -> Dict[str, str]: +) -> UrlsToHtmls: """Fetch content from URLs using the Tavily API. Args: diff --git a/cognee/tasks/web_scraper/web_scraper_task.py b/cognee/tasks/web_scraper/web_scraper_task.py index 52154c6ef..2bade3719 100644 --- a/cognee/tasks/web_scraper/web_scraper_task.py +++ b/cognee/tasks/web_scraper/web_scraper_task.py @@ -19,7 +19,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges from cognee.modules.engine.operations.setup import setup from .models import WebPage, WebSite, ScrapingJob -from .config import SoupCrawlerConfig, TavilyConfig +from .config import DefaultCrawlerConfig, TavilyConfig from .utils import fetch_page_content try: @@ -47,7 +47,7 @@ async def cron_web_scraper_task( schedule: str = None, extraction_rules: dict = None, tavily_api_key: str = os.getenv("TAVILY_API_KEY"), - soup_crawler_config: SoupCrawlerConfig = None, + soup_crawler_config: DefaultCrawlerConfig = None, tavily_config: TavilyConfig = None, job_name: str = "scraping", ): @@ -121,7 +121,7 @@ async def web_scraper_task( schedule: str = None, extraction_rules: dict = None, tavily_api_key: str = os.getenv("TAVILY_API_KEY"), - soup_crawler_config: SoupCrawlerConfig = None, + soup_crawler_config: DefaultCrawlerConfig = None, tavily_config: TavilyConfig = None, job_name: str = None, ): @@ -341,7 +341,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle soup_crawler_config: Configuration for BeautifulSoup crawler. Returns: - Tuple[SoupCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config, + Tuple[DefaultCrawlerConfig, TavilyConfig, str]: Configured soup_crawler_config, tavily_config, and preferred_tool ("tavily" or "beautifulsoup"). Raises: @@ -350,7 +350,7 @@ def check_arguments(tavily_api_key, extraction_rules, tavily_config, soup_crawle preferred_tool = "beautifulsoup" if extraction_rules and not soup_crawler_config: - soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules) + soup_crawler_config = DefaultCrawlerConfig(extraction_rules=extraction_rules) if tavily_api_key: if not tavily_config: diff --git a/cognee/tests/tasks/web_scraping/web_scraping_test.py b/cognee/tests/tasks/web_scraping/web_scraping_test.py index bf66b5155..81c58ac8d 100644 --- a/cognee/tests/tasks/web_scraping/web_scraping_test.py +++ b/cognee/tests/tasks/web_scraping/web_scraping_test.py @@ -1,6 +1,6 @@ import asyncio import cognee -from cognee.tasks.web_scraper.config import SoupCrawlerConfig +from cognee.tasks.web_scraper.config import DefaultCrawlerConfig from cognee.tasks.web_scraper import cron_web_scraper_task @@ -14,7 +14,7 @@ async def test_web_scraping_using_bs4(): "authors": {"selector": ".quote small", "all": True}, } - soup_config = SoupCrawlerConfig( + soup_config = DefaultCrawlerConfig( concurrency=5, crawl_delay=0.5, timeout=15.0, @@ -47,7 +47,7 @@ async def test_web_scraping_using_bs4_and_incremental_loading(): url = "https://books.toscrape.com/" rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"} - soup_config = SoupCrawlerConfig( + soup_config = DefaultCrawlerConfig( concurrency=1, crawl_delay=0.1, timeout=10.0, From 16e1c609253f74a36061b49e3ef533e9b5490272 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 16:43:56 +0100 Subject: [PATCH 32/44] move bs4 html parsing into `bs4_loader` --- .../loaders/external/bs4_loader.py} | 24 ++++++++++++++++--- cognee/tasks/web_scraper/__init__.py | 1 - cognee/tasks/web_scraper/utils.py | 8 +++---- .../web_url_crawler/test_bs4_crawler.py | 4 ++-- 4 files changed, 26 insertions(+), 11 deletions(-) rename cognee/{tasks/web_scraper/bs4_crawler.py => infrastructure/loaders/external/bs4_loader.py} (89%) diff --git a/cognee/tasks/web_scraper/bs4_crawler.py b/cognee/infrastructure/loaders/external/bs4_loader.py similarity index 89% rename from cognee/tasks/web_scraper/bs4_crawler.py rename to cognee/infrastructure/loaders/external/bs4_loader.py index 171a76633..8022de04f 100644 --- a/cognee/tasks/web_scraper/bs4_crawler.py +++ b/cognee/infrastructure/loaders/external/bs4_loader.py @@ -5,9 +5,10 @@ from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. supports robots.txt handling, rate limiting, and custom extraction rules. """ -from typing import Union, List, Dict, Any, Optional +from typing import Union, Dict, Any, Optional, List from dataclasses import dataclass from bs4 import BeautifulSoup +from cognee.infrastructure.loaders import LoaderInterface from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) @@ -32,8 +33,7 @@ class ExtractionRule: join_with: str = " " -# TODO(daulet) refactor: This is no longer BeautifulSoup, rather just a crawler -class BeautifulSoupCrawler: +class BeautifulSoupLoader(LoaderInterface): """Crawler for fetching and extracting web content using BeautifulSoup. Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt @@ -50,6 +50,24 @@ class BeautifulSoupCrawler: robots_cache_ttl: Time-to-live for robots.txt cache in seconds. """ + @property + def supported_extensions(self) -> List[str]: + return ["html"] + + @property + def supported_mime_types(self) -> List[str]: + pass + + @property + def loader_name(self) -> str: + return "beautiful_soup_loader" + + def can_handle(self, extension: str, mime_type: str) -> bool: + pass + + async def load(self, file_path: str, **kwargs): + pass + def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py index f4d6677c7..26c3e68cf 100644 --- a/cognee/tasks/web_scraper/__init__.py +++ b/cognee/tasks/web_scraper/__init__.py @@ -5,7 +5,6 @@ data in a graph database. It includes classes and functions for crawling web pag BeautifulSoup or Tavily, defining data models, and handling scraping configurations. """ -from .bs4_crawler import BeautifulSoupCrawler from .utils import fetch_page_content from .web_scraper_task import cron_web_scraper_task, web_scraper_task from .default_url_crawler import DefaultUrlCrawler diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index 0cbd355a3..b1cbf82e9 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -9,7 +9,6 @@ from re import L from typing import List, Union, TypeAlias from cognee.shared.logging_utils import get_logger from .default_url_crawler import DefaultUrlCrawler -from .bs4_crawler import BeautifulSoupCrawler from .config import DefaultCrawlerConfig, TavilyConfig logger = get_logger(__name__) @@ -48,7 +47,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: if os.getenv("TAVILY_API_KEY"): logger.info("Using Tavily API for url fetching") - return await fetch_with_tavily(urls, tavily_config) + return await fetch_with_tavily(urls) else: logger.info("Using default crawler for content extraction") @@ -89,9 +88,7 @@ async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: await crawler.close() -async def fetch_with_tavily( - urls: Union[str, List[str]], tavily_config: TavilyConfig -) -> UrlsToHtmls: +async def fetch_with_tavily(urls: Union[str, List[str]]) -> UrlsToHtmls: """Fetch content from URLs using the Tavily API. Args: @@ -112,6 +109,7 @@ async def fetch_with_tavily( ) raise + tavily_config = TavilyConfig() url_list = [urls] if isinstance(urls, str) else urls extract_depth = tavily_config.extract_depth if tavily_config else "basic" timeout = tavily_config.timeout if tavily_config else 10 diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py index 0e7637d86..156cc87a4 100644 --- a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py @@ -1,10 +1,10 @@ import pytest -from cognee.tasks.web_scraper import BeautifulSoupCrawler +from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): - crawler = BeautifulSoupCrawler() + crawler = DefaultUrlCrawler() url = "https://en.wikipedia.org/wiki/Large_language_model" results = await crawler.fetch_urls(url) assert len(results) == 1 From 7210198f2ef4950cb40f6204e2355e597e3dd6ac Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 16:54:33 +0100 Subject: [PATCH 33/44] implement `bs4_loader.py` methods aside `load` yet --- cognee/infrastructure/loaders/external/bs4_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/infrastructure/loaders/external/bs4_loader.py b/cognee/infrastructure/loaders/external/bs4_loader.py index 8022de04f..ceea3f9de 100644 --- a/cognee/infrastructure/loaders/external/bs4_loader.py +++ b/cognee/infrastructure/loaders/external/bs4_loader.py @@ -56,14 +56,14 @@ class BeautifulSoupLoader(LoaderInterface): @property def supported_mime_types(self) -> List[str]: - pass + return ["text/html"] @property def loader_name(self) -> str: return "beautiful_soup_loader" def can_handle(self, extension: str, mime_type: str) -> bool: - pass + return extension in self.supported_extensions() and mime_type in self.supported_mime_types() async def load(self, file_path: str, **kwargs): pass From 322ef156cb5efce2d75bc7e0df0ebec9484903c9 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 17:10:45 +0100 Subject: [PATCH 34/44] redefine `preferred_loaders` param to allow for args per loader --- cognee/api/v1/add/add.py | 2 +- cognee/api/v1/update/update.py | 4 ++-- cognee/infrastructure/loaders/LoaderEngine.py | 6 ++++-- cognee/tasks/ingestion/data_item_to_text_file.py | 4 ++-- cognee/tasks/ingestion/ingest_data.py | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 216911ec0..73a3081be 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -23,7 +23,7 @@ async def add( vector_db_config: dict = None, graph_db_config: dict = None, dataset_id: Optional[UUID] = None, - preferred_loaders: List[str] = None, + preferred_loaders: dict[str, dict[str, Any]] = None, incremental_loading: bool = True, data_per_batch: Optional[int] = 20, ): diff --git a/cognee/api/v1/update/update.py b/cognee/api/v1/update/update.py index a421b3dc0..83b92c50f 100644 --- a/cognee/api/v1/update/update.py +++ b/cognee/api/v1/update/update.py @@ -1,5 +1,5 @@ from uuid import UUID -from typing import Union, BinaryIO, List, Optional +from typing import Union, BinaryIO, List, Optional, Any from cognee.modules.users.models import User from cognee.api.v1.delete import delete @@ -15,7 +15,7 @@ async def update( node_set: Optional[List[str]] = None, vector_db_config: dict = None, graph_db_config: dict = None, - preferred_loaders: List[str] = None, + preferred_loaders: dict[str, dict[str, Any]] = None, incremental_loading: bool = True, ): """ diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 6b62f7641..84ecee0de 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -64,7 +64,9 @@ class LoaderEngine: return True def get_loader( - self, file_path: str, preferred_loaders: List[str] = None + self, + file_path: str, + preferred_loaders: dict[str, dict[str, Any]], ) -> Optional[LoaderInterface]: """ Get appropriate loader for a file. @@ -105,7 +107,7 @@ class LoaderEngine: async def load_file( self, file_path: str, - preferred_loaders: Optional[List[str]] = None, + preferred_loaders: dict[str, dict[str, Any]] = None, **kwargs, ): """ diff --git a/cognee/tasks/ingestion/data_item_to_text_file.py b/cognee/tasks/ingestion/data_item_to_text_file.py index 4b9e4bb23..0303f6c92 100644 --- a/cognee/tasks/ingestion/data_item_to_text_file.py +++ b/cognee/tasks/ingestion/data_item_to_text_file.py @@ -1,6 +1,6 @@ import os from urllib.parse import urlparse -from typing import List, Tuple +from typing import Any, List, Tuple from pathlib import Path import tempfile @@ -35,7 +35,7 @@ async def pull_from_s3(file_path, destination_file) -> None: async def data_item_to_text_file( data_item_path: str, - preferred_loaders: List[str], + preferred_loaders: dict[str, dict[str, Any]] = None, ) -> Tuple[str, LoaderInterface]: if isinstance(data_item_path, str): parsed_url = urlparse(data_item_path) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 02987b893..7b081cc34 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -27,7 +27,7 @@ async def ingest_data( user: User, node_set: Optional[List[str]] = None, dataset_id: UUID = None, - preferred_loaders: List[str] = None, + preferred_loaders: dict[str, dict[str, Any]] = None, ): if not user: user = await get_default_user() @@ -44,7 +44,7 @@ async def ingest_data( user: User, node_set: Optional[List[str]] = None, dataset_id: UUID = None, - preferred_loaders: List[str] = None, + preferred_loaders: dict[str, dict[str, Any]] = None, ): new_datapoints = [] existing_data_points = [] From f84e31c626d74e00e1b9a1265997f5a6bac44d10 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 17:11:27 +0100 Subject: [PATCH 35/44] `bs4_loader.py` -> `beautiful_soup_loader.py`, add to supported loaders --- cognee/infrastructure/loaders/external/__init__.py | 7 +++++++ .../external/{bs4_loader.py => beautiful_soup_loader.py} | 0 cognee/infrastructure/loaders/supported_loaders.py | 7 +++++++ 3 files changed, 14 insertions(+) rename cognee/infrastructure/loaders/external/{bs4_loader.py => beautiful_soup_loader.py} (100%) diff --git a/cognee/infrastructure/loaders/external/__init__.py b/cognee/infrastructure/loaders/external/__init__.py index 6bf9f9200..785338c09 100644 --- a/cognee/infrastructure/loaders/external/__init__.py +++ b/cognee/infrastructure/loaders/external/__init__.py @@ -27,3 +27,10 @@ try: __all__.append("AdvancedPdfLoader") except ImportError: pass + +try: + from .beautiful_soup_loader import BeautifulSoupLoader + + __all__.append("BeautifulSoupLoader") +except ImportError: + pass diff --git a/cognee/infrastructure/loaders/external/bs4_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py similarity index 100% rename from cognee/infrastructure/loaders/external/bs4_loader.py rename to cognee/infrastructure/loaders/external/beautiful_soup_loader.py diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index d103babe3..156253b53 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -23,3 +23,10 @@ try: supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader except ImportError: pass + +try: + from cognee.infrastructure.loaders.external import BeautifulSoupLoader + + supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader +except ImportError: + pass From 03b4547b7f4e067c8dfed7259e7deff56049a170 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 20:10:43 +0100 Subject: [PATCH 36/44] validate e2e - urls are saved as htmls, and loaders are selected correctly --- cognee/infrastructure/loaders/LoaderEngine.py | 11 ++ .../loaders/external/beautiful_soup_loader.py | 21 ++- cognee/modules/ingestion/save_data_to_file.py | 11 +- cognee/tasks/ingestion/ingest_data.py | 4 + .../ingestion/save_data_item_to_storage.py | 6 +- cognee/tasks/web_scraper/config.py | 1 - .../tasks/web_scraper/default_url_crawler.py | 2 +- cognee/tasks/web_scraper/types.py | 4 + cognee/tasks/web_scraper/utils.py | 6 +- .../integration/web_url_crawler/test_add.py | 160 ++++++++++++++---- examples/python/web_url_fetcher_example.py | 1 + 11 files changed, 182 insertions(+), 45 deletions(-) create mode 100644 cognee/tasks/web_scraper/types.py diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 84ecee0de..1a47eea56 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -78,14 +78,21 @@ class LoaderEngine: Returns: LoaderInterface that can handle the file, or None if not found """ + from pathlib import Path file_info = filetype.guess(file_path) + path_extension = Path(file_path).suffix.lstrip(".") + # Try preferred loaders first if preferred_loaders: for loader_name in preferred_loaders: if loader_name in self._loaders: loader = self._loaders[loader_name] + # Try with path extension first (for text formats like html) + if loader.can_handle(extension=path_extension, mime_type=file_info.mime): + return loader + # Fall back to content-detected extension if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: @@ -95,6 +102,10 @@ class LoaderEngine: for loader_name in self.default_loader_priority: if loader_name in self._loaders: loader = self._loaders[loader_name] + # Try with path extension first (for text formats like html) + if loader.can_handle(extension=path_extension, mime_type=file_info.mime): + return loader + # Fall back to content-detected extension if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime): return loader else: diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index ceea3f9de..05330a095 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -8,7 +8,7 @@ supports robots.txt handling, rate limiting, and custom extraction rules. from typing import Union, Dict, Any, Optional, List from dataclasses import dataclass from bs4 import BeautifulSoup -from cognee.infrastructure.loaders import LoaderInterface +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) @@ -56,17 +56,30 @@ class BeautifulSoupLoader(LoaderInterface): @property def supported_mime_types(self) -> List[str]: - return ["text/html"] + return ["text/html", "text/plain"] @property def loader_name(self) -> str: return "beautiful_soup_loader" def can_handle(self, extension: str, mime_type: str) -> bool: - return extension in self.supported_extensions() and mime_type in self.supported_mime_types() + can = extension in self.supported_extensions and mime_type in self.supported_mime_types + return can async def load(self, file_path: str, **kwargs): - pass + """Load an HTML file and return its path. + + For HTML files stored on disk, we simply return the file path + since the content is already in text format and can be processed directly. + + Args: + file_path: Path to the HTML file + **kwargs: Additional arguments + + Returns: + The file path to the HTML file + """ + raise NotImplementedError def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py index 0ba0b2983..42e8d45ba 100644 --- a/cognee/modules/ingestion/save_data_to_file.py +++ b/cognee/modules/ingestion/save_data_to_file.py @@ -1,10 +1,12 @@ -from typing import BinaryIO, Union +from typing import BinaryIO, Union, Optional from cognee.infrastructure.files.storage import get_file_storage, get_storage_config from .classify import classify import hashlib -async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): +async def save_data_to_file( + data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None +): storage_config = get_storage_config() data_root_directory = storage_config["data_root_directory"] @@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): file_name = file_metadata["name"] + if file_extension is not None: + extension = file_extension.lstrip(".") + file_name_without_ext = file_name.rsplit(".", 1)[0] + file_name = f"{file_name_without_ext}.{extension}" + storage = get_file_storage(data_root_directory) full_file_path = await storage.store(file_name, data) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 7b081cc34..25b2aa6ae 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional import cognee.modules.ingestion as ingestion from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.data.models import Data +from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets @@ -88,6 +89,9 @@ async def ingest_data( preferred_loaders, ) + if loader_engine is None: + raise IngestionError("Loader cannot be None") + # Find metadata from original file # Standard flow: extract metadata from both original and stored files async with open_data_file(original_file_path) as file: diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 453219f15..05d21e617 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -8,7 +8,7 @@ from cognee.modules.ingestion import save_data_to_file from cognee.shared.logging_utils import get_logger from pydantic_settings import BaseSettings, SettingsConfigDict -from cognee.tasks.ingestion.data_fetchers.web_url_fetcher import WebUrlFetcher +from cognee.tasks.web_scraper.utils import fetch_page_content logger = get_logger() @@ -58,8 +58,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str if parsed_url.scheme == "s3": return data_item elif parsed_url.scheme == "http" or parsed_url.scheme == "https": - fetcher = WebUrlFetcher() - return await fetcher.fetch(data_item) + urls_to_page_contents = await fetch_page_content(data_item) + return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html") # data is local file path elif parsed_url.scheme == "file": if settings.accept_local_file_path: diff --git a/cognee/tasks/web_scraper/config.py b/cognee/tasks/web_scraper/config.py index fcf22ab33..f23156f95 100644 --- a/cognee/tasks/web_scraper/config.py +++ b/cognee/tasks/web_scraper/config.py @@ -20,7 +20,6 @@ class DefaultCrawlerConfig(BaseModel): max_retries: int = 2 retry_delay_factor: float = 0.5 headers: Optional[Dict[str, str]] = None - extraction_rules: Dict[str, Any] use_playwright: bool = False playwright_js_wait: float = 0.8 robots_cache_ttl: float = 3600.0 diff --git a/cognee/tasks/web_scraper/default_url_crawler.py b/cognee/tasks/web_scraper/default_url_crawler.py index d9d2ee922..d09bf3e80 100644 --- a/cognee/tasks/web_scraper/default_url_crawler.py +++ b/cognee/tasks/web_scraper/default_url_crawler.py @@ -7,7 +7,7 @@ from urllib.parse import urlparse import httpx from cognee.shared.logging_utils import get_logger -from cognee.tasks.web_scraper.utils import UrlsToHtmls +from cognee.tasks.web_scraper.types import UrlsToHtmls logger = get_logger() diff --git a/cognee/tasks/web_scraper/types.py b/cognee/tasks/web_scraper/types.py new file mode 100644 index 000000000..54a3f5d42 --- /dev/null +++ b/cognee/tasks/web_scraper/types.py @@ -0,0 +1,4 @@ +from typing import TypeAlias + + +UrlsToHtmls: TypeAlias = dict[str, str] diff --git a/cognee/tasks/web_scraper/utils.py b/cognee/tasks/web_scraper/utils.py index b1cbf82e9..1f51bf98d 100644 --- a/cognee/tasks/web_scraper/utils.py +++ b/cognee/tasks/web_scraper/utils.py @@ -5,16 +5,14 @@ both BeautifulSoup for custom extraction rules and Tavily for API-based scraping """ import os -from re import L -from typing import List, Union, TypeAlias +from typing import List, Union from cognee.shared.logging_utils import get_logger +from cognee.tasks.web_scraper.types import UrlsToHtmls from .default_url_crawler import DefaultUrlCrawler from .config import DefaultCrawlerConfig, TavilyConfig logger = get_logger(__name__) -UrlsToHtmls: TypeAlias = dict[str, str] - async def fetch_page_content(urls: Union[str, List[str]]) -> UrlsToHtmls: """Fetch content from one or more URLs using the specified tool. diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_add.py index a00ca9e0d..27a627680 100644 --- a/cognee/tests/integration/web_url_crawler/test_add.py +++ b/cognee/tests/integration/web_url_crawler/test_add.py @@ -1,37 +1,76 @@ -from sys import exc_info import pytest import cognee -from cognee.modules.ingestion.exceptions.exceptions import IngestionError +from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path +from cognee.infrastructure.loaders.LoaderEngine import LoaderEngine +from cognee.infrastructure.loaders.external.beautiful_soup_loader import BeautifulSoupLoader +from cognee.tasks.ingestion import save_data_item_to_storage +from pathlib import Path @pytest.mark.asyncio -async def test_add_fails_when_web_url_fetcher_config_not_specified(): - from cognee.shared.logging_utils import setup_logging, ERROR - - setup_logging(log_level=ERROR) +async def test_url_saves_as_html_file(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - with pytest.raises(IngestionError) as excinfo: - await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", - incremental_loading=False, + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" ) - assert excinfo.value.message.startswith( - "web_url_fetcher configuration must be a valid dictionary" - ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") @pytest.mark.asyncio -async def test_add_succesfully_adds_url_when_fetcher_config_specified(): +async def test_saved_html_is_valid(): + try: + from bs4 import BeautifulSoup + except ImportError: + pytest.fail("Test case requires bs4 installed") + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - extraction_rules = { - "title": {"selector": "title"}, - "headings": {"selector": "h1, h2, h3", "all": True}, - "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "all": True}, - } + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + content = Path(file_path).read_text() + + soup = BeautifulSoup(content, "html.parser") + assert soup.find() is not None, "File should contain parseable HTML" + + has_html_elements = any( + [ + soup.find("html"), + soup.find("head"), + soup.find("body"), + soup.find("div"), + soup.find("p"), + ] + ) + assert has_html_elements, "File should contain common HTML elements" + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_add_url(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + + +@pytest.mark.asyncio +async def test_add_url_without_incremental_loading(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) try: await cognee.add( @@ -43,17 +82,10 @@ async def test_add_succesfully_adds_url_when_fetcher_config_specified(): @pytest.mark.asyncio -async def test_add_with_incremental_loading_works(): +async def test_add_url_with_incremental_loading(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - extraction_rules = { - "title": {"selector": "title"}, - "headings": {"selector": "h1, h2, h3", "all": True}, - "links": {"selector": "a", "attr": "href", "all": True}, - "paragraphs": {"selector": "p", "all": True}, - } - try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", @@ -64,7 +96,7 @@ async def test_add_with_incremental_loading_works(): @pytest.mark.asyncio -async def test_add_without_incremental_loading_works(): +async def test_add_url_with_extraction_rules(): # TODO: this'll fail due to not implemented `load()` yet await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -78,7 +110,75 @@ async def test_add_without_incremental_loading_works(): try: await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", - incremental_loading=False, + preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: pytest.fail(f"Failed to add url: {e}") + + +@pytest.mark.asyncio +async def test_loader_is_none_by_default(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader is None + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_provided(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader == bs_loader + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") diff --git a/examples/python/web_url_fetcher_example.py b/examples/python/web_url_fetcher_example.py index 2195a62c0..aff8094bf 100644 --- a/examples/python/web_url_fetcher_example.py +++ b/examples/python/web_url_fetcher_example.py @@ -23,6 +23,7 @@ async def main(): await cognee.add( "https://en.wikipedia.org/wiki/Large_language_model", incremental_loading=False, + preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) await cognee.cognify() From ed4eba4c4415b310d835e187326abb625887f476 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 21:55:06 +0100 Subject: [PATCH 37/44] add back in-code comments for `ingest_data` --- cognee/tasks/ingestion/ingest_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 25b2aa6ae..0572d0f1e 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -96,9 +96,13 @@ async def ingest_data( # Standard flow: extract metadata from both original and stored files async with open_data_file(original_file_path) as file: classified_data = ingestion.classify(file) + + # data_id is the hash of original file contents + owner id to avoid duplicate data + data_id = ingestion.identify(classified_data, user) original_file_metadata = classified_data.get_metadata() + # Find metadata from Cognee data storage text file async with open_data_file(cognee_storage_file_path) as file: classified_data = ingestion.classify(file) storage_file_metadata = classified_data.get_metadata() From 6895813ae88b49335500d4ea13b8c270f96e1e07 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 22:10:42 +0100 Subject: [PATCH 38/44] tests: name integration tests more meaningfully --- .../{test_bs4_crawler.py => test_default_url_crawler.py} | 0 .../web_url_crawler/{test_add.py => test_url_adding_e2e.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cognee/tests/integration/web_url_crawler/{test_bs4_crawler.py => test_default_url_crawler.py} (100%) rename cognee/tests/integration/web_url_crawler/{test_add.py => test_url_adding_e2e.py} (100%) diff --git a/cognee/tests/integration/web_url_crawler/test_bs4_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py similarity index 100% rename from cognee/tests/integration/web_url_crawler/test_bs4_crawler.py rename to cognee/tests/integration/web_url_crawler/test_default_url_crawler.py diff --git a/cognee/tests/integration/web_url_crawler/test_add.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py similarity index 100% rename from cognee/tests/integration/web_url_crawler/test_add.py rename to cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py From 0f6aac19e8aef5f071a1c74fa45ad80c97d2ac4f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 22:35:23 +0100 Subject: [PATCH 39/44] TDD: add test cases and finish loading stage --- cognee/infrastructure/loaders/LoaderEngine.py | 14 +- .../loaders/external/beautiful_soup_loader.py | 60 +++++++-- .../web_url_crawler/test_url_adding_e2e.py | 126 ++++++++++++++++++ 3 files changed, 189 insertions(+), 11 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 1a47eea56..725f37b14 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -126,7 +126,7 @@ class LoaderEngine: Args: file_path: Path to the file to be processed - preferred_loaders: List of preferred loader names to try first + preferred_loaders: Dict of loader names to their configurations **kwargs: Additional loader-specific configuration Raises: @@ -138,8 +138,16 @@ class LoaderEngine: raise ValueError(f"No loader found for file: {file_path}") logger.debug(f"Loading {file_path} with {loader.loader_name}") - # TODO: loading needs to be reworked to work with both file streams and file locations - return await loader.load(file_path, **kwargs) + + # Extract loader-specific config from preferred_loaders + loader_config = {} + if preferred_loaders and loader.loader_name in preferred_loaders: + loader_config = preferred_loaders[loader.loader_name] + + # Merge with any additional kwargs (kwargs take precedence) + merged_kwargs = {**loader_config, **kwargs} + + return await loader.load(file_path, **merged_kwargs) def get_available_loaders(self) -> List[str]: """ diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index 05330a095..bd6d8025b 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface): can = extension in self.supported_extensions and mime_type in self.supported_mime_types return can - async def load(self, file_path: str, **kwargs): - """Load an HTML file and return its path. - - For HTML files stored on disk, we simply return the file path - since the content is already in text format and can be processed directly. + async def load( + self, + file_path: str, + extraction_rules: dict[str, Any] = None, + join_all_matches: bool = False, + **kwargs, + ): + """Load an HTML file, extract content, and save to storage. Args: file_path: Path to the HTML file + extraction_rules: Dict of CSS selector rules for content extraction + join_all_matches: If True, extract all matching elements for each rule **kwargs: Additional arguments Returns: - The file path to the HTML file + Path to the stored extracted text file """ - raise NotImplementedError + if extraction_rules is None: + raise ValueError("extraction_rules required for BeautifulSoupLoader") + + logger.info(f"Processing HTML file: {file_path}") + + from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata + from cognee.infrastructure.files.storage import get_file_storage, get_storage_config + + with open(file_path, "rb") as f: + file_metadata = await get_file_metadata(f) + f.seek(0) + html = f.read() + + storage_file_name = "text_" + file_metadata["content_hash"] + ".txt" + + # Normalize extraction rules + normalized_rules: List[ExtractionRule] = [] + for _, rule in extraction_rules.items(): + r = self._normalize_rule(rule) + if join_all_matches: + r.all = True + normalized_rules.append(r) + + pieces = [] + for rule in normalized_rules: + text = self._extract_from_html(html, rule) + if text: + pieces.append(text) + + full_content = " ".join(pieces).strip() + + # Store the extracted content + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + storage = get_file_storage(data_root_directory) + + full_file_path = await storage.store(storage_file_name, full_content) + + logger.info(f"Extracted {len(full_content)} characters from HTML") + return full_file_path def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. @@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface): ) raise ValueError(f"Invalid extraction rule: {rule}") - def extract(self, html: str, rule: ExtractionRule) -> str: + def _extract_from_html(self, html: str, rule: ExtractionRule) -> str: """Extract content from HTML using BeautifulSoup or lxml XPath. Args: diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index 27a627680..afe2dce7f 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov assert loader == bs_loader except Exception as e: pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_raises_if_required_args_are_missing(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {}} + with pytest.raises(ValueError): + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loads_file_successfully(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + original_file = Path(file_path) + assert original_file.exists() + assert original_file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader == bs_loader + + cognee_loaded_txt_path = await loader_engine.load_file( + file_path=file_path, preferred_loaders=preferred_loaders + ) + + cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path) + + assert cognee_loaded_txt_path.endswith(".txt") + + extracted_file = Path(cognee_loaded_txt_path) + + assert extracted_file.exists() + assert extracted_file.stat().st_size > 0 + + original_basename = original_file.stem + extracted_basename = extracted_file.stem + assert original_basename == extracted_basename, ( + f"Expected same base name: {original_basename} vs {extracted_basename}" + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") From f02aa1abfc4f1f098621383ec6edd57c1fca2fb4 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 23:02:25 +0100 Subject: [PATCH 40/44] ruff format --- cognee/context_global_variables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index aad53341a..d52de4b4e 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -14,6 +14,7 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) + async def set_session_user_context_variable(user): session_user.set(user) From 3f5c09eb45a52e23623bc2ef32e83d251f9afc1f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 23:11:01 +0100 Subject: [PATCH 41/44] lazy load `cron_web_scraper_task` and `web_scraper_task` --- cognee/tasks/web_scraper/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/web_scraper/__init__.py b/cognee/tasks/web_scraper/__init__.py index 26c3e68cf..d52129c05 100644 --- a/cognee/tasks/web_scraper/__init__.py +++ b/cognee/tasks/web_scraper/__init__.py @@ -6,9 +6,24 @@ BeautifulSoup or Tavily, defining data models, and handling scraping configurati """ from .utils import fetch_page_content -from .web_scraper_task import cron_web_scraper_task, web_scraper_task from .default_url_crawler import DefaultUrlCrawler +# Lazy import for web_scraper_task to avoid requiring apscheduler +# Import these directly if needed: from cognee.tasks.web_scraper.web_scraper_task import ... + + +def __getattr__(name): + """Lazy load web scraper task functions that require apscheduler.""" + if name == "cron_web_scraper_task": + from .web_scraper_task import cron_web_scraper_task + + return cron_web_scraper_task + elif name == "web_scraper_task": + from .web_scraper_task import web_scraper_task + + return web_scraper_task + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + __all__ = [ "BeautifulSoupCrawler", From a35bcecdf9dd386e62b07e912c788d0bf20682b4 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 23:13:40 +0100 Subject: [PATCH 42/44] refactor tavily_crawler test --- .../tests/integration/web_url_crawler/test_tavily_crawler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 7edb9b8d3..50b409f8f 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -1,14 +1,11 @@ -import os import pytest -from cognee.tasks.web_scraper.config import TavilyConfig from cognee.tasks.web_scraper.utils import fetch_with_tavily @pytest.mark.asyncio async def test_fetch(): url = "https://en.wikipedia.org/wiki/Large_language_model" - tavily_config = TavilyConfig() - results = await fetch_with_tavily(url, tavily_config) + results = await fetch_with_tavily(url) assert len(results) == 1 assert isinstance(results, dict) html = results[url] From 20c9e5498b5179cfae0ac56a2579b7b45b1f0b85 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 23:27:18 +0100 Subject: [PATCH 43/44] skip tavily in Github CI for now --- .../integration/web_url_crawler/test_tavily_crawler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 50b409f8f..946ce8378 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -1,12 +1,19 @@ +import os import pytest from cognee.tasks.web_scraper.utils import fetch_with_tavily +skip_in_ci = pytest.mark.skipif( + os.getenv("GITHUB_ACTIONS") == "true", + reason="Skipping in Github for now - before we get TAVILY_API_KEY", +) + +@skip_in_ci @pytest.mark.asyncio async def test_fetch(): url = "https://en.wikipedia.org/wiki/Large_language_model" results = await fetch_with_tavily(url) - assert len(results) == 1 assert isinstance(results, dict) + assert len(results) == 1 html = results[url] assert isinstance(html, str) From 10e4fd7681833013c358f90d2ac7633fea7ec112 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 23:46:21 +0100 Subject: [PATCH 44/44] Make BS4 loader compatible with tavily fetcher --- .../loaders/external/beautiful_soup_loader.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index bd6d8025b..04954a228 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -115,6 +115,23 @@ class BeautifulSoupLoader(LoaderInterface): full_content = " ".join(pieces).strip() + # Fallback: If no content extracted, check if the file is plain text (not HTML) + if not full_content: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(html, "html.parser") + # If there are no HTML tags, treat as plain text + if not soup.find(): + logger.warning( + f"No HTML tags found in {file_path}. Treating as plain text. " + "This may happen when content is pre-extracted (e.g., via Tavily with text format)." + ) + full_content = html.decode("utf-8") if isinstance(html, bytes) else html + full_content = full_content.strip() + + if not full_content: + logger.warning(f"No content extracted from HTML file: {file_path}") + # Store the extracted content storage_config = get_storage_config() data_root_directory = storage_config["data_root_directory"]