From 0f6aac19e8aef5f071a1c74fa45ad80c97d2ac4f Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Tue, 21 Oct 2025 22:35:23 +0100 Subject: [PATCH] TDD: add test cases and finish loading stage --- cognee/infrastructure/loaders/LoaderEngine.py | 14 +- .../loaders/external/beautiful_soup_loader.py | 60 +++++++-- .../web_url_crawler/test_url_adding_e2e.py | 126 ++++++++++++++++++ 3 files changed, 189 insertions(+), 11 deletions(-) diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 1a47eea56..725f37b14 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -126,7 +126,7 @@ class LoaderEngine: Args: file_path: Path to the file to be processed - preferred_loaders: List of preferred loader names to try first + preferred_loaders: Dict of loader names to their configurations **kwargs: Additional loader-specific configuration Raises: @@ -138,8 +138,16 @@ class LoaderEngine: raise ValueError(f"No loader found for file: {file_path}") logger.debug(f"Loading {file_path} with {loader.loader_name}") - # TODO: loading needs to be reworked to work with both file streams and file locations - return await loader.load(file_path, **kwargs) + + # Extract loader-specific config from preferred_loaders + loader_config = {} + if preferred_loaders and loader.loader_name in preferred_loaders: + loader_config = preferred_loaders[loader.loader_name] + + # Merge with any additional kwargs (kwargs take precedence) + merged_kwargs = {**loader_config, **kwargs} + + return await loader.load(file_path, **merged_kwargs) def get_available_loaders(self) -> List[str]: """ diff --git a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py index 05330a095..bd6d8025b 100644 --- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py +++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py @@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface): can = extension in self.supported_extensions and mime_type in self.supported_mime_types return can - async def load(self, file_path: str, **kwargs): - """Load an HTML file and return its path. - - For HTML files stored on disk, we simply return the file path - since the content is already in text format and can be processed directly. + async def load( + self, + file_path: str, + extraction_rules: dict[str, Any] = None, + join_all_matches: bool = False, + **kwargs, + ): + """Load an HTML file, extract content, and save to storage. Args: file_path: Path to the HTML file + extraction_rules: Dict of CSS selector rules for content extraction + join_all_matches: If True, extract all matching elements for each rule **kwargs: Additional arguments Returns: - The file path to the HTML file + Path to the stored extracted text file """ - raise NotImplementedError + if extraction_rules is None: + raise ValueError("extraction_rules required for BeautifulSoupLoader") + + logger.info(f"Processing HTML file: {file_path}") + + from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata + from cognee.infrastructure.files.storage import get_file_storage, get_storage_config + + with open(file_path, "rb") as f: + file_metadata = await get_file_metadata(f) + f.seek(0) + html = f.read() + + storage_file_name = "text_" + file_metadata["content_hash"] + ".txt" + + # Normalize extraction rules + normalized_rules: List[ExtractionRule] = [] + for _, rule in extraction_rules.items(): + r = self._normalize_rule(rule) + if join_all_matches: + r.all = True + normalized_rules.append(r) + + pieces = [] + for rule in normalized_rules: + text = self._extract_from_html(html, rule) + if text: + pieces.append(text) + + full_content = " ".join(pieces).strip() + + # Store the extracted content + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + storage = get_file_storage(data_root_directory) + + full_file_path = await storage.store(storage_file_name, full_content) + + logger.info(f"Extracted {len(full_content)} characters from HTML") + return full_file_path def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule: """Normalize an extraction rule to an ExtractionRule dataclass. @@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface): ) raise ValueError(f"Invalid extraction rule: {rule}") - def extract(self, html: str, rule: ExtractionRule) -> str: + def _extract_from_html(self, html: str, rule: ExtractionRule) -> str: """Extract content from HTML using BeautifulSoup or lxml XPath. Args: diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index 27a627680..afe2dce7f 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov assert loader == bs_loader except Exception as e: pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_raises_if_required_args_are_missing(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {}} + with pytest.raises(ValueError): + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + file = Path(file_path) + assert file.exists() + assert file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + await loader_engine.load_file( + file_path, + preferred_loaders=preferred_loaders, + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}") + + +@pytest.mark.asyncio +async def test_beautiful_soup_loads_file_successfully(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + extraction_rules = { + "title": {"selector": "title"}, + "headings": {"selector": "h1, h2, h3", "all": True}, + "links": {"selector": "a", "attr": "href", "all": True}, + "paragraphs": {"selector": "p", "all": True}, + } + + try: + original_file_path = await save_data_item_to_storage( + "https://en.wikipedia.org/wiki/Large_language_model" + ) + file_path = get_data_file_path(original_file_path) + assert file_path.endswith(".html") + original_file = Path(file_path) + assert original_file.exists() + assert original_file.stat().st_size > 0 + + loader_engine = LoaderEngine() + bs_loader = BeautifulSoupLoader() + loader_engine.register_loader(bs_loader) + preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}} + loader = loader_engine.get_loader( + file_path, + preferred_loaders=preferred_loaders, + ) + + assert loader == bs_loader + + cognee_loaded_txt_path = await loader_engine.load_file( + file_path=file_path, preferred_loaders=preferred_loaders + ) + + cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path) + + assert cognee_loaded_txt_path.endswith(".txt") + + extracted_file = Path(cognee_loaded_txt_path) + + assert extracted_file.exists() + assert extracted_file.stat().st_size > 0 + + original_basename = original_file.stem + extracted_basename = extracted_file.stem + assert original_basename == extracted_basename, ( + f"Expected same base name: {original_basename} vs {extracted_basename}" + ) + except Exception as e: + pytest.fail(f"Failed to save data item to storage: {e}")