TDD: add test cases and finish loading stage

2025-10-21 22:35:23 +01:00 · 2025-10-21 22:35:23 +01:00 · 0f6aac19e8
commit 0f6aac19e8
parent 6895813ae8
3 changed files with 189 additions and 11 deletions
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -126,7 +126,7 @@ class LoaderEngine:

        Args:
            file_path: Path to the file to be processed
-            preferred_loaders: List of preferred loader names to try first
+            preferred_loaders: Dict of loader names to their configurations
            **kwargs: Additional loader-specific configuration

        Raises:
@ -138,8 +138,16 @@ class LoaderEngine:
            raise ValueError(f"No loader found for file: {file_path}")

        logger.debug(f"Loading {file_path} with {loader.loader_name}")
-        # TODO: loading needs to be reworked to work with both file streams and file locations
-        return await loader.load(file_path, **kwargs)
+
+        # Extract loader-specific config from preferred_loaders
+        loader_config = {}
+        if preferred_loaders and loader.loader_name in preferred_loaders:
+            loader_config = preferred_loaders[loader.loader_name]
+
+        # Merge with any additional kwargs (kwargs take precedence)
+        merged_kwargs = {**loader_config, **kwargs}
+
+        return await loader.load(file_path, **merged_kwargs)

    def get_available_loaders(self) -> List[str]:
        """
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface):
        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
        return can

-    async def load(self, file_path: str, **kwargs):
-        """Load an HTML file and return its path.
-
-        For HTML files stored on disk, we simply return the file path
-        since the content is already in text format and can be processed directly.
+    async def load(
+        self,
+        file_path: str,
+        extraction_rules: dict[str, Any] = None,
+        join_all_matches: bool = False,
+        **kwargs,
+    ):
+        """Load an HTML file, extract content, and save to storage.

        Args:
            file_path: Path to the HTML file
+            extraction_rules: Dict of CSS selector rules for content extraction
+            join_all_matches: If True, extract all matching elements for each rule
            **kwargs: Additional arguments

        Returns:
-            The file path to the HTML file
+            Path to the stored extracted text file
        """
-        raise NotImplementedError
+        if extraction_rules is None:
+            raise ValueError("extraction_rules required for BeautifulSoupLoader")
+
+        logger.info(f"Processing HTML file: {file_path}")
+
+        from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
+        from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
+
+        with open(file_path, "rb") as f:
+            file_metadata = await get_file_metadata(f)
+            f.seek(0)
+            html = f.read()
+
+        storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
+
+        # Normalize extraction rules
+        normalized_rules: List[ExtractionRule] = []
+        for _, rule in extraction_rules.items():
+            r = self._normalize_rule(rule)
+            if join_all_matches:
+                r.all = True
+            normalized_rules.append(r)
+
+        pieces = []
+        for rule in normalized_rules:
+            text = self._extract_from_html(html, rule)
+            if text:
+                pieces.append(text)
+
+        full_content = " ".join(pieces).strip()
+
+        # Store the extracted content
+        storage_config = get_storage_config()
+        data_root_directory = storage_config["data_root_directory"]
+        storage = get_file_storage(data_root_directory)
+
+        full_file_path = await storage.store(storage_file_name, full_content)
+
+        logger.info(f"Extracted {len(full_content)} characters from HTML")
+        return full_file_path

    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
        """Normalize an extraction rule to an ExtractionRule dataclass.
@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface):
            )
        raise ValueError(f"Invalid extraction rule: {rule}")

-    def extract(self, html: str, rule: ExtractionRule) -> str:
+    def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
        """Extract content from HTML using BeautifulSoup or lxml XPath.

        Args:
--- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
+++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py
@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
        assert loader == bs_loader
    except Exception as e:
        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {}}
+        with pytest.raises(ValueError):
+            await loader_engine.load_file(
+                file_path,
+                preferred_loaders=preferred_loaders,
+            )
+        extraction_rules = {
+            "title": {"selector": "title"},
+            "headings": {"selector": "h1, h2, h3", "all": True},
+            "links": {"selector": "a", "attr": "href", "all": True},
+            "paragraphs": {"selector": "p", "all": True},
+        }
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        await loader_engine.load_file(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        file = Path(file_path)
+        assert file.exists()
+        assert file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        extraction_rules = {
+            "title": {"selector": "title"},
+            "headings": {"selector": "h1, h2, h3", "all": True},
+            "links": {"selector": "a", "attr": "href", "all": True},
+            "paragraphs": {"selector": "p", "all": True},
+        }
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        await loader_engine.load_file(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")
+
+
+@pytest.mark.asyncio
+async def test_beautiful_soup_loads_file_successfully():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    extraction_rules = {
+        "title": {"selector": "title"},
+        "headings": {"selector": "h1, h2, h3", "all": True},
+        "links": {"selector": "a", "attr": "href", "all": True},
+        "paragraphs": {"selector": "p", "all": True},
+    }
+
+    try:
+        original_file_path = await save_data_item_to_storage(
+            "https://en.wikipedia.org/wiki/Large_language_model"
+        )
+        file_path = get_data_file_path(original_file_path)
+        assert file_path.endswith(".html")
+        original_file = Path(file_path)
+        assert original_file.exists()
+        assert original_file.stat().st_size > 0
+
+        loader_engine = LoaderEngine()
+        bs_loader = BeautifulSoupLoader()
+        loader_engine.register_loader(bs_loader)
+        preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
+        loader = loader_engine.get_loader(
+            file_path,
+            preferred_loaders=preferred_loaders,
+        )
+
+        assert loader == bs_loader
+
+        cognee_loaded_txt_path = await loader_engine.load_file(
+            file_path=file_path, preferred_loaders=preferred_loaders
+        )
+
+        cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
+
+        assert cognee_loaded_txt_path.endswith(".txt")
+
+        extracted_file = Path(cognee_loaded_txt_path)
+
+        assert extracted_file.exists()
+        assert extracted_file.stat().st_size > 0
+
+        original_basename = original_file.stem
+        extracted_basename = extracted_file.stem
+        assert original_basename == extracted_basename, (
+            f"Expected same base name: {original_basename} vs {extracted_basename}"
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to save data item to storage: {e}")