TDD: add test cases and finish loading stage

This commit is contained in:
Daulet Amirkhanov 2025-10-21 22:35:23 +01:00
parent 6895813ae8
commit 0f6aac19e8
3 changed files with 189 additions and 11 deletions

View file

@ -126,7 +126,7 @@ class LoaderEngine:
Args:
file_path: Path to the file to be processed
preferred_loaders: List of preferred loader names to try first
preferred_loaders: Dict of loader names to their configurations
**kwargs: Additional loader-specific configuration
Raises:
@ -138,8 +138,16 @@ class LoaderEngine:
raise ValueError(f"No loader found for file: {file_path}")
logger.debug(f"Loading {file_path} with {loader.loader_name}")
# TODO: loading needs to be reworked to work with both file streams and file locations
return await loader.load(file_path, **kwargs)
# Extract loader-specific config from preferred_loaders
loader_config = {}
if preferred_loaders and loader.loader_name in preferred_loaders:
loader_config = preferred_loaders[loader.loader_name]
# Merge with any additional kwargs (kwargs take precedence)
merged_kwargs = {**loader_config, **kwargs}
return await loader.load(file_path, **merged_kwargs)
def get_available_loaders(self) -> List[str]:
"""

View file

@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface):
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
return can
async def load(self, file_path: str, **kwargs):
"""Load an HTML file and return its path.
For HTML files stored on disk, we simply return the file path
since the content is already in text format and can be processed directly.
async def load(
self,
file_path: str,
extraction_rules: dict[str, Any] = None,
join_all_matches: bool = False,
**kwargs,
):
"""Load an HTML file, extract content, and save to storage.
Args:
file_path: Path to the HTML file
extraction_rules: Dict of CSS selector rules for content extraction
join_all_matches: If True, extract all matching elements for each rule
**kwargs: Additional arguments
Returns:
The file path to the HTML file
Path to the stored extracted text file
"""
raise NotImplementedError
if extraction_rules is None:
raise ValueError("extraction_rules required for BeautifulSoupLoader")
logger.info(f"Processing HTML file: {file_path}")
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
with open(file_path, "rb") as f:
file_metadata = await get_file_metadata(f)
f.seek(0)
html = f.read()
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
# Normalize extraction rules
normalized_rules: List[ExtractionRule] = []
for _, rule in extraction_rules.items():
r = self._normalize_rule(rule)
if join_all_matches:
r.all = True
normalized_rules.append(r)
pieces = []
for rule in normalized_rules:
text = self._extract_from_html(html, rule)
if text:
pieces.append(text)
full_content = " ".join(pieces).strip()
# Store the extracted content
storage_config = get_storage_config()
data_root_directory = storage_config["data_root_directory"]
storage = get_file_storage(data_root_directory)
full_file_path = await storage.store(storage_file_name, full_content)
logger.info(f"Extracted {len(full_content)} characters from HTML")
return full_file_path
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
"""Normalize an extraction rule to an ExtractionRule dataclass.
@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface):
)
raise ValueError(f"Invalid extraction rule: {rule}")
def extract(self, html: str, rule: ExtractionRule) -> str:
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
"""Extract content from HTML using BeautifulSoup or lxml XPath.
Args:

View file

@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
assert loader == bs_loader
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {}}
with pytest.raises(ValueError):
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
file = Path(file_path)
assert file.exists()
assert file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
await loader_engine.load_file(
file_path,
preferred_loaders=preferred_loaders,
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")
@pytest.mark.asyncio
async def test_beautiful_soup_loads_file_successfully():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
extraction_rules = {
"title": {"selector": "title"},
"headings": {"selector": "h1, h2, h3", "all": True},
"links": {"selector": "a", "attr": "href", "all": True},
"paragraphs": {"selector": "p", "all": True},
}
try:
original_file_path = await save_data_item_to_storage(
"https://en.wikipedia.org/wiki/Large_language_model"
)
file_path = get_data_file_path(original_file_path)
assert file_path.endswith(".html")
original_file = Path(file_path)
assert original_file.exists()
assert original_file.stat().st_size > 0
loader_engine = LoaderEngine()
bs_loader = BeautifulSoupLoader()
loader_engine.register_loader(bs_loader)
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
loader = loader_engine.get_loader(
file_path,
preferred_loaders=preferred_loaders,
)
assert loader == bs_loader
cognee_loaded_txt_path = await loader_engine.load_file(
file_path=file_path, preferred_loaders=preferred_loaders
)
cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
assert cognee_loaded_txt_path.endswith(".txt")
extracted_file = Path(cognee_loaded_txt_path)
assert extracted_file.exists()
assert extracted_file.stat().st_size > 0
original_basename = original_file.stem
extracted_basename = extracted_file.stem
assert original_basename == extracted_basename, (
f"Expected same base name: {original_basename} vs {extracted_basename}"
)
except Exception as e:
pytest.fail(f"Failed to save data item to storage: {e}")