TDD: add test cases and finish loading stage
This commit is contained in:
parent
6895813ae8
commit
0f6aac19e8
3 changed files with 189 additions and 11 deletions
|
|
@ -126,7 +126,7 @@ class LoaderEngine:
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to be processed
|
file_path: Path to the file to be processed
|
||||||
preferred_loaders: List of preferred loader names to try first
|
preferred_loaders: Dict of loader names to their configurations
|
||||||
**kwargs: Additional loader-specific configuration
|
**kwargs: Additional loader-specific configuration
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
|
|
@ -138,8 +138,16 @@ class LoaderEngine:
|
||||||
raise ValueError(f"No loader found for file: {file_path}")
|
raise ValueError(f"No loader found for file: {file_path}")
|
||||||
|
|
||||||
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
||||||
# TODO: loading needs to be reworked to work with both file streams and file locations
|
|
||||||
return await loader.load(file_path, **kwargs)
|
# Extract loader-specific config from preferred_loaders
|
||||||
|
loader_config = {}
|
||||||
|
if preferred_loaders and loader.loader_name in preferred_loaders:
|
||||||
|
loader_config = preferred_loaders[loader.loader_name]
|
||||||
|
|
||||||
|
# Merge with any additional kwargs (kwargs take precedence)
|
||||||
|
merged_kwargs = {**loader_config, **kwargs}
|
||||||
|
|
||||||
|
return await loader.load(file_path, **merged_kwargs)
|
||||||
|
|
||||||
def get_available_loaders(self) -> List[str]:
|
def get_available_loaders(self) -> List[str]:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -66,20 +66,64 @@ class BeautifulSoupLoader(LoaderInterface):
|
||||||
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
||||||
return can
|
return can
|
||||||
|
|
||||||
async def load(self, file_path: str, **kwargs):
|
async def load(
|
||||||
"""Load an HTML file and return its path.
|
self,
|
||||||
|
file_path: str,
|
||||||
For HTML files stored on disk, we simply return the file path
|
extraction_rules: dict[str, Any] = None,
|
||||||
since the content is already in text format and can be processed directly.
|
join_all_matches: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Load an HTML file, extract content, and save to storage.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the HTML file
|
file_path: Path to the HTML file
|
||||||
|
extraction_rules: Dict of CSS selector rules for content extraction
|
||||||
|
join_all_matches: If True, extract all matching elements for each rule
|
||||||
**kwargs: Additional arguments
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The file path to the HTML file
|
Path to the stored extracted text file
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
if extraction_rules is None:
|
||||||
|
raise ValueError("extraction_rules required for BeautifulSoupLoader")
|
||||||
|
|
||||||
|
logger.info(f"Processing HTML file: {file_path}")
|
||||||
|
|
||||||
|
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
||||||
|
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||||
|
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
file_metadata = await get_file_metadata(f)
|
||||||
|
f.seek(0)
|
||||||
|
html = f.read()
|
||||||
|
|
||||||
|
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
||||||
|
|
||||||
|
# Normalize extraction rules
|
||||||
|
normalized_rules: List[ExtractionRule] = []
|
||||||
|
for _, rule in extraction_rules.items():
|
||||||
|
r = self._normalize_rule(rule)
|
||||||
|
if join_all_matches:
|
||||||
|
r.all = True
|
||||||
|
normalized_rules.append(r)
|
||||||
|
|
||||||
|
pieces = []
|
||||||
|
for rule in normalized_rules:
|
||||||
|
text = self._extract_from_html(html, rule)
|
||||||
|
if text:
|
||||||
|
pieces.append(text)
|
||||||
|
|
||||||
|
full_content = " ".join(pieces).strip()
|
||||||
|
|
||||||
|
# Store the extracted content
|
||||||
|
storage_config = get_storage_config()
|
||||||
|
data_root_directory = storage_config["data_root_directory"]
|
||||||
|
storage = get_file_storage(data_root_directory)
|
||||||
|
|
||||||
|
full_file_path = await storage.store(storage_file_name, full_content)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(full_content)} characters from HTML")
|
||||||
|
return full_file_path
|
||||||
|
|
||||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||||
|
|
@ -105,7 +149,7 @@ class BeautifulSoupLoader(LoaderInterface):
|
||||||
)
|
)
|
||||||
raise ValueError(f"Invalid extraction rule: {rule}")
|
raise ValueError(f"Invalid extraction rule: {rule}")
|
||||||
|
|
||||||
def extract(self, html: str, rule: ExtractionRule) -> str:
|
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
|
||||||
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
|
||||||
|
|
@ -182,3 +182,129 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov
|
||||||
assert loader == bs_loader
|
assert loader == bs_loader
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Failed to save data item to storage: {e}")
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_raises_if_required_args_are_missing():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {}}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_present():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
file = Path(file_path)
|
||||||
|
assert file.exists()
|
||||||
|
assert file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
await loader_engine.load_file(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_beautiful_soup_loads_file_successfully():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
extraction_rules = {
|
||||||
|
"title": {"selector": "title"},
|
||||||
|
"headings": {"selector": "h1, h2, h3", "all": True},
|
||||||
|
"links": {"selector": "a", "attr": "href", "all": True},
|
||||||
|
"paragraphs": {"selector": "p", "all": True},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_file_path = await save_data_item_to_storage(
|
||||||
|
"https://en.wikipedia.org/wiki/Large_language_model"
|
||||||
|
)
|
||||||
|
file_path = get_data_file_path(original_file_path)
|
||||||
|
assert file_path.endswith(".html")
|
||||||
|
original_file = Path(file_path)
|
||||||
|
assert original_file.exists()
|
||||||
|
assert original_file.stat().st_size > 0
|
||||||
|
|
||||||
|
loader_engine = LoaderEngine()
|
||||||
|
bs_loader = BeautifulSoupLoader()
|
||||||
|
loader_engine.register_loader(bs_loader)
|
||||||
|
preferred_loaders = {"beautiful_soup_loader": {"extraction_rules": extraction_rules}}
|
||||||
|
loader = loader_engine.get_loader(
|
||||||
|
file_path,
|
||||||
|
preferred_loaders=preferred_loaders,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader == bs_loader
|
||||||
|
|
||||||
|
cognee_loaded_txt_path = await loader_engine.load_file(
|
||||||
|
file_path=file_path, preferred_loaders=preferred_loaders
|
||||||
|
)
|
||||||
|
|
||||||
|
cognee_loaded_txt_path = get_data_file_path(cognee_loaded_txt_path)
|
||||||
|
|
||||||
|
assert cognee_loaded_txt_path.endswith(".txt")
|
||||||
|
|
||||||
|
extracted_file = Path(cognee_loaded_txt_path)
|
||||||
|
|
||||||
|
assert extracted_file.exists()
|
||||||
|
assert extracted_file.stat().st_size > 0
|
||||||
|
|
||||||
|
original_basename = original_file.stem
|
||||||
|
extracted_basename = extracted_file.stem
|
||||||
|
assert original_basename == extracted_basename, (
|
||||||
|
f"Expected same base name: {original_basename} vs {extracted_basename}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to save data item to storage: {e}")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue