fix: use pypdf for pdf text extraction

This commit is contained in:
Boris Arzentar 2024-03-26 16:21:33 +01:00
parent 9c06e293b2
commit a87d627ec2
6 changed files with 75 additions and 1987 deletions

View file

@ -3,7 +3,6 @@ from os import path, listdir
import asyncio import asyncio
import dlt import dlt
import duckdb import duckdb
from unstructured.cleaners.core import clean
from cognee.root_dir import get_absolute_path from cognee.root_dir import get_absolute_path
import cognee.modules.ingestion as ingestion import cognee.modules.ingestion as ingestion
from cognee.infrastructure.files import get_file_metadata from cognee.infrastructure.files import get_file_metadata
@ -19,7 +18,7 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
for file_or_dir in listdir(root_dir_path): for file_or_dir in listdir(root_dir_path):
if path.isdir(path.join(root_dir_path, file_or_dir)): if path.isdir(path.join(root_dir_path, file_or_dir)):
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
dataset_name = clean(dataset_name.replace(" ", "_")) dataset_name = dataset_name.strip().replace(" ", "_")
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name) nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)

View file

@ -3,8 +3,7 @@ import asyncio
from typing import List, Union from typing import List, Union
import instructor import instructor
from openai import OpenAI from openai import OpenAI
from unstructured.cleaners.core import clean from pypdf import PdfReader
from unstructured.partition.pdf import partition_pdf
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
from cognee.modules.cognify.llm.label_content import label_content from cognee.modules.cognify.llm.label_content import label_content
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
@ -61,8 +60,9 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object
for file_metadata in files_metadata: for file_metadata in files_metadata:
with open(file_metadata["file_path"], "rb") as file: with open(file_metadata["file_path"], "rb") as file:
elements = partition_pdf(file = file, strategy = "fast") reader = PdfReader(stream = file)
text = "\n".join(map(lambda element: clean(element.text), elements)) pages = list(reader.pages[:3])
text = "\n".join([page.extract_text().strip() for page in pages])
awaitables.append(process_text(text, file_metadata)) awaitables.append(process_text(text, file_metadata))

View file

@ -1,10 +1,10 @@
from typing import List from typing import List
from fastembed.embedding import FlagEmbedding from fastembed import TextEmbedding
from .EmbeddingEngine import EmbeddingEngine from .EmbeddingEngine import EmbeddingEngine
class DefaultEmbeddingEngine(EmbeddingEngine): class DefaultEmbeddingEngine(EmbeddingEngine):
async def embed_text(self, text: List[str]) -> List[float]: async def embed_text(self, text: List[str]) -> List[float]:
embedding_model = FlagEmbedding(model_name = "BAAI/bge-large-en-v1.5") embedding_model = TextEmbedding(model_name = "BAAI/bge-large-en-v1.5")
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text))) embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
return embeddings_list return embeddings_list

View file

@ -1,10 +1,8 @@
from typing import BinaryIO, TypedDict from typing import BinaryIO, TypedDict
import filetype import filetype
from unstructured.cleaners.core import clean from pypdf import PdfReader
from unstructured.partition.pdf import partition_pdf
from .extract_keywords import extract_keywords from .extract_keywords import extract_keywords
class FileTypeException(Exception): class FileTypeException(Exception):
message: str message: str
@ -27,10 +25,10 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
keywords: list = [] keywords: list = []
if file_type.extension == "pdf": if file_type.extension == "pdf":
elements = partition_pdf(file = file, strategy = "fast") reader = PdfReader(stream = file)
keywords = extract_keywords( pages = list(reader.pages[:3])
"\n".join(map(lambda element: clean(element.text), elements)) text = "\n".join([page.extract_text().strip() for page in pages])
) keywords = extract_keywords(text)
file_path = file.name file_path = file.name
file_name = file_path.split("/")[-1].split(".")[0] file_name = file_path.split("/")[-1].split(".")[0]

2028
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -34,9 +34,8 @@ debugpy = "^1.8.0"
pyarrow = "^15.0.0" pyarrow = "^15.0.0"
pylint = "^3.0.3" pylint = "^3.0.3"
aiosqlite = "^0.20.0" aiosqlite = "^0.20.0"
unstructured = {extras = ["all-docs"], version = "^0.12.5"}
pymupdf = "^1.23.25" pymupdf = "^1.23.25"
pandas = "^2.2.1" pandas = "^2.2.0"
greenlet = "^3.0.3" greenlet = "^3.0.3"
ruff = "^0.2.2" ruff = "^0.2.2"
filetype = "^1.2.0" filetype = "^1.2.0"
@ -51,7 +50,8 @@ graphistry = "^0.33.5"
tenacity = "^8.2.3" tenacity = "^8.2.3"
weaviate-client = "^4.5.4" weaviate-client = "^4.5.4"
scikit-learn = "^1.4.1.post1" scikit-learn = "^1.4.1.post1"
fastembed = "0.1.3" fastembed = "^0.2.5"
pypdf = "^4.1.0"
[tool.poetry.extras] [tool.poetry.extras]
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"] dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
@ -93,7 +93,6 @@ mkdocstrings = "^0.22.0"
mkdocstrings-python = "^1.1.2" mkdocstrings-python = "^1.1.2"
pytest-examples = "^0.0.10" pytest-examples = "^0.0.10"
mkdocs-jupyter = "^0.24.6" mkdocs-jupyter = "^0.24.6"
mkdocs-rss-plugin = "^1.12.0"
mkdocs-minify-plugin = "^0.8.0" mkdocs-minify-plugin = "^0.8.0"
mkdocs-redirects = "^1.2.1" mkdocs-redirects = "^1.2.1"