fix: use pypdf for pdf text extraction

This commit is contained in:
Boris Arzentar 2024-03-26 16:21:33 +01:00
parent 9c06e293b2
commit a87d627ec2
6 changed files with 75 additions and 1987 deletions

View file

@ -3,7 +3,6 @@ from os import path, listdir
import asyncio
import dlt
import duckdb
from unstructured.cleaners.core import clean
from cognee.root_dir import get_absolute_path
import cognee.modules.ingestion as ingestion
from cognee.infrastructure.files import get_file_metadata
@ -19,7 +18,7 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
for file_or_dir in listdir(root_dir_path):
if path.isdir(path.join(root_dir_path, file_or_dir)):
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
dataset_name = clean(dataset_name.replace(" ", "_"))
dataset_name = dataset_name.strip().replace(" ", "_")
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)

View file

@ -3,8 +3,7 @@ import asyncio
from typing import List, Union
import instructor
from openai import OpenAI
from unstructured.cleaners.core import clean
from unstructured.partition.pdf import partition_pdf
from pypdf import PdfReader
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
from cognee.modules.cognify.llm.label_content import label_content
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
@ -61,8 +60,9 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object
for file_metadata in files_metadata:
with open(file_metadata["file_path"], "rb") as file:
elements = partition_pdf(file = file, strategy = "fast")
text = "\n".join(map(lambda element: clean(element.text), elements))
reader = PdfReader(stream = file)
pages = list(reader.pages[:3])
text = "\n".join([page.extract_text().strip() for page in pages])
awaitables.append(process_text(text, file_metadata))

View file

@ -1,10 +1,10 @@
from typing import List
from fastembed.embedding import FlagEmbedding
from fastembed import TextEmbedding
from .EmbeddingEngine import EmbeddingEngine
class DefaultEmbeddingEngine(EmbeddingEngine):
async def embed_text(self, text: List[str]) -> List[float]:
embedding_model = FlagEmbedding(model_name = "BAAI/bge-large-en-v1.5")
embedding_model = TextEmbedding(model_name = "BAAI/bge-large-en-v1.5")
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
return embeddings_list

View file

@ -1,10 +1,8 @@
from typing import BinaryIO, TypedDict
import filetype
from unstructured.cleaners.core import clean
from unstructured.partition.pdf import partition_pdf
from pypdf import PdfReader
from .extract_keywords import extract_keywords
class FileTypeException(Exception):
message: str
@ -27,10 +25,10 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
keywords: list = []
if file_type.extension == "pdf":
elements = partition_pdf(file = file, strategy = "fast")
keywords = extract_keywords(
"\n".join(map(lambda element: clean(element.text), elements))
)
reader = PdfReader(stream = file)
pages = list(reader.pages[:3])
text = "\n".join([page.extract_text().strip() for page in pages])
keywords = extract_keywords(text)
file_path = file.name
file_name = file_path.split("/")[-1].split(".")[0]

2028
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -34,9 +34,8 @@ debugpy = "^1.8.0"
pyarrow = "^15.0.0"
pylint = "^3.0.3"
aiosqlite = "^0.20.0"
unstructured = {extras = ["all-docs"], version = "^0.12.5"}
pymupdf = "^1.23.25"
pandas = "^2.2.1"
pandas = "^2.2.0"
greenlet = "^3.0.3"
ruff = "^0.2.2"
filetype = "^1.2.0"
@ -51,7 +50,8 @@ graphistry = "^0.33.5"
tenacity = "^8.2.3"
weaviate-client = "^4.5.4"
scikit-learn = "^1.4.1.post1"
fastembed = "0.1.3"
fastembed = "^0.2.5"
pypdf = "^4.1.0"
[tool.poetry.extras]
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
@ -93,7 +93,6 @@ mkdocstrings = "^0.22.0"
mkdocstrings-python = "^1.1.2"
pytest-examples = "^0.0.10"
mkdocs-jupyter = "^0.24.6"
mkdocs-rss-plugin = "^1.12.0"
mkdocs-minify-plugin = "^0.8.0"
mkdocs-redirects = "^1.2.1"