fix: use pypdf for pdf text extraction
This commit is contained in:
parent
9c06e293b2
commit
a87d627ec2
6 changed files with 75 additions and 1987 deletions
|
|
@ -3,7 +3,6 @@ from os import path, listdir
|
|||
import asyncio
|
||||
import dlt
|
||||
import duckdb
|
||||
from unstructured.cleaners.core import clean
|
||||
from cognee.root_dir import get_absolute_path
|
||||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure.files import get_file_metadata
|
||||
|
|
@ -19,7 +18,7 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
|
|||
for file_or_dir in listdir(root_dir_path):
|
||||
if path.isdir(path.join(root_dir_path, file_or_dir)):
|
||||
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
|
||||
dataset_name = clean(dataset_name.replace(" ", "_"))
|
||||
dataset_name = dataset_name.strip().replace(" ", "_")
|
||||
|
||||
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ import asyncio
|
|||
from typing import List, Union
|
||||
import instructor
|
||||
from openai import OpenAI
|
||||
from unstructured.cleaners.core import clean
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from pypdf import PdfReader
|
||||
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
|
||||
from cognee.modules.cognify.llm.label_content import label_content
|
||||
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
|
||||
|
|
@ -61,8 +60,9 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object
|
|||
|
||||
for file_metadata in files_metadata:
|
||||
with open(file_metadata["file_path"], "rb") as file:
|
||||
elements = partition_pdf(file = file, strategy = "fast")
|
||||
text = "\n".join(map(lambda element: clean(element.text), elements))
|
||||
reader = PdfReader(stream = file)
|
||||
pages = list(reader.pages[:3])
|
||||
text = "\n".join([page.extract_text().strip() for page in pages])
|
||||
|
||||
awaitables.append(process_text(text, file_metadata))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
from typing import List
|
||||
from fastembed.embedding import FlagEmbedding
|
||||
from fastembed import TextEmbedding
|
||||
from .EmbeddingEngine import EmbeddingEngine
|
||||
|
||||
class DefaultEmbeddingEngine(EmbeddingEngine):
|
||||
async def embed_text(self, text: List[str]) -> List[float]:
|
||||
embedding_model = FlagEmbedding(model_name = "BAAI/bge-large-en-v1.5")
|
||||
embedding_model = TextEmbedding(model_name = "BAAI/bge-large-en-v1.5")
|
||||
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
|
||||
|
||||
return embeddings_list
|
||||
|
|
|
|||
|
|
@ -1,10 +1,8 @@
|
|||
from typing import BinaryIO, TypedDict
|
||||
import filetype
|
||||
from unstructured.cleaners.core import clean
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from pypdf import PdfReader
|
||||
from .extract_keywords import extract_keywords
|
||||
|
||||
|
||||
class FileTypeException(Exception):
|
||||
message: str
|
||||
|
||||
|
|
@ -27,10 +25,10 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
|||
keywords: list = []
|
||||
|
||||
if file_type.extension == "pdf":
|
||||
elements = partition_pdf(file = file, strategy = "fast")
|
||||
keywords = extract_keywords(
|
||||
"\n".join(map(lambda element: clean(element.text), elements))
|
||||
)
|
||||
reader = PdfReader(stream = file)
|
||||
pages = list(reader.pages[:3])
|
||||
text = "\n".join([page.extract_text().strip() for page in pages])
|
||||
keywords = extract_keywords(text)
|
||||
|
||||
file_path = file.name
|
||||
file_name = file_path.split("/")[-1].split(".")[0]
|
||||
|
|
|
|||
2028
poetry.lock
generated
2028
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -34,9 +34,8 @@ debugpy = "^1.8.0"
|
|||
pyarrow = "^15.0.0"
|
||||
pylint = "^3.0.3"
|
||||
aiosqlite = "^0.20.0"
|
||||
unstructured = {extras = ["all-docs"], version = "^0.12.5"}
|
||||
pymupdf = "^1.23.25"
|
||||
pandas = "^2.2.1"
|
||||
pandas = "^2.2.0"
|
||||
greenlet = "^3.0.3"
|
||||
ruff = "^0.2.2"
|
||||
filetype = "^1.2.0"
|
||||
|
|
@ -51,7 +50,8 @@ graphistry = "^0.33.5"
|
|||
tenacity = "^8.2.3"
|
||||
weaviate-client = "^4.5.4"
|
||||
scikit-learn = "^1.4.1.post1"
|
||||
fastembed = "0.1.3"
|
||||
fastembed = "^0.2.5"
|
||||
pypdf = "^4.1.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
|
||||
|
|
@ -93,7 +93,6 @@ mkdocstrings = "^0.22.0"
|
|||
mkdocstrings-python = "^1.1.2"
|
||||
pytest-examples = "^0.0.10"
|
||||
mkdocs-jupyter = "^0.24.6"
|
||||
mkdocs-rss-plugin = "^1.12.0"
|
||||
mkdocs-minify-plugin = "^0.8.0"
|
||||
mkdocs-redirects = "^1.2.1"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue