fix: use pypdf for pdf text extraction
This commit is contained in:
parent
9c06e293b2
commit
a87d627ec2
6 changed files with 75 additions and 1987 deletions
|
|
@ -3,7 +3,6 @@ from os import path, listdir
|
||||||
import asyncio
|
import asyncio
|
||||||
import dlt
|
import dlt
|
||||||
import duckdb
|
import duckdb
|
||||||
from unstructured.cleaners.core import clean
|
|
||||||
from cognee.root_dir import get_absolute_path
|
from cognee.root_dir import get_absolute_path
|
||||||
import cognee.modules.ingestion as ingestion
|
import cognee.modules.ingestion as ingestion
|
||||||
from cognee.infrastructure.files import get_file_metadata
|
from cognee.infrastructure.files import get_file_metadata
|
||||||
|
|
@ -19,7 +18,7 @@ async def add(file_paths: Union[str, List[str]], dataset_name: str = None):
|
||||||
for file_or_dir in listdir(root_dir_path):
|
for file_or_dir in listdir(root_dir_path):
|
||||||
if path.isdir(path.join(root_dir_path, file_or_dir)):
|
if path.isdir(path.join(root_dir_path, file_or_dir)):
|
||||||
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
|
dataset_name = file_or_dir if parent_dir == "root" else parent_dir + "." + file_or_dir
|
||||||
dataset_name = clean(dataset_name.replace(" ", "_"))
|
dataset_name = dataset_name.strip().replace(" ", "_")
|
||||||
|
|
||||||
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
|
nested_datasets = list_dir_files(path.join(root_dir_path, file_or_dir), dataset_name)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,7 @@ import asyncio
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
import instructor
|
import instructor
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from unstructured.cleaners.core import clean
|
from pypdf import PdfReader
|
||||||
from unstructured.partition.pdf import partition_pdf
|
|
||||||
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
|
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
|
||||||
from cognee.modules.cognify.llm.label_content import label_content
|
from cognee.modules.cognify.llm.label_content import label_content
|
||||||
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
|
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
|
||||||
|
|
@ -61,8 +60,9 @@ async def cognify(datasets: Union[str, List[str]] = None, graphdatamodel: object
|
||||||
|
|
||||||
for file_metadata in files_metadata:
|
for file_metadata in files_metadata:
|
||||||
with open(file_metadata["file_path"], "rb") as file:
|
with open(file_metadata["file_path"], "rb") as file:
|
||||||
elements = partition_pdf(file = file, strategy = "fast")
|
reader = PdfReader(stream = file)
|
||||||
text = "\n".join(map(lambda element: clean(element.text), elements))
|
pages = list(reader.pages[:3])
|
||||||
|
text = "\n".join([page.extract_text().strip() for page in pages])
|
||||||
|
|
||||||
awaitables.append(process_text(text, file_metadata))
|
awaitables.append(process_text(text, file_metadata))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from fastembed.embedding import FlagEmbedding
|
from fastembed import TextEmbedding
|
||||||
from .EmbeddingEngine import EmbeddingEngine
|
from .EmbeddingEngine import EmbeddingEngine
|
||||||
|
|
||||||
class DefaultEmbeddingEngine(EmbeddingEngine):
|
class DefaultEmbeddingEngine(EmbeddingEngine):
|
||||||
async def embed_text(self, text: List[str]) -> List[float]:
|
async def embed_text(self, text: List[str]) -> List[float]:
|
||||||
embedding_model = FlagEmbedding(model_name = "BAAI/bge-large-en-v1.5")
|
embedding_model = TextEmbedding(model_name = "BAAI/bge-large-en-v1.5")
|
||||||
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
|
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
|
||||||
|
|
||||||
return embeddings_list
|
return embeddings_list
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
from typing import BinaryIO, TypedDict
|
from typing import BinaryIO, TypedDict
|
||||||
import filetype
|
import filetype
|
||||||
from unstructured.cleaners.core import clean
|
from pypdf import PdfReader
|
||||||
from unstructured.partition.pdf import partition_pdf
|
|
||||||
from .extract_keywords import extract_keywords
|
from .extract_keywords import extract_keywords
|
||||||
|
|
||||||
|
|
||||||
class FileTypeException(Exception):
|
class FileTypeException(Exception):
|
||||||
message: str
|
message: str
|
||||||
|
|
||||||
|
|
@ -27,10 +25,10 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||||
keywords: list = []
|
keywords: list = []
|
||||||
|
|
||||||
if file_type.extension == "pdf":
|
if file_type.extension == "pdf":
|
||||||
elements = partition_pdf(file = file, strategy = "fast")
|
reader = PdfReader(stream = file)
|
||||||
keywords = extract_keywords(
|
pages = list(reader.pages[:3])
|
||||||
"\n".join(map(lambda element: clean(element.text), elements))
|
text = "\n".join([page.extract_text().strip() for page in pages])
|
||||||
)
|
keywords = extract_keywords(text)
|
||||||
|
|
||||||
file_path = file.name
|
file_path = file.name
|
||||||
file_name = file_path.split("/")[-1].split(".")[0]
|
file_name = file_path.split("/")[-1].split(".")[0]
|
||||||
|
|
|
||||||
2028
poetry.lock
generated
2028
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -34,9 +34,8 @@ debugpy = "^1.8.0"
|
||||||
pyarrow = "^15.0.0"
|
pyarrow = "^15.0.0"
|
||||||
pylint = "^3.0.3"
|
pylint = "^3.0.3"
|
||||||
aiosqlite = "^0.20.0"
|
aiosqlite = "^0.20.0"
|
||||||
unstructured = {extras = ["all-docs"], version = "^0.12.5"}
|
|
||||||
pymupdf = "^1.23.25"
|
pymupdf = "^1.23.25"
|
||||||
pandas = "^2.2.1"
|
pandas = "^2.2.0"
|
||||||
greenlet = "^3.0.3"
|
greenlet = "^3.0.3"
|
||||||
ruff = "^0.2.2"
|
ruff = "^0.2.2"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
|
@ -51,7 +50,8 @@ graphistry = "^0.33.5"
|
||||||
tenacity = "^8.2.3"
|
tenacity = "^8.2.3"
|
||||||
weaviate-client = "^4.5.4"
|
weaviate-client = "^4.5.4"
|
||||||
scikit-learn = "^1.4.1.post1"
|
scikit-learn = "^1.4.1.post1"
|
||||||
fastembed = "0.1.3"
|
fastembed = "^0.2.5"
|
||||||
|
pypdf = "^4.1.0"
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
|
dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"]
|
||||||
|
|
@ -93,7 +93,6 @@ mkdocstrings = "^0.22.0"
|
||||||
mkdocstrings-python = "^1.1.2"
|
mkdocstrings-python = "^1.1.2"
|
||||||
pytest-examples = "^0.0.10"
|
pytest-examples = "^0.0.10"
|
||||||
mkdocs-jupyter = "^0.24.6"
|
mkdocs-jupyter = "^0.24.6"
|
||||||
mkdocs-rss-plugin = "^1.12.0"
|
|
||||||
mkdocs-minify-plugin = "^0.8.0"
|
mkdocs-minify-plugin = "^0.8.0"
|
||||||
mkdocs-redirects = "^1.2.1"
|
mkdocs-redirects = "^1.2.1"
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue