cognee/cognee/infrastructure/files/utils/extract_text_from_file.py
2024-03-29 13:53:59 +01:00

11 lines
392 B
Python

from typing import BinaryIO
from pypdf import PdfReader
def extract_text_from_file(file: BinaryIO, file_type) -> str:
if file_type.extension == "pdf":
reader = PdfReader(stream = file)
pages = list(reader.pages[:3])
return "\n".join([page.extract_text().strip() for page in pages])
if file_type.extension == "txt":
return file.read().decode("utf-8")