cognee/cognee/infrastructure/files/utils/extract_text_from_file.py
2024-11-04 17:24:46 +01:00

13 lines
458 B
Python

from typing import BinaryIO
from pypdf import PdfReader
import filetype
def extract_text_from_file(file: BinaryIO, file_type: filetype.Type) -> str:
"""Extract text from a file"""
if file_type.extension == "pdf":
reader = PdfReader(stream = file)
pages = list(reader.pages[:3])
return "\n".join([page.extract_text().strip() for page in pages])
if file_type.extension == "txt":
return file.read().decode("utf-8")