chore: Update dependencies to handle different document types

Update unstructured so it would install support for different document types

Chore COG-685
This commit is contained in:
Igor Ilic 2024-12-09 09:49:26 +01:00
parent 5567370214
commit df289deb18
6 changed files with 112 additions and 6 deletions

View file

@ -27,7 +27,7 @@ TEST_TEXT = """
def test_AudioDocument():
document = AudioDocument(
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip(

View file

@ -16,7 +16,7 @@ The commotion has attracted an audience: a murder of crows has gathered in the l
def test_ImageDocument():
document = ImageDocument(
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):

View file

@ -17,7 +17,8 @@ def test_PdfDocument():
"artificial-intelligence.pdf",
)
document = PdfDocument(
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(),
mime_type="",
)
for ground_truth, paragraph_data in zip(

View file

@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size):
input_file,
)
document = TextDocument(
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="",
)
for ground_truth, paragraph_data in zip(

105
poetry.lock generated
View file

@ -1525,6 +1525,17 @@ files = [
[package.extras]
dev = ["coverage", "pytest (>=7.4.4)"]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = true
python-versions = ">=3.8"
files = [
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
]
[[package]]
name = "exceptiongroup"
version = "1.2.2"
@ -4655,6 +4666,20 @@ typing-extensions = ">=4.11,<5"
[package.extras]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
[[package]]
name = "openpyxl"
version = "3.1.5"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = true
python-versions = ">=3.8"
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]]
name = "opentelemetry-api"
version = "1.27.0"
@ -5885,6 +5910,17 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
model = ["milvus-model (>=0.1.0)"]
[[package]]
name = "pypandoc"
version = "1.14"
description = "Thin wrapper for pandoc."
optional = true
python-versions = ">=3.6"
files = [
{file = "pypandoc-1.14-py3-none-any.whl", hash = "sha256:1315c7ad7fac7236dacf69a05b521ed2c3f1d0177f70e9b92bfffce6c023df22"},
{file = "pypandoc-1.14.tar.gz", hash = "sha256:6b4c45f5f1b9fb5bb562079164806bdbbc3e837b5402bcf3f1139edc5730a197"},
]
[[package]]
name = "pyparsing"
version = "3.2.0"
@ -6008,6 +6044,21 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python-docx"
version = "1.1.2"
description = "Create, read, and update Microsoft Word .docx files."
optional = true
python-versions = ">=3.7"
files = [
{file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"},
{file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"},
]
[package.dependencies]
lxml = ">=3.1.0"
typing-extensions = ">=4.9.0"
[[package]]
name = "python-dotenv"
version = "1.0.1"
@ -6085,6 +6136,23 @@ click = "*"
olefile = "*"
typing-extensions = ">=4.9.0"
[[package]]
name = "python-pptx"
version = "1.0.2"
description = "Create, read, and update PowerPoint 2007+ (.pptx) files."
optional = true
python-versions = ">=3.8"
files = [
{file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"},
{file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"},
]
[package.dependencies]
lxml = ">=3.1.0"
Pillow = ">=3.3.2"
typing-extensions = ">=4.9.0"
XlsxWriter = ">=0.5.7"
[[package]]
name = "pytz"
version = "2024.2"
@ -8141,18 +8209,26 @@ filetype = "*"
html5lib = "*"
langdetect = "*"
lxml = "*"
markdown = {version = "*", optional = true, markers = "extra == \"md\""}
networkx = {version = "*", optional = true, markers = "extra == \"xlsx\""}
nltk = "*"
numpy = "<2"
openpyxl = {version = "*", optional = true, markers = "extra == \"xlsx\""}
pandas = {version = "*", optional = true, markers = "extra == \"csv\" or extra == \"tsv\" or extra == \"xlsx\""}
psutil = "*"
pypandoc = {version = "*", optional = true, markers = "extra == \"epub\" or extra == \"odt\" or extra == \"org\" or extra == \"rst\" or extra == \"rtf\""}
python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"doc\" or extra == \"docx\" or extra == \"odt\""}
python-iso639 = "*"
python-magic = "*"
python-oxmsg = "*"
python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"ppt\" or extra == \"pptx\""}
rapidfuzz = "*"
requests = "*"
tqdm = "*"
typing-extensions = "*"
unstructured-client = "*"
wrapt = "*"
xlrd = {version = "*", optional = true, markers = "extra == \"xlsx\""}
[package.extras]
all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
@ -8498,6 +8574,33 @@ files = [
{file = "wrapt-1.17.0.tar.gz", hash = "sha256:16187aa2317c731170a88ef35e8937ae0f533c402872c1ee5e6d079fcf320801"},
]
[[package]]
name = "xlrd"
version = "2.0.1"
description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files"
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
files = [
{file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"},
{file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"},
]
[package.extras]
build = ["twine", "wheel"]
docs = ["sphinx"]
test = ["pytest", "pytest-cov"]
[[package]]
name = "xlsxwriter"
version = "3.2.0"
description = "A Python module for creating Excel XLSX files."
optional = true
python-versions = ">=3.6"
files = [
{file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"},
{file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"},
]
[[package]]
name = "xxhash"
version = "3.5.0"
@ -8765,4 +8868,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0,<3.12"
content-hash = "c9a760447a62b3c71fa84f20a614b6d3c5725b3869fc87f78b03eb2c80841ce1"
content-hash = "c1f30981f79db94213a89aec3207f0b4775944968e97dda8aa49c3aa143ce7b5"

View file

@ -73,7 +73,9 @@ llama-index-core = {version = "^0.11.22", optional = true}
deepeval = {version = "^2.0.1", optional = true}
transformers = "^4.46.3"
pymilvus = {version = "^2.5.0", optional = true}
unstructured = {version = "^0.16.10", optional = true}
unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.10", optional = true }
[tool.poetry.extras]
filesystem = ["s3fs", "botocore"]