chore: Update dependencies to handle different document types
Update unstructured so it would install support for different document types Chore COG-685
This commit is contained in:
parent
5567370214
commit
df289deb18
6 changed files with 112 additions and 6 deletions
|
|
@ -27,7 +27,7 @@ TEST_TEXT = """
|
|||
def test_AudioDocument():
|
||||
|
||||
document = AudioDocument(
|
||||
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
|
||||
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
|
||||
)
|
||||
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
|
||||
for ground_truth, paragraph_data in zip(
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ The commotion has attracted an audience: a murder of crows has gathered in the l
|
|||
def test_ImageDocument():
|
||||
|
||||
document = ImageDocument(
|
||||
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
|
||||
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
|
||||
)
|
||||
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,8 @@ def test_PdfDocument():
|
|||
"artificial-intelligence.pdf",
|
||||
)
|
||||
document = PdfDocument(
|
||||
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4()
|
||||
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(),
|
||||
mime_type="",
|
||||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size):
|
|||
input_file,
|
||||
)
|
||||
document = TextDocument(
|
||||
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4()
|
||||
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="",
|
||||
)
|
||||
|
||||
for ground_truth, paragraph_data in zip(
|
||||
|
|
|
|||
105
poetry.lock
generated
105
poetry.lock
generated
|
|
@ -1525,6 +1525,17 @@ files = [
|
|||
[package.extras]
|
||||
dev = ["coverage", "pytest (>=7.4.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
description = "An implementation of lxml.xmlfile for the standard library"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
|
||||
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.2.2"
|
||||
|
|
@ -4655,6 +4666,20 @@ typing-extensions = ">=4.11,<5"
|
|||
[package.extras]
|
||||
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
|
||||
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-api"
|
||||
version = "1.27.0"
|
||||
|
|
@ -5885,6 +5910,17 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
|
|||
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
|
||||
model = ["milvus-model (>=0.1.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pypandoc"
|
||||
version = "1.14"
|
||||
description = "Thin wrapper for pandoc."
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "pypandoc-1.14-py3-none-any.whl", hash = "sha256:1315c7ad7fac7236dacf69a05b521ed2c3f1d0177f70e9b92bfffce6c023df22"},
|
||||
{file = "pypandoc-1.14.tar.gz", hash = "sha256:6b4c45f5f1b9fb5bb562079164806bdbbc3e837b5402bcf3f1139edc5730a197"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyparsing"
|
||||
version = "3.2.0"
|
||||
|
|
@ -6008,6 +6044,21 @@ files = [
|
|||
[package.dependencies]
|
||||
six = ">=1.5"
|
||||
|
||||
[[package]]
|
||||
name = "python-docx"
|
||||
version = "1.1.2"
|
||||
description = "Create, read, and update Microsoft Word .docx files."
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"},
|
||||
{file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
lxml = ">=3.1.0"
|
||||
typing-extensions = ">=4.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "1.0.1"
|
||||
|
|
@ -6085,6 +6136,23 @@ click = "*"
|
|||
olefile = "*"
|
||||
typing-extensions = ">=4.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "python-pptx"
|
||||
version = "1.0.2"
|
||||
description = "Create, read, and update PowerPoint 2007+ (.pptx) files."
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"},
|
||||
{file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
lxml = ">=3.1.0"
|
||||
Pillow = ">=3.3.2"
|
||||
typing-extensions = ">=4.9.0"
|
||||
XlsxWriter = ">=0.5.7"
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2024.2"
|
||||
|
|
@ -8141,18 +8209,26 @@ filetype = "*"
|
|||
html5lib = "*"
|
||||
langdetect = "*"
|
||||
lxml = "*"
|
||||
markdown = {version = "*", optional = true, markers = "extra == \"md\""}
|
||||
networkx = {version = "*", optional = true, markers = "extra == \"xlsx\""}
|
||||
nltk = "*"
|
||||
numpy = "<2"
|
||||
openpyxl = {version = "*", optional = true, markers = "extra == \"xlsx\""}
|
||||
pandas = {version = "*", optional = true, markers = "extra == \"csv\" or extra == \"tsv\" or extra == \"xlsx\""}
|
||||
psutil = "*"
|
||||
pypandoc = {version = "*", optional = true, markers = "extra == \"epub\" or extra == \"odt\" or extra == \"org\" or extra == \"rst\" or extra == \"rtf\""}
|
||||
python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"doc\" or extra == \"docx\" or extra == \"odt\""}
|
||||
python-iso639 = "*"
|
||||
python-magic = "*"
|
||||
python-oxmsg = "*"
|
||||
python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"ppt\" or extra == \"pptx\""}
|
||||
rapidfuzz = "*"
|
||||
requests = "*"
|
||||
tqdm = "*"
|
||||
typing-extensions = "*"
|
||||
unstructured-client = "*"
|
||||
wrapt = "*"
|
||||
xlrd = {version = "*", optional = true, markers = "extra == \"xlsx\""}
|
||||
|
||||
[package.extras]
|
||||
all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
|
||||
|
|
@ -8498,6 +8574,33 @@ files = [
|
|||
{file = "wrapt-1.17.0.tar.gz", hash = "sha256:16187aa2317c731170a88ef35e8937ae0f533c402872c1ee5e6d079fcf320801"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xlrd"
|
||||
version = "2.0.1"
|
||||
description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files"
|
||||
optional = true
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
files = [
|
||||
{file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"},
|
||||
{file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
build = ["twine", "wheel"]
|
||||
docs = ["sphinx"]
|
||||
test = ["pytest", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "xlsxwriter"
|
||||
version = "3.2.0"
|
||||
description = "A Python module for creating Excel XLSX files."
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"},
|
||||
{file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xxhash"
|
||||
version = "3.5.0"
|
||||
|
|
@ -8765,4 +8868,4 @@ weaviate = ["weaviate-client"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9.0,<3.12"
|
||||
content-hash = "c9a760447a62b3c71fa84f20a614b6d3c5725b3869fc87f78b03eb2c80841ce1"
|
||||
content-hash = "c1f30981f79db94213a89aec3207f0b4775944968e97dda8aa49c3aa143ce7b5"
|
||||
|
|
|
|||
|
|
@ -73,7 +73,9 @@ llama-index-core = {version = "^0.11.22", optional = true}
|
|||
deepeval = {version = "^2.0.1", optional = true}
|
||||
transformers = "^4.46.3"
|
||||
pymilvus = {version = "^2.5.0", optional = true}
|
||||
unstructured = {version = "^0.16.10", optional = true}
|
||||
unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.10", optional = true }
|
||||
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
filesystem = ["s3fs", "botocore"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue