Compare commits
4 commits
main
...
1gb_data_t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a7f5be561 | ||
|
|
278ede9dbe | ||
|
|
e68b19ec83 | ||
|
|
7429f3253c |
3 changed files with 71 additions and 7 deletions
47
1gb_nonparallel_cognee.py
Normal file
47
1gb_nonparallel_cognee.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import modal
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import cognee
|
||||||
|
from cognee.shared.logging_utils import get_logger, setup_logging, INFO
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
app = modal.App("1gb_nonparallel_cognee")
|
||||||
|
|
||||||
|
image = (
|
||||||
|
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
|
||||||
|
.copy_local_file("pyproject.toml", "pyproject.toml")
|
||||||
|
.copy_local_file("poetry.lock", "poetry.lock")
|
||||||
|
.pip_install(
|
||||||
|
"protobuf",
|
||||||
|
"h2",
|
||||||
|
"deepeval",
|
||||||
|
"gdown",
|
||||||
|
"plotly",
|
||||||
|
"psycopg2-binary==2.9.10",
|
||||||
|
"asyncpg==0.30.0",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.function(
|
||||||
|
image=image,
|
||||||
|
max_containers=1,
|
||||||
|
timeout=86400,
|
||||||
|
secrets=[modal.Secret.from_name("1gb_nonparallel_cognee")],
|
||||||
|
)
|
||||||
|
async def run_cognee_1gb():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
setup_logging(log_level=INFO)
|
||||||
|
|
||||||
|
await cognee.add("s3://s3-test-laszlo")
|
||||||
|
await cognee.cognify()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@app.local_entrypoint()
|
||||||
|
async def main():
|
||||||
|
modal_tasks = [run_cognee_1gb.remote.aio()]
|
||||||
|
await asyncio.gather(*modal_tasks)
|
||||||
|
|
@ -151,13 +151,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
||||||
extract_graph_from_data,
|
extract_graph_from_data,
|
||||||
graph_model=graph_model,
|
graph_model=graph_model,
|
||||||
ontology_adapter=OntologyResolver(ontology_file=ontology_file_path),
|
ontology_adapter=OntologyResolver(ontology_file=ontology_file_path),
|
||||||
task_config={"batch_size": 10},
|
task_config={"batch_size": 50},
|
||||||
), # Generate knowledge graphs from the document chunks.
|
), # Generate knowledge graphs from the document chunks.
|
||||||
Task(
|
Task(
|
||||||
summarize_text,
|
summarize_text,
|
||||||
task_config={"batch_size": 10},
|
task_config={"batch_size": 50},
|
||||||
),
|
),
|
||||||
Task(add_data_points, task_config={"batch_size": 10}),
|
Task(add_data_points, task_config={"batch_size": 50}),
|
||||||
]
|
]
|
||||||
|
|
||||||
return default_tasks
|
return default_tasks
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,11 @@
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
from pypdf.errors import PdfReadError
|
||||||
from cognee.modules.chunking.Chunker import Chunker
|
from cognee.modules.chunking.Chunker import Chunker
|
||||||
from .open_data_file import open_data_file
|
from .open_data_file import open_data_file
|
||||||
from .Document import Document
|
from .Document import Document
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("PDFDocument")
|
||||||
|
|
||||||
|
|
||||||
class PdfDocument(Document):
|
class PdfDocument(Document):
|
||||||
|
|
@ -9,12 +13,25 @@ class PdfDocument(Document):
|
||||||
|
|
||||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||||
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
||||||
file = PdfReader(stream)
|
logger.info(f"Reading PDF:{self.raw_data_location}")
|
||||||
|
try:
|
||||||
|
file = PdfReader(stream, strict=False)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
for page in file.pages:
|
try:
|
||||||
page_text = page.extract_text()
|
for page in file.pages:
|
||||||
yield page_text
|
page_text = page.extract_text()
|
||||||
|
yield page_text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue