take 0
This commit is contained in:
commit
6882fe59d2
9 changed files with 2222 additions and 0 deletions
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
.idea/
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
|||
3.13
|
||||
40
Dockerfile
Normal file
40
Dockerfile
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
FROM opensearchproject/opensearch:3.0.0
|
||||
|
||||
USER root
|
||||
|
||||
RUN echo y | dnf install less procps-ng findutils sysstat perf sudo
|
||||
|
||||
# Grant the opensearchuser sudo privileges
|
||||
# 'wheel' is the sudo group in Amazon Linux
|
||||
RUN usermod -aG wheel opensearch
|
||||
|
||||
# Change the sudoers file to allow passwordless sudo
|
||||
RUN echo "opensearch ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
|
||||
|
||||
# FIXME handle the machine arch better, somehow
|
||||
ARG ASYNC_PROFILER_URL=https://github.com/async-profiler/async-profiler/releases/download/v4.0/async-profiler-4.0-linux-x64.tar.gz
|
||||
|
||||
RUN mkdir /opt/async-profiler
|
||||
RUN curl -s -L $ASYNC_PROFILER_URL | tar zxvf - --strip-components=1 -C /opt/async-profiler
|
||||
RUN chown -R opensearch:opensearch /opt/async-profiler
|
||||
|
||||
RUN echo "#!/bin/bash" > /usr/share/opensearch/profile.sh
|
||||
RUN echo "export PATH=\$PATH:/opt/async-profiler/bin" >> /usr/share/opensearch/profile.sh
|
||||
RUN echo "echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid >/dev/null" >> /usr/share/opensearch/profile.sh
|
||||
RUN echo "echo 0 | sudo tee /proc/sys/kernel/kptr_restrict >/dev/null" >> /usr/share/opensearch/profile.sh
|
||||
RUN echo "asprof \$@" >> /usr/share/opensearch/profile.sh
|
||||
|
||||
RUN chmod 777 /usr/share/opensearch/profile.sh
|
||||
|
||||
USER opensearch
|
||||
|
||||
RUN opensearch-plugin remove opensearch-neural-search
|
||||
RUN opensearch-plugin remove opensearch-knn
|
||||
|
||||
# FIXME installing the prom exporter plugin ahead of time isn't compatible with the operator, for now
|
||||
# RUN opensearch-plugin install https://github.com/Virtimo/prometheus-exporter-plugin-for-opensearch/releases/download/v2.18.0/prometheus-exporter-2.18.0.0.zip
|
||||
|
||||
RUN echo y | opensearch-plugin install https://repo1.maven.org/maven2/org/opensearch/plugin/opensearch-jvector-plugin/3.0.0.3/opensearch-jvector-plugin-3.0.0.3.zip
|
||||
RUN echo y | opensearch-plugin install repository-gcs
|
||||
RUN echo y | opensearch-plugin install repository-azure
|
||||
RUN echo y | opensearch-plugin install repository-s3
|
||||
0
README.md
Normal file
0
README.md
Normal file
25
docker-compose.yml
Normal file
25
docker-compose.yml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
services:
|
||||
opensearch:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: os
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=OSisgendb1!
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "9600:9600"
|
||||
|
||||
dashboards:
|
||||
image: opensearchproject/opensearch-dashboards:3.0.0
|
||||
container_name: osdash
|
||||
depends_on:
|
||||
- opensearch
|
||||
environment:
|
||||
OPENSEARCH_HOSTS: '["https://opensearch:9200"]'
|
||||
OPENSEARCH_USERNAME: "admin"
|
||||
OPENSEARCH_PASSWORD: "OSisgendb1!"
|
||||
ports:
|
||||
- "5601:5601"
|
||||
|
||||
BIN
documents/2506.08231v1.pdf
Normal file
BIN
documents/2506.08231v1.pdf
Normal file
Binary file not shown.
28
pyproject.toml
Normal file
28
pyproject.toml
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
[project]
|
||||
name = "gendb"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"aiofiles>=24.1.0",
|
||||
"docling>=2.41.0",
|
||||
"opensearch-py[async]>=3.0.0",
|
||||
"python-multipart>=0.0.20",
|
||||
"starlette>=0.47.1",
|
||||
"torch>=2.7.1",
|
||||
"uvicorn>=0.35.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = [
|
||||
{ index = "pytorch-cu128" },
|
||||
]
|
||||
torchvision = [
|
||||
{ index = "pytorch-cu128" },
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cu128"
|
||||
url = "https://download.pytorch.org/whl/cu128"
|
||||
explicit = true
|
||||
172
src/app.py
Normal file
172
src/app.py
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
# app.py
|
||||
|
||||
import os
|
||||
os.environ['USE_CPU_ONLY'] = 'true'
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import tempfile
|
||||
import asyncio
|
||||
|
||||
from starlette.applications import Starlette
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import JSONResponse
|
||||
from starlette.routing import Route
|
||||
|
||||
import aiofiles
|
||||
from opensearchpy import AsyncOpenSearch
|
||||
from opensearchpy._async.http_aiohttp import AIOHttpConnection
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
|
||||
# Initialize Docling converter
|
||||
converter = DocumentConverter() # basic converter; tweak via PipelineOptions if you need OCR, etc. :contentReference[oaicite:0]{index=0}
|
||||
|
||||
# Initialize Async OpenSearch (adjust hosts/auth as needed)
|
||||
es = AsyncOpenSearch(
|
||||
hosts=[{"host": "localhost", "port": 9200}],
|
||||
connection_class=AIOHttpConnection,
|
||||
scheme="https",
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_assert_fingerprint=None,
|
||||
http_auth=("admin","OSisgendb1!"),
|
||||
http_compress=True,
|
||||
)
|
||||
|
||||
INDEX_NAME = "documents"
|
||||
|
||||
index_body = {
|
||||
"settings": {"number_of_shards":1, "number_of_replicas":1},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"origin": {
|
||||
"properties": {
|
||||
"binary_hash": {"type":"keyword"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
async def init_index():
|
||||
if not await es.indices.exists(index=INDEX_NAME):
|
||||
await es.indices.create(index=INDEX_NAME, body=index_body)
|
||||
print(f"Created index '{INDEX_NAME}'")
|
||||
else:
|
||||
print(f"Index '{INDEX_NAME}' already exists, skipping creation.")
|
||||
|
||||
# Index will be initialized when the app starts
|
||||
|
||||
|
||||
# ——————————————
|
||||
# CORE PROCESSING LOGIC
|
||||
# ——————————————
|
||||
|
||||
async def process_file_on_disk(path: str):
|
||||
"""
|
||||
1. Compute SHA256 hash by streaming the file in chunks.
|
||||
2. If OpenSearch already has a doc with that ID, skip.
|
||||
3. Otherwise, run Docling.convert(path) → JSON → index into OpenSearch.
|
||||
"""
|
||||
# 1) compute hash
|
||||
sha256 = hashlib.sha256()
|
||||
async with aiofiles.open(path, "rb") as f:
|
||||
while True:
|
||||
chunk = await f.read(1 << 20) # 1 MiB
|
||||
if not chunk:
|
||||
break
|
||||
sha256.update(chunk)
|
||||
file_hash = sha256.hexdigest()
|
||||
|
||||
# 2) check in OpenSearch
|
||||
exists = await es.exists(index=INDEX_NAME, id=file_hash)
|
||||
if exists:
|
||||
return {"path": path, "status": "unchanged", "id": file_hash}
|
||||
|
||||
# 3) parse + index
|
||||
result = converter.convert(path)
|
||||
doc_dict = result.document.export_to_dict()
|
||||
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
|
||||
|
||||
return {"path": path, "status": "indexed", "id": file_hash}
|
||||
|
||||
|
||||
async def upload(request: Request):
|
||||
"""
|
||||
POST /upload
|
||||
Form-data with a `file` field. Streams to disk + processes it.
|
||||
"""
|
||||
form = await request.form()
|
||||
upload_file = form["file"] # starlette.datastructures.UploadFile
|
||||
|
||||
# stream into a temp file while hashing
|
||||
sha256 = hashlib.sha256()
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
while True:
|
||||
chunk = await upload_file.read(1 << 20)
|
||||
if not chunk:
|
||||
break
|
||||
sha256.update(chunk)
|
||||
tmp.write(chunk)
|
||||
tmp.flush()
|
||||
|
||||
file_hash = sha256.hexdigest()
|
||||
# if you prefer the Datastax pattern for naming IDs, see:
|
||||
# https://github.com/datastax/astra-assistants-api/blob/main/impl/utils.py#L229 :contentReference[oaicite:1]{index=1}
|
||||
|
||||
# check + index
|
||||
exists = await es.exists(index=INDEX_NAME, id=file_hash)
|
||||
if exists:
|
||||
return JSONResponse({"status": "unchanged", "id": file_hash})
|
||||
|
||||
result = converter.convert(tmp.name)
|
||||
doc_dict = result.document.export_to_dict()
|
||||
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
|
||||
|
||||
return JSONResponse({"status": "indexed", "id": file_hash})
|
||||
|
||||
finally:
|
||||
tmp.close()
|
||||
os.remove(tmp.name)
|
||||
|
||||
|
||||
async def upload_path(request: Request):
|
||||
"""
|
||||
POST /upload_path
|
||||
JSON body: { "path": "/absolute/path/to/dir" }
|
||||
Recursively processes every file found there in parallel.
|
||||
"""
|
||||
payload = await request.json()
|
||||
base_dir = payload.get("path")
|
||||
if not base_dir or not os.path.isdir(base_dir):
|
||||
return JSONResponse({"error": "Invalid path"}, status_code=400)
|
||||
|
||||
tasks = []
|
||||
for root, _, files in os.walk(base_dir):
|
||||
for fn in files:
|
||||
full = os.path.join(root, fn)
|
||||
tasks.append(process_file_on_disk(full))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
return JSONResponse({"results": results})
|
||||
|
||||
|
||||
app = Starlette(debug=True, routes=[
|
||||
Route("/upload", upload, methods=["POST"]),
|
||||
Route("/upload_path", upload_path, methods=["POST"]),
|
||||
])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
# Initialize index before starting server
|
||||
asyncio.run(init_index())
|
||||
|
||||
uvicorn.run(
|
||||
"app:app", # "module:variable"
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=True, # dev only
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue