take 0

2025-07-10 22:36:45 -04:00 · 2025-07-10 22:36:45 -04:00 · 6882fe59d2
commit 6882fe59d2
9 changed files with 2222 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,12 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+.idea/
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.13
--- a/40
+++ b/40
@ -0,0 +1,40 @@
+FROM opensearchproject/opensearch:3.0.0
+
+USER root
+
+RUN echo y | dnf install less procps-ng findutils sysstat perf sudo
+
+# Grant the opensearchuser sudo privileges
+# 'wheel' is the sudo group in Amazon Linux
+RUN usermod -aG wheel opensearch
+
+# Change the sudoers file to allow passwordless sudo
+RUN echo "opensearch ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# FIXME handle the machine arch better, somehow
+ARG ASYNC_PROFILER_URL=https://github.com/async-profiler/async-profiler/releases/download/v4.0/async-profiler-4.0-linux-x64.tar.gz
+
+RUN mkdir /opt/async-profiler
+RUN curl -s -L $ASYNC_PROFILER_URL | tar zxvf - --strip-components=1 -C /opt/async-profiler
+RUN chown -R opensearch:opensearch /opt/async-profiler
+
+RUN echo "#!/bin/bash" > /usr/share/opensearch/profile.sh
+RUN echo "export PATH=\$PATH:/opt/async-profiler/bin" >> /usr/share/opensearch/profile.sh
+RUN echo "echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid >/dev/null" >> /usr/share/opensearch/profile.sh
+RUN echo "echo 0 | sudo tee /proc/sys/kernel/kptr_restrict >/dev/null" >> /usr/share/opensearch/profile.sh
+RUN echo "asprof \$@" >> /usr/share/opensearch/profile.sh
+
+RUN chmod 777 /usr/share/opensearch/profile.sh
+
+USER opensearch
+
+RUN opensearch-plugin remove opensearch-neural-search
+RUN opensearch-plugin remove opensearch-knn
+
+# FIXME installing the prom exporter plugin ahead of time isn't compatible with the operator, for now
+# RUN opensearch-plugin install https://github.com/Virtimo/prometheus-exporter-plugin-for-opensearch/releases/download/v2.18.0/prometheus-exporter-2.18.0.0.zip
+
+RUN echo y | opensearch-plugin install https://repo1.maven.org/maven2/org/opensearch/plugin/opensearch-jvector-plugin/3.0.0.3/opensearch-jvector-plugin-3.0.0.3.zip
+RUN echo y | opensearch-plugin install repository-gcs
+RUN echo y | opensearch-plugin install repository-azure
+RUN echo y | opensearch-plugin install repository-s3
--- a/README.md
+++ b/README.md
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,25 @@
+services:
+  opensearch:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: os
+    environment:
+      - discovery.type=single-node
+      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=OSisgendb1!
+    ports:
+      - "9200:9200"
+      - "9600:9600"
+
+  dashboards:
+    image: opensearchproject/opensearch-dashboards:3.0.0
+    container_name: osdash
+    depends_on:
+      - opensearch
+    environment:
+      OPENSEARCH_HOSTS: '["https://opensearch:9200"]'
+      OPENSEARCH_USERNAME: "admin"
+      OPENSEARCH_PASSWORD: "OSisgendb1!"
+    ports:
+      - "5601:5601"
+
--- a/documents/2506.08231v1.pdf
+++ b/documents/2506.08231v1.pdf
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,28 @@
+[project]
+name = "gendb"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "aiofiles>=24.1.0",
+    "docling>=2.41.0",
+    "opensearch-py[async]>=3.0.0",
+    "python-multipart>=0.0.20",
+    "starlette>=0.47.1",
+    "torch>=2.7.1",
+    "uvicorn>=0.35.0",
+]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu128" },
+]
+torchvision = [
+  { index = "pytorch-cu128" },
+]
+
+[[tool.uv.index]]
+name     = "pytorch-cu128"
+url      = "https://download.pytorch.org/whl/cu128"
+explicit = true
--- a/src/app.py
+++ b/src/app.py
@ -0,0 +1,172 @@
+# app.py
+
+import os
+os.environ['USE_CPU_ONLY'] = 'true'
+
+import json
+import hashlib
+import tempfile
+import asyncio
+
+from starlette.applications import Starlette
+from starlette.requests     import Request
+from starlette.responses    import JSONResponse
+from starlette.routing      import Route
+
+import aiofiles
+from opensearchpy import AsyncOpenSearch
+from opensearchpy._async.http_aiohttp import AIOHttpConnection
+from docling.document_converter import DocumentConverter
+
+
+# Initialize Docling converter
+converter = DocumentConverter()  # basic converter; tweak via PipelineOptions if you need OCR, etc. :contentReference[oaicite:0]{index=0}
+
+# Initialize Async OpenSearch (adjust hosts/auth as needed)
+es = AsyncOpenSearch(
+    hosts=[{"host": "localhost", "port": 9200}],
+    connection_class=AIOHttpConnection,
+    scheme="https",
+    use_ssl=True,
+    verify_certs=False,
+    ssl_assert_fingerprint=None,
+    http_auth=("admin","OSisgendb1!"),
+    http_compress=True,
+)
+
+INDEX_NAME = "documents"
+
+index_body = {
+    "settings": {"number_of_shards":1, "number_of_replicas":1},
+    "mappings": {
+        "properties": {
+            "origin": {
+                "properties": {
+                    "binary_hash": {"type":"keyword"}
+                }
+            }
+        }
+    }
+}
+async def init_index():
+    if not await es.indices.exists(index=INDEX_NAME):
+        await es.indices.create(index=INDEX_NAME, body=index_body)
+        print(f"Created index '{INDEX_NAME}'")
+    else:
+        print(f"Index '{INDEX_NAME}' already exists, skipping creation.")
+
+# Index will be initialized when the app starts
+
+
+# ——————————————
+# CORE PROCESSING LOGIC
+# ——————————————
+
+async def process_file_on_disk(path: str):
+    """
+    1. Compute SHA256 hash by streaming the file in chunks.
+    2. If OpenSearch already has a doc with that ID, skip.
+    3. Otherwise, run Docling.convert(path) → JSON → index into OpenSearch.
+    """
+    # 1) compute hash
+    sha256 = hashlib.sha256()
+    async with aiofiles.open(path, "rb") as f:
+        while True:
+            chunk = await f.read(1 << 20)  # 1 MiB
+            if not chunk:
+                break
+            sha256.update(chunk)
+    file_hash = sha256.hexdigest()
+
+    # 2) check in OpenSearch
+    exists = await es.exists(index=INDEX_NAME, id=file_hash)
+    if exists:
+        return {"path": path, "status": "unchanged", "id": file_hash}
+
+    # 3) parse + index
+    result = converter.convert(path)
+    doc_dict = result.document.export_to_dict()
+    await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
+
+    return {"path": path, "status": "indexed", "id": file_hash}
+
+
+async def upload(request: Request):
+    """
+    POST /upload
+    Form-data with a `file` field. Streams to disk + processes it.
+    """
+    form = await request.form()
+    upload_file = form["file"]  # starlette.datastructures.UploadFile
+
+    # stream into a temp file while hashing
+    sha256 = hashlib.sha256()
+    tmp = tempfile.NamedTemporaryFile(delete=False)
+    try:
+        while True:
+            chunk = await upload_file.read(1 << 20)
+            if not chunk:
+                break
+            sha256.update(chunk)
+            tmp.write(chunk)
+        tmp.flush()
+
+        file_hash = sha256.hexdigest()
+        # if you prefer the Datastax pattern for naming IDs, see:
+        # https://github.com/datastax/astra-assistants-api/blob/main/impl/utils.py#L229 :contentReference[oaicite:1]{index=1}
+
+        # check + index
+        exists = await es.exists(index=INDEX_NAME, id=file_hash)
+        if exists:
+            return JSONResponse({"status": "unchanged", "id": file_hash})
+
+        result = converter.convert(tmp.name)
+        doc_dict = result.document.export_to_dict()
+        await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
+
+        return JSONResponse({"status": "indexed", "id": file_hash})
+
+    finally:
+        tmp.close()
+        os.remove(tmp.name)
+
+
+async def upload_path(request: Request):
+    """
+    POST /upload_path
+    JSON body: { "path": "/absolute/path/to/dir" }
+    Recursively processes every file found there in parallel.
+    """
+    payload = await request.json()
+    base_dir = payload.get("path")
+    if not base_dir or not os.path.isdir(base_dir):
+        return JSONResponse({"error": "Invalid path"}, status_code=400)
+
+    tasks = []
+    for root, _, files in os.walk(base_dir):
+        for fn in files:
+            full = os.path.join(root, fn)
+            tasks.append(process_file_on_disk(full))
+
+    results = await asyncio.gather(*tasks)
+    return JSONResponse({"results": results})
+
+
+app = Starlette(debug=True, routes=[
+    Route("/upload",      upload,       methods=["POST"]),
+    Route("/upload_path", upload_path,  methods=["POST"]),
+])
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    # Initialize index before starting server
+    asyncio.run(init_index())
+    
+    uvicorn.run(
+        "app:app",        # "module:variable"
+        host="0.0.0.0",
+        port=8000,
+        reload=True,      # dev only
+    )
--- a/uv.lock
+++ b/uv.lock