openrag/src/app.py
estevez.sebastian@gmail.com a95bc06ff1 take 0
2025-07-10 22:36:45 -04:00

172 lines
5 KiB
Python

# app.py
import os
os.environ['USE_CPU_ONLY'] = 'true'
import json
import hashlib
import tempfile
import asyncio
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import JSONResponse
from starlette.routing import Route
import aiofiles
from opensearchpy import AsyncOpenSearch
from opensearchpy._async.http_aiohttp import AIOHttpConnection
from docling.document_converter import DocumentConverter
# Initialize Docling converter
converter = DocumentConverter() # basic converter; tweak via PipelineOptions if you need OCR, etc. :contentReference[oaicite:0]{index=0}
# Initialize Async OpenSearch (adjust hosts/auth as needed)
es = AsyncOpenSearch(
hosts=[{"host": "localhost", "port": 9200}],
connection_class=AIOHttpConnection,
scheme="https",
use_ssl=True,
verify_certs=False,
ssl_assert_fingerprint=None,
http_auth=("admin","OSisgendb1!"),
http_compress=True,
)
INDEX_NAME = "documents"
index_body = {
"settings": {"number_of_shards":1, "number_of_replicas":1},
"mappings": {
"properties": {
"origin": {
"properties": {
"binary_hash": {"type":"keyword"}
}
}
}
}
}
async def init_index():
if not await es.indices.exists(index=INDEX_NAME):
await es.indices.create(index=INDEX_NAME, body=index_body)
print(f"Created index '{INDEX_NAME}'")
else:
print(f"Index '{INDEX_NAME}' already exists, skipping creation.")
# Index will be initialized when the app starts
# ——————————————
# CORE PROCESSING LOGIC
# ——————————————
async def process_file_on_disk(path: str):
"""
1. Compute SHA256 hash by streaming the file in chunks.
2. If OpenSearch already has a doc with that ID, skip.
3. Otherwise, run Docling.convert(path) → JSON → index into OpenSearch.
"""
# 1) compute hash
sha256 = hashlib.sha256()
async with aiofiles.open(path, "rb") as f:
while True:
chunk = await f.read(1 << 20) # 1 MiB
if not chunk:
break
sha256.update(chunk)
file_hash = sha256.hexdigest()
# 2) check in OpenSearch
exists = await es.exists(index=INDEX_NAME, id=file_hash)
if exists:
return {"path": path, "status": "unchanged", "id": file_hash}
# 3) parse + index
result = converter.convert(path)
doc_dict = result.document.export_to_dict()
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
return {"path": path, "status": "indexed", "id": file_hash}
async def upload(request: Request):
"""
POST /upload
Form-data with a `file` field. Streams to disk + processes it.
"""
form = await request.form()
upload_file = form["file"] # starlette.datastructures.UploadFile
# stream into a temp file while hashing
sha256 = hashlib.sha256()
tmp = tempfile.NamedTemporaryFile(delete=False)
try:
while True:
chunk = await upload_file.read(1 << 20)
if not chunk:
break
sha256.update(chunk)
tmp.write(chunk)
tmp.flush()
file_hash = sha256.hexdigest()
# if you prefer the Datastax pattern for naming IDs, see:
# https://github.com/datastax/astra-assistants-api/blob/main/impl/utils.py#L229 :contentReference[oaicite:1]{index=1}
# check + index
exists = await es.exists(index=INDEX_NAME, id=file_hash)
if exists:
return JSONResponse({"status": "unchanged", "id": file_hash})
result = converter.convert(tmp.name)
doc_dict = result.document.export_to_dict()
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
return JSONResponse({"status": "indexed", "id": file_hash})
finally:
tmp.close()
os.remove(tmp.name)
async def upload_path(request: Request):
"""
POST /upload_path
JSON body: { "path": "/absolute/path/to/dir" }
Recursively processes every file found there in parallel.
"""
payload = await request.json()
base_dir = payload.get("path")
if not base_dir or not os.path.isdir(base_dir):
return JSONResponse({"error": "Invalid path"}, status_code=400)
tasks = []
for root, _, files in os.walk(base_dir):
for fn in files:
full = os.path.join(root, fn)
tasks.append(process_file_on_disk(full))
results = await asyncio.gather(*tasks)
return JSONResponse({"results": results})
app = Starlette(debug=True, routes=[
Route("/upload", upload, methods=["POST"]),
Route("/upload_path", upload_path, methods=["POST"]),
])
if __name__ == "__main__":
import uvicorn
# Initialize index before starting server
asyncio.run(init_index())
uvicorn.run(
"app:app", # "module:variable"
host="0.0.0.0",
port=8000,
reload=True, # dev only
)