172 lines
5 KiB
Python
172 lines
5 KiB
Python
# app.py
|
|
|
|
import os
|
|
os.environ['USE_CPU_ONLY'] = 'true'
|
|
|
|
import json
|
|
import hashlib
|
|
import tempfile
|
|
import asyncio
|
|
|
|
from starlette.applications import Starlette
|
|
from starlette.requests import Request
|
|
from starlette.responses import JSONResponse
|
|
from starlette.routing import Route
|
|
|
|
import aiofiles
|
|
from opensearchpy import AsyncOpenSearch
|
|
from opensearchpy._async.http_aiohttp import AIOHttpConnection
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
|
|
# Initialize Docling converter
|
|
converter = DocumentConverter() # basic converter; tweak via PipelineOptions if you need OCR, etc. :contentReference[oaicite:0]{index=0}
|
|
|
|
# Initialize Async OpenSearch (adjust hosts/auth as needed)
|
|
es = AsyncOpenSearch(
|
|
hosts=[{"host": "localhost", "port": 9200}],
|
|
connection_class=AIOHttpConnection,
|
|
scheme="https",
|
|
use_ssl=True,
|
|
verify_certs=False,
|
|
ssl_assert_fingerprint=None,
|
|
http_auth=("admin","OSisgendb1!"),
|
|
http_compress=True,
|
|
)
|
|
|
|
INDEX_NAME = "documents"
|
|
|
|
index_body = {
|
|
"settings": {"number_of_shards":1, "number_of_replicas":1},
|
|
"mappings": {
|
|
"properties": {
|
|
"origin": {
|
|
"properties": {
|
|
"binary_hash": {"type":"keyword"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
async def init_index():
|
|
if not await es.indices.exists(index=INDEX_NAME):
|
|
await es.indices.create(index=INDEX_NAME, body=index_body)
|
|
print(f"Created index '{INDEX_NAME}'")
|
|
else:
|
|
print(f"Index '{INDEX_NAME}' already exists, skipping creation.")
|
|
|
|
# Index will be initialized when the app starts
|
|
|
|
|
|
# ——————————————
|
|
# CORE PROCESSING LOGIC
|
|
# ——————————————
|
|
|
|
async def process_file_on_disk(path: str):
|
|
"""
|
|
1. Compute SHA256 hash by streaming the file in chunks.
|
|
2. If OpenSearch already has a doc with that ID, skip.
|
|
3. Otherwise, run Docling.convert(path) → JSON → index into OpenSearch.
|
|
"""
|
|
# 1) compute hash
|
|
sha256 = hashlib.sha256()
|
|
async with aiofiles.open(path, "rb") as f:
|
|
while True:
|
|
chunk = await f.read(1 << 20) # 1 MiB
|
|
if not chunk:
|
|
break
|
|
sha256.update(chunk)
|
|
file_hash = sha256.hexdigest()
|
|
|
|
# 2) check in OpenSearch
|
|
exists = await es.exists(index=INDEX_NAME, id=file_hash)
|
|
if exists:
|
|
return {"path": path, "status": "unchanged", "id": file_hash}
|
|
|
|
# 3) parse + index
|
|
result = converter.convert(path)
|
|
doc_dict = result.document.export_to_dict()
|
|
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
|
|
|
|
return {"path": path, "status": "indexed", "id": file_hash}
|
|
|
|
|
|
async def upload(request: Request):
|
|
"""
|
|
POST /upload
|
|
Form-data with a `file` field. Streams to disk + processes it.
|
|
"""
|
|
form = await request.form()
|
|
upload_file = form["file"] # starlette.datastructures.UploadFile
|
|
|
|
# stream into a temp file while hashing
|
|
sha256 = hashlib.sha256()
|
|
tmp = tempfile.NamedTemporaryFile(delete=False)
|
|
try:
|
|
while True:
|
|
chunk = await upload_file.read(1 << 20)
|
|
if not chunk:
|
|
break
|
|
sha256.update(chunk)
|
|
tmp.write(chunk)
|
|
tmp.flush()
|
|
|
|
file_hash = sha256.hexdigest()
|
|
# if you prefer the Datastax pattern for naming IDs, see:
|
|
# https://github.com/datastax/astra-assistants-api/blob/main/impl/utils.py#L229 :contentReference[oaicite:1]{index=1}
|
|
|
|
# check + index
|
|
exists = await es.exists(index=INDEX_NAME, id=file_hash)
|
|
if exists:
|
|
return JSONResponse({"status": "unchanged", "id": file_hash})
|
|
|
|
result = converter.convert(tmp.name)
|
|
doc_dict = result.document.export_to_dict()
|
|
await es.index(index=INDEX_NAME, id=file_hash, body=doc_dict)
|
|
|
|
return JSONResponse({"status": "indexed", "id": file_hash})
|
|
|
|
finally:
|
|
tmp.close()
|
|
os.remove(tmp.name)
|
|
|
|
|
|
async def upload_path(request: Request):
|
|
"""
|
|
POST /upload_path
|
|
JSON body: { "path": "/absolute/path/to/dir" }
|
|
Recursively processes every file found there in parallel.
|
|
"""
|
|
payload = await request.json()
|
|
base_dir = payload.get("path")
|
|
if not base_dir or not os.path.isdir(base_dir):
|
|
return JSONResponse({"error": "Invalid path"}, status_code=400)
|
|
|
|
tasks = []
|
|
for root, _, files in os.walk(base_dir):
|
|
for fn in files:
|
|
full = os.path.join(root, fn)
|
|
tasks.append(process_file_on_disk(full))
|
|
|
|
results = await asyncio.gather(*tasks)
|
|
return JSONResponse({"results": results})
|
|
|
|
|
|
app = Starlette(debug=True, routes=[
|
|
Route("/upload", upload, methods=["POST"]),
|
|
Route("/upload_path", upload_path, methods=["POST"]),
|
|
])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
# Initialize index before starting server
|
|
asyncio.run(init_index())
|
|
|
|
uvicorn.run(
|
|
"app:app", # "module:variable"
|
|
host="0.0.0.0",
|
|
port=8000,
|
|
reload=True, # dev only
|
|
)
|