per chunk index

2025-07-11 00:57:32 -04:00 · 2025-07-11 00:57:32 -04:00 · 5465a1204a
commit 5465a1204a
parent b97a86bdd5
1 changed files with 48 additions and 42 deletions
--- a/src/app.py
+++ b/src/app.py
@ -47,31 +47,21 @@ index_body = {
    },
    "mappings": {
        "properties": {
-            "id":        { "type": "keyword" },
-            "origin": {
-                "properties": {
-                    "binary_hash": { "type": "keyword" }
-                }
-            },
-            "filename":  { "type": "keyword" },
-            "mimetype":  { "type": "keyword" },
-            "chunks": {
-                "type": "nested",
-                "properties": {
-                    "page":  { "type": "integer" },
-                    "text":  { "type": "text" },
-                    "chunk_embedding": {
-                        "type": "knn_vector",
-                        "dimension": VECTOR_DIM,
-                        "method": {
-                            "name":       "disk_ann",
-                            "engine":     "jvector",
-                            "space_type": "l2",
-                            "parameters": {
-                                "ef_construction": 100,
-                                "m":               16
-                            }
-                        }
+            "document_id": { "type": "keyword" },
+            "filename":    { "type": "keyword" },
+            "mimetype":    { "type": "keyword" },
+            "page":        { "type": "integer" },
+            "text":        { "type": "text" },
+            "chunk_embedding": {
+                "type": "knn_vector",
+                "dimension": VECTOR_DIM,
+                "method": {
+                    "name":       "disk_ann",
+                    "engine":     "jvector",
+                    "space_type": "l2",
+                    "parameters": {
+                        "ef_construction": 100,
+                        "m":               16
                    }
                }
            }
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
    #    return {"status": "unchanged", "id": file_hash}

    # convert and extract
+    # TODO: Check if docling can handle in-memory bytes instead of file path
+    # This would eliminate the need for temp files in upload flow
    result = converter.convert(file_path)
    full_doc = result.document.export_to_dict()
    slim_doc = extract_relevant(full_doc)
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    embeddings = [d.embedding for d in resp.data]

-    # attach embeddings
-    for chunk, vect in zip(slim_doc["chunks"], embeddings):
-        chunk["chunk_embedding"] = vect
-
-    await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc)
+    # Index each chunk as a separate document
+    for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
+        chunk_doc = {
+            "document_id": file_hash,
+            "filename": slim_doc["filename"],
+            "mimetype": slim_doc["mimetype"],
+            "page": chunk["page"],
+            "text": chunk["text"],
+            "chunk_embedding": vect
+        }
+        chunk_id = f"{file_hash}_{i}"
+        await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
    return {"status": "indexed", "id": file_hash}

 async def process_file_on_disk(path: str):
@ -221,27 +220,34 @@ async def search(request: Request):
    resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
    query_embedding = resp.data[0].embedding

-    # Search using vector similarity
+    # Search using vector similarity on individual chunks
    search_body = {
        "query": {
-            "nested": {
-                "path": "chunks",
-                "query": {
-                    "knn": {
-                        "chunks.chunk_embedding": {
-                            "vector": query_embedding,
-                            "k": 10
-                        }
-                    }
+            "knn": {
+                "chunk_embedding": {
+                    "vector": query_embedding,
+                    "k": 10
                }
            }
        },
-        "_source": ["chunks.text", "chunks.page", "filename", "mimetype"],
+        "_source": ["filename", "mimetype", "page", "text"],
        "size": 10
    }

    results = await es.search(index=INDEX_NAME, body=search_body)
-    return JSONResponse({"results": results["hits"]["hits"]})
+    
+    # Transform results to match expected format
+    chunks = []
+    for hit in results["hits"]["hits"]:
+        chunks.append({
+            "filename": hit["_source"]["filename"],
+            "mimetype": hit["_source"]["mimetype"],
+            "page": hit["_source"]["page"],
+            "text": hit["_source"]["text"],
+            "score": hit["_score"]
+        })
+    
+    return JSONResponse({"results": chunks})

 app = Starlette(debug=True, routes=[
    Route("/upload",      upload,       methods=["POST"]),