per chunk index

2025-07-11 00:57:32 -04:00 · 2025-07-11 00:57:32 -04:00 · c0097f9b79
commit c0097f9b79
parent 07a65a2082
1 changed files with 48 additions and 42 deletions
--- a/src/app.py
+++ b/src/app.py
@ -47,31 +47,21 @@ index_body = {
    },
    "mappings": {
        "properties": {
-            "id":        { "type": "keyword" },
+            "document_id": { "type": "keyword" },
-            "origin": {
+            "filename":    { "type": "keyword" },
-                "properties": {
+            "mimetype":    { "type": "keyword" },
-                    "binary_hash": { "type": "keyword" }
+            "page":        { "type": "integer" },
-                }
+            "text":        { "type": "text" },
-            },
+            "chunk_embedding": {
-            "filename":  { "type": "keyword" },
+                "type": "knn_vector",
-            "mimetype":  { "type": "keyword" },
+                "dimension": VECTOR_DIM,
-            "chunks": {
+                "method": {
-                "type": "nested",
+                    "name":       "disk_ann",
-                "properties": {
+                    "engine":     "jvector",
-                    "page":  { "type": "integer" },
+                    "space_type": "l2",
-                    "text":  { "type": "text" },
+                    "parameters": {
-                    "chunk_embedding": {
+                        "ef_construction": 100,
-                        "type": "knn_vector",
+                        "m":               16
                        "dimension": VECTOR_DIM,
                        "method": {
                            "name":       "disk_ann",
                            "engine":     "jvector",
                            "space_type": "l2",
                            "parameters": {
                                "ef_construction": 100,
                                "m":               16
                            }
                        }
                    }
                }
            }
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
    #    return {"status": "unchanged", "id": file_hash}
    # convert and extract
    # TODO: Check if docling can handle in-memory bytes instead of file path
    # This would eliminate the need for temp files in upload flow
    result = converter.convert(file_path)
    full_doc = result.document.export_to_dict()
    slim_doc = extract_relevant(full_doc)
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    embeddings = [d.embedding for d in resp.data]
-    # attach embeddings
+    # Index each chunk as a separate document
-    for chunk, vect in zip(slim_doc["chunks"], embeddings):
+    for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
-        chunk["chunk_embedding"] = vect
+        chunk_doc = {
-
+            "document_id": file_hash,
-    await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc)
+            "filename": slim_doc["filename"],
            "mimetype": slim_doc["mimetype"],
            "page": chunk["page"],
            "text": chunk["text"],
            "chunk_embedding": vect
        }
        chunk_id = f"{file_hash}_{i}"
        await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
    return {"status": "indexed", "id": file_hash}
 async def process_file_on_disk(path: str):
@ -221,27 +220,34 @@ async def search(request: Request):
    resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
    query_embedding = resp.data[0].embedding
-    # Search using vector similarity
+    # Search using vector similarity on individual chunks
    search_body = {
        "query": {
-            "nested": {
+            "knn": {
-                "path": "chunks",
+                "chunk_embedding": {
-                "query": {
+                    "vector": query_embedding,
-                    "knn": {
+                    "k": 10
                        "chunks.chunk_embedding": {
                            "vector": query_embedding,
                            "k": 10
                        }
                    }
                }
            }
        },
-        "_source": ["chunks.text", "chunks.page", "filename", "mimetype"],
+        "_source": ["filename", "mimetype", "page", "text"],
        "size": 10
    }
    results = await es.search(index=INDEX_NAME, body=search_body)
-    return JSONResponse({"results": results["hits"]["hits"]})
+    
    # Transform results to match expected format
    chunks = []
    for hit in results["hits"]["hits"]:
        chunks.append({
            "filename": hit["_source"]["filename"],
            "mimetype": hit["_source"]["mimetype"],
            "page": hit["_source"]["page"],
            "text": hit["_source"]["text"],
            "score": hit["_score"]
        })
    return JSONResponse({"results": chunks})
 app = Starlette(debug=True, routes=[
    Route("/upload",      upload,       methods=["POST"]),