per chunk index

2025-07-11 00:57:32 -04:00 · 2025-07-11 00:57:32 -04:00 · c0097f9b79
commit c0097f9b79
parent 07a65a2082
1 changed files with 48 additions and 42 deletions
--- a/src/app.py
+++ b/src/app.py
@ -47,17 +47,9 @@ index_body = {
    },
    "mappings": {
        "properties": {
-            "id":        { "type": "keyword" },
+            "document_id": { "type": "keyword" },
            "origin": {
                "properties": {
                    "binary_hash": { "type": "keyword" }
                }
            },
            "filename":    { "type": "keyword" },
            "mimetype":    { "type": "keyword" },
            "chunks": {
                "type": "nested",
                "properties": {
            "page":        { "type": "integer" },
            "text":        { "type": "text" },
            "chunk_embedding": {
@ -76,8 +68,6 @@ index_body = {
        }
    }
 }
    }
 }
 client = patch_openai_with_mcp(OpenAI())  # Get the patched client back
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
    #    return {"status": "unchanged", "id": file_hash}
    # convert and extract
    # TODO: Check if docling can handle in-memory bytes instead of file path
    # This would eliminate the need for temp files in upload flow
    result = converter.convert(file_path)
    full_doc = result.document.export_to_dict()
    slim_doc = extract_relevant(full_doc)
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    embeddings = [d.embedding for d in resp.data]
-    # attach embeddings
+    # Index each chunk as a separate document
-    for chunk, vect in zip(slim_doc["chunks"], embeddings):
+    for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
-        chunk["chunk_embedding"] = vect
+        chunk_doc = {
-
+            "document_id": file_hash,
-    await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc)
+            "filename": slim_doc["filename"],
            "mimetype": slim_doc["mimetype"],
            "page": chunk["page"],
            "text": chunk["text"],
            "chunk_embedding": vect
        }
        chunk_id = f"{file_hash}_{i}"
        await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
    return {"status": "indexed", "id": file_hash}
 async def process_file_on_disk(path: str):
@ -221,27 +220,34 @@ async def search(request: Request):
    resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
    query_embedding = resp.data[0].embedding
-    # Search using vector similarity
+    # Search using vector similarity on individual chunks
    search_body = {
        "query": {
            "nested": {
                "path": "chunks",
        "query": {
            "knn": {
-                        "chunks.chunk_embedding": {
+                "chunk_embedding": {
                    "vector": query_embedding,
                    "k": 10
                }
            }
                }
            }
        },
-        "_source": ["chunks.text", "chunks.page", "filename", "mimetype"],
+        "_source": ["filename", "mimetype", "page", "text"],
        "size": 10
    }
    results = await es.search(index=INDEX_NAME, body=search_body)
-    return JSONResponse({"results": results["hits"]["hits"]})
+    
    # Transform results to match expected format
    chunks = []
    for hit in results["hits"]["hits"]:
        chunks.append({
            "filename": hit["_source"]["filename"],
            "mimetype": hit["_source"]["mimetype"],
            "page": hit["_source"]["page"],
            "text": hit["_source"]["text"],
            "score": hit["_score"]
        })
    return JSONResponse({"results": chunks})
 app = Starlette(debug=True, routes=[
    Route("/upload",      upload,       methods=["POST"]),