per chunk index

This commit is contained in:
phact 2025-07-11 00:57:32 -04:00
parent 07a65a2082
commit c0097f9b79

View file

@ -47,17 +47,9 @@ index_body = {
}, },
"mappings": { "mappings": {
"properties": { "properties": {
"id": { "type": "keyword" }, "document_id": { "type": "keyword" },
"origin": {
"properties": {
"binary_hash": { "type": "keyword" }
}
},
"filename": { "type": "keyword" }, "filename": { "type": "keyword" },
"mimetype": { "type": "keyword" }, "mimetype": { "type": "keyword" },
"chunks": {
"type": "nested",
"properties": {
"page": { "type": "integer" }, "page": { "type": "integer" },
"text": { "type": "text" }, "text": { "type": "text" },
"chunk_embedding": { "chunk_embedding": {
@ -76,8 +68,6 @@ index_body = {
} }
} }
} }
}
}
client = patch_openai_with_mcp(OpenAI()) # Get the patched client back client = patch_openai_with_mcp(OpenAI()) # Get the patched client back
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
# return {"status": "unchanged", "id": file_hash} # return {"status": "unchanged", "id": file_hash}
# convert and extract # convert and extract
# TODO: Check if docling can handle in-memory bytes instead of file path
# This would eliminate the need for temp files in upload flow
result = converter.convert(file_path) result = converter.convert(file_path)
full_doc = result.document.export_to_dict() full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc) slim_doc = extract_relevant(full_doc)
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
resp = client.embeddings.create(model=EMBED_MODEL, input=texts) resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
embeddings = [d.embedding for d in resp.data] embeddings = [d.embedding for d in resp.data]
# attach embeddings # Index each chunk as a separate document
for chunk, vect in zip(slim_doc["chunks"], embeddings): for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
chunk["chunk_embedding"] = vect chunk_doc = {
"document_id": file_hash,
await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc) "filename": slim_doc["filename"],
"mimetype": slim_doc["mimetype"],
"page": chunk["page"],
"text": chunk["text"],
"chunk_embedding": vect
}
chunk_id = f"{file_hash}_{i}"
await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
return {"status": "indexed", "id": file_hash} return {"status": "indexed", "id": file_hash}
async def process_file_on_disk(path: str): async def process_file_on_disk(path: str):
@ -221,27 +220,34 @@ async def search(request: Request):
resp = client.embeddings.create(model=EMBED_MODEL, input=[query]) resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
query_embedding = resp.data[0].embedding query_embedding = resp.data[0].embedding
# Search using vector similarity # Search using vector similarity on individual chunks
search_body = { search_body = {
"query": {
"nested": {
"path": "chunks",
"query": { "query": {
"knn": { "knn": {
"chunks.chunk_embedding": { "chunk_embedding": {
"vector": query_embedding, "vector": query_embedding,
"k": 10 "k": 10
} }
} }
}
}
}, },
"_source": ["chunks.text", "chunks.page", "filename", "mimetype"], "_source": ["filename", "mimetype", "page", "text"],
"size": 10 "size": 10
} }
results = await es.search(index=INDEX_NAME, body=search_body) results = await es.search(index=INDEX_NAME, body=search_body)
return JSONResponse({"results": results["hits"]["hits"]})
# Transform results to match expected format
chunks = []
for hit in results["hits"]["hits"]:
chunks.append({
"filename": hit["_source"]["filename"],
"mimetype": hit["_source"]["mimetype"],
"page": hit["_source"]["page"],
"text": hit["_source"]["text"],
"score": hit["_score"]
})
return JSONResponse({"results": chunks})
app = Starlette(debug=True, routes=[ app = Starlette(debug=True, routes=[
Route("/upload", upload, methods=["POST"]), Route("/upload", upload, methods=["POST"]),