per chunk index

This commit is contained in:
phact 2025-07-11 00:57:32 -04:00
parent 07a65a2082
commit c0097f9b79

View file

@ -47,31 +47,21 @@ index_body = {
}, },
"mappings": { "mappings": {
"properties": { "properties": {
"id": { "type": "keyword" }, "document_id": { "type": "keyword" },
"origin": { "filename": { "type": "keyword" },
"properties": { "mimetype": { "type": "keyword" },
"binary_hash": { "type": "keyword" } "page": { "type": "integer" },
} "text": { "type": "text" },
}, "chunk_embedding": {
"filename": { "type": "keyword" }, "type": "knn_vector",
"mimetype": { "type": "keyword" }, "dimension": VECTOR_DIM,
"chunks": { "method": {
"type": "nested", "name": "disk_ann",
"properties": { "engine": "jvector",
"page": { "type": "integer" }, "space_type": "l2",
"text": { "type": "text" }, "parameters": {
"chunk_embedding": { "ef_construction": 100,
"type": "knn_vector", "m": 16
"dimension": VECTOR_DIM,
"method": {
"name": "disk_ann",
"engine": "jvector",
"space_type": "l2",
"parameters": {
"ef_construction": 100,
"m": 16
}
}
} }
} }
} }
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
# return {"status": "unchanged", "id": file_hash} # return {"status": "unchanged", "id": file_hash}
# convert and extract # convert and extract
# TODO: Check if docling can handle in-memory bytes instead of file path
# This would eliminate the need for temp files in upload flow
result = converter.convert(file_path) result = converter.convert(file_path)
full_doc = result.document.export_to_dict() full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc) slim_doc = extract_relevant(full_doc)
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
resp = client.embeddings.create(model=EMBED_MODEL, input=texts) resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
embeddings = [d.embedding for d in resp.data] embeddings = [d.embedding for d in resp.data]
# attach embeddings # Index each chunk as a separate document
for chunk, vect in zip(slim_doc["chunks"], embeddings): for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
chunk["chunk_embedding"] = vect chunk_doc = {
"document_id": file_hash,
await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc) "filename": slim_doc["filename"],
"mimetype": slim_doc["mimetype"],
"page": chunk["page"],
"text": chunk["text"],
"chunk_embedding": vect
}
chunk_id = f"{file_hash}_{i}"
await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
return {"status": "indexed", "id": file_hash} return {"status": "indexed", "id": file_hash}
async def process_file_on_disk(path: str): async def process_file_on_disk(path: str):
@ -221,27 +220,34 @@ async def search(request: Request):
resp = client.embeddings.create(model=EMBED_MODEL, input=[query]) resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
query_embedding = resp.data[0].embedding query_embedding = resp.data[0].embedding
# Search using vector similarity # Search using vector similarity on individual chunks
search_body = { search_body = {
"query": { "query": {
"nested": { "knn": {
"path": "chunks", "chunk_embedding": {
"query": { "vector": query_embedding,
"knn": { "k": 10
"chunks.chunk_embedding": {
"vector": query_embedding,
"k": 10
}
}
} }
} }
}, },
"_source": ["chunks.text", "chunks.page", "filename", "mimetype"], "_source": ["filename", "mimetype", "page", "text"],
"size": 10 "size": 10
} }
results = await es.search(index=INDEX_NAME, body=search_body) results = await es.search(index=INDEX_NAME, body=search_body)
return JSONResponse({"results": results["hits"]["hits"]})
# Transform results to match expected format
chunks = []
for hit in results["hits"]["hits"]:
chunks.append({
"filename": hit["_source"]["filename"],
"mimetype": hit["_source"]["mimetype"],
"page": hit["_source"]["page"],
"text": hit["_source"]["text"],
"score": hit["_score"]
})
return JSONResponse({"results": chunks})
app = Starlette(debug=True, routes=[ app = Starlette(debug=True, routes=[
Route("/upload", upload, methods=["POST"]), Route("/upload", upload, methods=["POST"]),