per chunk index
This commit is contained in:
parent
b97a86bdd5
commit
5465a1204a
1 changed files with 48 additions and 42 deletions
90
src/app.py
90
src/app.py
|
|
@ -47,31 +47,21 @@ index_body = {
|
|||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": { "type": "keyword" },
|
||||
"origin": {
|
||||
"properties": {
|
||||
"binary_hash": { "type": "keyword" }
|
||||
}
|
||||
},
|
||||
"filename": { "type": "keyword" },
|
||||
"mimetype": { "type": "keyword" },
|
||||
"chunks": {
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"page": { "type": "integer" },
|
||||
"text": { "type": "text" },
|
||||
"chunk_embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": VECTOR_DIM,
|
||||
"method": {
|
||||
"name": "disk_ann",
|
||||
"engine": "jvector",
|
||||
"space_type": "l2",
|
||||
"parameters": {
|
||||
"ef_construction": 100,
|
||||
"m": 16
|
||||
}
|
||||
}
|
||||
"document_id": { "type": "keyword" },
|
||||
"filename": { "type": "keyword" },
|
||||
"mimetype": { "type": "keyword" },
|
||||
"page": { "type": "integer" },
|
||||
"text": { "type": "text" },
|
||||
"chunk_embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": VECTOR_DIM,
|
||||
"method": {
|
||||
"name": "disk_ann",
|
||||
"engine": "jvector",
|
||||
"space_type": "l2",
|
||||
"parameters": {
|
||||
"ef_construction": 100,
|
||||
"m": 16
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -148,6 +138,8 @@ async def process_file_common(file_path: str, file_hash: str = None):
|
|||
# return {"status": "unchanged", "id": file_hash}
|
||||
|
||||
# convert and extract
|
||||
# TODO: Check if docling can handle in-memory bytes instead of file path
|
||||
# This would eliminate the need for temp files in upload flow
|
||||
result = converter.convert(file_path)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
|
|
@ -156,11 +148,18 @@ async def process_file_common(file_path: str, file_hash: str = None):
|
|||
resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
|
||||
embeddings = [d.embedding for d in resp.data]
|
||||
|
||||
# attach embeddings
|
||||
for chunk, vect in zip(slim_doc["chunks"], embeddings):
|
||||
chunk["chunk_embedding"] = vect
|
||||
|
||||
await es.index(index=INDEX_NAME, id=file_hash, body=slim_doc)
|
||||
# Index each chunk as a separate document
|
||||
for i, (chunk, vect) in enumerate(zip(slim_doc["chunks"], embeddings)):
|
||||
chunk_doc = {
|
||||
"document_id": file_hash,
|
||||
"filename": slim_doc["filename"],
|
||||
"mimetype": slim_doc["mimetype"],
|
||||
"page": chunk["page"],
|
||||
"text": chunk["text"],
|
||||
"chunk_embedding": vect
|
||||
}
|
||||
chunk_id = f"{file_hash}_{i}"
|
||||
await es.index(index=INDEX_NAME, id=chunk_id, body=chunk_doc)
|
||||
return {"status": "indexed", "id": file_hash}
|
||||
|
||||
async def process_file_on_disk(path: str):
|
||||
|
|
@ -221,27 +220,34 @@ async def search(request: Request):
|
|||
resp = client.embeddings.create(model=EMBED_MODEL, input=[query])
|
||||
query_embedding = resp.data[0].embedding
|
||||
|
||||
# Search using vector similarity
|
||||
# Search using vector similarity on individual chunks
|
||||
search_body = {
|
||||
"query": {
|
||||
"nested": {
|
||||
"path": "chunks",
|
||||
"query": {
|
||||
"knn": {
|
||||
"chunks.chunk_embedding": {
|
||||
"vector": query_embedding,
|
||||
"k": 10
|
||||
}
|
||||
}
|
||||
"knn": {
|
||||
"chunk_embedding": {
|
||||
"vector": query_embedding,
|
||||
"k": 10
|
||||
}
|
||||
}
|
||||
},
|
||||
"_source": ["chunks.text", "chunks.page", "filename", "mimetype"],
|
||||
"_source": ["filename", "mimetype", "page", "text"],
|
||||
"size": 10
|
||||
}
|
||||
|
||||
results = await es.search(index=INDEX_NAME, body=search_body)
|
||||
return JSONResponse({"results": results["hits"]["hits"]})
|
||||
|
||||
# Transform results to match expected format
|
||||
chunks = []
|
||||
for hit in results["hits"]["hits"]:
|
||||
chunks.append({
|
||||
"filename": hit["_source"]["filename"],
|
||||
"mimetype": hit["_source"]["mimetype"],
|
||||
"page": hit["_source"]["page"],
|
||||
"text": hit["_source"]["text"],
|
||||
"score": hit["_score"]
|
||||
})
|
||||
|
||||
return JSONResponse({"results": chunks})
|
||||
|
||||
app = Starlette(debug=True, routes=[
|
||||
Route("/upload", upload, methods=["POST"]),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue