fix: parent-child chunking method

This commit is contained in:
buua436 2025-12-08 14:44:11 +08:00
parent 43f51baa96
commit 55f7d4e877
2 changed files with 6 additions and 2 deletions

View file

@ -389,6 +389,9 @@ class Dealer:
"topk": top,
"similarity": similarity_threshold,
"available_int": 1,
"fields":["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
"question_kwd", "question_tks", "doc_type_kwd","available_int", "content_with_weight","mom_id", PAGERANK_FLD, TAG_FLD]
}
if isinstance(tenant_ids, str):
@ -469,6 +472,7 @@ class Dealer:
"vector": chunk.get(vector_column, zero_vector),
"positions": position_int,
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
"mom_id": chunk.get("mom_id", ""),
}
if highlight and sres.highlight:
if id in sres.highlight:

View file

@ -727,17 +727,17 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
if not mom:
continue
id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
ck["mom_id"] = id
if id in mother_ids:
continue
mother_ids.add(id)
ck["mom_id"] = id
mom_ck = copy.deepcopy(ck)
mom_ck["id"] = id
mom_ck["content_with_weight"] = mom
mom_ck["available_int"] = 0
flds = list(mom_ck.keys())
for fld in flds:
if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int", "position_int"]:
if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int"]:
del mom_ck[fld]
mothers.append(mom_ck)