chunk
This commit is contained in:
parent
b2fa0a861d
commit
057fa0057a
1 changed files with 83 additions and 69 deletions
|
|
@ -13,6 +13,7 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import asyncio
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
@ -147,31 +148,35 @@ async def set():
|
||||||
d["available_int"] = req["available_int"]
|
d["available_int"] = req["available_int"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
def _set_sync():
|
||||||
if not tenant_id:
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||||
return get_data_error_result(message="Tenant not found!")
|
if not tenant_id:
|
||||||
|
return get_data_error_result(message="Tenant not found!")
|
||||||
|
|
||||||
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
||||||
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embd_id)
|
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embd_id)
|
||||||
|
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(message="Document not found!")
|
return get_data_error_result(message="Document not found!")
|
||||||
|
|
||||||
if doc.parser_id == ParserType.QA:
|
_d = d
|
||||||
arr = [
|
if doc.parser_id == ParserType.QA:
|
||||||
t for t in re.split(
|
arr = [
|
||||||
r"[\n\t]",
|
t for t in re.split(
|
||||||
req["content_with_weight"]) if len(t) > 1]
|
r"[\n\t]",
|
||||||
q, a = rmPrefix(arr[0]), rmPrefix("\n".join(arr[1:]))
|
req["content_with_weight"]) if len(t) > 1]
|
||||||
d = beAdoc(d, q, a, not any(
|
q, a = rmPrefix(arr[0]), rmPrefix("\n".join(arr[1:]))
|
||||||
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
_d = beAdoc(d, q, a, not any(
|
||||||
|
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
||||||
|
|
||||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not _d.get("question_kwd") else "\n".join(_d["question_kwd"])])
|
||||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||||
d["q_%d_vec" % len(v)] = v.tolist()
|
_d["q_%d_vec" % len(v)] = v.tolist()
|
||||||
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
settings.docStoreConn.update({"id": req["chunk_id"]}, _d, search.index_name(tenant_id), doc.kb_id)
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_set_sync)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
@ -182,16 +187,19 @@ async def set():
|
||||||
async def switch():
|
async def switch():
|
||||||
req = await get_request_json()
|
req = await get_request_json()
|
||||||
try:
|
try:
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
def _switch_sync():
|
||||||
if not e:
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||||
return get_data_error_result(message="Document not found!")
|
if not e:
|
||||||
for cid in req["chunk_ids"]:
|
return get_data_error_result(message="Document not found!")
|
||||||
if not settings.docStoreConn.update({"id": cid},
|
for cid in req["chunk_ids"]:
|
||||||
{"available_int": int(req["available_int"])},
|
if not settings.docStoreConn.update({"id": cid},
|
||||||
search.index_name(DocumentService.get_tenant_id(req["doc_id"])),
|
{"available_int": int(req["available_int"])},
|
||||||
doc.kb_id):
|
search.index_name(DocumentService.get_tenant_id(req["doc_id"])),
|
||||||
return get_data_error_result(message="Index updating failure")
|
doc.kb_id):
|
||||||
return get_json_result(data=True)
|
return get_data_error_result(message="Index updating failure")
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_switch_sync)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
@ -202,20 +210,23 @@ async def switch():
|
||||||
async def rm():
|
async def rm():
|
||||||
req = await get_request_json()
|
req = await get_request_json()
|
||||||
try:
|
try:
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
def _rm_sync():
|
||||||
if not e:
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||||
return get_data_error_result(message="Document not found!")
|
if not e:
|
||||||
if not settings.docStoreConn.delete({"id": req["chunk_ids"]},
|
return get_data_error_result(message="Document not found!")
|
||||||
search.index_name(DocumentService.get_tenant_id(req["doc_id"])),
|
if not settings.docStoreConn.delete({"id": req["chunk_ids"]},
|
||||||
doc.kb_id):
|
search.index_name(DocumentService.get_tenant_id(req["doc_id"])),
|
||||||
return get_data_error_result(message="Chunk deleting failure")
|
doc.kb_id):
|
||||||
deleted_chunk_ids = req["chunk_ids"]
|
return get_data_error_result(message="Chunk deleting failure")
|
||||||
chunk_number = len(deleted_chunk_ids)
|
deleted_chunk_ids = req["chunk_ids"]
|
||||||
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
|
chunk_number = len(deleted_chunk_ids)
|
||||||
for cid in deleted_chunk_ids:
|
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
|
||||||
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
|
for cid in deleted_chunk_ids:
|
||||||
settings.STORAGE_IMPL.rm(doc.kb_id, cid)
|
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
|
||||||
return get_json_result(data=True)
|
settings.STORAGE_IMPL.rm(doc.kb_id, cid)
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_rm_sync)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
@ -245,35 +256,38 @@ async def create():
|
||||||
d["tag_feas"] = req["tag_feas"]
|
d["tag_feas"] = req["tag_feas"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
def _create_sync():
|
||||||
if not e:
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||||
return get_data_error_result(message="Document not found!")
|
if not e:
|
||||||
d["kb_id"] = [doc.kb_id]
|
return get_data_error_result(message="Document not found!")
|
||||||
d["docnm_kwd"] = doc.name
|
d["kb_id"] = [doc.kb_id]
|
||||||
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
d["docnm_kwd"] = doc.name
|
||||||
d["doc_id"] = doc.id
|
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
||||||
|
d["doc_id"] = doc.id
|
||||||
|
|
||||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||||
if not tenant_id:
|
if not tenant_id:
|
||||||
return get_data_error_result(message="Tenant not found!")
|
return get_data_error_result(message="Tenant not found!")
|
||||||
|
|
||||||
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(message="Knowledgebase not found!")
|
return get_data_error_result(message="Knowledgebase not found!")
|
||||||
if kb.pagerank:
|
if kb.pagerank:
|
||||||
d[PAGERANK_FLD] = kb.pagerank
|
d[PAGERANK_FLD] = kb.pagerank
|
||||||
|
|
||||||
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
embd_id = DocumentService.get_embd_id(req["doc_id"])
|
||||||
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||||
|
|
||||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
||||||
v = 0.1 * v[0] + 0.9 * v[1]
|
v = 0.1 * v[0] + 0.9 * v[1]
|
||||||
d["q_%d_vec" % len(v)] = v.tolist()
|
d["q_%d_vec" % len(v)] = v.tolist()
|
||||||
settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
||||||
|
|
||||||
DocumentService.increment_chunk_num(
|
DocumentService.increment_chunk_num(
|
||||||
doc.id, doc.kb_id, c, 1, 0)
|
doc.id, doc.kb_id, c, 1, 0)
|
||||||
return get_json_result(data={"chunk_id": chunck_id})
|
return get_json_result(data={"chunk_id": chunck_id})
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_create_sync)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue