fix: make ids parameter optional and optimize input text cleaning
- Add default None value for ids parameter - Move text cleaning into else branch - Only clean text when auto-generating ids - Preserve original text with custom ids - Improve code readability
This commit is contained in:
parent
411782797b
commit
845e914f1b
1 changed files with 3 additions and 4 deletions
|
|
@ -577,7 +577,7 @@ class LightRAG:
|
|||
await self._insert_done()
|
||||
|
||||
async def apipeline_enqueue_documents(
|
||||
self, input: str | list[str], ids: list[str] | None
|
||||
self, input: str | list[str], ids: list[str] | None = None
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline for Processing Documents
|
||||
|
|
@ -591,9 +591,6 @@ class LightRAG:
|
|||
if isinstance(input, str):
|
||||
input = [input]
|
||||
|
||||
# Clean input text and remove duplicates
|
||||
input = list(set(self.clean_text(doc) for doc in input))
|
||||
|
||||
# 1. Validate ids if provided or generate MD5 hash IDs
|
||||
if ids is not None:
|
||||
# Check if the number of IDs matches the number of documents
|
||||
|
|
@ -607,6 +604,8 @@ class LightRAG:
|
|||
# Generate contents dict of IDs provided by user and documents
|
||||
contents = {id_: doc for id_, doc in zip(ids, input)}
|
||||
else:
|
||||
# Clean input text and remove duplicates
|
||||
input = list(set(self.clean_text(doc) for doc in input))
|
||||
# Generate contents dict of MD5 hash IDs and documents
|
||||
contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue