Refactor chunk_by_paragraph to be isomorphic

2024-11-13 12:11:56 +01:00 · 2024-11-13 12:11:56 +01:00 · ce498d97dd
commit ce498d97dd
parent ab55a73d18
1 changed files with 71 additions and 62 deletions
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -1,69 +1,78 @@
 from uuid import uuid5, NAMESPACE_OID
 from typing import Dict, Any, Iterator
 from .chunk_by_sentence import chunk_by_sentence
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
+def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
-    paragraph = ""
+    """
-    last_cut_type = None
+    Chunks text by paragraph while preserving exact text reconstruction capability.
    When chunks are joined with empty string "", they reproduce the original text exactly.
    """
    current_chunk = ""
    current_word_count = 0
    chunk_index = 0
    last_paragraph_id = None
-    paragraph_word_count = 0
+    last_cut_type = None
-    paragraph_chunk_index = 0
+    
-
+    for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data):
-    for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data):
+        # Check if this sentence would exceed length limit
-        if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length:
+        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
-            if batch_paragraphs is True:
+            # Yield current chunk
-                chunk_id = uuid5(NAMESPACE_OID, paragraph)
+            chunk_dict = {
-                yield dict(
+                "text": current_chunk,
-                    text = paragraph.strip(),
+                "word_count": current_word_count,
-                    word_count = paragraph_word_count,
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
-                    id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id.
+                "chunk_index": chunk_index,
-                                   # paragraph_id doens't mean anything since multiple paragraphs are merged.
+                "cut_type": last_cut_type
-                    chunk_id = chunk_id,
+            }
-                    chunk_index = paragraph_chunk_index,
+            
-                    cut_type = last_cut_type,
+            if batch_paragraphs:
-                )
+                chunk_dict["id"] = chunk_dict["chunk_id"]
            else:
-                yield dict(
+                chunk_dict["id"] = last_paragraph_id
-                    text = paragraph.strip(),
+                
-                    word_count = paragraph_word_count,
+            yield chunk_dict
-                    id = last_paragraph_id,
+            
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
+            # Start new chunk with current sentence
-                    chunk_index = paragraph_chunk_index,
+            current_chunk = sentence
-                    cut_type = last_cut_type,
+            current_word_count = word_count
-                )
+            chunk_index += 1
-
+        else:
-            paragraph_chunk_index += 1
+            # Just concatenate directly - no space handling
-            paragraph_word_count = 0
+            current_chunk += sentence
-            paragraph = ""
+            current_word_count += word_count
-
+        
-        paragraph += (" " if len(paragraph) > 0 else "") + sentence
+        # Handle end of paragraph
-        paragraph_word_count += word_count
+        if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
-
+            # For non-batch mode, yield each paragraph separately
-        if end_type == "paragraph_end" or end_type == "sentence_cut":
+            chunk_dict = {
-            if batch_paragraphs is True:
+                "text": current_chunk,
-                paragraph += "\n\n" if end_type == "paragraph_end" else ""
+                "word_count": current_word_count,
-            else:
+                "id": paragraph_id,
-                yield dict(
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
-                    text = paragraph.strip(),
+                "chunk_index": chunk_index,
-                    word_count = paragraph_word_count,
+                "cut_type": end_type
-                    paragraph_id = paragraph_id,
+            }
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
+            yield chunk_dict
-                    chunk_index = paragraph_chunk_index,
+            current_chunk = ""
-                    cut_type = end_type,
+            current_word_count = 0
-                )
+            chunk_index = 0
-
+        
                paragraph_chunk_index = 0
                paragraph_word_count = 0
                paragraph = ""
        last_cut_type = end_type
        last_paragraph_id = paragraph_id
-
+    
-    if len(paragraph) > 0:
+    # Yield any remaining text
-        yield dict(
+    if current_chunk:
-            chunk_id = uuid5(NAMESPACE_OID, paragraph),
+        chunk_dict = {
-            text = paragraph,
+            "text": current_chunk,
-            word_count = paragraph_word_count,
+            "word_count": current_word_count,
-            paragraph_id = last_paragraph_id,
+            "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
-            chunk_index = paragraph_chunk_index,
+            "chunk_index": chunk_index,
-            cut_type = last_cut_type,
+            "cut_type": last_cut_type
-        )
+        }
        if batch_paragraphs:
            chunk_dict["id"] = chunk_dict["chunk_id"]
        else:
            chunk_dict["id"] = last_paragraph_id
        yield chunk_dict