From cb75e6631e9712c99794943b9266e44835bf233d Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 5 Aug 2025 17:58:34 +0800
Subject: [PATCH] Remove quantized embedding  info from LLM cache

- Delete quantize_embedding function
- Delete dequantize_embedding function
- Remove embedding fields from CacheData
- Update save_to_cache to exclude embedding data
- Clean up unused quantization-related code
---
 lightrag/operate.py | 12 ------------
 lightrag/utils.py   | 45 ---------------------------------------------
 2 files changed, 57 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index ca21881b..254dfdac 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1833,9 +1833,6 @@ async def kg_query(
                 args_hash=args_hash,
                 content=response,
                 prompt=query,
-                quantized=quantized,
-                min_val=min_val,
-                max_val=max_val,
                 mode=query_param.mode,
                 cache_type="query",
             ),
@@ -1972,9 +1969,6 @@ async def extract_keywords_only(
                     args_hash=args_hash,
                     content=json.dumps(cache_data),
                     prompt=text,
-                    quantized=quantized,
-                    min_val=min_val,
-                    max_val=max_val,
                     mode=param.mode,
                     cache_type="keywords",
                 ),
@@ -3105,9 +3099,6 @@ async def naive_query(
                 args_hash=args_hash,
                 content=response,
                 prompt=query,
-                quantized=quantized,
-                min_val=min_val,
-                max_val=max_val,
                 mode=query_param.mode,
                 cache_type="query",
             ),
@@ -3231,9 +3222,6 @@ async def kg_query_with_keywords(
                     args_hash=args_hash,
                     content=response,
                     prompt=query,
-                    quantized=quantized,
-                    min_val=min_val,
-                    max_val=max_val,
                     mode=query_param.mode,
                     cache_type="query",
                 ),
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 354ca0a3..9e818d6b 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -756,40 +756,6 @@ def cosine_similarity(v1, v2):
     return dot_product / (norm1 * norm2)
 
 
-def quantize_embedding(embedding: np.ndarray | list[float], bits: int = 8) -> tuple:
-    """Quantize embedding to specified bits"""
-    # Convert list to numpy array if needed
-    if isinstance(embedding, list):
-        embedding = np.array(embedding)
-
-    # Calculate min/max values for reconstruction
-    min_val = embedding.min()
-    max_val = embedding.max()
-
-    if min_val == max_val:
-        # handle constant vector
-        quantized = np.zeros_like(embedding, dtype=np.uint8)
-        return quantized, min_val, max_val
-
-    # Quantize to 0-255 range
-    scale = (2**bits - 1) / (max_val - min_val)
-    quantized = np.round((embedding - min_val) * scale).astype(np.uint8)
-
-    return quantized, min_val, max_val
-
-
-def dequantize_embedding(
-    quantized: np.ndarray, min_val: float, max_val: float, bits=8
-) -> np.ndarray:
-    """Restore quantized embedding"""
-    if min_val == max_val:
-        # handle constant vector
-        return np.full_like(quantized, min_val, dtype=np.float32)
-
-    scale = (max_val - min_val) / (2**bits - 1)
-    return (quantized * scale + min_val).astype(np.float32)
-
-
 async def handle_cache(
     hashing_kv,
     args_hash,
@@ -824,9 +790,6 @@ class CacheData:
     args_hash: str
     content: str
     prompt: str
-    quantized: np.ndarray | None = None
-    min_val: float | None = None
-    max_val: float | None = None
     mode: str = "default"
     cache_type: str = "query"
     chunk_id: str | None = None
@@ -866,14 +829,6 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         "return": cache_data.content,
         "cache_type": cache_data.cache_type,
         "chunk_id": cache_data.chunk_id if cache_data.chunk_id is not None else None,
-        "embedding": cache_data.quantized.tobytes().hex()
-        if cache_data.quantized is not None
-        else None,
-        "embedding_shape": cache_data.quantized.shape
-        if cache_data.quantized is not None
-        else None,
-        "embedding_min": cache_data.min_val,
-        "embedding_max": cache_data.max_val,
         "original_prompt": cache_data.prompt,
     }