From cb75e6631e9712c99794943b9266e44835bf233d Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 5 Aug 2025 17:58:34 +0800 Subject: [PATCH] Remove quantized embedding info from LLM cache - Delete quantize_embedding function - Delete dequantize_embedding function - Remove embedding fields from CacheData - Update save_to_cache to exclude embedding data - Clean up unused quantization-related code --- lightrag/operate.py | 12 ------------ lightrag/utils.py | 45 --------------------------------------------- 2 files changed, 57 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index ca21881b..254dfdac 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1833,9 +1833,6 @@ async def kg_query( args_hash=args_hash, content=response, prompt=query, - quantized=quantized, - min_val=min_val, - max_val=max_val, mode=query_param.mode, cache_type="query", ), @@ -1972,9 +1969,6 @@ async def extract_keywords_only( args_hash=args_hash, content=json.dumps(cache_data), prompt=text, - quantized=quantized, - min_val=min_val, - max_val=max_val, mode=param.mode, cache_type="keywords", ), @@ -3105,9 +3099,6 @@ async def naive_query( args_hash=args_hash, content=response, prompt=query, - quantized=quantized, - min_val=min_val, - max_val=max_val, mode=query_param.mode, cache_type="query", ), @@ -3231,9 +3222,6 @@ async def kg_query_with_keywords( args_hash=args_hash, content=response, prompt=query, - quantized=quantized, - min_val=min_val, - max_val=max_val, mode=query_param.mode, cache_type="query", ), diff --git a/lightrag/utils.py b/lightrag/utils.py index 354ca0a3..9e818d6b 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -756,40 +756,6 @@ def cosine_similarity(v1, v2): return dot_product / (norm1 * norm2) -def quantize_embedding(embedding: np.ndarray | list[float], bits: int = 8) -> tuple: - """Quantize embedding to specified bits""" - # Convert list to numpy array if needed - if isinstance(embedding, list): - embedding = np.array(embedding) - - # Calculate min/max values for reconstruction - min_val = embedding.min() - max_val = embedding.max() - - if min_val == max_val: - # handle constant vector - quantized = np.zeros_like(embedding, dtype=np.uint8) - return quantized, min_val, max_val - - # Quantize to 0-255 range - scale = (2**bits - 1) / (max_val - min_val) - quantized = np.round((embedding - min_val) * scale).astype(np.uint8) - - return quantized, min_val, max_val - - -def dequantize_embedding( - quantized: np.ndarray, min_val: float, max_val: float, bits=8 -) -> np.ndarray: - """Restore quantized embedding""" - if min_val == max_val: - # handle constant vector - return np.full_like(quantized, min_val, dtype=np.float32) - - scale = (max_val - min_val) / (2**bits - 1) - return (quantized * scale + min_val).astype(np.float32) - - async def handle_cache( hashing_kv, args_hash, @@ -824,9 +790,6 @@ class CacheData: args_hash: str content: str prompt: str - quantized: np.ndarray | None = None - min_val: float | None = None - max_val: float | None = None mode: str = "default" cache_type: str = "query" chunk_id: str | None = None @@ -866,14 +829,6 @@ async def save_to_cache(hashing_kv, cache_data: CacheData): "return": cache_data.content, "cache_type": cache_data.cache_type, "chunk_id": cache_data.chunk_id if cache_data.chunk_id is not None else None, - "embedding": cache_data.quantized.tobytes().hex() - if cache_data.quantized is not None - else None, - "embedding_shape": cache_data.quantized.shape - if cache_data.quantized is not None - else None, - "embedding_min": cache_data.min_val, - "embedding_max": cache_data.max_val, "original_prompt": cache_data.prompt, }