Remove quantized embedding info from LLM cache
- Delete quantize_embedding function - Delete dequantize_embedding function - Remove embedding fields from CacheData - Update save_to_cache to exclude embedding data - Clean up unused quantization-related code
This commit is contained in:
parent
c7d17f13c1
commit
cb75e6631e
2 changed files with 0 additions and 57 deletions
|
|
@ -1833,9 +1833,6 @@ async def kg_query(
|
|||
args_hash=args_hash,
|
||||
content=response,
|
||||
prompt=query,
|
||||
quantized=quantized,
|
||||
min_val=min_val,
|
||||
max_val=max_val,
|
||||
mode=query_param.mode,
|
||||
cache_type="query",
|
||||
),
|
||||
|
|
@ -1972,9 +1969,6 @@ async def extract_keywords_only(
|
|||
args_hash=args_hash,
|
||||
content=json.dumps(cache_data),
|
||||
prompt=text,
|
||||
quantized=quantized,
|
||||
min_val=min_val,
|
||||
max_val=max_val,
|
||||
mode=param.mode,
|
||||
cache_type="keywords",
|
||||
),
|
||||
|
|
@ -3105,9 +3099,6 @@ async def naive_query(
|
|||
args_hash=args_hash,
|
||||
content=response,
|
||||
prompt=query,
|
||||
quantized=quantized,
|
||||
min_val=min_val,
|
||||
max_val=max_val,
|
||||
mode=query_param.mode,
|
||||
cache_type="query",
|
||||
),
|
||||
|
|
@ -3231,9 +3222,6 @@ async def kg_query_with_keywords(
|
|||
args_hash=args_hash,
|
||||
content=response,
|
||||
prompt=query,
|
||||
quantized=quantized,
|
||||
min_val=min_val,
|
||||
max_val=max_val,
|
||||
mode=query_param.mode,
|
||||
cache_type="query",
|
||||
),
|
||||
|
|
|
|||
|
|
@ -756,40 +756,6 @@ def cosine_similarity(v1, v2):
|
|||
return dot_product / (norm1 * norm2)
|
||||
|
||||
|
||||
def quantize_embedding(embedding: np.ndarray | list[float], bits: int = 8) -> tuple:
|
||||
"""Quantize embedding to specified bits"""
|
||||
# Convert list to numpy array if needed
|
||||
if isinstance(embedding, list):
|
||||
embedding = np.array(embedding)
|
||||
|
||||
# Calculate min/max values for reconstruction
|
||||
min_val = embedding.min()
|
||||
max_val = embedding.max()
|
||||
|
||||
if min_val == max_val:
|
||||
# handle constant vector
|
||||
quantized = np.zeros_like(embedding, dtype=np.uint8)
|
||||
return quantized, min_val, max_val
|
||||
|
||||
# Quantize to 0-255 range
|
||||
scale = (2**bits - 1) / (max_val - min_val)
|
||||
quantized = np.round((embedding - min_val) * scale).astype(np.uint8)
|
||||
|
||||
return quantized, min_val, max_val
|
||||
|
||||
|
||||
def dequantize_embedding(
|
||||
quantized: np.ndarray, min_val: float, max_val: float, bits=8
|
||||
) -> np.ndarray:
|
||||
"""Restore quantized embedding"""
|
||||
if min_val == max_val:
|
||||
# handle constant vector
|
||||
return np.full_like(quantized, min_val, dtype=np.float32)
|
||||
|
||||
scale = (max_val - min_val) / (2**bits - 1)
|
||||
return (quantized * scale + min_val).astype(np.float32)
|
||||
|
||||
|
||||
async def handle_cache(
|
||||
hashing_kv,
|
||||
args_hash,
|
||||
|
|
@ -824,9 +790,6 @@ class CacheData:
|
|||
args_hash: str
|
||||
content: str
|
||||
prompt: str
|
||||
quantized: np.ndarray | None = None
|
||||
min_val: float | None = None
|
||||
max_val: float | None = None
|
||||
mode: str = "default"
|
||||
cache_type: str = "query"
|
||||
chunk_id: str | None = None
|
||||
|
|
@ -866,14 +829,6 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
|
|||
"return": cache_data.content,
|
||||
"cache_type": cache_data.cache_type,
|
||||
"chunk_id": cache_data.chunk_id if cache_data.chunk_id is not None else None,
|
||||
"embedding": cache_data.quantized.tobytes().hex()
|
||||
if cache_data.quantized is not None
|
||||
else None,
|
||||
"embedding_shape": cache_data.quantized.shape
|
||||
if cache_data.quantized is not None
|
||||
else None,
|
||||
"embedding_min": cache_data.min_val,
|
||||
"embedding_max": cache_data.max_val,
|
||||
"original_prompt": cache_data.prompt,
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue