LightRAG/lightrag/llm/hf.py
clssck dd1413f3eb test(lightrag,examples): add prompt accuracy and quality tests
Add comprehensive test suites for prompt evaluation:
- test_prompt_accuracy.py: 365 lines testing prompt extraction accuracy
- test_prompt_quality_deep.py: 672 lines for deep quality analysis
- Refactor prompt.py to consolidate optimized variants (removed prompt_optimized.py)
- Apply ruff formatting and type hints across 30 files
- Update pyrightconfig.json for static type checking
- Modernize reproduce scripts and examples with improved type annotations
- Sync uv.lock dependencies
2025-12-05 16:39:52 +01:00

156 lines
5.1 KiB
Python

import copy
import os
from functools import lru_cache
import pipmaster as pm # Pipmaster for dynamic library install
# install specific modules
if not pm.is_installed('transformers'):
pm.install('transformers')
if not pm.is_installed('torch'):
pm.install('torch')
if not pm.is_installed('numpy'):
pm.install('numpy')
import numpy as np
import torch
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from transformers import AutoModelForCausalLM, AutoTokenizer
from lightrag.exceptions import (
APIConnectionError,
APITimeoutError,
RateLimitError,
)
from lightrag.utils import wrap_embedding_func_with_attrs
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
@lru_cache(maxsize=1)
def initialize_hf_model(model_name):
hf_tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='auto', trust_remote_code=True)
hf_model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', trust_remote_code=True)
if hf_tokenizer.pad_token is None:
hf_tokenizer.pad_token = hf_tokenizer.eos_token
return hf_model, hf_tokenizer
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
)
async def hf_model_if_cache(
model,
prompt,
system_prompt=None,
history_messages=None,
enable_cot: bool = False,
**kwargs,
) -> str:
if history_messages is None:
history_messages = []
if enable_cot:
from lightrag.utils import logger
logger.debug('enable_cot=True is not supported for Hugging Face local models and will be ignored.')
model_name = model
hf_model, hf_tokenizer = initialize_hf_model(model_name)
messages = []
if system_prompt:
messages.append({'role': 'system', 'content': system_prompt})
messages.extend(history_messages)
messages.append({'role': 'user', 'content': prompt})
kwargs.pop('hashing_kv', None)
input_prompt = ''
try:
input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except Exception:
try:
ori_message = copy.deepcopy(messages)
if messages[0]['role'] == 'system':
messages[1]['content'] = '<system>' + messages[0]['content'] + '</system>\n' + messages[1]['content']
messages = messages[1:]
input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except Exception:
len_message = len(ori_message)
for msgid in range(len_message):
input_prompt = (
input_prompt
+ '<'
+ ori_message[msgid]['role']
+ '>'
+ ori_message[msgid]['content']
+ '</'
+ ori_message[msgid]['role']
+ '>\n'
)
device = hf_model.device
tokenized = hf_tokenizer(input_prompt, return_tensors='pt', padding=True, truncation=True).to(device)
inputs = {k: v.to(device) for k, v in tokenized.items()}
output = hf_model.generate(**inputs, max_new_tokens=512, num_return_sequences=1, early_stopping=True)
response_text = hf_tokenizer.decode(output[0][len(inputs['input_ids'][0]) :], skip_special_tokens=True)
return response_text
async def hf_model_complete(
prompt,
system_prompt=None,
history_messages=None,
keyword_extraction=False,
enable_cot: bool = False,
**kwargs,
) -> str:
if history_messages is None:
history_messages = []
kwargs.pop('keyword_extraction', None)
model_name = kwargs['hashing_kv'].global_config['llm_model_name']
result = await hf_model_if_cache(
model_name,
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
enable_cot=enable_cot,
**kwargs,
)
return result
@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
# Detect the appropriate device
if torch.cuda.is_available():
device = next(embed_model.parameters()).device # Use CUDA if available
elif torch.backends.mps.is_available():
device = torch.device('mps') # Use MPS for Apple Silicon
else:
device = torch.device('cpu') # Fallback to CPU
# Move the model to the detected device
embed_model = embed_model.to(device)
# Tokenize the input texts and move them to the same device
encoded_texts = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
# Perform inference
with torch.no_grad():
outputs = embed_model(
input_ids=encoded_texts['input_ids'],
attention_mask=encoded_texts['attention_mask'],
)
embeddings = outputs.last_hidden_state.mean(dim=1)
# Convert embeddings to NumPy
if embeddings.dtype == torch.bfloat16:
return embeddings.detach().to(torch.float32).cpu().numpy()
else:
return embeddings.detach().cpu().numpy()