Merge branch 'main' into add-Memgraph-graph-db

This commit is contained in:
yangdx 2025-07-05 13:14:39 +08:00
commit a567601da2
8 changed files with 144 additions and 116 deletions

View file

@ -757,6 +757,8 @@ async def initialize_rag():
<details> <details>
<summary> <b>使用Faiss进行存储</b> </summary> <summary> <b>使用Faiss进行存储</b> </summary>
在使用Faiss向量数据库之前必须手工安装`faiss-cpu``faiss-gpu`
- 安装所需依赖: - 安装所需依赖:

View file

@ -819,6 +819,8 @@ For production level scenarios you will most likely want to leverage an enterpri
<details> <details>
<summary> <b>Using Faiss for Storage</b> </summary> <summary> <b>Using Faiss for Storage</b> </summary>
You must manually install faiss-cpu or faiss-gpu before using FAISS vector db.
Manually install `faiss-cpu` or `faiss-gpu` before using FAISS vector db.
- Install the required dependencies: - Install the required dependencies:

View file

@ -108,11 +108,28 @@ EMBEDDING_BINDING_HOST=http://localhost:11434
# AZURE_EMBEDDING_ENDPOINT=your_endpoint # AZURE_EMBEDDING_ENDPOINT=your_endpoint
# AZURE_EMBEDDING_API_KEY=your_api_key # AZURE_EMBEDDING_API_KEY=your_api_key
###########################
### Data storage selection ### Data storage selection
###########################
### PostgreSQL
# LIGHTRAG_KV_STORAGE=PGKVStorage # LIGHTRAG_KV_STORAGE=PGKVStorage
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage # LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
### MongoDB
# LIGHTRAG_KV_STORAGE=MongoKVStorage
# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
### KV Storage
# LIGHTRAG_KV_STORAGE=RedisKVStorage
# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
### Vector Storage
# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
### Graph Storage
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage # LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage
### PostgreSQL Configuration ### PostgreSQL Configuration
POSTGRES_HOST=localhost POSTGRES_HOST=localhost

View file

@ -4,9 +4,7 @@ import asyncio
from typing import Any, final from typing import Any, final
import json import json
import numpy as np import numpy as np
from dataclasses import dataclass from dataclasses import dataclass
import pipmaster as pm
from lightrag.utils import logger, compute_mdhash_id from lightrag.utils import logger, compute_mdhash_id
from lightrag.base import BaseVectorStorage from lightrag.base import BaseVectorStorage
@ -17,11 +15,7 @@ from .shared_storage import (
set_all_update_flags, set_all_update_flags,
) )
USE_GPU = os.getenv("FAISS_USE_GPU", "0") == "1" # You must manually install faiss-cpu or faiss-gpu before using FAISS vector db
FAISS_PACKAGE = "faiss-gpu" if USE_GPU else "faiss-cpu"
if not pm.is_installed(FAISS_PACKAGE):
pm.install(FAISS_PACKAGE)
import faiss # type: ignore import faiss # type: ignore
@ -165,7 +159,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
meta["__vector__"] = embeddings[i].tolist() meta["__vector__"] = embeddings[i].tolist()
self._id_to_meta.update({fid: meta}) self._id_to_meta.update({fid: meta})
logger.info(f"Upserted {len(list_data)} vectors into Faiss index.") logger.debug(f"Upserted {len(list_data)} vectors into Faiss index.")
return [m["__id__"] for m in list_data] return [m["__id__"] for m in list_data]
async def query( async def query(
@ -228,7 +222,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
2. Only one process should updating the storage at a time before index_done_callback, 2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption KG-storage-log should be used to avoid data corruption
""" """
logger.info(f"Deleting {len(ids)} vectors from {self.namespace}") logger.debug(f"Deleting {len(ids)} vectors from {self.namespace}")
to_remove = [] to_remove = []
for cid in ids: for cid in ids:
fid = self._find_faiss_id_by_custom_id(cid) fid = self._find_faiss_id_by_custom_id(cid)
@ -330,7 +324,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
and rebuild in-memory structures so we can query. and rebuild in-memory structures so we can query.
""" """
if not os.path.exists(self._faiss_index_file): if not os.path.exists(self._faiss_index_file):
logger.warning("No existing Faiss index file found. Starting fresh.") logger.warning(f"No existing Faiss index file found for {self.namespace}")
return return
try: try:

View file

@ -168,6 +168,13 @@ async def _handle_single_entity_extraction(
# Normalize entity name # Normalize entity name
entity_name = normalize_extracted_info(entity_name, is_entity=True) entity_name = normalize_extracted_info(entity_name, is_entity=True)
# Check if entity name became empty after normalization
if not entity_name or not entity_name.strip():
logger.warning(
f"Entity extraction error: entity name became empty after normalization. Original: '{record_attributes[1]}'"
)
return None
# Clean and validate entity type # Clean and validate entity type
entity_type = clean_str(record_attributes[2]).strip('"') entity_type = clean_str(record_attributes[2]).strip('"')
if not entity_type.strip() or entity_type.startswith('("'): if not entity_type.strip() or entity_type.startswith('("'):
@ -209,6 +216,20 @@ async def _handle_single_relationship_extraction(
# Normalize source and target entity names # Normalize source and target entity names
source = normalize_extracted_info(source, is_entity=True) source = normalize_extracted_info(source, is_entity=True)
target = normalize_extracted_info(target, is_entity=True) target = normalize_extracted_info(target, is_entity=True)
# Check if source or target became empty after normalization
if not source or not source.strip():
logger.warning(
f"Relationship extraction error: source entity became empty after normalization. Original: '{record_attributes[1]}'"
)
return None
if not target or not target.strip():
logger.warning(
f"Relationship extraction error: target entity became empty after normalization. Original: '{record_attributes[2]}'"
)
return None
if source == target: if source == target:
logger.debug( logger.debug(
f"Relationship source and target are the same in: {record_attributes}" f"Relationship source and target are the same in: {record_attributes}"

93
pyproject.toml Normal file
View file

@ -0,0 +1,93 @@
[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "lightrag-hku"
dynamic = ["version"]
authors = [
{name = "Zirui Guo"}
]
description = "LightRAG: Simple and Fast Retrieval-Augmented Generation"
readme = "README.md"
license = {text = "MIT"}
requires-python = ">=3.9"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
"aiohttp",
"configparser",
"dotenv",
"future",
"numpy",
"pandas>=2.0.0",
"pipmaster",
"pydantic",
"python-dotenv",
"pyuca",
"setuptools",
"tenacity",
"tiktoken",
"xlsxwriter>=3.1.0",
]
[project.optional-dependencies]
api = [
# Core dependencies
"aiohttp",
"configparser",
"dotenv",
"future",
"numpy",
"openai",
"pandas>=2.0.0",
"pipmaster",
"pydantic",
"python-dotenv",
"pyuca",
"setuptools",
"tenacity",
"tiktoken",
"xlsxwriter>=3.1.0",
# API-specific dependencies
"aiofiles",
"ascii_colors",
"asyncpg",
"distro",
"fastapi",
"httpcore",
"httpx",
"jiter",
"passlib[bcrypt]",
"PyJWT",
"python-jose[cryptography]",
"python-multipart",
"pytz",
"uvicorn",
]
[project.scripts]
lightrag-server = "lightrag.api.lightrag_server:main"
lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main"
[project.urls]
Homepage = "https://github.com/HKUDS/LightRAG"
Documentation = "https://github.com/HKUDS/LightRAG"
Repository = "https://github.com/HKUDS/LightRAG"
"Bug Tracker" = "https://github.com/HKUDS/LightRAG/issues"
[tool.setuptools]
packages = ["lightrag"]
include-package-data = true
[tool.setuptools.dynamic]
version = {attr = "lightrag.__version__"}
[tool.setuptools.package-data]
lightrag = ["api/webui/**/*"]

109
setup.py
View file

@ -1,107 +1,6 @@
import setuptools # Minimal setup.py for backward compatibility
from pathlib import Path # Primary configuration is now in pyproject.toml
from setuptools import setup
# Reading the long description from README.md setup()
def read_long_description():
try:
return Path("README.md").read_text(encoding="utf-8")
except FileNotFoundError:
return "A description of LightRAG is currently unavailable."
# Retrieving metadata from __init__.py
def retrieve_metadata():
vars2find = ["__author__", "__version__", "__url__"]
vars2readme = {}
try:
with open("./lightrag/__init__.py") as f:
for line in f.readlines():
for v in vars2find:
if line.startswith(v):
line = (
line.replace(" ", "")
.replace('"', "")
.replace("'", "")
.strip()
)
vars2readme[v] = line.split("=")[1]
except FileNotFoundError:
raise FileNotFoundError("Metadata file './lightrag/__init__.py' not found.")
# Checking if all required variables are found
missing_vars = [v for v in vars2find if v not in vars2readme]
if missing_vars:
raise ValueError(
f"Missing required metadata variables in __init__.py: {missing_vars}"
)
return vars2readme
# Reading dependencies from requirements.txt
def read_requirements(file_path="requirements.txt"):
deps = []
try:
with open(file_path) as f:
deps = [
line.strip() for line in f if line.strip() and not line.startswith("#")
]
except FileNotFoundError:
print(f"Warning: '{file_path}' not found. No dependencies will be installed.")
return deps
def read_api_requirements():
return read_requirements("lightrag/api/requirements.txt")
def read_extra_requirements():
return read_requirements("lightrag/tools/lightrag_visualizer/requirements.txt")
metadata = retrieve_metadata()
long_description = read_long_description()
requirements = read_requirements()
setuptools.setup(
name="lightrag-hku",
url=metadata["__url__"],
version=metadata["__version__"],
author=metadata["__author__"],
description="LightRAG: Simple and Fast Retrieval-Augmented Generation",
long_description=long_description,
long_description_content_type="text/markdown",
packages=setuptools.find_packages(
exclude=("tests*", "docs*")
), # Automatically find packages
classifiers=[
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Topic :: Software Development :: Libraries :: Python Modules",
],
python_requires=">=3.9",
install_requires=requirements,
include_package_data=True, # Includes non-code files from MANIFEST.in
project_urls={ # Additional project metadata
"Documentation": metadata.get("__url__", ""),
"Source": metadata.get("__url__", ""),
"Tracker": f"{metadata.get('__url__', '')}/issues"
if metadata.get("__url__")
else "",
},
extras_require={
"api": requirements + read_api_requirements(),
"tools": read_extra_requirements(), # API requirements as optional
},
entry_points={
"console_scripts": [
"lightrag-server=lightrag.api.lightrag_server:main [api]",
"lightrag-gunicorn=lightrag.api.run_with_gunicorn:main [api]",
"lightrag-viewer=lightrag.tools.lightrag_visualizer.graph_visualizer:main [tools]",
],
},
)