diff --git a/README-zh.md b/README-zh.md index 917736a6..34fece43 100644 --- a/README-zh.md +++ b/README-zh.md @@ -757,6 +757,8 @@ async def initialize_rag():
使用Faiss进行存储 +在使用Faiss向量数据库之前必须手工安装`faiss-cpu`或`faiss-gpu`。 + - 安装所需依赖: diff --git a/README.md b/README.md index 2068f205..1caffef4 100644 --- a/README.md +++ b/README.md @@ -819,6 +819,8 @@ For production level scenarios you will most likely want to leverage an enterpri
Using Faiss for Storage +You must manually install faiss-cpu or faiss-gpu before using FAISS vector db. +Manually install `faiss-cpu` or `faiss-gpu` before using FAISS vector db. - Install the required dependencies: diff --git a/env.example b/env.example index 98e4790b..514f8425 100644 --- a/env.example +++ b/env.example @@ -108,11 +108,28 @@ EMBEDDING_BINDING_HOST=http://localhost:11434 # AZURE_EMBEDDING_ENDPOINT=your_endpoint # AZURE_EMBEDDING_API_KEY=your_api_key +########################### ### Data storage selection +########################### +### PostgreSQL # LIGHTRAG_KV_STORAGE=PGKVStorage -# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage # LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage +# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage +# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage +### MongoDB +# LIGHTRAG_KV_STORAGE=MongoKVStorage +# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage +# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage +# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage +### KV Storage +# LIGHTRAG_KV_STORAGE=RedisKVStorage +# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage +### Vector Storage +# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage +# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage +### Graph Storage # LIGHTRAG_GRAPH_STORAGE=Neo4JStorage +# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage ### PostgreSQL Configuration POSTGRES_HOST=localhost diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/deprecated/age_impl.py similarity index 100% rename from lightrag/kg/age_impl.py rename to lightrag/kg/deprecated/age_impl.py diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index cb19497a..af691458 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -4,9 +4,7 @@ import asyncio from typing import Any, final import json import numpy as np - from dataclasses import dataclass -import pipmaster as pm from lightrag.utils import logger, compute_mdhash_id from lightrag.base import BaseVectorStorage @@ -17,11 +15,7 @@ from .shared_storage import ( set_all_update_flags, ) -USE_GPU = os.getenv("FAISS_USE_GPU", "0") == "1" -FAISS_PACKAGE = "faiss-gpu" if USE_GPU else "faiss-cpu" -if not pm.is_installed(FAISS_PACKAGE): - pm.install(FAISS_PACKAGE) - +# You must manually install faiss-cpu or faiss-gpu before using FAISS vector db import faiss # type: ignore @@ -165,7 +159,7 @@ class FaissVectorDBStorage(BaseVectorStorage): meta["__vector__"] = embeddings[i].tolist() self._id_to_meta.update({fid: meta}) - logger.info(f"Upserted {len(list_data)} vectors into Faiss index.") + logger.debug(f"Upserted {len(list_data)} vectors into Faiss index.") return [m["__id__"] for m in list_data] async def query( @@ -228,7 +222,7 @@ class FaissVectorDBStorage(BaseVectorStorage): 2. Only one process should updating the storage at a time before index_done_callback, KG-storage-log should be used to avoid data corruption """ - logger.info(f"Deleting {len(ids)} vectors from {self.namespace}") + logger.debug(f"Deleting {len(ids)} vectors from {self.namespace}") to_remove = [] for cid in ids: fid = self._find_faiss_id_by_custom_id(cid) @@ -330,7 +324,7 @@ class FaissVectorDBStorage(BaseVectorStorage): and rebuild in-memory structures so we can query. """ if not os.path.exists(self._faiss_index_file): - logger.warning("No existing Faiss index file found. Starting fresh.") + logger.warning(f"No existing Faiss index file found for {self.namespace}") return try: diff --git a/lightrag/operate.py b/lightrag/operate.py index 60425148..88837435 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -168,6 +168,13 @@ async def _handle_single_entity_extraction( # Normalize entity name entity_name = normalize_extracted_info(entity_name, is_entity=True) + # Check if entity name became empty after normalization + if not entity_name or not entity_name.strip(): + logger.warning( + f"Entity extraction error: entity name became empty after normalization. Original: '{record_attributes[1]}'" + ) + return None + # Clean and validate entity type entity_type = clean_str(record_attributes[2]).strip('"') if not entity_type.strip() or entity_type.startswith('("'): @@ -209,6 +216,20 @@ async def _handle_single_relationship_extraction( # Normalize source and target entity names source = normalize_extracted_info(source, is_entity=True) target = normalize_extracted_info(target, is_entity=True) + + # Check if source or target became empty after normalization + if not source or not source.strip(): + logger.warning( + f"Relationship extraction error: source entity became empty after normalization. Original: '{record_attributes[1]}'" + ) + return None + + if not target or not target.strip(): + logger.warning( + f"Relationship extraction error: target entity became empty after normalization. Original: '{record_attributes[2]}'" + ) + return None + if source == target: logger.debug( f"Relationship source and target are the same in: {record_attributes}" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..b87df3bc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,93 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "lightrag-hku" +dynamic = ["version"] +authors = [ + {name = "Zirui Guo"} +] +description = "LightRAG: Simple and Fast Retrieval-Augmented Generation" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "aiohttp", + "configparser", + "dotenv", + "future", + "numpy", + "pandas>=2.0.0", + "pipmaster", + "pydantic", + "python-dotenv", + "pyuca", + "setuptools", + "tenacity", + "tiktoken", + "xlsxwriter>=3.1.0", +] + +[project.optional-dependencies] +api = [ + # Core dependencies + "aiohttp", + "configparser", + "dotenv", + "future", + "numpy", + "openai", + "pandas>=2.0.0", + "pipmaster", + "pydantic", + "python-dotenv", + "pyuca", + "setuptools", + "tenacity", + "tiktoken", + "xlsxwriter>=3.1.0", + # API-specific dependencies + "aiofiles", + "ascii_colors", + "asyncpg", + "distro", + "fastapi", + "httpcore", + "httpx", + "jiter", + "passlib[bcrypt]", + "PyJWT", + "python-jose[cryptography]", + "python-multipart", + "pytz", + "uvicorn", +] + +[project.scripts] +lightrag-server = "lightrag.api.lightrag_server:main" +lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main" + +[project.urls] +Homepage = "https://github.com/HKUDS/LightRAG" +Documentation = "https://github.com/HKUDS/LightRAG" +Repository = "https://github.com/HKUDS/LightRAG" +"Bug Tracker" = "https://github.com/HKUDS/LightRAG/issues" + +[tool.setuptools] +packages = ["lightrag"] +include-package-data = true + +[tool.setuptools.dynamic] +version = {attr = "lightrag.__version__"} + +[tool.setuptools.package-data] +lightrag = ["api/webui/**/*"] diff --git a/setup.py b/setup.py index 14e5c56b..655e2e9e 100644 --- a/setup.py +++ b/setup.py @@ -1,107 +1,6 @@ -import setuptools -from pathlib import Path +# Minimal setup.py for backward compatibility +# Primary configuration is now in pyproject.toml +from setuptools import setup -# Reading the long description from README.md -def read_long_description(): - try: - return Path("README.md").read_text(encoding="utf-8") - except FileNotFoundError: - return "A description of LightRAG is currently unavailable." - - -# Retrieving metadata from __init__.py -def retrieve_metadata(): - vars2find = ["__author__", "__version__", "__url__"] - vars2readme = {} - try: - with open("./lightrag/__init__.py") as f: - for line in f.readlines(): - for v in vars2find: - if line.startswith(v): - line = ( - line.replace(" ", "") - .replace('"', "") - .replace("'", "") - .strip() - ) - vars2readme[v] = line.split("=")[1] - except FileNotFoundError: - raise FileNotFoundError("Metadata file './lightrag/__init__.py' not found.") - - # Checking if all required variables are found - missing_vars = [v for v in vars2find if v not in vars2readme] - if missing_vars: - raise ValueError( - f"Missing required metadata variables in __init__.py: {missing_vars}" - ) - - return vars2readme - - -# Reading dependencies from requirements.txt -def read_requirements(file_path="requirements.txt"): - deps = [] - try: - with open(file_path) as f: - deps = [ - line.strip() for line in f if line.strip() and not line.startswith("#") - ] - except FileNotFoundError: - print(f"Warning: '{file_path}' not found. No dependencies will be installed.") - return deps - - -def read_api_requirements(): - return read_requirements("lightrag/api/requirements.txt") - - -def read_extra_requirements(): - return read_requirements("lightrag/tools/lightrag_visualizer/requirements.txt") - - -metadata = retrieve_metadata() -long_description = read_long_description() -requirements = read_requirements() - -setuptools.setup( - name="lightrag-hku", - url=metadata["__url__"], - version=metadata["__version__"], - author=metadata["__author__"], - description="LightRAG: Simple and Fast Retrieval-Augmented Generation", - long_description=long_description, - long_description_content_type="text/markdown", - packages=setuptools.find_packages( - exclude=("tests*", "docs*") - ), # Automatically find packages - classifiers=[ - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Intended Audience :: Developers", - "Topic :: Software Development :: Libraries :: Python Modules", - ], - python_requires=">=3.9", - install_requires=requirements, - include_package_data=True, # Includes non-code files from MANIFEST.in - project_urls={ # Additional project metadata - "Documentation": metadata.get("__url__", ""), - "Source": metadata.get("__url__", ""), - "Tracker": f"{metadata.get('__url__', '')}/issues" - if metadata.get("__url__") - else "", - }, - extras_require={ - "api": requirements + read_api_requirements(), - "tools": read_extra_requirements(), # API requirements as optional - }, - entry_points={ - "console_scripts": [ - "lightrag-server=lightrag.api.lightrag_server:main [api]", - "lightrag-gunicorn=lightrag.api.run_with_gunicorn:main [api]", - "lightrag-viewer=lightrag.tools.lightrag_visualizer.graph_visualizer:main [tools]", - ], - }, -) +setup()