Merge branch 'main' into add-Memgraph-graph-db

2025-07-05 13:14:39 +08:00 · 2025-07-05 13:14:39 +08:00 · a567601da2
commit a567601da2
parent 8ce2223e58 706da5ad23
8 changed files with 144 additions and 116 deletions
--- a/README-zh.md
+++ b/README-zh.md
@ -757,6 +757,8 @@ async def initialize_rag():

 <details>
 <summary> <b>使用Faiss进行存储</b> </summary>
+在使用Faiss向量数据库之前必须手工安装`faiss-cpu`或`faiss-gpu`。
+

 - 安装所需依赖：

--- a/README.md
+++ b/README.md
@ -819,6 +819,8 @@ For production level scenarios you will most likely want to leverage an enterpri

 <details>
 <summary> <b>Using Faiss for Storage</b> </summary>
+You must manually install faiss-cpu or faiss-gpu before using FAISS vector db.
+Manually install `faiss-cpu` or `faiss-gpu` before using FAISS vector db.

 - Install the required dependencies:

--- a/env.example
+++ b/env.example
@ -108,11 +108,28 @@ EMBEDDING_BINDING_HOST=http://localhost:11434
 # AZURE_EMBEDDING_ENDPOINT=your_endpoint
 # AZURE_EMBEDDING_API_KEY=your_api_key

+###########################
 ### Data storage selection
+###########################
+### PostgreSQL
 # LIGHTRAG_KV_STORAGE=PGKVStorage
-# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
 # LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
+# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
+# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
+### MongoDB
+# LIGHTRAG_KV_STORAGE=MongoKVStorage
+# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
+# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
+# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
+### KV Storage
+# LIGHTRAG_KV_STORAGE=RedisKVStorage
+# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
+### Vector Storage
+# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
+# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
+### Graph Storage
 # LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
+# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage

 ### PostgreSQL Configuration
 POSTGRES_HOST=localhost
--- a/lightrag/kg/deprecated/age_impl.py
+++ b/lightrag/kg/deprecated/age_impl.py
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@ -4,9 +4,7 @@ import asyncio
 from typing import Any, final
 import json
 import numpy as np
-
 from dataclasses import dataclass
-import pipmaster as pm

 from lightrag.utils import logger, compute_mdhash_id
 from lightrag.base import BaseVectorStorage
@ -17,11 +15,7 @@ from .shared_storage import (
    set_all_update_flags,
 )

-USE_GPU = os.getenv("FAISS_USE_GPU", "0") == "1"
-FAISS_PACKAGE = "faiss-gpu" if USE_GPU else "faiss-cpu"
-if not pm.is_installed(FAISS_PACKAGE):
-    pm.install(FAISS_PACKAGE)
-
+# You must manually install faiss-cpu or faiss-gpu before using FAISS vector db
 import faiss  # type: ignore


@ -165,7 +159,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
            meta["__vector__"] = embeddings[i].tolist()
            self._id_to_meta.update({fid: meta})

-        logger.info(f"Upserted {len(list_data)} vectors into Faiss index.")
+        logger.debug(f"Upserted {len(list_data)} vectors into Faiss index.")
        return [m["__id__"] for m in list_data]

    async def query(
@ -228,7 +222,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
        2. Only one process should updating the storage at a time before index_done_callback,
           KG-storage-log should be used to avoid data corruption
        """
-        logger.info(f"Deleting {len(ids)} vectors from {self.namespace}")
+        logger.debug(f"Deleting {len(ids)} vectors from {self.namespace}")
        to_remove = []
        for cid in ids:
            fid = self._find_faiss_id_by_custom_id(cid)
@ -330,7 +324,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
        and rebuild in-memory structures so we can query.
        """
        if not os.path.exists(self._faiss_index_file):
-            logger.warning("No existing Faiss index file found. Starting fresh.")
+            logger.warning(f"No existing Faiss index file found for {self.namespace}")
            return

        try:
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -168,6 +168,13 @@ async def _handle_single_entity_extraction(
    # Normalize entity name
    entity_name = normalize_extracted_info(entity_name, is_entity=True)

+    # Check if entity name became empty after normalization
+    if not entity_name or not entity_name.strip():
+        logger.warning(
+            f"Entity extraction error: entity name became empty after normalization. Original: '{record_attributes[1]}'"
+        )
+        return None
+
    # Clean and validate entity type
    entity_type = clean_str(record_attributes[2]).strip('"')
    if not entity_type.strip() or entity_type.startswith('("'):
@ -209,6 +216,20 @@ async def _handle_single_relationship_extraction(
    # Normalize source and target entity names
    source = normalize_extracted_info(source, is_entity=True)
    target = normalize_extracted_info(target, is_entity=True)
+
+    # Check if source or target became empty after normalization
+    if not source or not source.strip():
+        logger.warning(
+            f"Relationship extraction error: source entity became empty after normalization. Original: '{record_attributes[1]}'"
+        )
+        return None
+
+    if not target or not target.strip():
+        logger.warning(
+            f"Relationship extraction error: target entity became empty after normalization. Original: '{record_attributes[2]}'"
+        )
+        return None
+
    if source == target:
        logger.debug(
            f"Relationship source and target are the same in: {record_attributes}"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,93 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "lightrag-hku"
+dynamic = ["version"]
+authors = [
+    {name = "Zirui Guo"}
+]
+description = "LightRAG: Simple and Fast Retrieval-Augmented Generation"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.9"
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "aiohttp",
+    "configparser",
+    "dotenv",
+    "future",
+    "numpy",
+    "pandas>=2.0.0",
+    "pipmaster",
+    "pydantic",
+    "python-dotenv",
+    "pyuca",
+    "setuptools",
+    "tenacity",
+    "tiktoken",
+    "xlsxwriter>=3.1.0",
+]
+
+[project.optional-dependencies]
+api = [
+    # Core dependencies
+    "aiohttp",
+    "configparser",
+    "dotenv",
+    "future",
+    "numpy",
+    "openai",
+    "pandas>=2.0.0",
+    "pipmaster",
+    "pydantic",
+    "python-dotenv",
+    "pyuca",
+    "setuptools",
+    "tenacity",
+    "tiktoken",
+    "xlsxwriter>=3.1.0",
+    # API-specific dependencies
+    "aiofiles",
+    "ascii_colors",
+    "asyncpg",
+    "distro",
+    "fastapi",
+    "httpcore",
+    "httpx",
+    "jiter",
+    "passlib[bcrypt]",
+    "PyJWT",
+    "python-jose[cryptography]",
+    "python-multipart",
+    "pytz",
+    "uvicorn",
+]
+
+[project.scripts]
+lightrag-server = "lightrag.api.lightrag_server:main"
+lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main"
+
+[project.urls]
+Homepage = "https://github.com/HKUDS/LightRAG"
+Documentation = "https://github.com/HKUDS/LightRAG"
+Repository = "https://github.com/HKUDS/LightRAG"
+"Bug Tracker" = "https://github.com/HKUDS/LightRAG/issues"
+
+[tool.setuptools]
+packages = ["lightrag"]
+include-package-data = true
+
+[tool.setuptools.dynamic]
+version = {attr = "lightrag.__version__"}
+
+[tool.setuptools.package-data]
+lightrag = ["api/webui/**/*"]
--- a/setup.py
+++ b/setup.py
@ -1,107 +1,6 @@
-import setuptools
-from pathlib import Path
+# Minimal setup.py for backward compatibility
+# Primary configuration is now in pyproject.toml

+from setuptools import setup

-# Reading the long description from README.md
-def read_long_description():
-    try:
-        return Path("README.md").read_text(encoding="utf-8")
-    except FileNotFoundError:
-        return "A description of LightRAG is currently unavailable."
-
-
-# Retrieving metadata from __init__.py
-def retrieve_metadata():
-    vars2find = ["__author__", "__version__", "__url__"]
-    vars2readme = {}
-    try:
-        with open("./lightrag/__init__.py") as f:
-            for line in f.readlines():
-                for v in vars2find:
-                    if line.startswith(v):
-                        line = (
-                            line.replace(" ", "")
-                            .replace('"', "")
-                            .replace("'", "")
-                            .strip()
-                        )
-                        vars2readme[v] = line.split("=")[1]
-    except FileNotFoundError:
-        raise FileNotFoundError("Metadata file './lightrag/__init__.py' not found.")
-
-    # Checking if all required variables are found
-    missing_vars = [v for v in vars2find if v not in vars2readme]
-    if missing_vars:
-        raise ValueError(
-            f"Missing required metadata variables in __init__.py: {missing_vars}"
-        )
-
-    return vars2readme
-
-
-# Reading dependencies from requirements.txt
-def read_requirements(file_path="requirements.txt"):
-    deps = []
-    try:
-        with open(file_path) as f:
-            deps = [
-                line.strip() for line in f if line.strip() and not line.startswith("#")
-            ]
-    except FileNotFoundError:
-        print(f"Warning: '{file_path}' not found. No dependencies will be installed.")
-    return deps
-
-
-def read_api_requirements():
-    return read_requirements("lightrag/api/requirements.txt")
-
-
-def read_extra_requirements():
-    return read_requirements("lightrag/tools/lightrag_visualizer/requirements.txt")
-
-
-metadata = retrieve_metadata()
-long_description = read_long_description()
-requirements = read_requirements()
-
-setuptools.setup(
-    name="lightrag-hku",
-    url=metadata["__url__"],
-    version=metadata["__version__"],
-    author=metadata["__author__"],
-    description="LightRAG: Simple and Fast Retrieval-Augmented Generation",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    packages=setuptools.find_packages(
-        exclude=("tests*", "docs*")
-    ),  # Automatically find packages
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-        "Intended Audience :: Developers",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-    ],
-    python_requires=">=3.9",
-    install_requires=requirements,
-    include_package_data=True,  # Includes non-code files from MANIFEST.in
-    project_urls={  # Additional project metadata
-        "Documentation": metadata.get("__url__", ""),
-        "Source": metadata.get("__url__", ""),
-        "Tracker": f"{metadata.get('__url__', '')}/issues"
-        if metadata.get("__url__")
-        else "",
-    },
-    extras_require={
-        "api": requirements + read_api_requirements(),
-        "tools": read_extra_requirements(),  # API requirements as optional
-    },
-    entry_points={
-        "console_scripts": [
-            "lightrag-server=lightrag.api.lightrag_server:main [api]",
-            "lightrag-gunicorn=lightrag.api.run_with_gunicorn:main [api]",
-            "lightrag-viewer=lightrag.tools.lightrag_visualizer.graph_visualizer:main [tools]",
-        ],
-    },
-)
+setup()