Enhance Neo4j fulltext search with Chinese language support
• Add CJK analyzer for Chinese text • Auto-detect Chinese characters • Recreate index if needed • Separate Chinese/Latin search logic • Improve fallback for Chinese queries
This commit is contained in:
parent
3b502af858
commit
8e2a1fa59e
1 changed files with 169 additions and 69 deletions
|
|
@ -70,6 +70,11 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
"""Return workspace label (guaranteed non-empty during initialization)"""
|
"""Return workspace label (guaranteed non-empty during initialization)"""
|
||||||
return self.workspace
|
return self.workspace
|
||||||
|
|
||||||
|
def _is_chinese_text(self, text: str) -> bool:
|
||||||
|
"""Check if text contains Chinese characters."""
|
||||||
|
chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
|
||||||
|
return bool(chinese_pattern.search(text))
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
async with get_data_init_lock():
|
async with get_data_init_lock():
|
||||||
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
|
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
|
||||||
|
|
@ -225,37 +230,91 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async def _create_fulltext_index(
|
async def _create_fulltext_index(
|
||||||
self, driver: AsyncDriver, database: str, workspace_label: str
|
self, driver: AsyncDriver, database: str, workspace_label: str
|
||||||
):
|
):
|
||||||
"""Create a full-text index on the entity_id property if it doesn't exist."""
|
"""Create a full-text index on the entity_id property with Chinese tokenizer support."""
|
||||||
index_name = "entity_id_fulltext_idx"
|
index_name = "entity_id_fulltext_idx"
|
||||||
try:
|
try:
|
||||||
async with driver.session(database=database) as session:
|
async with driver.session(database=database) as session:
|
||||||
# Check if the full-text index exists
|
# Check if the full-text index exists and get its configuration
|
||||||
check_index_query = "SHOW FULLTEXT INDEXES"
|
check_index_query = "SHOW FULLTEXT INDEXES"
|
||||||
result = await session.run(check_index_query)
|
result = await session.run(check_index_query)
|
||||||
indexes = await result.data()
|
indexes = await result.data()
|
||||||
await result.consume()
|
await result.consume()
|
||||||
|
|
||||||
index_exists = any(idx["name"] == index_name for idx in indexes)
|
existing_index = None
|
||||||
|
for idx in indexes:
|
||||||
|
if idx["name"] == index_name:
|
||||||
|
existing_index = idx
|
||||||
|
break
|
||||||
|
|
||||||
if not index_exists:
|
# Check if we need to recreate the index
|
||||||
|
needs_recreation = False
|
||||||
|
if existing_index:
|
||||||
|
# Check if the existing index has CJK analyzer
|
||||||
|
index_config = existing_index.get("options", {})
|
||||||
|
current_analyzer = index_config.get("indexConfig", {}).get("fulltext.analyzer", "standard")
|
||||||
|
|
||||||
|
if current_analyzer != "cjk":
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Existing index '{index_name}' uses '{current_analyzer}' analyzer. "
|
||||||
|
"Recreating with CJK analyzer for Chinese support."
|
||||||
|
)
|
||||||
|
needs_recreation = True
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"[{self.workspace}] Full-text index '{index_name}' already exists with CJK analyzer."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
if not existing_index or needs_recreation:
|
||||||
|
# Drop existing index if it needs recreation
|
||||||
|
if needs_recreation:
|
||||||
|
try:
|
||||||
|
drop_query = f"DROP INDEX {index_name}"
|
||||||
|
result = await session.run(drop_query)
|
||||||
|
await result.consume()
|
||||||
|
logger.info(f"[{self.workspace}] Dropped existing index '{index_name}'")
|
||||||
|
except Exception as drop_error:
|
||||||
|
logger.warning(f"[{self.workspace}] Failed to drop existing index: {str(drop_error)}")
|
||||||
|
|
||||||
|
# Create new index with CJK analyzer
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[{self.workspace}] Full-text index '{index_name}' not found. Creating it now."
|
f"[{self.workspace}] Creating full-text index '{index_name}' with Chinese tokenizer support."
|
||||||
)
|
|
||||||
# Create the full-text index
|
|
||||||
create_index_query = f"""
|
|
||||||
CREATE FULLTEXT INDEX {index_name} FOR (n:`{workspace_label}`) ON EACH [n.entity_id]
|
|
||||||
"""
|
|
||||||
result = await session.run(create_index_query)
|
|
||||||
await result.consume()
|
|
||||||
logger.info(
|
|
||||||
f"[{self.workspace}] Successfully created full-text index '{index_name}'."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"[{self.workspace}] Full-text index '{index_name}' already exists."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
create_index_query = f"""
|
||||||
|
CREATE FULLTEXT INDEX {index_name}
|
||||||
|
FOR (n:`{workspace_label}`) ON EACH [n.entity_id]
|
||||||
|
OPTIONS {{
|
||||||
|
indexConfig: {{
|
||||||
|
`fulltext.analyzer`: 'cjk',
|
||||||
|
`fulltext.eventually_consistent`: true
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
result = await session.run(create_index_query)
|
||||||
|
await result.consume()
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Successfully created full-text index '{index_name}' with CJK analyzer."
|
||||||
|
)
|
||||||
|
except Exception as cjk_error:
|
||||||
|
# Fallback to standard analyzer if CJK is not supported
|
||||||
|
logger.warning(
|
||||||
|
f"[{self.workspace}] CJK analyzer not supported: {str(cjk_error)}. "
|
||||||
|
"Falling back to standard analyzer."
|
||||||
|
)
|
||||||
|
create_index_query = f"""
|
||||||
|
CREATE FULLTEXT INDEX {index_name}
|
||||||
|
FOR (n:`{workspace_label}`) ON EACH [n.entity_id]
|
||||||
|
"""
|
||||||
|
result = await session.run(create_index_query)
|
||||||
|
await result.consume()
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Successfully created full-text index '{index_name}' with standard analyzer."
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Handle cases where the command might not be supported (e.g., community edition before 5.x)
|
# Handle cases where the command might not be supported
|
||||||
if "Unknown command" in str(e) or "invalid syntax" in str(e).lower():
|
if "Unknown command" in str(e) or "invalid syntax" in str(e).lower():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"[{self.workspace}] Could not create or verify full-text index '{index_name}'. "
|
f"[{self.workspace}] Could not create or verify full-text index '{index_name}'. "
|
||||||
|
|
@ -1594,6 +1653,7 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async def search_labels(self, query: str, limit: int = 50) -> list[str]:
|
async def search_labels(self, query: str, limit: int = 50) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Search labels with fuzzy matching, using a full-text index for performance if available.
|
Search labels with fuzzy matching, using a full-text index for performance if available.
|
||||||
|
Enhanced with Chinese text support using CJK analyzer.
|
||||||
Falls back to a slower CONTAINS search if the index is not available or fails.
|
Falls back to a slower CONTAINS search if the index is not available or fails.
|
||||||
"""
|
"""
|
||||||
workspace_label = self._get_workspace_label()
|
workspace_label = self._get_workspace_label()
|
||||||
|
|
@ -1602,7 +1662,7 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
query_lower = query_strip.lower()
|
query_lower = query_strip.lower()
|
||||||
|
is_chinese = self._is_chinese_text(query_strip)
|
||||||
index_name = "entity_id_fulltext_idx"
|
index_name = "entity_id_fulltext_idx"
|
||||||
|
|
||||||
# Attempt to use the full-text index first
|
# Attempt to use the full-text index first
|
||||||
|
|
@ -1610,77 +1670,117 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
# The query uses a full-text index.
|
if is_chinese:
|
||||||
# The native score from the index is used as the primary sorting key.
|
# For Chinese text, use different search strategies
|
||||||
# We add extra scoring for exact and prefix matches to align with NetworkX logic.
|
cypher_query = f"""
|
||||||
cypher_query = f"""
|
CALL db.index.fulltext.queryNodes($index_name, $search_query) YIELD node, score
|
||||||
CALL db.index.fulltext.queryNodes($index_name, $search_query) YIELD node, score
|
WITH node, score
|
||||||
WITH node, score
|
WHERE node:`{workspace_label}`
|
||||||
WHERE node:`{workspace_label}`
|
WITH node.entity_id AS label, score
|
||||||
WITH node.entity_id AS label, toLower(node.entity_id) AS label_lower, score
|
WITH label, score,
|
||||||
WITH label, label_lower, score,
|
CASE
|
||||||
CASE
|
WHEN label = $query_strip THEN score + 1000
|
||||||
WHEN label_lower = $query_lower THEN score + 1000
|
WHEN label CONTAINS $query_strip THEN score + 500
|
||||||
WHEN label_lower STARTS WITH $query_lower THEN score + 500
|
ELSE score
|
||||||
WHEN label_lower CONTAINS ' ' + $query_lower OR label_lower CONTAINS '_' + $query_lower THEN score + 50
|
END AS final_score
|
||||||
ELSE score
|
RETURN label
|
||||||
END AS final_score
|
ORDER BY final_score DESC, label ASC
|
||||||
RETURN label
|
LIMIT $limit
|
||||||
ORDER BY final_score DESC, label ASC
|
"""
|
||||||
LIMIT $limit
|
# For Chinese, don't add wildcard as it may not work properly with CJK analyzer
|
||||||
"""
|
search_query = query_strip
|
||||||
|
else:
|
||||||
|
# For non-Chinese text, use the original logic with wildcard
|
||||||
|
cypher_query = f"""
|
||||||
|
CALL db.index.fulltext.queryNodes($index_name, $search_query) YIELD node, score
|
||||||
|
WITH node, score
|
||||||
|
WHERE node:`{workspace_label}`
|
||||||
|
WITH node.entity_id AS label, toLower(node.entity_id) AS label_lower, score
|
||||||
|
WITH label, label_lower, score,
|
||||||
|
CASE
|
||||||
|
WHEN label_lower = $query_lower THEN score + 1000
|
||||||
|
WHEN label_lower STARTS WITH $query_lower THEN score + 500
|
||||||
|
WHEN label_lower CONTAINS ' ' + $query_lower OR label_lower CONTAINS '_' + $query_lower THEN score + 50
|
||||||
|
ELSE score
|
||||||
|
END AS final_score
|
||||||
|
RETURN label
|
||||||
|
ORDER BY final_score DESC, label ASC
|
||||||
|
LIMIT $limit
|
||||||
|
"""
|
||||||
|
search_query = f"{query_strip}*"
|
||||||
|
|
||||||
result = await session.run(
|
result = await session.run(
|
||||||
cypher_query,
|
cypher_query,
|
||||||
index_name=index_name,
|
index_name=index_name,
|
||||||
search_query=f"{query_strip}*", # Add wildcard for prefix/contains matching
|
search_query=search_query,
|
||||||
query_lower=query_lower,
|
query_lower=query_lower,
|
||||||
|
query_strip=query_strip,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
labels = [record["label"] async for record in result]
|
labels = [record["label"] async for record in result]
|
||||||
await result.consume()
|
await result.consume()
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[{self.workspace}] Full-text search for '{query}' returned {len(labels)} results (limit: {limit})"
|
f"[{self.workspace}] Full-text search ({'Chinese' if is_chinese else 'Latin'}) for '{query}' returned {len(labels)} results (limit: {limit})"
|
||||||
)
|
)
|
||||||
return labels
|
return labels
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If the full-text search fails (e.g., index doesn't exist, unsupported version),
|
# If the full-text search fails, fall back to CONTAINS search
|
||||||
# fall back to the old, slower method.
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"[{self.workspace}] Full-text search failed with error: {str(e)}. "
|
f"[{self.workspace}] Full-text search failed with error: {str(e)}. "
|
||||||
"Falling back to slower, non-indexed search. "
|
"Falling back to slower, non-indexed search."
|
||||||
"Ensure you are on Neo4j Enterprise or a version supporting full-text indexes."
|
|
||||||
)
|
)
|
||||||
logger.warning(
|
|
||||||
f"[{self.workspace}] Falling back to slower, non-indexed search. "
|
# Enhanced fallback implementation
|
||||||
"Ensure you are on Neo4j Enterprise or a version supporting full-text indexes."
|
|
||||||
)
|
|
||||||
# Fallback implementation
|
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
cypher_query = f"""
|
if is_chinese:
|
||||||
MATCH (n:`{workspace_label}`)
|
# For Chinese text, use direct CONTAINS without case conversion
|
||||||
WHERE n.entity_id IS NOT NULL
|
cypher_query = f"""
|
||||||
WITH n.entity_id AS label, toLower(n.entity_id) AS label_lower
|
MATCH (n:`{workspace_label}`)
|
||||||
WHERE label_lower CONTAINS $query_lower
|
WHERE n.entity_id IS NOT NULL
|
||||||
WITH label, label_lower,
|
WITH n.entity_id AS label
|
||||||
CASE
|
WHERE label CONTAINS $query_strip
|
||||||
WHEN label_lower = $query_lower THEN 1000
|
WITH label,
|
||||||
WHEN label_lower STARTS WITH $query_lower THEN 500
|
CASE
|
||||||
ELSE 100 - size(label)
|
WHEN label = $query_strip THEN 1000
|
||||||
END AS score
|
WHEN label STARTS WITH $query_strip THEN 500
|
||||||
ORDER BY score DESC, label ASC
|
ELSE 100 - size(label)
|
||||||
LIMIT $limit
|
END AS score
|
||||||
RETURN label
|
ORDER BY score DESC, label ASC
|
||||||
"""
|
LIMIT $limit
|
||||||
result = await session.run(
|
RETURN label
|
||||||
cypher_query, query_lower=query_lower, limit=limit
|
"""
|
||||||
)
|
result = await session.run(
|
||||||
|
cypher_query, query_strip=query_strip, limit=limit
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# For non-Chinese text, use the original fallback logic
|
||||||
|
cypher_query = f"""
|
||||||
|
MATCH (n:`{workspace_label}`)
|
||||||
|
WHERE n.entity_id IS NOT NULL
|
||||||
|
WITH n.entity_id AS label, toLower(n.entity_id) AS label_lower
|
||||||
|
WHERE label_lower CONTAINS $query_lower
|
||||||
|
WITH label, label_lower,
|
||||||
|
CASE
|
||||||
|
WHEN label_lower = $query_lower THEN 1000
|
||||||
|
WHEN label_lower STARTS WITH $query_lower THEN 500
|
||||||
|
ELSE 100 - size(label)
|
||||||
|
END AS score
|
||||||
|
ORDER BY score DESC, label ASC
|
||||||
|
LIMIT $limit
|
||||||
|
RETURN label
|
||||||
|
"""
|
||||||
|
result = await session.run(
|
||||||
|
cypher_query, query_lower=query_lower, limit=limit
|
||||||
|
)
|
||||||
|
|
||||||
labels = [record["label"] async for record in result]
|
labels = [record["label"] async for record in result]
|
||||||
await result.consume()
|
await result.consume()
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[{self.workspace}] Fallback search for '{query}' returned {len(labels)} results (limit: {limit})"
|
f"[{self.workspace}] Fallback search ({'Chinese' if is_chinese else 'Latin'}) for '{query}' returned {len(labels)} results (limit: {limit})"
|
||||||
)
|
)
|
||||||
return labels
|
return labels
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue