Merge branch 'optimize-extraction' into return-data-only
This commit is contained in:
commit
20c5127c7c
3 changed files with 44 additions and 27 deletions
|
|
@ -324,7 +324,7 @@ async def _handle_single_entity_extraction(
|
|||
if len(record_attributes) != 4 or "entity" not in record_attributes[0]:
|
||||
if len(record_attributes) > 1 and "entity" in record_attributes[0]:
|
||||
logger.warning(
|
||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}`"
|
||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` of type {record_attributes[2] if len(record_attributes) > 2 else 'N/A'}"
|
||||
)
|
||||
return None
|
||||
|
||||
|
|
@ -392,10 +392,12 @@ async def _handle_single_relationship_extraction(
|
|||
timestamp: int,
|
||||
file_path: str = "unknown_source",
|
||||
):
|
||||
if len(record_attributes) != 5 or "relationship" not in record_attributes[0]:
|
||||
if len(record_attributes) > 1 and "relationship" in record_attributes[0]:
|
||||
if (
|
||||
len(record_attributes) != 5 or "relation" not in record_attributes[0]
|
||||
): # treat "relationship" and "relation" interchangeable
|
||||
if len(record_attributes) > 1 and "relation" in record_attributes[0]:
|
||||
logger.warning(
|
||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`"
|
||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`"
|
||||
)
|
||||
return None
|
||||
|
||||
|
|
@ -878,7 +880,7 @@ async def _process_extraction_result(
|
|||
# Split LLL output result to records by "\n"
|
||||
records = split_string_by_multi_markers(
|
||||
result,
|
||||
["\n", completion_delimiter],
|
||||
["\n", completion_delimiter, completion_delimiter.lower()],
|
||||
)
|
||||
|
||||
# Fix LLM output format error which use tuple_delimiter to seperate record instead of "\n"
|
||||
|
|
@ -892,18 +894,23 @@ async def _process_extraction_result(
|
|||
)
|
||||
for entity_record in entity_records:
|
||||
if not entity_record.startswith("entity") and not entity_record.startswith(
|
||||
"relationship"
|
||||
"relation"
|
||||
):
|
||||
entity_record = f"entity<|{entity_record}"
|
||||
entity_relation_records = split_string_by_multi_markers(
|
||||
entity_record, [f"{tuple_delimiter}relationship{tuple_delimiter}"]
|
||||
# treat "relationship" and "relation" interchangeable
|
||||
entity_record,
|
||||
[
|
||||
f"{tuple_delimiter}relationship{tuple_delimiter}",
|
||||
f"{tuple_delimiter}relation{tuple_delimiter}",
|
||||
],
|
||||
)
|
||||
for entity_relation_record in entity_relation_records:
|
||||
if not entity_relation_record.startswith(
|
||||
"entity"
|
||||
) and not entity_relation_record.startswith("relationship"):
|
||||
) and not entity_relation_record.startswith("relation"):
|
||||
entity_relation_record = (
|
||||
f"relationship{tuple_delimiter}{entity_relation_record}"
|
||||
f"relation{tuple_delimiter}{entity_relation_record}"
|
||||
)
|
||||
fixed_records = fixed_records + [entity_relation_record]
|
||||
|
||||
|
|
@ -920,9 +927,12 @@ async def _process_extraction_result(
|
|||
# Fix various forms of tuple_delimiter corruption from the LLM output using the dedicated function
|
||||
delimiter_core = tuple_delimiter[2:-2] # Extract "#" from "<|#|>"
|
||||
record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
|
||||
# change delimiter_core to lower case, and fix again
|
||||
delimiter_core = delimiter_core.lower()
|
||||
record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
|
||||
if delimiter_core != delimiter_core.lower():
|
||||
# change delimiter_core to lower case, and fix again
|
||||
delimiter_core = delimiter_core.lower()
|
||||
record = fix_tuple_delimiter_corruption(
|
||||
record, delimiter_core, tuple_delimiter
|
||||
)
|
||||
|
||||
record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
|
||||
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel
|
|||
* `target_entity`: The name of the target entity. Ensure **consistent naming** with entity extraction. Capitalize the first letter of each significant word (title case) if the name is case-insensitive.
|
||||
* `relationship_keywords`: One or more high-level keywords summarizing the overarching nature, concepts, or themes of the relationship. Multiple keywords within this field must be separated by a comma `,`. **DO NOT use `{tuple_delimiter}` for separating multiple keywords within this field.**
|
||||
* `relationship_description`: A concise explanation of the nature of the relationship between the source and target entities, providing a clear rationale for their connection.
|
||||
* **Output Format - Relationships:** Output a total of 5 fields for each relationship, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `relationship`.
|
||||
* Format: `relationship{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description`
|
||||
* **Output Format - Relationships:** Output a total of 5 fields for each relationship, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `relation`.
|
||||
* Format: `relation{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description`
|
||||
|
||||
3. **Delimiter Usage Protocol:**
|
||||
* The `{tuple_delimiter}` is a complete, atomic marker and **must not be filled with content**. It serves strictly as a field separator.
|
||||
|
|
@ -113,11 +113,11 @@ entity{tuple_delimiter}Taylor{tuple_delimiter}person{tuple_delimiter}Taylor is p
|
|||
entity{tuple_delimiter}Jordan{tuple_delimiter}person{tuple_delimiter}Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device.
|
||||
entity{tuple_delimiter}Cruz{tuple_delimiter}person{tuple_delimiter}Cruz is associated with a vision of control and order, influencing the dynamics among other characters.
|
||||
entity{tuple_delimiter}The Device{tuple_delimiter}equiment{tuple_delimiter}The Device is central to the story, with potential game-changing implications, and is revered by Taylor.
|
||||
relationship{tuple_delimiter}Alex{tuple_delimiter}Taylor{tuple_delimiter}power dynamics, observation{tuple_delimiter}Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device.
|
||||
relationship{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision.)
|
||||
relationship{tuple_delimiter}Taylor{tuple_delimiter}Jordan{tuple_delimiter}conflict resolution, mutual respect{tuple_delimiter}Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce.
|
||||
relationship{tuple_delimiter}Jordan{tuple_delimiter}Cruz{tuple_delimiter}ideological conflict, rebellion{tuple_delimiter}Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order.
|
||||
relationship{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}reverence, technological significance{tuple_delimiter}Taylor shows reverence towards the device, indicating its importance and potential impact.
|
||||
relation{tuple_delimiter}Alex{tuple_delimiter}Taylor{tuple_delimiter}power dynamics, observation{tuple_delimiter}Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device.
|
||||
relation{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision.)
|
||||
relation{tuple_delimiter}Taylor{tuple_delimiter}Jordan{tuple_delimiter}conflict resolution, mutual respect{tuple_delimiter}Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce.
|
||||
relation{tuple_delimiter}Jordan{tuple_delimiter}Cruz{tuple_delimiter}ideological conflict, rebellion{tuple_delimiter}Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order.
|
||||
relation{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}reverence, technological significance{tuple_delimiter}Taylor shows reverence towards the device, indicating its importance and potential impact.
|
||||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
|
|
@ -141,10 +141,10 @@ entity{tuple_delimiter}Crude Oil{tuple_delimiter}product{tuple_delimiter}Crude o
|
|||
entity{tuple_delimiter}Market Selloff{tuple_delimiter}category{tuple_delimiter}Market selloff refers to the significant decline in stock values due to investor concerns over interest rates and regulations.
|
||||
entity{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}category{tuple_delimiter}The Federal Reserve's upcoming policy announcement is expected to impact investor confidence and market stability.
|
||||
entity{tuple_delimiter}3.4% Decline{tuple_delimiter}category{tuple_delimiter}The Global Tech Index experienced a 3.4% decline in midday trading.
|
||||
relationship{tuple_delimiter}Global Tech Index{tuple_delimiter}Market Selloff{tuple_delimiter}market performance, investor sentiment{tuple_delimiter}The decline in the Global Tech Index is part of the broader market selloff driven by investor concerns.
|
||||
relationship{tuple_delimiter}Nexon Technologies{tuple_delimiter}Global Tech Index{tuple_delimiter}company impact, index movement{tuple_delimiter}Nexon Technologies' stock decline contributed to the overall drop in the Global Tech Index.
|
||||
relationship{tuple_delimiter}Gold Futures{tuple_delimiter}Market Selloff{tuple_delimiter}market reaction, safe-haven investment{tuple_delimiter}Gold prices rose as investors sought safe-haven assets during the market selloff.
|
||||
relationship{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}Market Selloff{tuple_delimiter}interest rate impact, financial regulation{tuple_delimiter}Speculation over Federal Reserve policy changes contributed to market volatility and investor selloff.
|
||||
relation{tuple_delimiter}Global Tech Index{tuple_delimiter}Market Selloff{tuple_delimiter}market performance, investor sentiment{tuple_delimiter}The decline in the Global Tech Index is part of the broader market selloff driven by investor concerns.
|
||||
relation{tuple_delimiter}Nexon Technologies{tuple_delimiter}Global Tech Index{tuple_delimiter}company impact, index movement{tuple_delimiter}Nexon Technologies' stock decline contributed to the overall drop in the Global Tech Index.
|
||||
relation{tuple_delimiter}Gold Futures{tuple_delimiter}Market Selloff{tuple_delimiter}market reaction, safe-haven investment{tuple_delimiter}Gold prices rose as investors sought safe-haven assets during the market selloff.
|
||||
relation{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}Market Selloff{tuple_delimiter}interest rate impact, financial regulation{tuple_delimiter}Speculation over Federal Reserve policy changes contributed to market volatility and investor selloff.
|
||||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
|
|
@ -160,10 +160,10 @@ entity{tuple_delimiter}Noah Carter{tuple_delimiter}person{tuple_delimiter}Noah C
|
|||
entity{tuple_delimiter}100m Sprint Record{tuple_delimiter}category{tuple_delimiter}The 100m sprint record is a benchmark in athletics, recently broken by Noah Carter.
|
||||
entity{tuple_delimiter}Carbon-Fiber Spikes{tuple_delimiter}equipment{tuple_delimiter}Carbon-fiber spikes are advanced sprinting shoes that provide enhanced speed and traction.
|
||||
entity{tuple_delimiter}World Athletics Federation{tuple_delimiter}organization{tuple_delimiter}The World Athletics Federation is the governing body overseeing the World Athletics Championship and record validations.
|
||||
relationship{tuple_delimiter}World Athletics Championship{tuple_delimiter}Tokyo{tuple_delimiter}event location, international competition{tuple_delimiter}The World Athletics Championship is being hosted in Tokyo.
|
||||
relationship{tuple_delimiter}Noah Carter{tuple_delimiter}100m Sprint Record{tuple_delimiter}athlete achievement, record-breaking{tuple_delimiter}Noah Carter set a new 100m sprint record at the championship.
|
||||
relationship{tuple_delimiter}Noah Carter{tuple_delimiter}Carbon-Fiber Spikes{tuple_delimiter}athletic equipment, performance boost{tuple_delimiter}Noah Carter used carbon-fiber spikes to enhance performance during the race.
|
||||
relationship{tuple_delimiter}Noah Carter{tuple_delimiter}World Athletics Championship{tuple_delimiter}athlete participation, competition{tuple_delimiter}Noah Carter is competing at the World Athletics Championship.
|
||||
relation{tuple_delimiter}World Athletics Championship{tuple_delimiter}Tokyo{tuple_delimiter}event location, international competition{tuple_delimiter}The World Athletics Championship is being hosted in Tokyo.
|
||||
relation{tuple_delimiter}Noah Carter{tuple_delimiter}100m Sprint Record{tuple_delimiter}athlete achievement, record-breaking{tuple_delimiter}Noah Carter set a new 100m sprint record at the championship.
|
||||
relation{tuple_delimiter}Noah Carter{tuple_delimiter}Carbon-Fiber Spikes{tuple_delimiter}athletic equipment, performance boost{tuple_delimiter}Noah Carter used carbon-fiber spikes to enhance performance during the race.
|
||||
relation{tuple_delimiter}Noah Carter{tuple_delimiter}World Athletics Championship{tuple_delimiter}athlete participation, competition{tuple_delimiter}Noah Carter is competing at the World Athletics Championship.
|
||||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
|
|
|
|||
|
|
@ -2652,6 +2652,13 @@ def fix_tuple_delimiter_corruption(
|
|||
record,
|
||||
)
|
||||
|
||||
# Fix <|#: -> <|#|> (missing closing >)
|
||||
record = re.sub(
|
||||
rf"<\|{escaped_delimiter_core}:(?!>)",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <|| -> <|#|>
|
||||
record = re.sub(
|
||||
r"<\|\|(?!>)",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue