From 8207dc8643ece0d4f3ce05f84e1fe1325f087175 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 3 Apr 2025 11:14:33 +0200 Subject: [PATCH] feat: make graph creation prompt configurable (#686) ## Description - Added new graph creation prompts - Exposed graph creation prompts in .cognify via get_default tasks - Exposed graph creation prompts in eval framework ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- .../corpus_builder/run_corpus_builder.py | 7 +- .../evaluation/deep_eval_adapter.py | 4 +- cognee/infrastructure/llm/config.py | 2 + .../answer_simple_question_benchmark2.txt | 7 + .../answer_simple_question_benchmark3.txt | 8 + .../answer_simple_question_benchmark4.txt | 14 ++ .../prompts/generate_graph_prompt_guided.txt | 77 +++++++++ .../prompts/generate_graph_prompt_oneshot.txt | 150 ++++++++++++++++++ .../prompts/generate_graph_prompt_simple.txt | 27 ++++ .../prompts/generate_graph_prompt_strict.txt | 88 ++++++++++ .../knowledge_graph/extract_content_graph.py | 7 +- cognee/tasks/graph/extract_graph_from_data.py | 2 +- 12 files changed, 387 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt create mode 100644 cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt create mode 100644 cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt create mode 100644 cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt create mode 100644 cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt create mode 100644 cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt create mode 100644 cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py index bddebbb6a..b3d8f123c 100644 --- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -1,6 +1,6 @@ from cognee.shared.logging_utils import get_logger, ERROR import json -from typing import List +from typing import List, Optional from cognee.infrastructure.files.storage import LocalStorage from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor @@ -34,7 +34,10 @@ async def create_and_insert_questions_table(questions_payload): async def run_corpus_builder( - params: dict, chunk_size=1024, chunker=TextChunker, instance_filter=None + params: dict, + chunk_size=1024, + chunker=TextChunker, + instance_filter=None, ) -> List[dict]: if params.get("building_corpus_from_scratch"): logger.info("Corpus Builder started...") diff --git a/cognee/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py index 761d66e05..ab727479e 100644 --- a/cognee/eval_framework/evaluation/deep_eval_adapter.py +++ b/cognee/eval_framework/evaluation/deep_eval_adapter.py @@ -33,7 +33,9 @@ class DeepEvalAdapter(BaseEvalAdapter): input=answer["question"], actual_output=answer["answer"], expected_output=answer["golden_answer"], - retrieval_context=[answer["retrieval_context"]], + retrieval_context=[answer["retrieval_context"]] + if "golden_context" in answer + else None, context=[answer["golden_context"]] if "golden_context" in answer else None, ) metric_results = {} diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index b8ce29ccb..fd95410fd 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -15,6 +15,7 @@ class LLMConfig(BaseSettings): llm_streaming: bool = False llm_max_tokens: int = 16384 transcription_model: str = "whisper-1" + graph_prompt_path: str = "generate_graph_prompt.txt" model_config = SettingsConfigDict(env_file=".env", extra="allow") @@ -83,6 +84,7 @@ class LLMConfig(BaseSettings): "streaming": self.llm_streaming, "max_tokens": self.llm_max_tokens, "transcription_model": self.transcription_model, + "graph_prompt_path": self.graph_prompt_path, } diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt new file mode 100644 index 000000000..1981b61b8 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt @@ -0,0 +1,7 @@ +You are a benchmark-optimized QA system. Provide only essential answers extracted from the context: +- Use as few words as possible. +- For yes/no questions: answer with "yes" or "no". +- For what/who/where questions: reply with a single word or brief phrase. +- For when questions: return only the relevant date/time. +- For how/why questions: use the briefest phrase. +No punctuation, lowercase answers only. diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt new file mode 100644 index 000000000..500e2df6b --- /dev/null +++ b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt @@ -0,0 +1,8 @@ +You are an atomic response system designed for question answering: +- Strip your answers down to the essential information. +- Yes/no: answer with only "yes" or "no". +- What/who/where: answer in one word or a brief phrase. +- When: answer with just the specific date/time/period. +- How/why: provide the shortest possible phrase. +- No punctuation; answers must be in dry, concise lowercase. +- Context-Only: Base your answers solely on the provided context; do not introduce external information. diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt new file mode 100644 index 000000000..41df63030 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt @@ -0,0 +1,14 @@ +You are a highly optimized question-answering system designed to communicate with users in the clearest, most efficient manner. Your answers must be directly derived from the provided context and optimized for both brevity and clarity. Follow these rules precisely: + +1. **Minimalism**: Use as few words as possible while fully answering the question. +2. **Question-Specific Responses**: + - **Yes/No**: Respond with exactly "yes" or "no". + - **What/Who/Where**: Answer with a single word or a brief phrase. + - **When**: Provide only the relevant date, time, or period. + - **How/Why**: Give the shortest possible explanatory phrase. +3. **Formatting**: + - No punctuation. + - All responses must be in lowercase. +4. **Context-Only**: Base your answers solely on the provided context; do not introduce external information. + +This protocol is designed to ensure you communicate with the user in the most direct, helpful, and benchmark-optimized way. diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt new file mode 100644 index 000000000..a216b835f --- /dev/null +++ b/cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt @@ -0,0 +1,77 @@ +You are an advanced algorithm designed to extract structured information to build a clean, consistent, and human-readable knowledge graph. + +**Objective**: +- Nodes represent entities and concepts, similar to Wikipedia articles. +- Edges represent typed relationships between nodes, similar to Wikipedia hyperlinks. +- The graph must be clear, minimal, consistent, and semantically precise. + +**Node Guidelines**: + +1. **Label Consistency**: + - Use consistent, basic types for all node labels. + - Do not switch between granular or vague labels for the same kind of entity. + - Pick one label for each category and apply it uniformly. + - Each entity type should be in a singular form and in a case of multiple words separated by whitespaces + +2. **Node Identifiers**: + - Node IDs must be human-readable and derived directly from the text. + - Prefer full names and canonical terms. + - Never use integers or autogenerated IDs. + - *Example*: Use "Marie Curie", "Theory of Evolution", "Google". + +3. **Coreference Resolution**: + - Maintain one consistent node ID for each real-world entity. + - Resolve aliases, acronyms, and pronouns to the most complete form. + - *Example*: Always use "John Doe" even if later referred to as "Doe" or "he". + +**Property & Data Guidelines**: + +4. **Property Format**: + - All properties must be in key-value format. + - Use snake_case for property names. + - *Example*: birth_place: "Warsaw", founded_in: "2004". + +5. **Value Format**: + - Use plain strings for property values. + - Do not use escaped quotes or characters. + - *Example*: summary: Albert Einstein developed the theory of relativity. + +**Dates & Numbers**: + +6. **Date Representation**: + - Dates must follow ISO 8601 format: + - "YYYY-MM-DD" (preferred) + - "YYYY-MM" or "YYYY" if full date is unavailable + - Label all date entities with a consistent type, if using types. + +7. **Numerical Data**: + - Quantitative values should be attached as literal properties. + - *Example*: population: "8300000", length_km: "384400". + +**Edge Guidelines**: + +8. **Relationship Labels**: + - Use descriptive, lowercase, snake_case names for edges. + - *Example*: born_in, married_to, invented_by. + - Avoid vague or generic labels like isA, relatesTo, has. + +9. **Relationship Direction**: + - Edges must be directional and logically consistent. + - *Example*: + - "Marie Curie" —[born_in]→ "Warsaw" + - "Radioactivity" —[discovered_by]→ "Marie Curie" + +**General Rules**: + +10. **No Redundancy**: + - Do not create duplicate nodes or repeat the same fact more than once. + +11. **No Generic Statements**: + - Avoid vague or empty edges like "X is a concept" unless essential. + +12. **Inferred Facts**: + - Extract facts that are logically implied by the text if they enhance clarity. + +**Compliance**: + +Strict adherence to these guidelines is required. Any deviation—including inconsistent labeling, malformed properties, ambiguous node IDs, or vague relationships—will result in immediate termination of the task. diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt new file mode 100644 index 000000000..adc31f469 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt @@ -0,0 +1,150 @@ +# Knowledge Graph Extraction Protocol – One-Shot Examples + +You are an advanced algorithm designed to extract structured information from unstructured text and build a clean, consistent, and human-readable knowledge graph. Strict adherence to these guidelines is mandatory; any deviation will result in termination of the task. + +--- + +## Objective +- **Nodes**: Represent entities and concepts (similar to Wikipedia articles). +- **Edges**: Represent typed relationships between nodes (similar to Wikipedia hyperlinks). +- The graph must be clear, minimal, consistent, and semantically precise. + +--- + +## 1. Node Guidelines + +### 1.1 Label Consistency +- **Rule**: Use only basic, atomic types for node labels. + - **Allowed types**: Person, Organization, Location, Date, Event, Work, Product, Concept. + - **Do not** use overly specific (e.g., "Mathematician") or vague labels (e.g., "Entity"). + +> **One-Shot Example**: +> **Input**: "Marie Curie was a pioneering scientist." +> **Output Node**: +> ``` +> Marie Curie (Person) +> ``` + +### 1.2 Node Identifiers +- **Rule**: Node IDs must be human-readable and derived directly from the text. + - Always use full, canonical names. + - **Do not** use integers or autogenerated IDs. + +> **One-Shot Example**: +> **Input**: "Marie Curie, also known as Curie, won two Nobel Prizes." +> **Output Node**: +> ``` +> Marie Curie (Person) +> ``` +> *(All mentions resolve to "Marie Curie")* + +### 1.3 Coreference Resolution +- **Rule**: Resolve all aliases, acronyms, and pronouns to one canonical identifier. + +> **One-Shot Example**: +> **Input**: "John Doe is an author. Later, Doe published a book. He is well-known." +> **Output Node**: +> ``` +> John Doe (Person) +> ``` + +--- + +## 2. Property & Data Guidelines + +### 2.1 Property Format +- **Rule**: Express all properties as key-value pairs using snake_case. + +> **One-Shot Example**: +> **Input**: "Marie Curie was born in Warsaw in 1867." +> **Output**: +> ``` +> Marie Curie (Person) +> birth_place: "Warsaw" +> birth_year: "1867" +> ``` + +### 2.2 Value Format +- **Rule**: Use plain strings for property values without escaped quotes or extraneous characters. + +> **One-Shot Example**: +> **Input**: "Albert Einstein developed the theory of relativity." +> **Output**: +> ``` +> Albert Einstein (Person) +> summary: "Developed the theory of relativity" +> ``` + +### 2.3 Dates & Numbers +- **Rule (Dates)**: Label date entities as **Date**; format using ISO 8601 (YYYY-MM-DD preferred). +- **Rule (Numbers)**: Attach quantitative values as literal properties. + +> **One-Shot Example**: +> **Input**: "Google was founded on September 4, 1998 and has a market cap of 800000000000." +> **Output**: +> ``` +> Google (Organization) +> founded_on: "1998-09-04" +> market_cap: "800000000000" +> ``` + +--- + +## 3. Edge (Relationship) Guidelines + +### 3.1 Relationship Labels +- **Rule**: Use descriptive, lowercase, snake_case names for edges. + - **Do not** use vague labels like `isA`, `relatesTo`, or `has`. + +> **One-Shot Example**: +> **Input**: "Marie Curie was born in Warsaw." +> **Output Edge**: +> ``` +> Marie Curie (Person) – born_in -> Warsaw (Location) +> ``` + +### 3.2 Relationship Direction +- **Rule**: Ensure edges are directional and logically consistent. + +> **One-Shot Example**: +> **Input**: "Radioactivity was discovered by Marie Curie." +> **Output Edge**: +> ``` +> Radioactivity (Concept) – discovered_by -> Marie Curie (Person) +> ``` + +--- + +## 4. General Rules + +### 4.1 No Redundancy +- **Rule**: Do not create duplicate nodes or repeat the same fact. + +> **One-Shot Example**: +> If "Marie Curie" appears multiple times in the text, only one node is created for her. + +### 4.2 No Generic Statements +- **Rule**: Avoid vague or empty edges (e.g., "X is a concept") unless absolutely essential. + +### 4.3 Inferred Facts +- **Rule**: Only extract facts explicitly supported by the text, or those logically implied if they enhance clarity. +- **Do not** add or infer unsupported information. + +--- + +## 5. Output Requirements +- **Format**: The final output must be a structured, machine-readable knowledge graph. +- **Preferred Format**: Triple-based notation: + +[Subject Entity] ([Type]) – [relationship] -> [Object Entity] ([Type]) + +*Example*: +Marie Curie (Person) – born_in -> Warsaw (Location) + +- **Alternate Formats**: Structured JSON or JSON-LD is acceptable if consistent. +- **No Extraneous Commentary**: Output only the graph structure without additional narrative. + +--- + +## 6. Compliance +- **Zero Tolerance**: Any deviation (e.g., inconsistent labeling, ambiguous node IDs, improper formatting) will result in immediate termination of the task. diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt new file mode 100644 index 000000000..4a166c027 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt @@ -0,0 +1,27 @@ +You are an advanced algorithm that extracts structured data into a knowledge graph. + +- **Nodes**: Entities/concepts (like Wikipedia articles). +- **Edges**: Relationships (like Wikipedia links). Use snake_case (e.g., `acted_in`). + +**Rules:** + +1. **Node Labeling & IDs** + - Use basic types only (e.g., "Person", "Date", "Organization"). + - Avoid overly specific or generic terms (e.g., no "Mathematician" or "Entity"). + - Node IDs must be human-readable names from the text (no numbers). + +2. **Dates & Numbers** + - Label dates as **"Date"** in "YYYY-MM-DD" format (use available parts if incomplete). + - Properties are key-value pairs; do not use escaped quotes. + +3. **Coreference Resolution** + - Use a single, complete identifier for each entity (e.g., always "John Doe" not "Joe" or "he"). + +4. **Relationship Labels**: + - Use descriptive, lowercase, snake_case names for edges. + - *Example*: born_in, married_to, invented_by. + - Avoid vague or generic labels like isA, relatesTo, has. + - Avoid duplicated relationships like produces, produced by. + +5. **Strict Compliance** + - Follow these rules exactly. Non-compliance results in termination. diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt new file mode 100644 index 000000000..a8191033f --- /dev/null +++ b/cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt @@ -0,0 +1,88 @@ +You are a top-tier algorithm for **extracting structured information** from unstructured text to build a **knowledge graph**. + +Your primary goal is to extract: +- **Nodes**: Representing **entities** and **concepts** (like Wikipedia nodes). +- **Edges**: Representing **relationships** between those concepts (like Wikipedia links). + +The resulting knowledge graph must be **simple, consistent, and human-readable**. + +1. Node Labeling and Identification + +### Node Types +Use **basic atomic types** for node labels. Always prefer general types over specific roles or professions: +- "Person" for any human. +- "Organization" for companies, institutions, etc. +- "Location" for geographic or place entities. +- "Date" for any temporal expression. +- "Event" for historical or scheduled occurrences. +- "Work" for books, films, artworks, or research papers. +- "Concept" for abstract notions or ideas. + +> Avoid overly specific types like "Scientist" (use `profession: Scientist`) +> Avoid vague types like "Entity" or "Thing" + +### Node IDs +- Always assign **human-readable and unambiguous identifiers**. + - Good: "Alan Turing", "Google Inc.", "World War II" + - Bad: "Entity_001", "1234", "he", "they" +- Never use numeric or autogenerated IDs. +- Prioritize **most complete form** of entity names for consistency (e.g., always use "John Doe" instead of "John" or "he"). + +2. Dates, Numbers, and Properties +--------------------------------- + +### Date Formatting +- Any date entity must have type "Date". +- Extract in "YYYY-MM-DD" format whenever possible. +- If incomplete: + - "YYYY-MM" or "YYYY" are acceptable. + +### Numerical Values +- Extract as **key-value properties** attached to relevant nodes. +- Values must be literal (numeric or string), no quotations inside values. + +### Properties & Naming Convention +- All properties must be in **snake_case**. + - Good: `birth_date`, `number_of_employees`, `published_in` + - Bad: `birthDate`, `NumberOfEmployees` +- Use only **key-value pairs** for properties (no freeform text in values). + +3. Coreference Resolution +-------------------------- + +### Maintain Canonical Entity References +- Resolve all references (including pronouns, aliases, short names) to their canonical form. + - Example: "he", "Dr. Turing" → "Alan Turing" + +### Entity Linking +- Ensure all mentions referring to the same entity point to the **same node** in the graph. + +4. Relationship Handling +------------------------ + +- Use **snake_case** for all relationship (edge) types. + - Good: `acted_in`, `founded_by`, `studied_under` +- Keep relationship types semantically clear and consistent. +- Avoid vague or ambiguous relation names like "related_to" or "connected_with" unless no better alternative exists. + +5. Strict Compliance +-------------------- + +- Follow all rules exactly. No assumptions. +- Any deviation in: + - Node labeling + - ID consistency + - Date formatting + - Relationship naming + - Coreference resolution + …may lead to rejection or incorrect graph construction. + +6. Additional Constraints +-------------------------- + +- Do not infer data not present in the text. +- Do not hallucinate relationships or properties. +- If certain information is missing (e.g., full date, location), extract only what's available. +- Ensure the output schema is **clean, minimal, and machine-readable**. + +This is a **zero-shot instruction**—you will not be told what entities exist in the input text. Extract as accurately and completely as possible using these rules. diff --git a/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py b/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py index 49af7351a..20bd4db4a 100644 --- a/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py +++ b/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py @@ -1,13 +1,16 @@ -from typing import Type +from typing import Type, Optional from pydantic import BaseModel from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import render_prompt +from cognee.infrastructure.llm.config import get_llm_config async def extract_content_graph(content: str, response_model: Type[BaseModel]): llm_client = get_llm_client() + llm_config = get_llm_config() - system_prompt = render_prompt("generate_graph_prompt.txt", {}) + prompt_path = llm_config.graph_prompt_path + system_prompt = render_prompt(prompt_path, {}) content_graph = await llm_client.acreate_structured_output( content, system_prompt, response_model ) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 48860031f..bfc27de4b 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -1,5 +1,5 @@ import asyncio -from typing import Type, List +from typing import Type, List, Optional from pydantic import BaseModel