diff --git a/cognee/base_config.py b/cognee/base_config.py index 2f9ca8cdb..da086711d 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -9,7 +9,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict class BaseConfig(BaseSettings): data_root_directory: str = get_absolute_path(".data_storage") monitoring_tool: object = Observer.LANGFUSE - structured_output_framework: str = os.getenv("STRUCTURED_OUTPUT_FRAMEWORK") + structured_output_framework: str = os.getenv("STRUCTURED_OUTPUT_FRAMEWORK", "") graphistry_username: Optional[str] = os.getenv("GRAPHISTRY_USERNAME") graphistry_password: Optional[str] = os.getenv("GRAPHISTRY_PASSWORD") langfuse_public_key: Optional[str] = os.getenv("LANGFUSE_PUBLIC_KEY") diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py index dd837cf5a..cb58fb552 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py @@ -101,6 +101,20 @@ class BamlAsyncClient: "content": content,"prompt_override": prompt_override, }) return typing.cast(types.KnowledgeGraph, result.cast_to(types, types, stream_types, False, __runtime__)) + async def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> types.SummarizedCode: + result = await self.__options.merge_options(baml_options).call_function_async(function_name="SummarizeCode", args={ + "content": content, + }) + return typing.cast(types.SummarizedCode, result.cast_to(types, types, stream_types, False, __runtime__)) + async def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> types.SummarizedContent: + result = await self.__options.merge_options(baml_options).call_function_async(function_name="SummarizeContent", args={ + "content": content, + }) + return typing.cast(types.SummarizedContent, result.cast_to(types, types, stream_types, False, __runtime__)) @@ -158,6 +172,30 @@ class BamlStreamClient: lambda x: typing.cast(types.KnowledgeGraph, x.cast_to(types, types, stream_types, False, __runtime__)), ctx, ) + def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.BamlStream[stream_types.SummarizedCode, types.SummarizedCode]: + ctx, result = self.__options.merge_options(baml_options).create_async_stream(function_name="SummarizeCode", args={ + "content": content, + }) + return baml_py.BamlStream[stream_types.SummarizedCode, types.SummarizedCode]( + result, + lambda x: typing.cast(stream_types.SummarizedCode, x.cast_to(types, types, stream_types, True, __runtime__)), + lambda x: typing.cast(types.SummarizedCode, x.cast_to(types, types, stream_types, False, __runtime__)), + ctx, + ) + def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.BamlStream[stream_types.SummarizedContent, types.SummarizedContent]: + ctx, result = self.__options.merge_options(baml_options).create_async_stream(function_name="SummarizeContent", args={ + "content": content, + }) + return baml_py.BamlStream[stream_types.SummarizedContent, types.SummarizedContent]( + result, + lambda x: typing.cast(stream_types.SummarizedContent, x.cast_to(types, types, stream_types, True, __runtime__)), + lambda x: typing.cast(types.SummarizedContent, x.cast_to(types, types, stream_types, False, __runtime__)), + ctx, + ) class BamlHttpRequestClient: @@ -194,6 +232,20 @@ class BamlHttpRequestClient: "content": content,"prompt_override": prompt_override, }, mode="request") return result + async def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = await self.__options.merge_options(baml_options).create_http_request_async(function_name="SummarizeCode", args={ + "content": content, + }, mode="request") + return result + async def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = await self.__options.merge_options(baml_options).create_http_request_async(function_name="SummarizeContent", args={ + "content": content, + }, mode="request") + return result class BamlHttpStreamRequestClient: @@ -230,6 +282,20 @@ class BamlHttpStreamRequestClient: "content": content,"prompt_override": prompt_override, }, mode="stream") return result + async def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = await self.__options.merge_options(baml_options).create_http_request_async(function_name="SummarizeCode", args={ + "content": content, + }, mode="stream") + return result + async def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = await self.__options.merge_options(baml_options).create_http_request_async(function_name="SummarizeContent", args={ + "content": content, + }, mode="stream") + return result b = BamlAsyncClient(DoNotUseDirectlyCallManager({})) \ No newline at end of file diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py index 4e5b22a42..09c747670 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py @@ -13,7 +13,7 @@ _file_map = { "extract_categories.baml": "", - "extract_content_graph.baml": "class Node {\n id string\n name string\n type string\n description string\n @@dynamic\n}\n\n/// doc string for edge\nclass Edge {\n /// doc string for source_node_id\n source_node_id string\n target_node_id string\n relationship_name string\n}\n\nclass KnowledgeGraph {\n nodes (Node @stream.done)[]\n edges Edge[]\n}\n\n// Simple template for basic extraction (fast, good quality)\ntemplate_string ExtractContentGraphPrompt() #\"\n You are an advanced algorithm that extracts structured data into a knowledge graph.\n\n - **Nodes**: Entities/concepts (like Wikipedia articles).\n - **Edges**: Relationships (like Wikipedia links). Use snake_case (e.g., `acted_in`).\n\n **Rules:**\n\n 1. **Node Labeling & IDs**\n - Use basic types only (e.g., \"Person\", \"Date\", \"Organization\").\n - Avoid overly specific or generic terms (e.g., no \"Mathematician\" or \"Entity\").\n - Node IDs must be human-readable names from the text (no numbers).\n\n 2. **Dates & Numbers**\n - Label dates as **\"Date\"** in \"YYYY-MM-DD\" format (use available parts if incomplete).\n - Properties are key-value pairs; do not use escaped quotes.\n\n 3. **Coreference Resolution**\n - Use a single, complete identifier for each entity (e.g., always \"John Doe\" not \"Joe\" or \"he\").\n\n 4. **Relationship Labels**:\n - Use descriptive, lowercase, snake_case names for edges.\n - *Example*: born_in, married_to, invented_by.\n - Avoid vague or generic labels like isA, relatesTo, has.\n - Avoid duplicated relationships like produces, produced by.\n\n 5. **Strict Compliance**\n - Follow these rules exactly. Non-compliance results in termination.\n\"#\n\n// Detailed template for complex extraction (slower, higher quality)\ntemplate_string DetailedExtractContentGraphPrompt() #\"\n You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.\n **Edges** represent relationships between concepts. They're akin to Wikipedia links.\n\n The aim is to achieve simplicity and clarity in the knowledge graph.\n\n # 1. Labeling Nodes\n **Consistency**: Ensure you use basic or elementary types for node labels.\n - For example, when you identify an entity representing a person, always label it as **\"Person\"**.\n - Avoid using more specific terms like \"Mathematician\" or \"Scientist\", keep those as \"profession\" property.\n - Don't use too generic terms like \"Entity\".\n **Node IDs**: Never utilize integers as node IDs.\n - Node IDs should be names or human-readable identifiers found in the text.\n\n # 2. Handling Numerical Data and Dates\n - For example, when you identify an entity representing a date, make sure it has type **\"Date\"**.\n - Extract the date in the format \"YYYY-MM-DD\"\n - If not possible to extract the whole date, extract month or year, or both if available.\n - **Property Format**: Properties must be in a key-value format.\n - **Quotation Marks**: Never use escaped single or double quotes within property values.\n - **Naming Convention**: Use snake_case for relationship names, e.g., `acted_in`.\n\n # 3. Coreference Resolution\n - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n If an entity, such as \"John Doe\", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., \"Joe\", \"he\"),\n always use the most complete identifier for that entity throughout the knowledge graph. In this example, use \"John Doe\" as the Person's ID.\n Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n\n # 4. Strict Compliance\n Adhere to the rules strictly. Non-compliance will result in termination.\n\"#\n\n// Guided template with step-by-step instructions\ntemplate_string GuidedExtractContentGraphPrompt() #\"\n You are an advanced algorithm designed to extract structured information to build a clean, consistent, and human-readable knowledge graph.\n\n **Objective**:\n - Nodes represent entities and concepts, similar to Wikipedia articles.\n - Edges represent typed relationships between nodes, similar to Wikipedia hyperlinks.\n - The graph must be clear, minimal, consistent, and semantically precise.\n\n **Node Guidelines**:\n\n 1. **Label Consistency**:\n - Use consistent, basic types for all node labels.\n - Do not switch between granular or vague labels for the same kind of entity.\n - Pick one label for each category and apply it uniformly.\n - Each entity type should be in a singular form and in a case of multiple words separated by whitespaces\n\n 2. **Node Identifiers**:\n - Node IDs must be human-readable and derived directly from the text.\n - Prefer full names and canonical terms.\n - Never use integers or autogenerated IDs.\n - *Example*: Use \"Marie Curie\", \"Theory of Evolution\", \"Google\".\n\n 3. **Coreference Resolution**:\n - Maintain one consistent node ID for each real-world entity.\n - Resolve aliases, acronyms, and pronouns to the most complete form.\n - *Example*: Always use \"John Doe\" even if later referred to as \"Doe\" or \"he\".\n\n **Edge Guidelines**:\n\n 4. **Relationship Labels**:\n - Use descriptive, lowercase, snake_case names for edges.\n - *Example*: born_in, married_to, invented_by.\n - Avoid vague or generic labels like isA, relatesTo, has.\n\n 5. **Relationship Direction**:\n - Edges must be directional and logically consistent.\n - *Example*:\n - \"Marie Curie\" —[born_in]→ \"Warsaw\"\n - \"Radioactivity\" —[discovered_by]→ \"Marie Curie\"\n\n **Compliance**:\n Strict adherence to these guidelines is required. Any deviation will result in immediate termination of the task.\n\"#\n\n// Strict template with zero-tolerance rules\ntemplate_string StrictExtractContentGraphPrompt() #\"\n You are a top-tier algorithm for **extracting structured information** from unstructured text to build a **knowledge graph**.\n\n Your primary goal is to extract:\n - **Nodes**: Representing **entities** and **concepts** (like Wikipedia nodes).\n - **Edges**: Representing **relationships** between those concepts (like Wikipedia links).\n\n The resulting knowledge graph must be **simple, consistent, and human-readable**.\n\n ## 1. Node Labeling and Identification\n\n ### Node Types\n Use **basic atomic types** for node labels. Always prefer general types over specific roles or professions:\n - \"Person\" for any human.\n - \"Organization\" for companies, institutions, etc.\n - \"Location\" for geographic or place entities.\n - \"Date\" for any temporal expression.\n - \"Event\" for historical or scheduled occurrences.\n - \"Work\" for books, films, artworks, or research papers.\n - \"Concept\" for abstract notions or ideas.\n\n ### Node IDs\n - Always assign **human-readable and unambiguous identifiers**.\n - Never use numeric or autogenerated IDs.\n - Prioritize **most complete form** of entity names for consistency.\n\n ## 2. Relationship Handling\n - Use **snake_case** for all relationship (edge) types.\n - Keep relationship types semantically clear and consistent.\n - Avoid vague relation names like \"related_to\" unless no better alternative exists.\n\n ## 3. Strict Compliance\n Follow all rules exactly. Any deviation may lead to rejection or incorrect graph construction.\n\"#\n\n// OpenAI client with environment model selection\nclient OpenAIClientWithEnvModel {\n provider openai\n options {\n model env.LLM_MODEL\n api_key env.OPENAI_API_KEY\n }\n}\n\n// Anthropic client with environment model selection\nclient AnthropicClientWithEnvModel {\n provider anthropic\n options {\n model env.LLM_MODEL\n api_key env.ANTHROPIC_API_KEY\n }\n}\n\n// Default client (maintains backward compatibility)\nclient DefaultClient {\n provider openai\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n }\n}\n\n// Function that returns raw structured output (for custom objects - to be handled in Python)\nfunction ExtractContentGraphGeneric(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Backward-compatible function specifically for KnowledgeGraph\nfunction ExtractContentGraph(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Alternative function that uses environment variable for prompt selection\nfunction ExtractContentGraphWithEnvPrompt(\n content: string,\n prompt_override: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if prompt_override %}\n {{ prompt_override }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Function that uses Anthropic client\nfunction ExtractContentGraphWithAnthropic(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client AnthropicClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\ntest ExtractPersonExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n My name is Vasiliy. I was born in 1992. I am a software engineer. I work at Google and am based in Berlin.\n \"#\n mode \"simple\"\n }\n}\n\ntest ExtractGuidedExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n Apple Inc. was founded by Steve Jobs in 1976. The company is headquartered in Cupertino, California.\n Tim Cook is the current CEO of Apple Inc.\n \"#\n mode \"guided\"\n }\n}\n\ntest ExtractStrictExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n The Python programming language was created by Guido van Rossum in 1991.\n \"#\n mode \"strict\"\n }\n}\n\ntest ExtractGenericExample {\n functions [ExtractContentGraphGeneric]\n args {\n content #\"\n React is a JavaScript library for building user interfaces, developed by Facebook.\n \"#\n mode \"simple\"\n }\n}\n", + "extract_content_graph.baml": "class Node {\n id string\n name string\n type string\n description string\n @@dynamic\n}\n\n/// doc string for edge\nclass Edge {\n /// doc string for source_node_id\n source_node_id string\n target_node_id string\n relationship_name string\n}\n\nclass KnowledgeGraph {\n nodes (Node @stream.done)[]\n edges Edge[]\n}\n\n// Summarization classes\nclass SummarizedContent {\n summary string\n description string\n}\n\nclass SummarizedFunction {\n name string\n description string\n inputs string[]?\n outputs string[]?\n decorators string[]?\n}\n\nclass SummarizedClass {\n name string\n description string\n methods SummarizedFunction[]?\n decorators string[]?\n}\n\nclass SummarizedCode {\n high_level_summary string\n key_features string[]\n imports string[]\n constants string[]\n classes SummarizedClass[]\n functions SummarizedFunction[]\n workflow_description string?\n}\n\n// Simple template for basic extraction (fast, good quality)\ntemplate_string ExtractContentGraphPrompt() #\"\n You are an advanced algorithm that extracts structured data into a knowledge graph.\n\n - **Nodes**: Entities/concepts (like Wikipedia articles).\n - **Edges**: Relationships (like Wikipedia links). Use snake_case (e.g., `acted_in`).\n\n **Rules:**\n\n 1. **Node Labeling & IDs**\n - Use basic types only (e.g., \"Person\", \"Date\", \"Organization\").\n - Avoid overly specific or generic terms (e.g., no \"Mathematician\" or \"Entity\").\n - Node IDs must be human-readable names from the text (no numbers).\n\n 2. **Dates & Numbers**\n - Label dates as **\"Date\"** in \"YYYY-MM-DD\" format (use available parts if incomplete).\n - Properties are key-value pairs; do not use escaped quotes.\n\n 3. **Coreference Resolution**\n - Use a single, complete identifier for each entity (e.g., always \"John Doe\" not \"Joe\" or \"he\").\n\n 4. **Relationship Labels**:\n - Use descriptive, lowercase, snake_case names for edges.\n - *Example*: born_in, married_to, invented_by.\n - Avoid vague or generic labels like isA, relatesTo, has.\n - Avoid duplicated relationships like produces, produced by.\n\n 5. **Strict Compliance**\n - Follow these rules exactly. Non-compliance results in termination.\n\"#\n\n// Summarization prompt template\ntemplate_string SummarizeContentPrompt() #\"\n You are a top-tier summarization engine. Your task is to summarize text and make it versatile.\n Be brief and concise, but keep the important information and the subject.\n Use synonym words where possible in order to change the wording but keep the meaning.\n\"#\n\n// Code summarization prompt template\ntemplate_string SummarizeCodePrompt() #\"\n You are an expert code analyst. Analyze the provided source code and extract key information:\n\n 1. Provide a high-level summary of what the code does\n 2. List key features and functionality\n 3. Identify imports and dependencies\n 4. List constants and global variables\n 5. Summarize classes with their methods\n 6. Summarize standalone functions\n 7. Describe the overall workflow if applicable\n\n Be precise and technical while remaining clear and concise.\n\"#\n\n// Detailed template for complex extraction (slower, higher quality)\ntemplate_string DetailedExtractContentGraphPrompt() #\"\n You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.\n **Edges** represent relationships between concepts. They're akin to Wikipedia links.\n\n The aim is to achieve simplicity and clarity in the knowledge graph.\n\n # 1. Labeling Nodes\n **Consistency**: Ensure you use basic or elementary types for node labels.\n - For example, when you identify an entity representing a person, always label it as **\"Person\"**.\n - Avoid using more specific terms like \"Mathematician\" or \"Scientist\", keep those as \"profession\" property.\n - Don't use too generic terms like \"Entity\".\n **Node IDs**: Never utilize integers as node IDs.\n - Node IDs should be names or human-readable identifiers found in the text.\n\n # 2. Handling Numerical Data and Dates\n - For example, when you identify an entity representing a date, make sure it has type **\"Date\"**.\n - Extract the date in the format \"YYYY-MM-DD\"\n - If not possible to extract the whole date, extract month or year, or both if available.\n - **Property Format**: Properties must be in a key-value format.\n - **Quotation Marks**: Never use escaped single or double quotes within property values.\n - **Naming Convention**: Use snake_case for relationship names, e.g., `acted_in`.\n\n # 3. Coreference Resolution\n - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n If an entity, such as \"John Doe\", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., \"Joe\", \"he\"),\n always use the most complete identifier for that entity throughout the knowledge graph. In this example, use \"John Doe\" as the Person's ID.\n Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n\n # 4. Strict Compliance\n Adhere to the rules strictly. Non-compliance will result in termination.\n\"#\n\n// Guided template with step-by-step instructions\ntemplate_string GuidedExtractContentGraphPrompt() #\"\n You are an advanced algorithm designed to extract structured information to build a clean, consistent, and human-readable knowledge graph.\n\n **Objective**:\n - Nodes represent entities and concepts, similar to Wikipedia articles.\n - Edges represent typed relationships between nodes, similar to Wikipedia hyperlinks.\n - The graph must be clear, minimal, consistent, and semantically precise.\n\n **Node Guidelines**:\n\n 1. **Label Consistency**:\n - Use consistent, basic types for all node labels.\n - Do not switch between granular or vague labels for the same kind of entity.\n - Pick one label for each category and apply it uniformly.\n - Each entity type should be in a singular form and in a case of multiple words separated by whitespaces\n\n 2. **Node Identifiers**:\n - Node IDs must be human-readable and derived directly from the text.\n - Prefer full names and canonical terms.\n - Never use integers or autogenerated IDs.\n - *Example*: Use \"Marie Curie\", \"Theory of Evolution\", \"Google\".\n\n 3. **Coreference Resolution**:\n - Maintain one consistent node ID for each real-world entity.\n - Resolve aliases, acronyms, and pronouns to the most complete form.\n - *Example*: Always use \"John Doe\" even if later referred to as \"Doe\" or \"he\".\n\n **Edge Guidelines**:\n\n 4. **Relationship Labels**:\n - Use descriptive, lowercase, snake_case names for edges.\n - *Example*: born_in, married_to, invented_by.\n - Avoid vague or generic labels like isA, relatesTo, has.\n\n 5. **Relationship Direction**:\n - Edges must be directional and logically consistent.\n - *Example*:\n - \"Marie Curie\" —[born_in]→ \"Warsaw\"\n - \"Radioactivity\" —[discovered_by]→ \"Marie Curie\"\n\n **Compliance**:\n Strict adherence to these guidelines is required. Any deviation will result in immediate termination of the task.\n\"#\n\n// Strict template with zero-tolerance rules\ntemplate_string StrictExtractContentGraphPrompt() #\"\n You are a top-tier algorithm for **extracting structured information** from unstructured text to build a **knowledge graph**.\n\n Your primary goal is to extract:\n - **Nodes**: Representing **entities** and **concepts** (like Wikipedia nodes).\n - **Edges**: Representing **relationships** between those concepts (like Wikipedia links).\n\n The resulting knowledge graph must be **simple, consistent, and human-readable**.\n\n ## 1. Node Labeling and Identification\n\n ### Node Types\n Use **basic atomic types** for node labels. Always prefer general types over specific roles or professions:\n - \"Person\" for any human.\n - \"Organization\" for companies, institutions, etc.\n - \"Location\" for geographic or place entities.\n - \"Date\" for any temporal expression.\n - \"Event\" for historical or scheduled occurrences.\n - \"Work\" for books, films, artworks, or research papers.\n - \"Concept\" for abstract notions or ideas.\n\n ### Node IDs\n - Always assign **human-readable and unambiguous identifiers**.\n - Never use numeric or autogenerated IDs.\n - Prioritize **most complete form** of entity names for consistency.\n\n ## 2. Relationship Handling\n - Use **snake_case** for all relationship (edge) types.\n - Keep relationship types semantically clear and consistent.\n - Avoid vague relation names like \"related_to\" unless no better alternative exists.\n\n ## 3. Strict Compliance\n Follow all rules exactly. Any deviation may lead to rejection or incorrect graph construction.\n\"#\n\n// OpenAI client with environment model selection\nclient OpenAIClientWithEnvModel {\n provider openai\n options {\n model env.LLM_MODEL\n api_key env.OPENAI_API_KEY\n }\n}\n\n// Anthropic client with environment model selection\nclient AnthropicClientWithEnvModel {\n provider anthropic\n options {\n model env.LLM_MODEL\n api_key env.ANTHROPIC_API_KEY\n }\n}\n\n// Default client (maintains backward compatibility)\nclient DefaultClient {\n provider openai\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n }\n}\n\n// Function that returns raw structured output (for custom objects - to be handled in Python)\nfunction ExtractContentGraphGeneric(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Backward-compatible function specifically for KnowledgeGraph\nfunction ExtractContentGraph(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Alternative function that uses environment variable for prompt selection\nfunction ExtractContentGraphWithEnvPrompt(\n content: string,\n prompt_override: string?\n) -> KnowledgeGraph {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {% if prompt_override %}\n {{ prompt_override }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Function that uses Anthropic client\nfunction ExtractContentGraphWithAnthropic(\n content: string,\n mode: \"simple\" | \"base\" | \"guided\" | \"strict\" | \"custom\"?,\n custom_prompt_content: string?\n) -> KnowledgeGraph {\n client AnthropicClientWithEnvModel\n\n prompt #\"\n {% if mode == \"base\" %}\n {{ DetailedExtractContentGraphPrompt() }}\n {% elif mode == \"guided\" %}\n {{ GuidedExtractContentGraphPrompt() }}\n {% elif mode == \"strict\" %}\n {{ StrictExtractContentGraphPrompt() }}\n {% elif mode == \"custom\" and custom_prompt_content %}\n {{ custom_prompt_content }}\n {% else %}\n {{ ExtractContentGraphPrompt() }}\n {% endif %}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n Before answering, briefly describe what you'll extract from the text, then provide the structured output.\n\n Example format:\n I'll extract the main entities and their relationships from this text...\n\n { ... }\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\n// Summarization functions\nfunction SummarizeContent(content: string) -> SummarizedContent {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {{ SummarizeContentPrompt() }}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\nfunction SummarizeCode(content: string) -> SummarizedCode {\n client OpenAIClientWithEnvModel\n\n prompt #\"\n {{ SummarizeCodePrompt() }}\n\n {{ ctx.output_format(prefix=\"Answer in this schema:\\n\") }}\n\n {{ _.role('user') }}\n {{ content }}\n \"#\n}\n\ntest ExtractPersonExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n My name is Vasiliy. I was born in 1992. I am a software engineer. I work at Google and am based in Berlin.\n \"#\n mode \"simple\"\n }\n}\n\ntest ExtractGuidedExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n Apple Inc. was founded by Steve Jobs in 1976. The company is headquartered in Cupertino, California.\n Tim Cook is the current CEO of Apple Inc.\n \"#\n mode \"guided\"\n }\n}\n\ntest ExtractStrictExample {\n functions [ExtractContentGraph]\n args {\n content #\"\n The Python programming language was created by Guido van Rossum in 1991.\n \"#\n mode \"strict\"\n }\n}\n\ntest ExtractGenericExample {\n functions [ExtractContentGraphGeneric]\n args {\n content #\"\n React is a JavaScript library for building user interfaces, developed by Facebook.\n \"#\n mode \"simple\"\n }\n}\n\ntest SummarizeContentExample {\n functions [SummarizeContent]\n args {\n content #\"\n Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval.\n It deals with the interaction between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.\n \"#\n }\n}\n\ntest SummarizeCodeExample {\n functions [SummarizeCode]\n args {\n content #\"\n def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)\n \n def main():\n print(fibonacci(10))\n \n if __name__ == \"__main__\":\n main()\n \"#\n }\n}\n", "generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../baml/\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.201.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n", } diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py index c389b8c0a..926aca307 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py @@ -46,6 +46,18 @@ class LlmResponseParser: result = self.__options.merge_options(baml_options).parse_response(function_name="ExtractContentGraphWithEnvPrompt", llm_response=llm_response, mode="request") return typing.cast(types.KnowledgeGraph, result) + def SummarizeCode( + self, llm_response: str, baml_options: BamlCallOptions = {}, + ) -> types.SummarizedCode: + result = self.__options.merge_options(baml_options).parse_response(function_name="SummarizeCode", llm_response=llm_response, mode="request") + return typing.cast(types.SummarizedCode, result) + + def SummarizeContent( + self, llm_response: str, baml_options: BamlCallOptions = {}, + ) -> types.SummarizedContent: + result = self.__options.merge_options(baml_options).parse_response(function_name="SummarizeContent", llm_response=llm_response, mode="request") + return typing.cast(types.SummarizedContent, result) + class LlmStreamParser: @@ -78,4 +90,16 @@ class LlmStreamParser: result = self.__options.merge_options(baml_options).parse_response(function_name="ExtractContentGraphWithEnvPrompt", llm_response=llm_response, mode="stream") return typing.cast(stream_types.KnowledgeGraph, result) + def SummarizeCode( + self, llm_response: str, baml_options: BamlCallOptions = {}, + ) -> stream_types.SummarizedCode: + result = self.__options.merge_options(baml_options).parse_response(function_name="SummarizeCode", llm_response=llm_response, mode="stream") + return typing.cast(stream_types.SummarizedCode, result) + + def SummarizeContent( + self, llm_response: str, baml_options: BamlCallOptions = {}, + ) -> stream_types.SummarizedContent: + result = self.__options.merge_options(baml_options).parse_response(function_name="SummarizeContent", llm_response=llm_response, mode="stream") + return typing.cast(stream_types.SummarizedContent, result) + \ No newline at end of file diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py index feefe5a89..a3afc0652 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py @@ -23,7 +23,7 @@ class StreamState(BaseModel, typing.Generic[StreamStateValueT]): value: StreamStateValueT state: typing_extensions.Literal["Pending", "Incomplete", "Complete"] # ######################################################################### -# Generated classes (3) +# Generated classes (7) # ######################################################################### class Edge(BaseModel): @@ -45,6 +45,32 @@ class Node(BaseModel): type: typing.Optional[str] = None description: typing.Optional[str] = None +class SummarizedClass(BaseModel): + name: typing.Optional[str] = None + description: typing.Optional[str] = None + methods: typing.Optional[typing.List["SummarizedFunction"]] = None + decorators: typing.Optional[typing.List[str]] = None + +class SummarizedCode(BaseModel): + high_level_summary: typing.Optional[str] = None + key_features: typing.List[str] + imports: typing.List[str] + constants: typing.List[str] + classes: typing.List["SummarizedClass"] + functions: typing.List["SummarizedFunction"] + workflow_description: typing.Optional[str] = None + +class SummarizedContent(BaseModel): + summary: typing.Optional[str] = None + description: typing.Optional[str] = None + +class SummarizedFunction(BaseModel): + name: typing.Optional[str] = None + description: typing.Optional[str] = None + inputs: typing.Optional[typing.List[str]] = None + outputs: typing.Optional[typing.List[str]] = None + decorators: typing.Optional[typing.List[str]] = None + # ######################################################################### # Generated type aliases (0) # ######################################################################### diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py index b4fa1c558..66768467f 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py @@ -113,6 +113,20 @@ class BamlSyncClient: "content": content,"prompt_override": prompt_override, }) return typing.cast(types.KnowledgeGraph, result.cast_to(types, types, stream_types, False, __runtime__)) + def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> types.SummarizedCode: + result = self.__options.merge_options(baml_options).call_function_sync(function_name="SummarizeCode", args={ + "content": content, + }) + return typing.cast(types.SummarizedCode, result.cast_to(types, types, stream_types, False, __runtime__)) + def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> types.SummarizedContent: + result = self.__options.merge_options(baml_options).call_function_sync(function_name="SummarizeContent", args={ + "content": content, + }) + return typing.cast(types.SummarizedContent, result.cast_to(types, types, stream_types, False, __runtime__)) @@ -170,6 +184,30 @@ class BamlStreamClient: lambda x: typing.cast(types.KnowledgeGraph, x.cast_to(types, types, stream_types, False, __runtime__)), ctx, ) + def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.BamlSyncStream[stream_types.SummarizedCode, types.SummarizedCode]: + ctx, result = self.__options.merge_options(baml_options).create_sync_stream(function_name="SummarizeCode", args={ + "content": content, + }) + return baml_py.BamlSyncStream[stream_types.SummarizedCode, types.SummarizedCode]( + result, + lambda x: typing.cast(stream_types.SummarizedCode, x.cast_to(types, types, stream_types, True, __runtime__)), + lambda x: typing.cast(types.SummarizedCode, x.cast_to(types, types, stream_types, False, __runtime__)), + ctx, + ) + def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.BamlSyncStream[stream_types.SummarizedContent, types.SummarizedContent]: + ctx, result = self.__options.merge_options(baml_options).create_sync_stream(function_name="SummarizeContent", args={ + "content": content, + }) + return baml_py.BamlSyncStream[stream_types.SummarizedContent, types.SummarizedContent]( + result, + lambda x: typing.cast(stream_types.SummarizedContent, x.cast_to(types, types, stream_types, True, __runtime__)), + lambda x: typing.cast(types.SummarizedContent, x.cast_to(types, types, stream_types, False, __runtime__)), + ctx, + ) class BamlHttpRequestClient: @@ -206,6 +244,20 @@ class BamlHttpRequestClient: "content": content,"prompt_override": prompt_override, }, mode="request") return result + def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = self.__options.merge_options(baml_options).create_http_request_sync(function_name="SummarizeCode", args={ + "content": content, + }, mode="request") + return result + def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = self.__options.merge_options(baml_options).create_http_request_sync(function_name="SummarizeContent", args={ + "content": content, + }, mode="request") + return result class BamlHttpStreamRequestClient: @@ -242,6 +294,20 @@ class BamlHttpStreamRequestClient: "content": content,"prompt_override": prompt_override, }, mode="stream") return result + def SummarizeCode(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = self.__options.merge_options(baml_options).create_http_request_sync(function_name="SummarizeCode", args={ + "content": content, + }, mode="stream") + return result + def SummarizeContent(self, content: str, + baml_options: BamlCallOptions = {}, + ) -> baml_py.baml_py.HTTPRequest: + result = self.__options.merge_options(baml_options).create_http_request_sync(function_name="SummarizeContent", args={ + "content": content, + }, mode="stream") + return result b = BamlSyncClient(DoNotUseDirectlyCallManager({})) \ No newline at end of file diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py index 3964e4721..cbd4e479f 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py @@ -18,7 +18,7 @@ from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIM class TypeBuilder(type_builder.TypeBuilder): def __init__(self): super().__init__(classes=set( - ["Edge","KnowledgeGraph","Node",] + ["Edge","KnowledgeGraph","Node","SummarizedClass","SummarizedCode","SummarizedContent","SummarizedFunction",] ), enums=set( [] ), runtime=DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME) @@ -29,7 +29,7 @@ class TypeBuilder(type_builder.TypeBuilder): # ######################################################################### - # Generated classes 3 + # Generated classes 7 # ######################################################################### @property @@ -44,6 +44,22 @@ class TypeBuilder(type_builder.TypeBuilder): def Node(self) -> "NodeBuilder": return NodeBuilder(self) + @property + def SummarizedClass(self) -> "SummarizedClassViewer": + return SummarizedClassViewer(self) + + @property + def SummarizedCode(self) -> "SummarizedCodeViewer": + return SummarizedCodeViewer(self) + + @property + def SummarizedContent(self) -> "SummarizedContentViewer": + return SummarizedContentViewer(self) + + @property + def SummarizedFunction(self) -> "SummarizedFunctionViewer": + return SummarizedFunctionViewer(self) + # ######################################################################### @@ -52,7 +68,7 @@ class TypeBuilder(type_builder.TypeBuilder): # ######################################################################### -# Generated classes 3 +# Generated classes 7 # ######################################################################### class EdgeAst: @@ -206,3 +222,215 @@ class NodeProperties: + +class SummarizedClassAst: + def __init__(self, tb: type_builder.TypeBuilder): + _tb = tb._tb # type: ignore (we know how to use this private attribute) + self._bldr = _tb.class_("SummarizedClass") + self._properties: typing.Set[str] = set([ "name", "description", "methods", "decorators", ]) + self._props = SummarizedClassProperties(self._bldr, self._properties) + + def type(self) -> baml_py.FieldType: + return self._bldr.field() + + @property + def props(self) -> "SummarizedClassProperties": + return self._props + + +class SummarizedClassViewer(SummarizedClassAst): + def __init__(self, tb: type_builder.TypeBuilder): + super().__init__(tb) + + + def list_properties(self) -> typing.List[typing.Tuple[str, type_builder.ClassPropertyViewer]]: + return [(name, type_builder.ClassPropertyViewer(self._bldr.property(name))) for name in self._properties] + + + +class SummarizedClassProperties: + def __init__(self, bldr: baml_py.ClassBuilder, properties: typing.Set[str]): + self.__bldr = bldr + self.__properties = properties # type: ignore (we know how to use this private attribute) # noqa: F821 + + + + @property + def name(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("name")) + + @property + def description(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("description")) + + @property + def methods(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("methods")) + + @property + def decorators(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("decorators")) + + + + +class SummarizedCodeAst: + def __init__(self, tb: type_builder.TypeBuilder): + _tb = tb._tb # type: ignore (we know how to use this private attribute) + self._bldr = _tb.class_("SummarizedCode") + self._properties: typing.Set[str] = set([ "high_level_summary", "key_features", "imports", "constants", "classes", "functions", "workflow_description", ]) + self._props = SummarizedCodeProperties(self._bldr, self._properties) + + def type(self) -> baml_py.FieldType: + return self._bldr.field() + + @property + def props(self) -> "SummarizedCodeProperties": + return self._props + + +class SummarizedCodeViewer(SummarizedCodeAst): + def __init__(self, tb: type_builder.TypeBuilder): + super().__init__(tb) + + + def list_properties(self) -> typing.List[typing.Tuple[str, type_builder.ClassPropertyViewer]]: + return [(name, type_builder.ClassPropertyViewer(self._bldr.property(name))) for name in self._properties] + + + +class SummarizedCodeProperties: + def __init__(self, bldr: baml_py.ClassBuilder, properties: typing.Set[str]): + self.__bldr = bldr + self.__properties = properties # type: ignore (we know how to use this private attribute) # noqa: F821 + + + + @property + def high_level_summary(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("high_level_summary")) + + @property + def key_features(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("key_features")) + + @property + def imports(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("imports")) + + @property + def constants(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("constants")) + + @property + def classes(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("classes")) + + @property + def functions(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("functions")) + + @property + def workflow_description(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("workflow_description")) + + + + +class SummarizedContentAst: + def __init__(self, tb: type_builder.TypeBuilder): + _tb = tb._tb # type: ignore (we know how to use this private attribute) + self._bldr = _tb.class_("SummarizedContent") + self._properties: typing.Set[str] = set([ "summary", "description", ]) + self._props = SummarizedContentProperties(self._bldr, self._properties) + + def type(self) -> baml_py.FieldType: + return self._bldr.field() + + @property + def props(self) -> "SummarizedContentProperties": + return self._props + + +class SummarizedContentViewer(SummarizedContentAst): + def __init__(self, tb: type_builder.TypeBuilder): + super().__init__(tb) + + + def list_properties(self) -> typing.List[typing.Tuple[str, type_builder.ClassPropertyViewer]]: + return [(name, type_builder.ClassPropertyViewer(self._bldr.property(name))) for name in self._properties] + + + +class SummarizedContentProperties: + def __init__(self, bldr: baml_py.ClassBuilder, properties: typing.Set[str]): + self.__bldr = bldr + self.__properties = properties # type: ignore (we know how to use this private attribute) # noqa: F821 + + + + @property + def summary(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("summary")) + + @property + def description(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("description")) + + + + +class SummarizedFunctionAst: + def __init__(self, tb: type_builder.TypeBuilder): + _tb = tb._tb # type: ignore (we know how to use this private attribute) + self._bldr = _tb.class_("SummarizedFunction") + self._properties: typing.Set[str] = set([ "name", "description", "inputs", "outputs", "decorators", ]) + self._props = SummarizedFunctionProperties(self._bldr, self._properties) + + def type(self) -> baml_py.FieldType: + return self._bldr.field() + + @property + def props(self) -> "SummarizedFunctionProperties": + return self._props + + +class SummarizedFunctionViewer(SummarizedFunctionAst): + def __init__(self, tb: type_builder.TypeBuilder): + super().__init__(tb) + + + def list_properties(self) -> typing.List[typing.Tuple[str, type_builder.ClassPropertyViewer]]: + return [(name, type_builder.ClassPropertyViewer(self._bldr.property(name))) for name in self._properties] + + + +class SummarizedFunctionProperties: + def __init__(self, bldr: baml_py.ClassBuilder, properties: typing.Set[str]): + self.__bldr = bldr + self.__properties = properties # type: ignore (we know how to use this private attribute) # noqa: F821 + + + + @property + def name(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("name")) + + @property + def description(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("description")) + + @property + def inputs(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("inputs")) + + @property + def outputs(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("outputs")) + + @property + def decorators(self) -> type_builder.ClassPropertyViewer: + return type_builder.ClassPropertyViewer(self.__bldr.property("decorators")) + + + diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py index 61f0c3bab..787a62d94 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py @@ -25,5 +25,17 @@ type_map = { "types.Node": types.Node, "stream_types.Node": stream_types.Node, + "types.SummarizedClass": types.SummarizedClass, + "stream_types.SummarizedClass": stream_types.SummarizedClass, + + "types.SummarizedCode": types.SummarizedCode, + "stream_types.SummarizedCode": stream_types.SummarizedCode, + + "types.SummarizedContent": types.SummarizedContent, + "stream_types.SummarizedContent": stream_types.SummarizedContent, + + "types.SummarizedFunction": types.SummarizedFunction, + "stream_types.SummarizedFunction": stream_types.SummarizedFunction, + } \ No newline at end of file diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py index 09b090f4e..72c1e59c9 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py @@ -41,7 +41,7 @@ def all_succeeded(checks: typing.Dict[CheckName, Check]) -> bool: # ######################################################################### # ######################################################################### -# Generated classes (3) +# Generated classes (7) # ######################################################################### class Edge(BaseModel): @@ -63,6 +63,32 @@ class Node(BaseModel): type: str description: str +class SummarizedClass(BaseModel): + name: str + description: str + methods: typing.Optional[typing.List["SummarizedFunction"]] = None + decorators: typing.Optional[typing.List[str]] = None + +class SummarizedCode(BaseModel): + high_level_summary: str + key_features: typing.List[str] + imports: typing.List[str] + constants: typing.List[str] + classes: typing.List["SummarizedClass"] + functions: typing.List["SummarizedFunction"] + workflow_description: typing.Optional[str] = None + +class SummarizedContent(BaseModel): + summary: str + description: str + +class SummarizedFunction(BaseModel): + name: str + description: str + inputs: typing.Optional[typing.List[str]] = None + outputs: typing.Optional[typing.List[str]] = None + decorators: typing.Optional[typing.List[str]] = None + # ######################################################################### # Generated type aliases (0) # ######################################################################### diff --git a/cognee/infrastructure/llm/structured_output_framework/baml_src/config.py b/cognee/infrastructure/llm/structured_output_framework/baml_src/config.py index 91f26cb11..153507e8a 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml_src/config.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml_src/config.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Optional, ClassVar from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic import model_validator @@ -48,18 +48,19 @@ class LLMConfig(BaseSettings): embedding_rate_limit_enabled: bool = False embedding_rate_limit_requests: int = 60 embedding_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute) - baml_registry = ClientRegistry() + baml_registry: ClassVar[ClientRegistry] = ClientRegistry() model_config = SettingsConfigDict(env_file=".env", extra="allow") - - baml_registry.add_llm_client(name=llm_provider, provider=llm_provider, options={ - "model": llm_model, - "temperature": llm_temperature, - "api_key": llm_api_key - }) - # Sets MyAmazingClient as the primary client - baml_registry.set_primary('openai') + def model_post_init(self, __context) -> None: + """Initialize the BAML registry after the model is created.""" + self.baml_registry.add_llm_client(name=self.llm_provider, provider=self.llm_provider, options={ + "model": self.llm_model, + "temperature": self.llm_temperature, + "api_key": self.llm_api_key + }) + # Sets the primary client + self.baml_registry.set_primary(self.llm_provider) @model_validator(mode="after") def ensure_env_vars_for_ollama(self) -> "LLMConfig": diff --git a/cognee/infrastructure/llm/structured_output_framework/baml_src/extract_content_graph.baml b/cognee/infrastructure/llm/structured_output_framework/baml_src/extract_content_graph.baml index 6b4c957fe..ca5f9981b 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml_src/extract_content_graph.baml +++ b/cognee/infrastructure/llm/structured_output_framework/baml_src/extract_content_graph.baml @@ -19,6 +19,37 @@ class KnowledgeGraph { edges Edge[] } +// Summarization classes +class SummarizedContent { + summary string + description string +} + +class SummarizedFunction { + name string + description string + inputs string[]? + outputs string[]? + decorators string[]? +} + +class SummarizedClass { + name string + description string + methods SummarizedFunction[]? + decorators string[]? +} + +class SummarizedCode { + high_level_summary string + key_features string[] + imports string[] + constants string[] + classes SummarizedClass[] + functions SummarizedFunction[] + workflow_description string? +} + // Simple template for basic extraction (fast, good quality) template_string ExtractContentGraphPrompt() #" You are an advanced algorithm that extracts structured data into a knowledge graph. @@ -50,6 +81,28 @@ template_string ExtractContentGraphPrompt() #" - Follow these rules exactly. Non-compliance results in termination. "# +// Summarization prompt template +template_string SummarizeContentPrompt() #" + You are a top-tier summarization engine. Your task is to summarize text and make it versatile. + Be brief and concise, but keep the important information and the subject. + Use synonym words where possible in order to change the wording but keep the meaning. +"# + +// Code summarization prompt template +template_string SummarizeCodePrompt() #" + You are an expert code analyst. Analyze the provided source code and extract key information: + + 1. Provide a high-level summary of what the code does + 2. List key features and functionality + 3. Identify imports and dependencies + 4. List constants and global variables + 5. Summarize classes with their methods + 6. Summarize standalone functions + 7. Describe the overall workflow if applicable + + Be precise and technical while remaining clear and concise. +"# + // Detailed template for complex extraction (slower, higher quality) template_string DetailedExtractContentGraphPrompt() #" You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. @@ -325,6 +378,33 @@ function ExtractContentGraphWithAnthropic( "# } +// Summarization functions +function SummarizeContent(content: string) -> SummarizedContent { + client OpenAIClientWithEnvModel + + prompt #" + {{ SummarizeContentPrompt() }} + + {{ ctx.output_format(prefix="Answer in this schema:\n") }} + + {{ _.role('user') }} + {{ content }} + "# +} + +function SummarizeCode(content: string) -> SummarizedCode { + client OpenAIClientWithEnvModel + + prompt #" + {{ SummarizeCodePrompt() }} + + {{ ctx.output_format(prefix="Answer in this schema:\n") }} + + {{ _.role('user') }} + {{ content }} + "# +} + test ExtractPersonExample { functions [ExtractContentGraph] args { @@ -365,3 +445,31 @@ test ExtractGenericExample { mode "simple" } } + +test SummarizeContentExample { + functions [SummarizeContent] + args { + content #" + Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. + It deals with the interaction between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. + "# + } +} + +test SummarizeCodeExample { + functions [SummarizeCode] + args { + content #" + def fibonacci(n): + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + + def main(): + print(fibonacci(10)) + + if __name__ == "__main__": + main() + "# + } +} diff --git a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/__init__.py b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/__init__.py index b6419282d..157cbe7e7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/__init__.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/__init__.py @@ -1 +1,2 @@ from .knowledge_graph.extract_content_graph import extract_content_graph +from .extract_summary import extract_summary, extract_code_summary diff --git a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/extract_summary.py b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/extract_summary.py index e69de29bb..2417eaa1e 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/extract_summary.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/extract_summary.py @@ -0,0 +1,67 @@ +import os +from typing import Type +from pydantic import BaseModel +from cognee.infrastructure.llm.structured_output_framework.baml.baml_client.async_client import b +from cognee.infrastructure.llm.structured_output_framework.baml_src.config import get_llm_config +from cognee.shared.data_models import SummarizedCode +from cognee.tasks.summarization.mock_summary import get_mock_summarized_code +from cognee.shared.logging_utils import get_logger +from instructor.exceptions import InstructorRetryException + +logger = get_logger("extract_summary_baml") + + +async def extract_summary(content: str, response_model: Type[BaseModel]): + """ + Extract summary using BAML framework. + + Args: + content: The content to summarize + response_model: The Pydantic model type for the response + + Returns: + BaseModel: The summarized content in the specified format + """ + config = get_llm_config() + + # Use BAML's SummarizeContent function + summary_result = await b.SummarizeContent(content, baml_options={"tb": config.baml_registry}) + + # Convert BAML result to the expected response model + if response_model is SummarizedCode: + # If it's asking for SummarizedCode but we got SummarizedContent, + # we need to use SummarizeCode instead + code_result = await b.SummarizeCode(content, baml_options={"tb": config.baml_registry}) + return code_result + else: + # For other models, return the summary result + return summary_result + + +async def extract_code_summary(content: str): + """ + Extract code summary using BAML framework with mocking support. + + Args: + content: The code content to summarize + + Returns: + SummarizedCode: The summarized code information + """ + enable_mocking = os.getenv("MOCK_CODE_SUMMARY", "false") + if isinstance(enable_mocking, bool): + enable_mocking = str(enable_mocking).lower() + enable_mocking = enable_mocking in ("true", "1", "yes") + + if enable_mocking: + result = get_mock_summarized_code() + return result + else: + try: + config = get_llm_config() + result = await b.SummarizeCode(content, baml_options={"tb": config.baml_registry}) + except Exception as e: + logger.error("Failed to extract code summary with BAML, falling back to mock summary", exc_info=e) + result = get_mock_summarized_code() + + return result diff --git a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/knowledge_graph/extract_content_graph.py b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/knowledge_graph/extract_content_graph.py index 8cabed157..63cc01705 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/knowledge_graph/extract_content_graph.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml_src/extraction/knowledge_graph/extract_content_graph.py @@ -1,8 +1,8 @@ import os from typing import Type from pydantic import BaseModel -from cognee.infrastructure.llm.structured_output_framework.baml.async_client import b -from cognee.infrastructure.llm.structured_output_framework.baml.type_builder import TypeBuilder +from cognee.infrastructure.llm.structured_output_framework.baml.baml_client.async_client import b +from cognee.infrastructure.llm.structured_output_framework.baml.baml_client.type_builder import TypeBuilder from cognee.infrastructure.llm.structured_output_framework.baml_src.config import get_llm_config from cognee.shared.logging_utils import get_logger, setup_logging diff --git a/cognee/tasks/graph/extract_graph_from_code.py b/cognee/tasks/graph/extract_graph_from_code.py index d38572b30..d74099ed6 100644 --- a/cognee/tasks/graph/extract_graph_from_code.py +++ b/cognee/tasks/graph/extract_graph_from_code.py @@ -9,8 +9,10 @@ from cognee.base_config import get_base_config base = get_base_config() if base.structured_output_framework == 'BAML': + print(f"Using BAML framework: {base.structured_output_framework}") from cognee.infrastructure.llm.structured_output_framework.baml_src.extraction import extract_content_graph else: + print(f"Using llitellm_instructor framework: {base.structured_output_framework}") from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_content_graph async def extract_graph_from_code( diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 707601a99..a13b7967d 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -9,8 +9,10 @@ from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.base_config import get_base_config base = get_base_config() if base.structured_output_framework == 'BAML': + print(f"Using BAML framework: {base.structured_output_framework}") from cognee.infrastructure.llm.structured_output_framework.baml_src.extraction import extract_content_graph else: + print(f"Using llitellm_instructor framework: {base.structured_output_framework}") from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_content_graph from cognee.modules.graph.utils import ( diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index a3807152f..be2eee570 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -3,7 +3,15 @@ from typing import AsyncGenerator, Union from uuid import uuid5 from cognee.infrastructure.engine import DataPoint -from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_code_summary +from cognee.base_config import get_base_config + +base = get_base_config() +if base.structured_output_framework == 'BAML': + print(f"Using BAML framework for code summarization: {base.structured_output_framework}") + from cognee.infrastructure.llm.structured_output_framework.baml_src.extraction import extract_code_summary +else: + print(f"Using llitellm_instructor framework for code summarization: {base.structured_output_framework}") + from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_code_summary from .models import CodeSummary diff --git a/cognee/tasks/summarization/summarize_text.py b/cognee/tasks/summarization/summarize_text.py index 8aff298b4..2785f4a68 100644 --- a/cognee/tasks/summarization/summarize_text.py +++ b/cognee/tasks/summarization/summarize_text.py @@ -2,7 +2,16 @@ import asyncio from typing import Type from uuid import uuid5 from pydantic import BaseModel -from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_summary +from cognee.base_config import get_base_config + +base = get_base_config() +if base.structured_output_framework == 'BAML': + print(f"Using BAML framework for text summarization: {base.structured_output_framework}") + from cognee.infrastructure.llm.structured_output_framework.baml_src.extraction import extract_summary +else: + print(f"Using llitellm_instructor framework for text summarization: {base.structured_output_framework}") + from cognee.infrastructure.llm.structured_output_framework.llitellm_instructor.extraction import extract_summary + from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.cognify.config import get_cognify_config from .models import TextSummary