cognee/profiling/util/DummyLLMAdapter.py

import spacy
import textacy
from typing import Type
from uuid import uuid4
from pydantic import BaseModel
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.shared.data_models import SummarizedContent
from cognee.shared.data_models import KnowledgeGraph, Node, Edge


class DummyLLMAdapter(LLMInterface):
    nlp = spacy.load('en_core_web_sm')
    async def acreate_structured_output(self,
                                        text_input: str,
                                        system_prompt: str,
                                        response_model: Type[BaseModel]) -> BaseModel:

        if isinstance(response_model, SummarizedContent):
            return(dummy_summarize_content(text_input))
        elif isinstance(response_model, KnowledgeGraph):
            return(dummy_extract_knowledge_graph(text_input, nlp))
        else:
            raise Exception("Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph")


def dummy_extract_knowledge_graph(text, nlp):
    doc = nlp(text)
    triples = list(textacy.extract.subject_verb_object_triples(doc))

    nodes = {}
    edges = []
    for triple in triples:
        source = "_".join([str(e) for e in triple.subject])
        target  = "_".join([str(e) for e in triple.object])
        nodes[source] = nodes.get(source, Node(id=str(uuid4()), name=source, type="object", description="")   )
        nodes[target] = nodes.get(target,  Node(id=str(uuid4()), name=target, type="object", description="")  )
        edge_type = "_".join([str(e) for e in triple.verb])
        edges.append(Edge(source_node_id=nodes[source].id, target_node_id=nodes[target].id, relationship_name=edge_type))
    return(KnowledgeGraph(nodes=list(nodes.keys()), edges=edges))


def dummy_summarize_content(text):
    words = [(word, len(word)) for word in set(text.split(" "))]
    words = sorted(words, key=lambda x: x[1], reverse=True)
    summary = " ".join([word for word, _ in words[:100]])
    description =  " ".join([word for word, _ in words[:10]])
    return(SummarizedContent(summary=summary, description=description))