docs: Add detailed backend module analysis documentation

Add comprehensive documentation covering 6 modules:
- 01-API-LAYER: Authentication, routing, SSE streaming
- 02-SERVICE-LAYER: Dialog, Task, LLM service analysis
- 03-RAG-ENGINE: Hybrid search, embedding, reranking
- 04-AGENT-SYSTEM: Canvas engine, components, tools
- 05-DOCUMENT-PROCESSING: Task executor, PDF parsing
- 06-ALGORITHMS: BM25, fusion, RAPTOR

Total 28 documentation files with code analysis, diagrams, and formulas.

2025-11-26 11:10:54 +00:00

12 KiB

Raw Blame History

Tool Integration Framework

Tong Quan

Tool framework cho phép Agent components gọi external services và APIs.

File Locations

/agent/tools/
├── base.py           # ToolBase class
├── retrieval.py      # KB search
├── google.py         # Google search
├── tavily.py         # Tavily search
├── exesql.py         # SQL execution
├── code_executor.py  # Code execution
├── wikipedia.py      # Wikipedia
├── arxiv.py          # ArXiv papers
├── pubmed.py         # PubMed
└── ...               # Other tools

Tool Base Class

class ToolBase(ComponentBase):
    """Base class for all tools."""

    def invoke(self, **kwargs):
        """
        Execute tool with error handling.
        """
        self.set_output("_created_time", time.perf_counter())

        try:
            res = self._invoke(**kwargs)
        except Exception as e:
            self._param.outputs["_ERROR"] = {"value": str(e)}
            res = str(e)

        self.set_output(
            "_elapsed_time",
            time.perf_counter() - self.output("_created_time")
        )

        return res

    def _retrieve_chunks(self, res_list, get_title, get_url,
                         get_content, get_score=None):
        """
        Helper: normalize search results into RAG chunks.

        Args:
            res_list: Raw search results
            get_title: Function to extract title
            get_url: Function to extract URL
            get_content: Function to extract content
            get_score: Optional function to extract score
        """
        chunks = []
        aggs = []

        for r in res_list:
            content = get_content(r)
            id = str(hash_str2int(content))

            chunks.append({
                "chunk_id": id,
                "content": content[:10000],
                "doc_id": id,
                "docnm_kwd": get_title(r),
                "similarity": get_score(r) if get_score else 1,
                "url": get_url(r)
            })

        # Add to canvas references
        self._canvas.add_reference(chunks, aggs)

Retrieval Tool

class Retrieval(ToolBase, ABC):
    component_name = "Retrieval"

    def _invoke(self, **kwargs):
        """
        Search knowledge base for relevant chunks.
        """
        query = kwargs.get("query")

        if not query:
            self.set_output("formalized_content", self._param.empty_response)
            return

        # Support dynamic KB selection via variables
        kb_ids = []
        for id in self._param.kb_ids:
            if "@" in id:  # Variable reference
                kb_nm = self._canvas.get_variable_value(id)
                kb_ids.extend(kb_nm if isinstance(kb_nm, list) else [kb_nm])
            else:
                kb_ids.append(id)

        # Execute retrieval
        results = DocumentService.retrieval(
            kb_ids=kb_ids,
            query=query,
            top_k=self._param.top_k,
            top_n=self._param.top_n,
            similarity_threshold=self._param.similarity_threshold,
            use_kg=self._param.use_kg,
            rerank_id=self._param.rerank_id,
            cross_languages=self._param.cross_languages,
            meta_filter=self._param.meta_data_filter
        )

        # Format output
        self._retrieve_chunks(
            results,
            get_title=lambda r: r["doc_name"],
            get_url=lambda r: r.get("url", ""),
            get_content=lambda r: r["content"],
            get_score=lambda r: r["similarity"]
        )

        # Create formatted context
        self.set_output("formalized_content", kb_prompt(results))

Google Search Tool

class Google(ToolBase, ABC):
    component_name = "Google"

    def _invoke(self, **kwargs):
        """
        Execute Google Custom Search.
        """
        query = kwargs.get("query")

        # Google API call
        response = requests.get(
            "https://www.googleapis.com/customsearch/v1",
            params={
                "key": self._param.api_key,
                "cx": self._param.search_engine_id,
                "q": query,
                "num": self._param.num_results
            }
        )

        results = response.json().get("items", [])

        # Normalize to chunks
        self._retrieve_chunks(
            results,
            get_title=lambda r: r["title"],
            get_url=lambda r: r["link"],
            get_content=lambda r: r.get("snippet", ""),
            get_score=lambda r: 1.0
        )

SQL Execution Tool

class ExeSQL(ToolBase, ABC):
    component_name = "ExeSQL"

    def _invoke(self, **kwargs):
        """
        Execute SQL query against database.
        """
        sql = kwargs.get("sql")

        # Build connection
        conn = self._get_connection(
            db_type=self._param.db_type,
            host=self._param.host,
            port=self._param.port,
            database=self._param.database,
            username=self._param.username,
            password=self._param.password
        )

        # Execute with safety limits
        df = pd.read_sql(
            sql, conn,
            params={},
            chunksize=self._param.max_records
        )

        self.set_output("result", df.to_dict())

    def _get_connection(self, db_type, **kwargs):
        """Create database connection."""
        if db_type == "mysql":
            import pymysql
            return pymysql.connect(**kwargs)
        elif db_type == "postgresql":
            import psycopg2
            return psycopg2.connect(**kwargs)
        elif db_type == "sqlite":
            import sqlite3
            return sqlite3.connect(kwargs["database"])

Code Execution Tool

class CodeExec(ToolBase, ABC):
    component_name = "CodeExec"

    def _invoke(self, **kwargs):
        """
        Execute code in sandboxed environment.
        """
        code_b64 = kwargs.get("code_b64")
        code = base64.b64decode(code_b64).decode('utf-8')
        language = kwargs.get("language", "python")
        arguments = kwargs.get("arguments", {})

        if language == "python":
            result = self._execute_python(code, arguments)
        elif language == "nodejs":
            result = self._execute_nodejs(code, arguments)
        else:
            raise ValueError(f"Unsupported language: {language}")

        self.set_output("result", json.dumps(result))

    def _execute_python(self, code, arguments):
        """Execute Python code with restricted builtins."""
        restricted_builtins = {
            'print': print,
            'len': len,
            'range': range,
            'str': str,
            'int': int,
            'float': float,
            'list': list,
            'dict': dict,
            'json': json,
            # ... limited set
        }

        exec_globals = {"__builtins__": restricted_builtins}
        exec(code, exec_globals)

        return exec_globals["main"](arguments)

Tavily Search Tool

class Tavily(ToolBase, ABC):
    component_name = "Tavily"

    def _invoke(self, **kwargs):
        """
        Tavily structured web search.
        """
        query = kwargs.get("query")

        response = requests.post(
            "https://api.tavily.com/search",
            json={
                "api_key": self._param.api_key,
                "query": query,
                "search_depth": self._param.search_depth,
                "include_answer": True,
                "max_results": self._param.max_results
            }
        )

        data = response.json()

        # Get direct answer if available
        if data.get("answer"):
            self.set_output("answer", data["answer"])

        # Process results
        self._retrieve_chunks(
            data.get("results", []),
            get_title=lambda r: r["title"],
            get_url=lambda r: r["url"],
            get_content=lambda r: r["content"],
            get_score=lambda r: r.get("score", 1.0)
        )

Academic Search Tools

ArXiv

class ArXiv(ToolBase, ABC):
    component_name = "ArXiv"

    def _invoke(self, **kwargs):
        """Search ArXiv for academic papers."""
        import arxiv

        query = kwargs.get("query")
        search = arxiv.Search(
            query=query,
            max_results=self._param.max_results,
            sort_by=arxiv.SortCriterion.Relevance
        )

        results = list(search.results())

        self._retrieve_chunks(
            results,
            get_title=lambda r: r.title,
            get_url=lambda r: r.pdf_url,
            get_content=lambda r: r.summary,
            get_score=lambda r: 1.0
        )

PubMed

class PubMed(ToolBase, ABC):
    component_name = "PubMed"

    def _invoke(self, **kwargs):
        """Search PubMed for biomedical literature."""
        from Bio import Entrez

        Entrez.email = self._param.email

        query = kwargs.get("query")

        # Search
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=self._param.max_results
        )
        record = Entrez.read(handle)
        ids = record["IdList"]

        # Fetch details
        handle = Entrez.efetch(
            db="pubmed",
            id=ids,
            rettype="abstract"
        )
        results = Entrez.read(handle)

        # Process results
        # ...

Tool Meta for Agent

def get_meta(self) -> dict:
    """
    Return tool metadata for function calling.
    """
    return {
        "function": {
            "name": self.component_name.lower(),
            "description": self._param.description,
            "parameters": {
                "type": "object",
                "properties": self._param.get_properties(),
                "required": self._param.get_required()
            }
        }
    }

# Example output:
{
    "function": {
        "name": "retrieval",
        "description": "Search knowledge base for relevant information",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query"
                }
            },
            "required": ["query"]
        }
    }
}

MCP Tool Integration

# Model Context Protocol tools
class MCPToolCallSession:
    def __init__(self, mcp_server, variables):
        self.server = mcp_server
        self.variables = variables

    def tool_call(self, name: str, args: dict) -> str:
        """Execute MCP tool."""
        # Connect to MCP server
        response = self.server.call_tool(name, args)
        return response

# Usage in Agent:
for mcp_config in self._param.mcp:
    _, mcp_server = MCPServerService.get_by_id(mcp_config["mcp_id"])
    session = MCPToolCallSession(mcp_server, mcp_server.variables)

    for tool_name, meta in mcp_config["tools"].items():
        self.tools[tool_name] = session

Tool Configuration

# Retrieval parameters
{
    "kb_ids": ["kb_123", "{{sys.selected_kb}}"],  # Static + dynamic
    "top_k": 1024,
    "top_n": 6,
    "similarity_threshold": 0.2,
    "use_kg": False,
    "rerank_id": "jina-reranker-v2",
}

# Google parameters
{
    "api_key": "...",
    "search_engine_id": "...",
    "num_results": 10
}

# SQL parameters
{
    "db_type": "mysql",
    "host": "localhost",
    "port": 3306,
    "database": "mydb",
    "username": "user",
    "password": "***",
    "max_records": 1000
}

/agent/tools/base.py - ToolBase class
/agent/tools/retrieval.py - KB retrieval
/agent/tools/*.py - Individual tool implementations
/agent/component/agent_with_tools.py - Tool-enabled agent

12 KiB Raw Blame History

Tool Integration Framework

Tong Quan

File Locations

Tool Base Class

Retrieval Tool

Google Search Tool

SQL Execution Tool

Code Execution Tool

Tavily Search Tool

Academic Search Tools

ArXiv

PubMed

Tool Meta for Agent

MCP Tool Integration

Tool Configuration

Related Files

12 KiB

Raw Blame History