diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 09cade767..c74ee69d8 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -78,6 +78,7 @@ async def add( - S3 path: "s3://my-bucket/documents/file.pdf" - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle] - Binary file object: open("file.txt", "rb") + - url: A web link url (https or http) dataset_name: Name of the dataset to store data in. Defaults to "main_dataset". Create separate datasets to organize different knowledge domains. user: User object for authentication and permissions. Uses default user if None. @@ -88,6 +89,9 @@ async def add( vector_db_config: Optional configuration for vector database (for custom setups). graph_db_config: Optional configuration for graph database (for custom setups). dataset_id: Optional specific dataset UUID to use instead of dataset_name. + extraction_rules: Optional dictionary of rules (e.g., CSS selectors, XPath) for extracting specific content from web pages using BeautifulSoup + tavily_config: Optional configuration for Tavily API, including API key and extraction settings + soup_crawler_config: Optional configuration for BeautifulSoup crawler, specifying concurrency, crawl delay, and extraction rules. Returns: PipelineRunInfo: Information about the ingestion pipeline execution including: @@ -136,6 +140,21 @@ async def add( # Add a single file await cognee.add("/home/user/documents/analysis.pdf") + + # Add a single url and bs4 extract ingestion method + extraction_rules = { + "title": "h1", + "description": "p", + "more_info": "a[href*='more-info']" + } + await cognee.add("https://example.com",extraction_rules=extraction_rules) + + # Add a single url and tavily extract ingestion method + Make sure to TAVILY_API_KEY = YOUR_TAVILY_API_KEY as a environment variable + await cognee.add("https://example.com") + + # Add multiple urls + await cognee.add(["https://example.com","https://books.toscrape.com"]) ``` Environment Variables: @@ -149,6 +168,7 @@ async def add( - DEFAULT_USER_PASSWORD: Custom default user password - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector" - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j" + - TAVILY_API_KEY: YOUR_TAVILY_API_KEY """ tasks = [