From 64b8aac86f3e3b19391c813b39f5ab0777caa4d6 Mon Sep 17 00:00:00 2001 From: Boris Date: Wed, 27 Nov 2024 09:32:29 +0100 Subject: [PATCH] feat: code graph swe integration Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> Co-authored-by: hande-k Co-authored-by: Igor Ilic Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> --- .data/multimedia/example.png | Bin 0 -> 10784 bytes .data/multimedia/text_to_speech.mp3 | Bin 0 -> 28173 bytes .../test_cognee_multimedia_notebook.yml | 63 +++++++ .github/workflows/test_python_3_10.yml | 7 +- .github/workflows/test_python_3_11.yml | 7 +- .github/workflows/test_python_3_9.yml | 7 +- README.md | 52 ++++-- .../databases/graph/get_graph_engine.py | 2 +- .../databases/graph/neo4j_driver/adapter.py | 48 ++++- .../databases/graph/networkx/adapter.py | 38 +++- .../vector/lancedb/LanceDBAdapter.py | 51 +++--- .../vector/pgvector/PGVectorAdapter.py | 61 +++++++ .../vector/pgvector/create_db_and_tables.py | 2 - .../infrastructure/databases/vector/utils.py | 26 +++ .../infrastructure/engine/models/DataPoint.py | 1 + cognee/infrastructure/llm/openai/adapter.py | 6 + .../modules/graph/cognee_graph/CogneeGraph.py | 98 +++++++++- .../graph/cognee_graph/CogneeGraphElements.py | 16 +- cognee/modules/graph/utils/__init__.py | 1 + .../graph/utils/convert_node_to_data_point.py | 23 +++ .../graph/utils/get_graph_from_model.py | 45 ++++- cognee/pipelines/__init__.py | 0 cognee/pipelines/retriever/__init__.py | 0 .../retriever/diffusion_retriever.py | 25 +++ cognee/pipelines/retriever/g_retriever.py | 25 +++ .../retriever/two_steps_retriever.py | 119 ++++++++++++ cognee/shared/CodeGraphEntities.py | 20 ++- .../code/enrich_dependency_graph_checker.py | 4 +- .../code/expand_dependency_graph_checker.py | 4 +- .../code/get_repo_dependency_graph_checker.py | 4 +- cognee/tasks/documents/classify_documents.py | 47 ++++- .../graph/convert_graph_from_code_graph.py | 54 ------ cognee/tasks/graph/extract_graph_from_data.py | 6 +- cognee/tasks/repo_processor/__init__.py | 2 +- .../repo_processor/enrich_dependency_graph.py | 98 ++++++++-- .../repo_processor/expand_dependency_graph.py | 56 +++--- .../get_repo_dependency_graph.py | 61 ------- .../get_repo_file_dependencies.py | 87 +++++++++ cognee/tasks/storage/add_data_points.py | 11 +- cognee/tasks/storage/index_data_points.py | 26 ++- cognee/tasks/summarization/summarize_code.py | 14 +- .../graph/code_graph_test_data_generation.py | 51 ------ .../convert_graph_from_code_graph_test.py | 27 --- .../graph/get_graph_from_huge_model_test.py | 100 +++++++++++ .../graph/cognee_graph_elements_test.py | 4 +- .../unit/modules/graph/cognee_graph_test.py | 4 +- docker-compose.yml | 2 +- evals/eval_swe_bench.py | 69 +++++-- evals/eval_utils.py | 56 +++++- examples/python/code_graph_pipeline.py | 67 ++----- examples/python/dynamic_steps_example.py | 53 ++---- examples/python/multimedia_example.py | 48 +++++ examples/python/simple_example.py | 37 +++- notebooks/cognee_demo.ipynb | 38 ++-- notebooks/cognee_llama_index.ipynb | 25 +-- notebooks/cognee_multimedia_demo.ipynb | 169 ++++++++++++++++++ poetry.lock | 5 - 57 files changed, 1494 insertions(+), 478 deletions(-) create mode 100644 .data/multimedia/example.png create mode 100644 .data/multimedia/text_to_speech.mp3 create mode 100644 .github/workflows/test_cognee_multimedia_notebook.yml create mode 100644 cognee/infrastructure/databases/vector/utils.py create mode 100644 cognee/modules/graph/utils/convert_node_to_data_point.py create mode 100644 cognee/pipelines/__init__.py create mode 100644 cognee/pipelines/retriever/__init__.py create mode 100644 cognee/pipelines/retriever/diffusion_retriever.py create mode 100644 cognee/pipelines/retriever/g_retriever.py create mode 100644 cognee/pipelines/retriever/two_steps_retriever.py delete mode 100644 cognee/tasks/graph/convert_graph_from_code_graph.py delete mode 100644 cognee/tasks/repo_processor/get_repo_dependency_graph.py create mode 100644 cognee/tasks/repo_processor/get_repo_file_dependencies.py delete mode 100644 cognee/tests/tasks/graph/code_graph_test_data_generation.py delete mode 100644 cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py create mode 100644 cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py create mode 100644 examples/python/multimedia_example.py create mode 100644 notebooks/cognee_multimedia_demo.ipynb diff --git a/.data/multimedia/example.png b/.data/multimedia/example.png new file mode 100644 index 0000000000000000000000000000000000000000..4d406cafdeb7e6d778ff3fa37d99d42088673bf9 GIT binary patch literal 10784 zcmeHsXEdB&*S<&)(UOQnlnA0561@|m_cDXgqMOm8cM{Qi^e%cCLoh@1XwkbN$|ylF zdKtaE@_U~3U(b3z{nz{XJ)iDh$#V`1lx$#h#s=ot&IpUS6J`pLce4nwgpL^Yb$^ zGlRk4xVX5wx;i5xBRf00+1c5FfdOxCZv+DI=g*(`_;?Z$5=BMD$jHdKxw*W&y!Y?l zKYjYNudgpGEX>5jgolU6(9p1}s|$rf<>%*HSy@FyL<9u|g@lA`Z*Px{jm^x=1Ox;$ zH#b9}Ph5dOG_XS zI66A|>C>l-jEud#Jq!k8Zf>5Km}qZrpOcd#Dk=&Bfrf^LDl01)7#Os*wLgCRC@(Kx zR#vvQwx+DC?C0kPfk5DJxUaA8*47pc4UL0?Lw$Yy&dyGHdivqvVP+{{PE*QNlA&Pr|00{;K9K`XlQ6!T3U2;bZu>IZfU3+@$r?Fm6)__IvgC93B`ABbv!e-vo_41bj@@{D&E(*HxL(ek5}q5 zE^R&W!-r2u;=jImX=_2^6!h`sqZOb&-`AI{->dQ$y@3;vJ#=T2hZAF)oNcV_nG2yR z#f`-cMdz+=KsT&Q(5*N9@5g&S%7k7EsPxU06_5F@fmPX;1{JHhtD%m&?VfCwe(M3) zUB(jtFRJGr8_WEznAl4dB`9|EA~_9Oa0d*3VuO{q3kMM+ymLN*IPk&E!%{E`Z7^)Q z_ax53FV5K9PP?{Lt}}dBJL2c*R7Opkv$pT zV`4K&8)tqp?nL`KEAM+26{>7t9%U6e@wK)(_FPs>+ zjd<0FKJ1L~-`J(m*l1FcXKSj3325$2yEOtsinn|4!Gm1r(&L?9uPL{X#g2h}J@XVk zTM<>2Cl_P83w3<{^*VQDk>EN#DreWBV;G+Jo`^1>xlNg1Vk~~$Lp1cr(YFuk(1 zXBLx_93xm?I}ccGgIK9{b0WEi$^6Uh#kcY&_~of1&PPXKa5 zdl>Dpoz93BNr0y0;~BfC(ENI6@#m(`(unrr)UL+Nv6jo4^e9_mNEdiRWx}AzDN0 z%@4LSC3H>2k6|1^BV)@n{WX_(0AM}`44A-6ug4lJ02mtL(H{;vtVh025jkzC>>5w9 z4j+xMs@o(GF+huu6EGG^)PIg-tUR%RPagb!Sa~4%r}^h*-coDMNJOz`3UB6kt&ive z8BYPqo*Q2wALXY*vlc{I6Zya`>LKRdP60`@p{wCdY5Iw;5MaLgQhQ%tlDk=0p60^P zkE_73^Z=j{{OjlyB<$D+q*mqmS#zsazTYFO}f0qie&VnvfhMW0g|3HHORMJwsa zEL}g?Tz$;VrK9XBzg1P7Soyih;Wqw@>To%h3}12C7o(aS%vgh;x)LzWTCtZr)?YK1 z!aZh5QtZ;Y2x}eSu zCPqI2Ip#jps;uD9MV^sZResi5F!0Dwm(2imy+HnfoJ)#YB%1dM8AWEa%%>sBX5C(O z>q<+qbnv^j>hl5$P#)N_ETaS48Z3!+Z%U81+|1Ke8lS+( z3O{RZnLY|Mi`t&vk^U|6^BKRITXvYb82o!RDX z18=`J(W#s5IudSU4Xp_DT1w>IK@jb;dK4?bWW0mR;(ccUFY?_lV@tFH>>a(|D!?4# z6()6N3_$5RN)}l$SD&u&prz`L)PF?V0%W=$;kAq?z^IQRa@O-C(iLN*HG9th%q)15 z?0)tc=u7;Y_MFeQX#sH!_X0HOtdn;Xne}K3PT=4`mtrWM-&$jv8wvo7(Q%gC2l}(e zC-XHkxA=({rDPL(-+{MYUmoD2U%VP+g*)&W2&5j*5h5W{e%T)0Z{m!%-m@XE!TPZ6 zo%dG`!?%3Fh2%-O}G7xR&Qe)_^vItlg;)3K`qY zZ@+C(To2JYxs`@oKW{8wtH^YJ7PKe<6H^II5tRvVW$bi3|2b7#9(K+ zgmqcugD4l}re0=!p+T#yby1mK8o5~8e!JS2f;|^~_++eXIuy|hwm&}WCf6bvXt31TBirz9Sh$40w9OX0{zWnRecW)mKm{Z+64Wx!QR?yYEx^>|(=Z|l}7 zJg|h{Xo`*flEX{Wz-Ygg<7?D#-1JcSl&hLESqQa0wJmwZQnnW^)c|4%byVa(&q){K z?-9_j^iK93mN#4m92t#I-DX!egFZAlE_{YS00f&Gg|CIorz-S4_vn~fgT){llq>b@ z!4C(Fp-mdpW`3-?aq1}-YryJJD$!(#$YnAU<+UV%5GaqsK@BwwJO`mXoyHz z-Gc(8V;&?)XsPnU{;~ge3*MXyGcdZK_Udh2*V$;|hNyd5Je1THmEc@A%PYTZ(^_UU zpy&6U-KM% z&}nRXsN@s7vlo8OlyIa<57N}RBa3Jd;d528FBQcc#)wpA8G31uf3m>p^T8jpWO`3o z31zCi#+r^Jluiy!NNe6R#u?lB$e>1A?%TROHWCggpK*$|(FzC*&~Vw3Pr3(8;#Ors)V)9e<^!J9qd(u6FS4vxd6DRXHmVwop=2BJx8f*ZAREZ~d}o z2BEHpd?T1rjjVUWu=!Qx{6Q7|w44!)GLPrT$Z-lnL>7M9RiLwwGusHrLn(H&ym&(~ zeQjgCfAO-LFHx#X-^BFQQ5?N0_Z2u85@dk=6i59 zcexG`JaJ<`v~#J47nF~ZEEuG%%d=GNEl-(X=Z^|)$*0g6?d^XA*V?Xkt~QDoFxK=_ zsQ)D3#~2MFD{3Qr%bFYt& z&#vF8dqKIv5+3XcW>OOhXB9k|c*mQv&ZMaVJ6fdYGlY&D-U+cf-hYMrR9*q5%16l5 zitS@)AaypRsB$exqi#=;`-&Pw%FU!Y7+LQnpSj19DBi zDeF+nuHBojVgaLYYQrVBB1+yCsaSJ6wFi8tNnP2r2VZW0;ls*;oOD0zaWXz8D=Waf z|AhS+AoE6jOj}}sq~Avu$t>g4pywq}Ms&$T-imNFVa!;(`dGw=1*qL#exn($o@A}| zk4txL;_3RgO5Rk3spAjJ$L=D?KT|A|A2!l?M+_tVmzuaR%PeR)=*HDgWj;4Pu?Oa@ z2Clwt(kLnj+adZwAbR3B9ZNA^hgCHO0D) z90SCIT_`c1FLemNa|A~Sm{-|(l(}w7%jMH?dMKQKY7jW>!jk2fl(IMHy>2>khJ ze0S)S^X!3grfhf{&~&r*T=`kctk#xKXmX~vKS=3dB1QCxiGIy>Ba!5_VEv)Erpd+O za2t&+S4APuyfBW)Y{ROYzsPR!EB;@^vpT$eLFoU}+CS}ni2_G-otvp(Z z$Q#IXJF8>5HV1y|3-XeE)U=FlFlV;y{sIM}eXcroGUnGUNN~F2uOurI+YBcjGNRZ? zfB@sl9V|g1>_jLJzauB+ccmvz+lJ{Eq#BLrKv`20;(6{MRAy$3nMH z+CUBC_1KaMGoav?OWi&93xBifq4a0d`>gp|&Pw4VWHbCoe5{(1 zyQ@4Z6rr|M?L}{6%K9o}+4>@Tdf>|x8`<+?9Zs7BN%=ZHtkX&#pIiYdUE53eU`qCd z3Y=M~%JLPMHYX3|__^-R6(-Vxwk!Jq_W3wQKv)D^=&@NQ3k%n}gn`A|9PwZHIufXM zZI`wG`e-_D+>&WR5CSsSgU{e99gx`|QFWDF0`UN~DgphZJMf?1z zJ$PO9;6ol<;rl_$`YLR)a1?f-Rx#j&i+t`Rac%mm+w^pmVAFT2iTdg2^m@a zY2Xv7q6U-hrHYOhvnXPQPzOO<){3E=p%ay-r->qPt%(|3`PJ}?HT}#0I^fe{!@1o2 zeJIx^vFW87vkAAywAlKiKApPODt8w3?%*B?XhQBtQd60K{T0Zgi6zUG*R}Hk)cY@e z?3>>EBT3IZ?d?lTC2UEK6MUr;LV~(G_L!S%u)H!y)f~Y0~ zm=Dgu5vXiMK^b{N*uNK_WKSW!u_Ol@0c%IW@U0EQyg4?puab?jh>?sdqxSo`9&i0Nwc@#( zJHP|myX!QX@=HkH2M-mvo6ZRgI{$2|X(?||{VD)|vPhVrEj`knBc^==FM5?ZVCL-u z>=R6XCob#NpK#r0gZ!Wv=A;ub&Pe#>b6n1Ggd>HfF$|00F)7g19CXsugI8?e0TCn{(Ym|b@Hp%RDZJIS&@ump}27It2F<5?~h0mpNFStm+AFS=WvAcMgIqGK} z!K<=^+HeS%X;0~R51=qpX3e4puvK?dXVG#CAin&zS`UNrGud*YZ2& zbuAVNR}Qc-`dbg?a^oEWn9D0PlF&fXGz`A%)COPas%ecJAo@E%XW$l96jZ4lqOXD~ z!;XH{61^?F9T2Vb-wDI1Ff1w*Ky@ZCEiW2BJO+ZAFVea^#JxSLheSCmWcu6W%zF-p z9GK^8f$0kxuBNTAc$1hq`r%qtu$D|!lPp^Ku|r5q2}t{~TEAHt$bRKnHGYpx$%mpD;7_5sZ*1*Gg+L83eiDAwK@(FIgyRQ?C{pxPmS-ahSy(&iMV@g4vu= z3}Y_!wY^`M-eg`Y&&GeU;?7W7==wSjWlIQe5Y=*AH1D_HV9%*-Pn^O~QkfojaEVQ) z6;j@SU?kclB*g}I{zVNR)v8?7d=%JR>^a;@HvOSe!_{;6*tbN#bYTN!CG-g$(d=$8 zopizX=OTGvRmJaS@M~ytsl<*2oT~MucM#QhSq;Pwpwm61>U2>e*Jqn#=q*5%Q%H(t zVt_lCMp~iGl=p{;zJUu{ZphqM=WX`K^IwXN@PBu^#6F?Fe5G9fcR5DFS()?-CVs$9 z8ZE3eK6Z4Qq<$fExJ*GBGuy*9*ouj}Kic&S3B|8E=7R}B&zm(?6%S$? z|DRPueErl|^jGfhg|1B9?`W3eX+{*Gbaf=MV$3}JubSrRr4aNm6c z1JSFHQTd}IyMy=<3i92t>aZcL#`}nLBLH^YZA_B;qZm)@h<>r0l%QLL!3Mpi?h1q@ z`jzLz$jfCGZ*cmJTnNj zg8R9MTaC)nZcOF5Y*cCcS1*w@iBcY&tt@}Z&uNTT5&Byk@VV%=q`vDI`g4anbnb;f zS6XJ{7>wEo0>-?7cPrJiArNeB~&WXDR2I zG?{5x%T%hRLyqsInQjRA1>)H^=StC7^Y4e#M6@W+J3 zHvF0q1t{5LN*6bdHF=+j%j+Oe=yuo0u~XcAMA`m!Y3e&BAwU!4g)UA`L~`NJ#NBP? zGR8Vc4G4}pC-wPhcI`!*A zf5B0&#VeG$#Zjy*;G?9tgi_jVvek%W(vk`f(ymTrl^w(6s-N!B4yAbeiP9lV=g1nC z7fP<_mERYl!gKZwO>e$@6Mz^$pTRK&2hgUH`;40nNdd?}$e{+_)6+p`!Y3!g)Gin5 zjV=4fi`cdFarei+M<$4v?%GBTY&+&wAovK1vd^9EX3i=`2EW;~J83CtzS+IlW#CFl zAi#HgroAw@a6BxtcN&_LUXg8SwG-fH@Lu7PTa8i&V6-C}%Tky%d%k-saqtnJBO1^G zW8_8w!*}Cemw>FC=o-IvBmDy&KE?<3AMRMby+IxL+))3O5_PGZ%W%c~-4Gp^D|^Df zcWeJSy2@3U$w~M5;(9MNT-SUSOH~2^jBa4jgO!Ao9qeh!O`yHu_SNy{yN_&!yoAQq zMpzsD?p68Hw(4#fnhq@bvMn@CEo(`vupSwT1ODiVQEbXzD)2%Lnod zs5jNc{3y6MXNqo*2a_mxfQi!D(q65<_Rj!mA*`;V4`<+qRjk!)=GgXtdVe+Ve*VTL z->|P6UyiEX zkG!0Dlt@zYaI!2_-g00`J{q*~WEYj|+k*)SRE!4^%{_16w;1yNj5z5QKGS<$PRRxa zY$BeHWAX*0yB4p(#^%-{%O7 zIMW0S4j04>oa5hSf0Nb(nl}CzQ^F(6wWbQdFn|_*Y=r+v-3IWJ zZl9?47Y5O7V*PK;3GO(mTCmluNi#6Yh(Q=}U;*xI-|8lay{yjg2)AHdsdQGxjC^UL z*}nzUxme3nJW+WJL-Ha@V__&^-QP@^m>MFUy!wLn#K+>VY0#vOxiZklNp?4X$?C=l z<%)97;{zYxxgrg&Q*!(3&6^o7!zj z$(}tk`E}UubYlyr-?gKb40GDRo{xIsST6;e!2c#7?TS7aFBOE?yj0&Fo?cO{nLSF^ zs;04V`)kCj5q2^SF8)a^1j{`O0-gSVIeFN>XGPyfjmDs1=n{y_+!43!a!R5TBBVx7F+d~lfy8DMzaBlz$6 zt??xD-&FUWK&jzc5r3>9E2_&?M?6x)3} zUs>S5E*|I@n>kh>I0?Y29MpSLVK)3ebI>IFPZoc>qy_crMG#~#IOh6yFo#XhXsaqr zBJq>HLzpvI)dgH*^&!ONXd!KCP2#Pab>~T1w$^DQ7pLG;{nns5rhnPSPHobc&EG7n zn^I=`kqj`9T~w>y@2mi0v%Gsv^#~qOr3U-0wg+0kcbG4<0S-Vuh^GBq5r^I~GMLu( zd=jsFMu<7n)9>}b^fwjsQipbh_RsS8d1q6&Gj9^4zrKFbHD$U>`RFrNB?Q44ib&TI zSgfq#$phBb@V5kTXHHQJJMj{}H+Tp8qnW1%g(l65)Rm@!WOk;i9~bjm<6GB@zomou zc-u*1SB<=7Zo;Y3Ur<(bIuL;U+HC&W9V9OTv&Zr6sk!S!;aamtg2oRr1IDZ8^67M* zu~F{{I{$4wRoAheNO!A;H_Gx9@-~gS$1}Ceq32og7L`ikFa}c zd<3<9#zj2X;e|UHG$&6UMa#i*2Q}f71x=68TLM$MmtZ~|C$VAMa`72%ONXNH;qQ}& zlkkPb{)+k|GavVjq0B^D>T=V25VyW0v{v>8KRvB9xcT3WCi^)?QE|umUlUV|SCC4( zr)K?ggqYU`kAB^mqgJw^`*pf1a_X6rDDUrm+rP|eo^Hq_EIFL_JIh`C)V*HZyx2KoO)617liG zC;(0h0tcrC=jzu|dm~bAB^#N$sL>k**e-@2Qz2n-uDHS_YwD&5t2rv;hwc*;10#4Z zixnrvO;r_PBG=0_8|{zW6}^_}PK_3qWbmt)t!fq$xES0$y za}mUV_)kR~(+s#61)0I)AVxfpBd6LDjEvM#m!zI7V^yf1<*aqxu-f`& zJ6O$oEL4n=Fxut$EORw4WJ3|fHHK6aEfjb{2EnQtxjZf&C?Suogzo+6U12xRF9p))nHS4GxT!LEgOeo66)^o`aYJSGItH+ zKImH;=Iny|Y-O$mFI(#Vt%^ZE?^NvHYxQ4VnR@10@&x&F#r4Y@CbBV{2#c@c761#qU4=?jMmmf5KTm;RFScmH#b_sM!DW)^*#D4@KyISN|VN z#Ffp^(<)R&-HP`8%K9|KBc1PvHN0{@tq2fn2nzZyA)CH__FqCW@SWUWJo0chmThuTR5i-JQApvKg;f(69|&i~M*O8Whkw(Kl|@$cOYiLai!C4dfL8S?&nLJqjF7iiu7D zCuK&Zz<^Il7xH9ra(RGB@WW9;w@FzLPX*FHLk9m+NQwz86yapy)m3?Kk&8+W*EGcl z|L%d@0P$;Suh4212NyoOAp~f%av=btqYtd66u%~cANDz5iChTdjG746&RddVDR7_< z9C=x(31q)`X%0hr5Tbh`i@>x>fS3{GWwvsWorgEIN^0^@YkYzUJ)Yh{$O>+kz+0k5 zE%r7{rlAl@8-Xi&rxI>ik+<6c%exC#O)=5Kwn?S zhM4-k&c19-+_*Mg@u>0KXrTKLFPSo8S#we^th%d%HG{ah9LZ1X4J@<$IoSnSMtU*? zi`(9tC-wnJGwO_HX!Ir|Pib2XaQ>Oxi$xvI90YF$BM>_^)Ys|-#y%p`zAf42G*Tvi zEFyaEK^=NwxF-a=9HhOSMKT|jPI+)FDy!epUqle)eG&$MNrxQzhAC-KurNZS0omD( zi({eDXaE@2vN@bp*<~m-1Zz>8l$qXI2z8P$5F=g-wI1n_VHZ_S-d|4wP>U$GNn=Vi za(>{a|2>oZ+6FhgA4GqyN`g`Ld*V->nY&3BDO;(yTzvX3pCJs*@&l6nQRK7Z1DzyUhGEd?ay2K^gwXH<@ln!p zUk#!^9-+U&#rdASnj1c(7Fo_cD_Dt!4l`BSjcf(izB-Q3I|E?!$-9b*&IZTzFq zW@#F??qt@!u;HW?`T&MB{})UWnJH-u&RT{c0G^V`nrV%?&M1nZ0@mp*(nAErR<9}; z3;$JDL$Q)q{|)d($;s|VRXqJS?|lb?W*I3)t%_8G4@=g%x$XO;TjHTIO*=Oaz4bci zte^^<(1CaB9CVDy7DV5nHH}A~25NAkfSQJ>+8Bu>^%o9~Z4}<~Nz_a_`>(!P79Lww z2R2etSDu8mzI#!P24_Hv8?uQNBICn(UFd`R{cjf1L-K4DC!}(dTo^~SZBz56NDzxj ztpE7=#D~BCSaa<7R|**8Ua8seQX*16c)51^e3-+H3Fi`Vui;*&G~ zr*mCC*ylHy0Y1eeQFrT~XGddlb?-l}P1g2RlS$|^eFe;_0_4#oY{^@(x=9wC~{)7iz|WA-9}u%QejlCx9a@!`(ZpHcVcGQMQKgaWA(eE_YH zMUizM_~*v2qBj|&w${l~om(blAJp6J)2NXi56FdnfzUEYAbs}85fD#c1K6%$pSMQi zBFVqt*tTnWMl_9yWmQwewXU}|f!(s8urgq{KNm)bvwd40iSt>fuY;5SnO_X2t|Qj= z&POlAebCSQ3>tbIa3o=BL`=+hp7G`-esvM;8BZw1{W*TljnkzfQ*1a3{KYQ+V*337mI0zwtjv;)3d^Br}z+y zLpFGytD1pyeh5|Yfe+>og5u!(xCYt7vek11=Y>fm7*a~4es5#k_pLSCLpuavbt z+$d1W9w`)!fPLKJrUBu{=9);Chp%Ades?B|fu8jF_Sw(~IOCEg0BZ)kor<2;SYiFN)ZZwM8^od1E zj_(RJWd;!G`Vr@f!T9Y@ZCao_yKw1WG;873_4+29LXtR zG7SAZq&Kt|tUGtmS??)Tk&I)bEKhU^xXy^U{Cs z^Za&l|9<-Yuxb}&b;F$Vee4?2Q%fy;5>7Ks%*q@aZk>9`bp9^D{Pghy;sJS9A(B;P z(?{FPM(gFgTl)|3i&zblBZ*_b z>-ert#Fr5ZtXR4}=)XdY80L2E*ZEqh0qKh~7{#*6Y!Qw#41{iRjh|wee+aZakU;e~ zq;FzGw^`$QekN?Q247q(~m z>cLIp3+79nfe(G4(MP5AU_Jb)`gLcUgm&t0NYCL5k~LGnfL2vhXPpXN4w&xe>#VOA z9#Z8pbZDj-QVBN;9WYQWQ3^SZi&I2SiGqt!Ecw_FRs!fFY4kEws~-l!5-4fS=GyXS!DS>6~$Js8Z zN{K9%tN(g@TfAT179c$d{C7@mua8d

3v5iqi+YJ%YUAZ&(>^9(g-cJFz$3m@`j7=$?ndfeS?qTWnnz$Y58|io&P9~DG7%1@8X8LpZ3k^1y%f&{P6gG9HJlPL|Q}e?b^*&lrKGPxVFxww$7`p z?tC5c1y4JU6zS2UK~bLTNgc{4n?-bIm1mPJ*2KjTau#d5pkzE#R3&BPEh?@W{|0E=ph7DB!==Tc@vBd`4%v^EhR;e<);7&Gs%Gu|W?Z2elz87Na8m(J3~fnD-S#Wy-eBhI2M zX8CYJE7W5&h*9#fUm`9|8_XbaU+Fd9(G}Nn5wCy#um9x*Iabe-9E&{vkpj4&)82+A zt!?nKj+zQtr)ReiFbwv)sQ*!DQuMQ&;r6sa>4(eUqEqmF<(}PkzwGkKbUwsav!#JM z(X~2Tc?_TwI%_>+H~Bm<9vtA?v%;Yk!+qqLzbL(zCcqkrxJ1|oBM^C?{vfKcZTNt6 zsiRUwGaYo>j+A&W*h9^UL*|h238C~<*CjsC)P2_RiKNC7=xo*|M+Ci;e~*V28~t}l zDOOzLHhV$D0rmTvC^4=|4fQYlMd$J4m9?j#*?sM!`S-nWKdMdR&C9ja=(v^PcpT1PLAZntoJjj zx$}>FvA7n0)ua7{27|E6*&O9&!vU3X#s-~(sBri*ndE1_$H|?458o@m>;~(wAbuNF zUd5pyWC^WvTNiJ*cLEIA7oH;{Q)b^6h89PNFbzcqF%?N{Xw47J2?L(bx0%@6ca1{k zN5o`qnX79J``z>d-_&~5B&^rgvreZXJs%*Bd;^kxQ}X!?Nk-pzQhc{a@3>Xgsx4;e zG*DzHaHKol`IG4=bNDjI%}dPUFv$QpE8Sl(olwn(!4>@bBhehcqeD!2Z_|=~b)KF) zS!^dN^vJokeTrj16=>0IBiyhrAmN14YRgzv{0I{N&LkoEJUS9LR=?@RUd^?DTk8Fo z+smncO!sFQNKYfJr+!|~#DbLGK9>v2nSj|(?btp_EL*(isP<@*lKE|jHoL5jpu59l z47TuL$VR8kAkH)8OvJ-@^FL7Io?v{5nGlW%gQvO8C8P@u0=%v1g{`MXaTk%QtrRn& z-_HQ`Jj%7~d12CnaoOe+6pl7_WFD%eqb?=%t>!R@4wnrzVv(mk_6=MLe6gPo&TT1(xR34Qk_3YutV?Q!-{F>VT^( zzsA2wC%9xMGZa>n?rm7v>b91xfr+X7bBWp3lS8m0<1;|5tzGwLLb)TUuU5?i{!8xb z=b}zyzJov2^XQXpJ{99G0(DV zvezRc$4H}AuqW0ODU^~$or+u2C1Bk$mo-olaXz0Wl_}OCPUT$LNSh5W1#rBIBrqJV z5f|WQqWwyQAwJI~3h}r66FSl}1F^JgeH@%kLm87^s-wA%6L^LZV&TfruNOy_tko?D zs8;b^s|@w<`BO_*fDfKc-KGwRfQLo*FS`7|`|f1yK?}zLdH~=Clw2BFIS0`of>HdF`9oGXJr#xCxN*%+VYbM3l$lMkQX0ggoy;QA3*`BqCb)QZKt}E!> zK=P+RFz%ewWk5ag(W>BeQ@Ur927fPNi>hSn*6p?6b60M&Y5HsatN-%(C$!Tko@QY< zZAQ|yLRR%Psxv!JiEJ6Sg;JDUeYLb|&Giyf$6o4Qp~JCTc-{_+YHV883m_@&3w+;b z&jwC;11V0N3DnVhV*J=64*fn8Qz0C{E2I5~eF-MXPq&+~R2FNBW@YO}$qu}sBe5GS z-DlEz0bk>lZPYm#7Y&(M6wmCp$q|wRq(TMZV~b){e&<~IAt1lz3|UA~!NBOb4} zAp*x1{Sm4B@pQ!!Dj~N;dVWBOUX{{JfKpC3HI@!p=P<~4lXY2@c|&CANlCHi2gazD zn$w|mh|M)V!1y-i`v~T2KWsH@&9M<;?P8^|vS~7Efl~v2>!k!dU^p)nH!mgTuv|t< zj3$VRGmH;d{Oi;3$nN z0_i!YD^Y&RADlO@xaxHF@Y7^4+b;A$1X%OB-Rd4KsU$j|9F?&Fs9dos%H=v)i;heF z+2d_hEyk3+Dq~C>dWvn`_0%>=dt}%Tj9c@|RVN3pnBUeMbNQ-ysX1t38OG<`9o(_9 z;$V=lvhrjOVWC-AC~cAg@KP6?F~DuJ(Ad*9sS#1hB<493oHt0%3Dn6g?li1j5+Lp@9Gwp2!?<(0$nU6|mTFt7{Z7xiw%#!L7`g(mmZY)8qe)`cD;NAB=vv39%Q1fFIYTYLls~bFYct=IVWK@ z7r*nn*Vtf51?ib$`}jS-%~-3-?PH*80)$k|lLZ~e18bEe6t?nVt-z`Mg25-Ws@&*O zhSf{1>G{OVlR(juGCfdeg_TK12F~wVA^I5kC>bd&R?e%6M67D^=;$n62LQ^)E-k-N zTLRukH;9MKz37;9UmiDZIxIGnU~cv+8j3(d8Cb)W3Jc6YQv1C>EJ7n|Tc{c7siGp9 zey`0l`Pt$)AG{iz{h{T}*U|WHZ^?Cu#U!n03WBW0#c`GjlwTAc(3io_F~`zG!hY5b zNt{=)L5oB@AM^gw4IYlpUB3SnWVlwHVdE+-;%|pLZRNRIK6bz;LMe6RJplY*eyCO) zQdS;kpY!{i(++@v2SI}o@APxf0kHdW-R~X0dNR3s86iErw4UXOkO0-mHwLd>{B3FJ ziyidUWsUiyjz#~;yA%fJ*o&c2I_}? zPC!p9q~J&(DpaiU&C6CU;NzVsLVAWcIW{ZEzbmj34-5{$b|V4FK>09D@>wg~y8W*0 zm(yskuA(R{@V;D2w*+uzi5Oxz+JJD+(g|c|7=^5-lpl)x@nn;R#J{2x{tn|Qd_FRN zKO(Gg7&f1;5Drz>ml41~Lr#97NLuXDTe`k94A$sjbAmZ2B% zc#>ip zw(kTm8%;c>@aMu}0Jv#R-6nrZC{Y0fD3Jyl4`&&NKrA1`9?5cyB93Hpa*lRfA?~|T z^zEXsB$p`mRl-a}m%Vve%i$J6@7+%Mh)es2Qi!-2{I)M(-6G9ho-rD1@`ZtH8@hTk+-N_Yg zYuhGAUE5kox!l=!r5lAo2n(*OF_By^>ZU< zU3{<$_}wmH+N-)L03Vr#DKP-O)j}P1dXXE@vr$3G5vkE?>(6aJan>^hYiNjWjvcZ%H6wl+N<}U$Hv&_e!jYYOkhA8 zuvBoIpyBklEbcRa)ng>Sdrp9C^Xoa`zOSSnhSQ-Sq$%p~rY6dGFkR$#U9wyHoj18%ay{$Yryq5qs zAX_H4=EYL^z(##aDK=efqHG*ZIzdMtF1t8neA1|6?6U3V3gT7kJb&(2_XZT3=-HL< ze^~NA6S9HHmgu1@M7V@`DQD()0bZ zx@4l1$q0nb<37!)UdoKo5xbmQ@^AKLDxKf12V*fy<+DV9#67tM@`QeQ2K4AHSwLiJ zJP%sOzIN4ZSa>!!6zM6XR%&kTRpmi(^JfH?T4^(8d?X%cuZ&G5cH~=lrdzs}TslUY zb;_6$kgddal+9>0{+utx-Ud|v!z~9>OGrttu#7LIdD&m2IN()VTdc3MPLE3t_G12A zY3O3L!EcI}x9!>+-&RFyDtmH9!MhyTpSj(ygKC17%H3{cyWSQ~mSXw|b-3@cQq~$G zJ#~D6qMWFCa6lA*WRM&^7O&BC+bQH3t}r#!u+LH&y^j^7OZ5Q_NSee92lT-Jq1fAS zJ30)?$z`iX>r4zV9>yLvo(jvkO3HqJqn%7r@?X>9{iF|xm!OF6hkIzbK71LwKqNKYhwnZ$4uWWVX%Tyj!J`^UrO?NhNROQ$qbPVOuMqGkgZMK3 zG8ACJ2%@msW+lY~Z9Tq>OEVq3z(qV_0mG#u27#gaD3N_j_(1fqB2;c3A}%xrsbN$g zD{3Jc17J7|HxdZwQ?clf;yZ&+B+!mrcs+f+6zVOrj~hXdnK*Dj=lNQ0lH2{sF)HU^ z;f<~7gC!L~gEx#C(g(ac5_ds*NNL7Te$r++pi(rXQawUm?-L^~e=yx!(+EV=+NNM- z-Up&>+YmD7wDd3+{)AGNez`nurj0Ny;IXohqB$-N335if28N59H!m3@4s1H#>BdFc zR3H#zGJkIv$;~rYR2QC-Zr0kT+*I_$q{{BBy7VhxSp8Of;l3egx{71HVQMFXcKXlH zl}(Xhq=$;;s;6-OD;PLM#^_sF3?2~_IvxlXL3_u?P0LsIkHacHNbf^h(>QyUuEHO? zL*-Nqk!-pK^nhW(OxIYNPqc^k55aB-`v>#I$Ba@6%ip=(@=Gc|2cJipr=r*Cs`k7f zHhr7UL~Nv|4+Z||7>G0yJ44q1ujTU%wd*=C`p`xDc%|{nB})bq3+p32^t7E_Nw5jL zyx$KWHd^2ODQA?>HLn%!=Bl#ICj1nOIqxw~!|}z$Ky9lB_GtNw)_POyi<_8BV9nUD zZBO&$xdl3es@oyU* ziar=uBcM=k3XbC^r9-DVfa&i*Zd$B41znStLVB3#7T#3VPoOQ8GP98__vspIvkNaI zU)**H`O1JHU;)Fjc(!uSRnbq+RD3Fxeaowg{nf$j_qD+~muyYl zbF7`zwK*P3(jvvpqosRmaaxKQgLBwHCn1%Lb)*-8h!y71zMQmpuUV366Y@Kf`5UG7&V|C*VPo%|RTBq>fijTs{yDw<<5Mvnknd5E-`d&8?Rp5NC*k z7y9phY)#YzH2k0kmyr<(@m;_8e&bv3?gPwp1(5tnRq6}K{s(2tocJXT1>xLZIZnwP z5-`@embBSc$1Ud^=)Y%%8~f-nIhGqdQuNw6>a%_y=|M-AMS5OR{ZlXN@lP^T!uWu* zOf4ISO2?yTM`U@-1W)9&Mj-ZkIi3BX0+s$~QJAP2*jzhw`;D#>1TDH~a80a1v?&|` z^b-qoW&tUC8=w|HCGh!)MOAeZj-}f>9K~gu~IA6Sy-3p>{ zr5}tqALCo`Hsy-V=DQ8up7D9z;@TmP7KIg3tehZv&S}kADfkz`JK^tJKh;oTu5T_| zzsC13s&jL+;2Lo?<91iPbbR;tVRP&D_z;<24{s#abrSPtp5<_3{$RJXV|fN`+UN^9 z025B);Cw)Otf@W~GXjl;f%sVbrnWASE5aSzc0fPMN8Pm)DI1&-X_ceER)nD4rasvr z@7CZ&l+HOH{;X_J)~Ff`=eJwV*B&<()Qzpu*!De?e}U?l{@qM(^)_q>u3p1tNut(& zKJtweiePGiF`*M+Kx0q)d1PqKp9QH)2R=n;DHY-fp}&5l8zx4d?^vFL^!PH}(<@$Q zI{+5~xLX{W*V+*JIf}kVd{~+`cU6q@-kM4RkR?=7X(>N_u&tkn;2Q(}AK!Vvu_#h& zf{%C}jEpe5?V={b4k3F)FdLq_^5uS9yZg89H;AnY z#K^l%%9*U-EBxJpH-b@OQIGZf%)JytQ2HsK&>ssDt&gI^8s|N9yP;>>iF!c(9FubX<4;jkgd|dm0x-edPg6ub6OoZTp+LJxsE3opxH*-fc|hcmGlU z{h5bC*4;!*VRKlvuxe}fyn6xReg)1U_aOq1#be||S30qUA8Wlbde0dGVT$aVIZf{` zt;VNj_sL)vKc?iV?q`^v@u#l{H<%#v{8Gkcl;96Q>$VZAT1GUEg> zvW`p=1_k?{e!g>SsuyMQ|5@h)fP-UMwE+D#(bk_MK64VgckqAIH+@4F3HIEGXPLE> zj^^fbHmMW%W?>`c(PsphbSf?3`^V$49cYz_^oX;sqZnRgCjoF_cpcE9WW(cB=w83D z@MxzgH-st!(m?^D1klTsdXnhsbI3GthLCa-d60TsM?mqzhFvB_U?zWeS*8ffOgy?ep`Ibt24(%Z>Hg znmK}?&&N#~cgbP~DQuLEO17(E*)LJ;2kkMfKe}7{6CP&2cHdg?gIGXxmAG$bQaxp2 zxm`}C*+%*?Xlo(7+%0)9jDhl*+u-l76;jZsuJQ2j9N+kE89?Pe@c?)*QEhXHkRA|J zd3YHyCb|iVku+z>&z8L*>mI$ESP?(+xFz_FKookBN)Yq1GnVEOCVu{qv5$zY`6*dTPLM1r zhiTIqBW7#oMiDEl+-*uyG;WI}J>8gxh+-tn3(@utU zuQ@RR>&vCLHXyvq<4xN(FV!hJRM_YzYgxAU&5W=7&i)LG(-X%n?WdIhS~s>b*1*aKGW3fInoz+ znzmYB*WMSKo~pcksmgARt=y59bR<`^C;r3cH}IhH+dk&NbF5Te!C9IzWPBtco|nBe z1F=6uvy944Z-Whr*yW43eGLO~(0Wa#8UZ4NL>h=Dylw|rC;{VF$zM!Fux|UM1%c+a zq8oMqNJKXR0L&cYT})FjdY%Bs-up+UBZF-bB0=fgTO1>pNBvJ+U#9S_VYxvAQ$%9U z%zG|bJ&viR6U!_41vxIn_0O$Wi#>LwIqrxgJ!zyz15zyCPBSs)6P6MtVp_ zluEk!qnNV zQ|w2@_?VQ$$5xY?bScs#q{keBB2)i3xTuSr2q55Y=ILDKuJK+?Pd{`AK$8GU{Fvr}yyia6u*vavZS^coH zAE6DVx0?9FlS=c9)X&+cv;YGm?DO zDe$a>W{udNiW&Cv<;}sCOH~)k&!<#J%=iBBM*I$3w%n)A5Djm;I5*r+F-?h61k=Up zkGg9_7ikIg_L$=xTb%*&{tE)Tq_bLSy0GDIE74i&6Atul5>Jk{1$A zgln1~4cA*CZGz7?dHMwDV5W3r&EWnrhJ^J)jgep#DSu*`=LC{Xf!b*?s!}n2zfr|7 z0bmLU&;YXyvMpHMKX*d|eC)%r06yD(`L{m{e4Z?T ziRNi;)-1$g&({y>DPq_;&jR>unla7oZ@oQ#bl;Ra!{SXzXX$vXv`M5wDP~)5Nmq;d zn(vwPBTJjh%cS@;UEj1cY~0+ZxcZA`okRVOVW?4<#RRsR_@oYlXi0Tt+1Ek-;uupq z=CrI2*svFF5h=ZBs=O~K@v^7n(nmw(oX>qs_Byyey6F>@chr`$-vzS+*u;U@El5uq zgEsq`q%3A!ZSuMJl%QEiZwLA$4G}|VF3ehFPES`#!5*th*Ix`qf_D5kWLZV%dCGjs zQm98E)I4#+)cE-($-J%&Ktv}T^P(u&i%m55G3y5R)S&G3)BP0${QP+0wqVi^E zMRU+i+Syfxk}%ut8YG5;SO}Az{=KBT9f-wEA-oXf<0#|J`^j|Cci+&lYxr|^>~vT) zF~)cdE@4r>p#EYt_|JAIW

7f9;<%`WWsk?rD%{6pqtQwqtQCm@4T)DMlto8Vj&s zD_=ky8%Qq4y};UPk&4$uq>`|@<)=556)#^YNJd6ZPCJ~^xV-Y9Bh^x<>?nxcY1=v0 zxMW=27P*tN%5HXkTYb*ZxqD`LBZ)zR(JsgXQlupVgznIX@~!mv2Yo;lbn)=k9#pJG zWi-R00ZSs|6VK#$;>WnD>4AlYcA}GYl^CpsDhr6vi#EqIbs=u^a>5{HCe%e|EkMP> zD@g=^7*&Zd;>xVa6+gih28d8lK2xDe)DSAiuoE2WjHIWVnK}7f5bU~(PdE|MNKC}Y z!Ah((k1icxdU|SVyf;-YrZU{V=wlh&rXTm~_VYxYO>c*_FC7a-mpKUk8z1v$Nfs&=wgKCx{Tre)U!ZAD4?s5(^$YbKNii-lK~Wp1%U%V&9E=edFE2*@ww+} zvlB_P^E@68l(4FIfQmGLQC&ZQeAO2OdwSOu<-N*AY&2Z#m!oa-AGyd9tMz)Xm)y|N zL;O{}DZ0u&Pr@#yIow7Cwsw138>&JJ9fm~|2><*4sx9rW0erG zwLg%1**x`RTEa!@Q~$jEZ+zZ?fl4_(XdJd$=d@M-;jKi~M#c>2q%z6=4U3|LCF|80MuS!BFclR4i9)r|yLW%n)7xT$~%OjPxkb@X|L14Qtn? z@vw~@^`BTt+MBS-36yo8^R$hpVt*dFt0uvL(rZyQ3$Zi4?tM}r2E)TJxP@lQ;Y5rX z=eZsIGAdgQ=^xff_9I4_W#7e{Xilwmaz7ZKTz|}UqTGRKNMXRgj{lukV_P*$qjJ;L zqvS2~ieV71Wz80m@A)JowEpww^6i>aSR2x#MJ;XBNj^Af>MTDS+K)~Q#-o^{ZzmEZ zmRwwUSSc>pQx=n}@J2`p*zKp2TM!|n0;I(bV}B8;1~STs@Xuz(uk$9bjHZALJ&T0R z?czVKT4iu3@X&0Ig|sX+(cOs)384v*o*0N`>Mo{R&v{G64+Xm)xo>7_gU__71^=u@ zm#OfPmWw?NIZ6KE!aL=JR(RamV5G+b z@?E`Nascgfb*Ec8S`x}cth9&Js6D)-kGvbLw&>*)yI{;rp%N&&%tzlUB8GixuRM~t zLp~=~%_rF+8eedk#A$#W292iQ(-9>m$nZ<6ny*KfQ%@AnW=5BV>Egr()A&B36Zads zAs$r>SM<5BWlQO`D-?g80)>yRddh&=I_LzGL(c_JH+GSpU`XL@{GVxJM$8nVWJ4Gp z)IyV|l`f_%(8S;?8U`$xCli|$%(|kh|afO`F47z$8!%O01I4lp33q@#g%dj_eW~C zV;j-k@bObbmRzx5^H_!s(8W6;J&6>vYvP&*c&uVB=T^m_lSzAq=IVdyu|%59#dJ$X zM>a_(vgSpk!y+d8C9Ov63z2Rpxq<%-T(gb-J;YzLlw>BicB}X_ui&z0kr}YeRXjb& z2K*YNd*rcZLxDf^mOs+Z_S^pbefwYgY2sGjGBwy+dNF$X1*Q8Mf9-GCxO_g}^54&F z=}B-;nEy9ER!!Bp3M4Up!RUC{by7WY z&co9YNfAkXxfHzgHVZbT$3Nfx+dnG{GHL(bV=#pcpC~^oEF-w{Ush@@p8`G`${U7{1ubOI#bd>qCC7SY(Vs&UTf5i)%P&urndTiH zXb9<%;;fmD*I1JRYI%lX@9f!=45{*JJyb}d`qLnWcGsJ%KLJL3AqC2ig-JD|EU(XshKuNPbWams=V*Q~5KC0a=Z=l%T<4Up5DGT>?$-}1Pfx83)8okC zlDb-@JCSv7%;UP}r<}FrX;r_z;c9^VH{VT2o1)zptt%l}w`-}WD5JQ`9%gKGV3?Q6WJ0L?$M)Y3mzIk~s4CCf z;0FSsA-%XT%Y~=e{l6uAcAFPg!-GmV63=-IMFl7^!ip^%59JM61LPvvF-&wEXV3W3 z#iQXn94o*tHUu2V_>fS`j+X}Y4c!jeu#!ThnjPZ$W>+Wvsa$Y52Wm;fFraJJ4N4sf ztsA+$*Uq{nS`_9pKY!<|x4`&pu5T8q1yZso^6?S3XuQgnmY^tm{%5xT-HC475$bK; z8As4kx*0}_U;ogz0@w&#OWm|s7#%DxwY;+aI#QL`&;w%c?Z~Xj4OcvZ&6vL;J#u@;w@o2{{%?^GT98x#y)VnU4XKU;mjuKB@@M;e!(}u^1H- z*9w0bj!ETCJx#rxhe$<6XY)N*RO)LKZQS;*PAKrI#$LB^?b~xTeO}yH^S1a@RJ{rm z{RReff8Mea{Wbstt~Hn}%?&ye@LXt0&g8C!N^?Cf9n8pRR z>F#SqWtx83QfqcznkQX3ET%+g7FK&-kH!JDNqnDff^M%OTU;|M%BCo#Ji;Lkb0B6Z zFDL!0L3}i0-2Gg3k;L>-ZQk(m!!vH*)r7WCq(_c&?nDZ(8L1n6ZEnosvcKMy(t2y0QhjYLhxoUI zhpz1{_n04yKxF76+Us84R*QrrLb?B?5VFuIfq#96)x5mH=S^ki8B|>5+=l2X&~A}s zEo3joIyMQ<{-KyIuw2)?+f48NlaZ|binWLpD^2-o;g79cDD1T6fdk=-c(CR%J`BEP zlYS`ZtwwrG$$zKddr7BkfqwynnZ{gGcS}XLkHWPUXLYHo&CyX>nR4H#+ z8_Q+uEe>-t4zpAli77==c5HXzM##(Cn6zRXBV|O-D;fx4I7ZdTCh8t%@WS-dVHV&Y8%9tjHlyT_y~j)e7QowLwY_^92W>+t>dW*9c$ib+R#cbNYfI5 ze=}j`7>=d@2ncBsZ$(b0cL#-}{F|+L(Lj4;O8TaYs2qFMk61KFci~`fA#z&dv0rA&S=O^K}JYM8b`^m8M6M8HbRz3nOs2M zPTi$*b$Q@&r$T6Q!ixpznWhkVU;gSX-s{(#Q-4==ijr-J4BdF^I7cv@Zh})>C+c&= ze~8v;Z3RWpP4nB-saVDl?xnDGkJ7;Dv`(dg35$aR$JBKk~Ge~ zmo!dj?`am#0v^H1scB$ywH;J0C?Jn7F`%_oqs9)cC&pGG5MsjVc%4XwEhhzKZ8OPU z`tNz7A5hB^7+`^T(*DQgE5p-*t@GoX;3I%kayoW~ZqjIYY0e%mI!P+{LWE}kqGR4n z>Pjt(=fThO66GTu3Y<3%BgX6_0D$&B%NghhSES~g;_2CCLur67VWY}ZyK5=VZY**O z=b%r8L=dbmuv2pJadN-*^g{J4y09TaeeG2W)CkE(!QUg?K4ua_#^;Q>-%S{3qzEw1 z7jS-0Zx|NIKLX_%HpIc{>ejAK#PKJQYyp6RtLMs>TDo1v*?*jdqUUm$`{}la#%bAxv5^ zL$ymG-YNnV5@nA6p*<(`+n`}TRMj1tC(#;T7h6ZZV&zt@rrJ5~>mKTR9* z*Y2m=_Yu>NjrTk&|HUU=u=xL}?JT3B{Gzr$3`5rdLk}_3pLXaFqy`vz=n#=^kZ$P? z1*D~0N;*UV>FyK>2`Qyikemm6)_OlbU*7Y1X06}a=ghhHeeG-SxypXNJcRk&a3_jX zHMiXv-9_G-pvjf`w!PEig0o)@3q2hhB;8V7ZyT>m|ka_KYsl}rE0 z-DCIo3pO$Ob0kLJ$QUs$;iYXN5*J%$%WSI~CH7oiMy@WT&We)#k@iL_vfeXQb;{z| z=lv$38wNx13{$82$0@-bIND4KE;9e_A6XvYj0xEhD(Ag_9`KVlsLLOxbG!S3JRQTD zb2~yNGHTT2-PaMOUG}^ksnf8nejLl2AR&{_xH3Y3EpvvaXw&cJ+CfELmuU1WKLehb zuTHOd;dfmcE=24jKj4_m9ieFhszNgRAB%>!ro!2jnNG?JlkU zcJ*`A>XU~gJbx05Hq9Fnu!4U(q2g)Eg}KrHFS&svN<>>+I8@ndIY&+uy>Kv{laRdp|KhJsJgRIGLRM#fjyqxfd(Z zcXKvUBur22Bne5B(2!;q!JzGc>C{io&S}gF-2mFR9;Xd{?g`P(@Q@-NKP3c)9;iti z$#BZUTK~Yq2%Qlax(mQhf2pZ5>2y5-ay%Vv+Z~mP7>}S=SO$5du@^5*V`&L6VcWQ@ zAC04bmtWaPR`i@p^b_O(VT|Kr)ge|Zd8Yd(K-0eFBEfH^CWTtxRla65Pt?%At%Q)- zUYtQ31Anrd$Pf^0jH~bQgLl6X4CsDod}(*M$xy&v`-TT)o!GWRk@T1CfrpRojI#Tq zZ$F>>2)1DoJC-?rf33zy=vIjM@K5x8p7CQ;AwqC~5E~CuW}kw$y489USZkLgNkJL@ znsCz??>VNj^Ba|lnc{te7C1ntmBh`9djmrYKa5`YSw!s6S42O%bsYZ7^w^74;uuVQZ?X*Px6iDv;V8V%R*=5K6d&=@~EH?8y3>&Ry2B@ zrxeu$bVvi$D?m3@+w*7A31v8b>1m|R`eD?|G9^tH&0?#>$3!|rI#H<7ggwh+B=$ z8dVK_HB$Ao7OT@Jk2lmFxqQMEIc#~r*!A+*^mOc`y^&Ivnd?!|o&60k5vBAZFAzpXy#W%KS#bNISezOfgf!u=rn$)an zeQAlP&1kY|=c}7t3k?}3FcC7_1AX*C8r_9H9TW+l*dfM>CL^uVoMORI42z%Y>c>h~ z`{(~Lht7yh=lQ*ADc}zGY7;W^n<=$bNr$i^EVuPQ0In2G4w5_)l;@bBjR6Tg`69HQ zEa8lbej?-Ym^jLn$99lU8qMI%c3@rW>Kc}{j^6u44StV4#-5enrMM4u_PA1;Wr9v5 zH+J#myDQ{HY>qazUoykbhM+VhmY#^U?h!Y=OkbM25bzEY>&nCVaiwFCsOa=ZWdEXJ z(&rk{mabYdOF9!#^f&>U78jUzh$+Sq+SfIgwu;&G1lj z_^B4fbnyDcHP_?h`5}GSeQJA9f^w^6ONtjaLI(s~+UN?luoK}Q_y)G?*5TphJ-7u`nO)`j}BoV{ln(js3T zvlpa@7P}mNmGn~l1HmQeYAS4q;JwH3*O8nC9)DX@kc9(6@@q^s!W8 zzewL<+dw~XiPak7X`{08+s2~YXXSo`3%5{zoXi-&mSbH75dzbJ$;4EtHz0XmrErzEUOE5) zZ>a$Jk7RJ&(BQOfEL9+8IHgt0n`8K2b?pHg`^x#U3@s_}JZm>sLQMm^obJ%%15Y_U z^OKOE9+kk#>yvMYh#bjc+)SV-ndYhN8@)E|}=7p9`Jk1+sNQd?r;ZOrBfAuV?&!M93_45QjtGWN$Xy$oS3 zACLb)oe8FKVg2i8M=Wzj~<#xWg*$xB4_JgbzW7IM>)M9mu3JHo^f@xtw#g}HS5oY~SkLE$)CuiAHA4)EEIl(k7)B_-8K2u*Oa=(g(jj$Z zc-P8lhB)Qf{rn|#Vb}M_UU?C?+P#Ewb4wOvkd-SSasM8Vem{D(%2I2k0b?g3xxf1E z*PGo}G-+U&uXklj;|mVsEh|n67|;2a2T@G8%DN|3p%!sswv$@z%Eq ziZp!|96tMEY!v?&ro*DecSLsz~ zw-2r?vWyK+*>HLOgqeE@g5Pid*u|XOYjfktKqij4=|Xg4uchsHo+D(jynNxbI^g>S z`pRF&;PTV9U)cbX7++B#0NpW6w(Q3f9B{LK`26OW?tVzkrqu23g!%ETG~+c|-Sl6- zRkQGQQHb;xUyQ7)yVSu@vWCHr?=LdFJ|+uBXBS`h2Hi;tt}`C7i}|7RCqbA%2VVZm zf_4_Hh-wEFS}G-gwkIZ5mOUQ$8LDWD@}n`W9dmZ3QmUTt*9&@YrNUjjAnBSS`TA5~ z^@ax~AH9FZgfg(${fC|xtvi$cB;oilO9QSuwMIFfI?F58e|nvnplV*s`K(S_UX;7^ z}Y?9F94|9p` zi1E=n`rLa_U9DUXOJe5$14#J`$fS_W?c8A7u=TEqtoFmZmFTle%Ef4o5BHEjrJf+myZ7 zMgFrCtFZO!Y@Xqe7=$fVm@c*9_hF_3bxyg>!gCVi=2RAqH(mkc2~4RLYn5C$%fUy? z62R7{_6Nk(E~BuuduC6Hcl-5-3?vE*XA7MN2$sJEn)RMl${=ej{M20N?(fhi`G zu3j{0vZEdsMN8xA>H6e|o8&odS@WXSm%fpmB(li0&@z4Qns_-mCX_E1sf2oDw>i@g z0~~gI{0YR~+>*(mFO)i@{$9Nz6QO7h(l$5G_2-J$lg9yIfwi4#^sK%e2E+cvhlC!_ z;a%P+ezlL$84{?9B(hFc_U)LyP+?`)rUT zS1ZF@uCi|Bw9o@qB-q9E?a-NfQTLVDTZ@*QO7f-+@WX!5L-)=zP5q*(^|UIk*s7Hu zUCQvDTUvIO&N={PBT0#W5{0UK5W)shFcI-~*ABu}rFc{f&k+r5$V~hvJSqi`0wNS; zEahd(A_PZp{H8s@APh|QwU)a>=`{cd1JDl;OO1W050D_;jmvVMLaG!%*pNCJT%VN- zB)l`1FC6i&N=wuxxidZlzWC?Q_L*E^vvl4x7f1!ly3=1k0bGvRMA@*P5O3~=5vO+kL<9_L*E5R*e&RusQ zRdYh$>gvx|_I4)5vx?$v!P&8|J8btaG#lCm#?X&xMDhxrl~d#;NmlS*BKS)Mk5i-J z82|jnQqY;aI9;P;XixUrYaYJk3^EwuB75A0(3#b&$6-=zhr8`-g!>PZKzvj|2@STw z(~s-;pkoy(9OyJg4`|Cl4zo9*=H8WYeRGSx3QDWGSe>p;EY{Rnbr#7Q|0Xp*>mEjy zen1VREbN-4ADroMVaN)!z|{CahBXDmzEfg_*ZF zF`klCgB3v01aq#<&mb?e&oVS}1tPG;U8F(KB&Rhd_V;5qv+MY1pl4t6-1Lv8tEJ{K1Dz$up)IgG3c;>Qh1 zC(Q+MN$$JPT*;r>pDDs1eOX1Jv+(FoP79MSR{G4HZHZ{$e6&78DdC2x$7t%a+L-W4 zF8t~rHC z*Em$zg3(Wa8~IAo9WuhuFJu{;6n<>pWN*r`?^DUNN0&o@158y#Y$pR1z4qF=@}K?_ z0!=zkQ<2aMbw;*&&=cQth9uNX9lpCaC9 zPI`CnyH!PT9KspT&$VgXx!sg9+M+P^x^?cFB=Q`$4|b)1*cd;8V5wBG2evCUeW9be zG2<)v#!oAIvOMfR?cB=nG^~E|H?0_<3yZ7#025SO=D- z$k~@gNN3`tX4#4+^uXXYfbiHkDn-r$P{>1d)+T%^aQ95Zvti^f1lz zoqTK}G1juI6KP}bmBofD=1t~(+7$JY!Xhj`t3FES>z8hF34A>B;|4d8yE{FX6Fd7efDW<9KjYDFo0)0PLZ z>R1#S=moE&7(8Z=rgJPC8!x<6w8)|UMgjLbY>b4KGiERfL{adVlZ-rWBpsQ$zp)3U z2gub?Z-6<6nu>1Zkm||pW&O7j`W)iBB9m9^UBRy@T(nxw7-Z`VpD7F^KJa{?=XXf} z`D+o|g%k81Y-@IjZSMFk!fzL0Pl9%JQ1ckdiG3^)Eg{(0c7c7vwm|to#Fb5*;v-#a4{o znYsS;Tq3RG>A$pZ5&f6RG64rfNK8y{V#LJW90FivuIe`3!2muBPtbji;Wq9>m5D6pO@0Z%fMjw<5pe*a7F=F##O`6NM_i8L1+4b;*=7b*nIBgp&JEE&avt1 zoTrb_-Cx*~0Aa=#$9-cFkwajF)azh`IZQtBnMQxVtE!ZXu0>PRp9cf1Yi`(`ZM^O=)DD z-}Y9Q@N@Z3v>r+q>ESiwa2x%XDc255_Nnwj^OJeVTt+TRy@F9$4omw3q@<_JVr@NZ z+%05;jm}^C8vWG}UsqYCA0npvq3U?j+om*_D6@qE@ct$yGof!Q;dPvJQ&QNq|bM_xI_P=@q;ms$Vw~Yck3whYcAL z6dTq?m&g_Q?@rzG(hT?Qc1d0hyz(dl%eeEexaqb(6`Vr2!~OgDa$~CX-LvEF1;?9# zDb`v^wG`ridgd<{vL{K`lhFOU7I3o6+>R7n;4RsTR_uzhnfN{IR)jPdV)(m%PXjUT2?YM&OiKvlw+2a9ZYHriIr05_pXO*J(i|*SQTr62b{W zT6lfmguI{{CldtzDFob&LSjQ{EwMwxBd`p~fM%P8?f^h&%ot?0CgDh)AF2A=YM8ny zW)ENrbRdSLu~l+VPCWZhL~~WKcmg8bc0_G1EQ9yJvkj#wYtQpn^u~0d#2(~Pm5eYA zXjMX^pWg!;Nkm=UV||3_u7(o;Nn}!ga{CR`IbyVhQ1AWME=*i!)Jc zA*>ME6B!H#XsW4Z>LZ@bs<=?h2m(Vc91uD1y*Ie*Kz9*3-~$k$)(v|EP&OAX%Ak*R zwyTzIkGak#mOjNk82)?rz;n)OcLoPs2(jvol8Fz|;SnI_HSoOKeLdyr7X(aro-lEI zBsPj;@HlfV-E~=ts=2Pm%YE~Sg0CmRu%AD8|8~XH$MW)VJ+%x-ya-ZmvR{OVi0#t2 zoE<0b%0cd4QA8Q+{~;wMC98lM^fssjpbM!HuF_Q+w&~1E@E#f$+24b1V4oXfY+6HsOo7F3Ajzc14^HQ?P zI!ckup-BmYvggj$^916yEoybh&%fL`M+bP-!sL?w46es8!xSMd3I%5M21Ojsz#lpC zkru4=SrE4?a_diA9+Uy6*X_ZRg0tg1Rq^d8s3pUrq)5l@HEmq_l?U=U0&Kt2 zk30=RwRkmkIIc3Noiw*yN--aIN_^B&^I4LO;JUxBvoOJc_)2s~b^uEUWTD;VouzJz z)eNsQ8+BCvQ>kz2XQir2=^szUL#peUIUji37#iG5^X})f+5HExS{-BmkmHQ4-Kl)9 z1xzXV$)xg6>whffx*sT!J`!)+h^?Ax%tBa(D*a8)uLWoqVr~QGcyj(x_Wz$c2K@9kO=)x!m#g6ewNfY=9;H(&Nh*J1M8Axvb*`WqZ z$V6P17zfz7Y`H_fNZEOF%e78mUT00W>n<7IZ*tsUdn57LncPe`^h{c2fv(`W9xWlu zkAY|Let(e^jr{%7eJX6@$EvtTEYouE7)VQ)5dZe{MmhO9+MjRKcouz2Hm+hAss9ik zGg!9qaGpPb3KdDr7&|)$i1t6`h_0hjLK61EY6tRWZ2jz+peeBrIPv*m_7?L_39G3m>jg`?@5H zN(5es@UoX3V2VP~zp$*V9Ih7}IAoU^Q-yKNd7h&?$ZQbaXms=bYI^l6QYQtZfi`OW z_FbdJ&#?+VIy|O8-**G)ta@g8xexKNqq{Qxoac}5XLS+^_3=rXm*wWQTt4y7F8s#F zWiG%HrqV-BYu?kK@A|IjR6k9nQ2%A6S|tBtfrzce<(st_LI+yk@2h`p1;~UWT`a>Rg@)^j85y0W7OpG$A3A&J)xKo80 zUdN#wZ9J9UrdIk3>$AO}CL5C@k%T)c?zcfZ_q%FgOvu6N%C06VZzcl;D`NNBOuR0b z1BM|i4`%UFWePDoJ`XISn8XPAB3asP%AbpX1q_RDn1YhvOxJMOWYIn@yDoLGyR-M? zJ_!YxZeCtTPEnd|&5-&7&pToA&S!Bw>h^h_I<46g`v0XHKC2~P(+8|tIV~&3}-h4VZqW00NVwE*$tZ1B2|M%EWqFj{DbAR^bvAO#tonT9o z{qH_B;0j%Ei#6;99OsYN*zo5PPE=c_89Dp62sW=|T^JtnqIr!L(Y%%uKKYg1%t(fi zm4#hA8D{>X_&f%n{SU2&7mhzz&{b0$C-o+qF*H}(k*K=Ny!Qav}4)pV=1Hi8>-#oekbdcM!d!Ad;drsxWnza3dUSqnm6I+mjHJz3B z^$$!JgnyC|BJ6Gp>fE5E+usF`?*bK8iu}x!()9&o0Vq9KQ{*@Y3Q4M*V&>7Scc0-^ zwQG{-p2091juk#VSMSSEkzopdG<(}4Q@)_PJ#rDN`Y&H4Wh_#xTk9R@^Wu6dXa&ms z6*#E*U$E){i!(D%=SL*=^+n|?AimWB##RU>fIia#e-MQX-7clo9Pxnxmsc^tb$<6b z`7o8(Ny1L~Qq!f$KKi0RSxXVOFXRMn;dmw5M^^He;m>kVqf>{ncO14hkm>o{w8Ki$ zC%Lcb&g-(NOrNr3Ih(f1`qZ5s;zI#FH6G9F1$~IO&}dm~=3&D^QEDJcQkwJ$A+u=;#tSJo ze>}dHrB8YeZ?0cO=3tS>$WlEvZDkL6qCV>VVQQ@Bnyhq`$1W`+%GYz+-Sy@@ud`&c zvAASlta{J`4=WU3uJ6uwlyB6qWwmO=be*Ug14Sz%aZ`Gjx*}4E3&F5R>pCxTvbH3h zPu{oXbTHC=hu$%VfrSQ(+nV#QMSZ5c1`#bZ9(*<@$!x@=a^c=Bf97!$tjxhfA_W2b zI;!-FnN5;b$!@3-Odgnggt?BIEXE=YGO~{zGLSQX#5w`GQdN4u%*P`ictq$(8`SQf z7y9jODV&53HWAc+lvN$S9D#msaQ%mY(lvrj!`0>%ZxrU98Q06fs~iJ^CvSyoJmfKX zUnbsEQV_gVPm62Jv|2B$aj3dncl{5SGAf9}pJhN&!*fu>~}7$E=v z5LtC2FN1#T1wNUp*5_>Y@htov3E6a++uF(c{gmbPv#&LDjEb;ZXS z=~FcW%{v}#2?~g3f*^4?(7|lk+a-5;l_gXw)MU$u^fA&(t=Jnw$+9MAOMLW zkdtCbcOQI_e5tOThx;oq}j0IDXqT5ghBrgOBhKJe2rByBEy$>*| zdSRROE6+=KZmcL_Q>G^lTi|Q%Yzb$sF`;zkQEHNS(+`hsvpq2o1v?40SNX8=`u#O| z&*Kc`jPf*FP;^DB2U`oNHh)<8{RJ*b5Q_k$SWwDN!xELNU_mDn;z0h@wldHx&7V%d-^s3d8N!*uPh9(P)@>tu~9Ol_L8~Bl@gYwY!|TsBmb47LFT`0%(}3 zE~8UoPbN%#?;BX?qklfGe5#e!hrC5!>!G9aGk z%AEDVov?p?iClUkpli@5wx{DBnSJd`c=On3nxanQB%@25c`vY`vMW^qtwUta*v^rbcWQU%*Z-p{JsUDMV#nGX9Nan^8 zVNwW>jBU|(I5P$HqbXgy(yHiAsArjtaCWPdinfWuT1mFKMq6dvYxAla!H4rx3C(_@ z2LFP6(iK$6w_}g`@hqRlK2VIWFhh9&j}0=aJvR78q@rS2bt1JewRSgkwqx<`YsuVa zVJ{*A?ntoTP6L_hlS-lmU8CD@3Vu7(LWYh)@;J&`;G}M7BiZbrmAZ~hAHa;ue$es! zMfmo_UOpGwV_D|bxbPG?k+8T;@tIG?ZWD<@q&g#SA9y}6q&z|3uHc_|Rv+wGY7PAG zkub=y9Lgb&YW6#OOi^~N(;T4P_#ye`J>%t0{$=NLUJg@jlopQ%u|)B`3^CYP^&UMk zLdlqlFR-=QFXt4%n32I_;*Y9Vw%1;Mu_HY~dDeAb13E(g<`O#s)X^H1kgmfjLy}c^ zfHcDq-I|{4yiyd&{p(1nP45Su9uE9XIACk;_Z)@7n*&L>X$a?DqKHs;CE{x}ug9tN zR*@4p9V1-cstAQe1_$CX3&|4|_VNsk2fU}d6P?GH)OSFsb5pUjdnUB~g3DFHx>@wY z8djLiD*wB5%s%of>SVdLUiM!8>v`Cs95nI%PWrY{Xs;nO1Kpr>^?M$hf(^17Cq6kW zs;5W$z*E8&CkF?ND4xo_u~CpCuV5JEMxhq0g-aZvnY@bn6`XQmZjo3ZV(+b?u)b%1 zadn64ARNf4QxgkdRMPOzaO(50yfA35{?{!Og% Z|FRx GraphDBInterface : +async def get_graph_engine() -> GraphDBInterface: """Factory function to get the appropriate graph client based on the graph type.""" config = get_graph_config() diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 1121a24d5..e6520e4e2 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -2,7 +2,7 @@ import logging import asyncio from textwrap import dedent -from typing import Optional, Any, List, Dict +from typing import Optional, Any, List, Dict, Union from contextlib import asynccontextmanager from uuid import UUID from neo4j import AsyncSession @@ -432,3 +432,49 @@ class Neo4jAdapter(GraphDBInterface): ) for record in result] return (nodes, edges) + + async def get_filtered_graph_data(self, attribute_filters): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: nodes and edges. + """ + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + values_str = ", ".join(f"'{value}'" if isinstance(value, str) else str(value) for value in values) + where_clauses.append(f"n.{attribute} IN [{values_str}]") + + where_clause = " AND ".join(where_clauses) + + query_nodes = f""" + MATCH (n) + WHERE {where_clause} + RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties + """ + result_nodes = await self.query(query_nodes) + + nodes = [( + record["id"], + record["properties"], + ) for record in result_nodes] + + query_edges = f""" + MATCH (n)-[r]->(m) + WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} + RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties + """ + result_edges = await self.query(query_edges) + + edges = [( + record["source"], + record["target"], + record["type"], + record["properties"], + ) for record in result_edges] + + return (nodes, edges) \ No newline at end of file diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index a72376082..d249b6336 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -6,7 +6,7 @@ import json import asyncio import logging from re import A -from typing import Dict, Any, List +from typing import Dict, Any, List, Union from uuid import UUID import aiofiles import aiofiles.os as aiofiles_os @@ -301,3 +301,39 @@ class NetworkXAdapter(GraphDBInterface): logger.info("Graph deleted successfully.") except Exception as error: logger.error("Failed to delete graph: %s", error) + + async def get_filtered_graph_data(self, attribute_filters: List[Dict[str, List[Union[str, int]]]]): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: + - Nodes: List of tuples (node_id, node_properties). + - Edges: List of tuples (source_id, target_id, relationship_type, edge_properties). + """ + # Create filters for nodes based on the attribute filters + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + where_clauses.append((attribute, values)) + + # Filter nodes + filtered_nodes = [ + (node, data) for node, data in self.graph.nodes(data=True) + if all(data.get(attr) in values for attr, values in where_clauses) + ] + + # Filter edges where both source and target nodes satisfy the filters + filtered_edges = [ + (source, target, data.get('relationship_type', 'UNKNOWN'), data) + for source, target, data in self.graph.edges(data=True) + if ( + all(self.graph.nodes[source].get(attr) in values for attr, values in where_clauses) and + all(self.graph.nodes[target].get(attr) in values for attr, values in where_clauses) + ) + ] + + return filtered_nodes, filtered_edges \ No newline at end of file diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 96f026b4f..6cbe45655 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -10,6 +10,7 @@ from cognee.infrastructure.files.storage import LocalStorage from cognee.modules.storage.utils import copy_model, get_own_properties from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine class IndexSchema(DataPoint): @@ -141,6 +142,34 @@ class LanceDBAdapter(VectorDBInterface): score = 0, ) for result in results.to_dict("index").values()] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + ): + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + connection = await self.get_connection() + collection = await connection.open_table(collection_name) + + results = await collection.vector_search(query_vector).to_pandas() + + result_values = list(results.to_dict("index").values()) + + normalized_values = normalize_distances(result_values) + + return [ScoredResult( + id=UUID(result["id"]), + payload=result["payload"], + score=normalized_values[value_index], + ) for value_index, result in enumerate(result_values)] + async def search( self, collection_name: str, @@ -148,6 +177,7 @@ class LanceDBAdapter(VectorDBInterface): query_vector: List[float] = None, limit: int = 5, with_vector: bool = False, + normalized: bool = True ): if query_text is None and query_vector is None: raise ValueError("One of query_text or query_vector must be provided!") @@ -162,26 +192,7 @@ class LanceDBAdapter(VectorDBInterface): result_values = list(results.to_dict("index").values()) - min_value = 100 - max_value = 0 - - for result in result_values: - value = float(result["_distance"]) - if value > max_value: - max_value = value - if value < min_value: - min_value = value - - normalized_values = [] - min_value = min(result["_distance"] for result in result_values) - max_value = max(result["_distance"] for result in result_values) - - if max_value == min_value: - # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) - normalized_values = [0 for _ in result_values] - else: - normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in - result_values] + normalized_values = normalize_distances(result_values) return [ScoredResult( id = UUID(result["id"]), diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 01691714b..97571a274 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -11,6 +11,7 @@ from cognee.infrastructure.engine import DataPoint from .serialize_data import serialize_data from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from ...relational.ModelBase import Base @@ -22,6 +23,19 @@ class IndexSchema(DataPoint): "index_fields": ["text"] } +def singleton(class_): + # Note: Using this singleton as a decorator to a class removes + # the option to use class methods for that class + instances = {} + + def getinstance(*args, **kwargs): + if class_ not in instances: + instances[class_] = class_(*args, **kwargs) + return instances[class_] + + return getinstance + +@singleton class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): def __init__( @@ -162,6 +176,53 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): ) for result in results ] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + )-> List[ScoredResult]: + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + # Get PGVectorDataPoint Table from database + PGVectorDataPoint = await self.get_table(collection_name) + + closest_items = [] + + # Use async session to connect to the database + async with self.get_async_session() as session: + # Find closest vectors to query_vector + closest_items = await session.execute( + select( + PGVectorDataPoint, + PGVectorDataPoint.c.vector.cosine_distance(query_vector).label( + "similarity" + ), + ) + .order_by("similarity") + ) + + vector_list = [] + + # Extract distances and find min/max for normalization + for vector in closest_items: + # TODO: Add normalization of similarity score + vector_list.append(vector) + + # Create and return ScoredResult objects + return [ + ScoredResult( + id = UUID(str(row.id)), + payload = row.payload, + score = row.similarity + ) for row in vector_list + ] + async def search( self, collection_name: str, diff --git a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py index ef27e2889..f40299939 100644 --- a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +++ b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py @@ -10,5 +10,3 @@ async def create_db_and_tables(): await vector_engine.create_database() async with vector_engine.engine.begin() as connection: await connection.execute(text("CREATE EXTENSION IF NOT EXISTS vector;")) - - diff --git a/cognee/infrastructure/databases/vector/utils.py b/cognee/infrastructure/databases/vector/utils.py new file mode 100644 index 000000000..ced161ea3 --- /dev/null +++ b/cognee/infrastructure/databases/vector/utils.py @@ -0,0 +1,26 @@ +from typing import List + + +def normalize_distances(result_values: List[dict]) -> List[float]: + min_value = 100 + max_value = 0 + + for result in result_values: + value = float(result["_distance"]) + if value > max_value: + max_value = value + if value < min_value: + min_value = value + + normalized_values = [] + min_value = min(result["_distance"] for result in result_values) + max_value = max(result["_distance"] for result in result_values) + + if max_value == min_value: + # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) + normalized_values = [0 for _ in result_values] + else: + normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in + result_values] + + return normalized_values \ No newline at end of file diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index f8ea1c9f0..d08c52fa8 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -11,6 +11,7 @@ class DataPoint(BaseModel): __tablename__ = "data_point" id: UUID = Field(default_factory = uuid4) updated_at: Optional[datetime] = datetime.now(timezone.utc) + topological_rank: Optional[int] = 0 _metadata: Optional[MetaData] = { "index_fields": [] } diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py index 28cdfff4e..1dc9b70f5 100644 --- a/cognee/infrastructure/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/openai/adapter.py @@ -87,6 +87,9 @@ class OpenAIAdapter(LLMInterface): transcription = litellm.transcription( model = self.transcription_model, file = Path(input), + api_key=self.api_key, + api_base=self.endpoint, + api_version=self.api_version, max_retries = 5, ) @@ -112,6 +115,9 @@ class OpenAIAdapter(LLMInterface): }, ], }], + api_key=self.api_key, + api_base=self.endpoint, + api_version=self.api_version, max_tokens = 300, max_retries = 5, ) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index d15d93b73..158fb9d07 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -1,9 +1,12 @@ -from typing import List, Dict, Union +import numpy as np +from typing import List, Dict, Union from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge from cognee.modules.graph.cognee_graph.CogneeAbstractGraph import CogneeAbstractGraph -from cognee.infrastructure.databases.graph import get_graph_engine +import heapq +from graphistry import edges + class CogneeGraph(CogneeAbstractGraph): """ @@ -39,26 +42,33 @@ class CogneeGraph(CogneeAbstractGraph): def get_node(self, node_id: str) -> Node: return self.nodes.get(node_id, None) - def get_edges(self, node_id: str) -> List[Edge]: + def get_edges_of_node(self, node_id: str) -> List[Edge]: node = self.get_node(node_id) if node: return node.skeleton_edges else: raise ValueError(f"Node with id {node_id} does not exist.") + def get_edges(self)-> List[Edge]: + return edges + async def project_graph_from_db(self, adapter: Union[GraphDBInterface], node_properties_to_project: List[str], edge_properties_to_project: List[str], directed = True, node_dimension = 1, - edge_dimension = 1) -> None: + edge_dimension = 1, + memory_fragment_filter = []) -> None: if node_dimension < 1 or edge_dimension < 1: raise ValueError("Dimensions must be positive integers") try: - nodes_data, edges_data = await adapter.get_graph_data() + if len(memory_fragment_filter) == 0: + nodes_data, edges_data = await adapter.get_graph_data() + else: + nodes_data, edges_data = await adapter.get_filtered_graph_data(attribute_filters = memory_fragment_filter) if not nodes_data: raise ValueError("No node data retrieved from the database.") @@ -89,3 +99,81 @@ class CogneeGraph(CogneeAbstractGraph): print(f"Error projecting graph: {e}") except Exception as ex: print(f"Unexpected error: {ex}") + + async def map_vector_distances_to_graph_nodes(self, node_distances) -> None: + for category, scored_results in node_distances.items(): + for scored_result in scored_results: + node_id = str(scored_result.id) + score = scored_result.score + node =self.get_node(node_id) + if node: + node.add_attribute("vector_distance", score) + else: + print(f"Node with id {node_id} not found in the graph.") + + async def map_vector_distances_to_graph_edges(self, vector_engine, query) -> None: # :TODO: When we calculate edge embeddings in vector db change this similarly to node mapping + try: + # Step 1: Generate the query embedding + query_vector = await vector_engine.embed_data([query]) + query_vector = query_vector[0] + if query_vector is None or len(query_vector) == 0: + raise ValueError("Failed to generate query embedding.") + + # Step 2: Collect all unique relationship types + unique_relationship_types = set() + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if relationship_type: + unique_relationship_types.add(relationship_type) + + # Step 3: Embed all unique relationship types + unique_relationship_types = list(unique_relationship_types) + relationship_type_embeddings = await vector_engine.embed_data(unique_relationship_types) + + # Step 4: Map relationship types to their embeddings and calculate distances + embedding_map = {} + for relationship_type, embedding in zip(unique_relationship_types, relationship_type_embeddings): + edge_vector = np.array(embedding) + + # Calculate cosine similarity + similarity = np.dot(query_vector, edge_vector) / ( + np.linalg.norm(query_vector) * np.linalg.norm(edge_vector) + ) + distance = 1 - similarity + + # Round the distance to 4 decimal places and store it + embedding_map[relationship_type] = round(distance, 4) + + # Step 4: Assign precomputed distances to edges + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if not relationship_type or relationship_type not in embedding_map: + print(f"Edge {edge} has an unknown or missing relationship type.") + continue + + # Assign the precomputed distance + edge.attributes["vector_distance"] = embedding_map[relationship_type] + + except Exception as ex: + print(f"Error mapping vector distances to edges: {ex}") + + + async def calculate_top_triplet_importances(self, k = int) -> List: + min_heap = [] + for i, edge in enumerate(self.edges): + source_node = self.get_node(edge.node1.id) + target_node = self.get_node(edge.node2.id) + + source_distance = source_node.attributes.get("vector_distance", 0) if source_node else 0 + target_distance = target_node.attributes.get("vector_distance", 0) if target_node else 0 + edge_distance = edge.attributes.get("vector_distance", 0) + + total_distance = source_distance + target_distance + edge_distance + + heapq.heappush(min_heap, (-total_distance, i, edge)) + if len(min_heap) > k: + heapq.heappop(min_heap) + + + return [edge for _, _, edge in sorted(min_heap)] + diff --git a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py index 8235cb24d..cecb0a272 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py @@ -1,5 +1,5 @@ import numpy as np -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Union class Node: """ @@ -21,6 +21,7 @@ class Node: raise ValueError("Dimension must be a positive integer") self.id = node_id self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.skeleton_neighbours = [] self.skeleton_edges = [] self.status = np.ones(dimension, dtype=int) @@ -55,6 +56,12 @@ class Node: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: return f"Node({self.id}, attributes={self.attributes})" @@ -87,6 +94,7 @@ class Edge: self.node1 = node1 self.node2 = node2 self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.directed = directed self.status = np.ones(dimension, dtype=int) @@ -95,6 +103,12 @@ class Edge: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str, value: Any) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: direction = "->" if self.directed else "--" return f"Edge({self.node1.id} {direction} {self.node2.id}, attributes={self.attributes})" diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py index 6fbe2ee99..c4fa0d654 100644 --- a/cognee/modules/graph/utils/__init__.py +++ b/cognee/modules/graph/utils/__init__.py @@ -2,3 +2,4 @@ from .expand_with_nodes_and_edges import expand_with_nodes_and_edges from .get_graph_from_model import get_graph_from_model from .get_model_instance_from_graph import get_model_instance_from_graph from .retrieve_existing_edges import retrieve_existing_edges +from .convert_node_to_data_point import convert_node_to_data_point diff --git a/cognee/modules/graph/utils/convert_node_to_data_point.py b/cognee/modules/graph/utils/convert_node_to_data_point.py new file mode 100644 index 000000000..292f53733 --- /dev/null +++ b/cognee/modules/graph/utils/convert_node_to_data_point.py @@ -0,0 +1,23 @@ +from cognee.infrastructure.engine import DataPoint + + +def convert_node_to_data_point(node_data: dict) -> DataPoint: + subclass = find_subclass_by_name(DataPoint, node_data["type"]) + + return subclass(**node_data) + + +def get_all_subclasses(cls): + subclasses = [] + for subclass in cls.__subclasses__(): + subclasses.append(subclass) + subclasses.extend(get_all_subclasses(subclass)) # Recursively get subclasses + + return subclasses + +def find_subclass_by_name(cls, name): + for subclass in get_all_subclasses(cls): + if subclass.__name__ == name: + return subclass + + return None diff --git a/cognee/modules/graph/utils/get_graph_from_model.py b/cognee/modules/graph/utils/get_graph_from_model.py index 29137ddc7..d2c6269ab 100644 --- a/cognee/modules/graph/utils/get_graph_from_model.py +++ b/cognee/modules/graph/utils/get_graph_from_model.py @@ -2,9 +2,18 @@ from datetime import datetime, timezone from cognee.infrastructure.engine import DataPoint from cognee.modules.storage.utils import copy_model -def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes = {}, added_edges = {}): +async def get_graph_from_model( + data_point: DataPoint, + include_root = True, + added_nodes = None, + added_edges = None, + visited_properties = None, +): nodes = [] edges = [] + added_nodes = added_nodes or {} + added_edges = added_edges or {} + visited_properties = visited_properties or {} data_point_properties = {} excluded_properties = set() @@ -13,10 +22,27 @@ def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes if field_name == "_metadata": continue + if field_value is None: + excluded_properties.add(field_name) + continue + if isinstance(field_value, DataPoint): excluded_properties.add(field_name) - property_nodes, property_edges = get_graph_from_model(field_value, True, added_nodes, added_edges) + property_key = f"{str(data_point.id)}{field_name}{str(field_value.id)}" + + if property_key in visited_properties: + return [], [] + + visited_properties[property_key] = 0 + + property_nodes, property_edges = await get_graph_from_model( + field_value, + True, + added_nodes, + added_edges, + visited_properties, + ) for node in property_nodes: if str(node.id) not in added_nodes: @@ -47,7 +73,20 @@ def get_graph_from_model(data_point: DataPoint, include_root = True, added_nodes excluded_properties.add(field_name) for item in field_value: - property_nodes, property_edges = get_graph_from_model(item, True, added_nodes, added_edges) + property_key = f"{str(data_point.id)}{field_name}{str(item.id)}" + + if property_key in visited_properties: + return [], [] + + visited_properties[property_key] = 0 + + property_nodes, property_edges = await get_graph_from_model( + item, + True, + added_nodes, + added_edges, + visited_properties, + ) for node in property_nodes: if str(node.id) not in added_nodes: diff --git a/cognee/pipelines/__init__.py b/cognee/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/__init__.py b/cognee/pipelines/retriever/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/diffusion_retriever.py b/cognee/pipelines/retriever/diffusion_retriever.py new file mode 100644 index 000000000..a6b79310e --- /dev/null +++ b/cognee/pipelines/retriever/diffusion_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await diffusion_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def diffusion_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/g_retriever.py b/cognee/pipelines/retriever/g_retriever.py new file mode 100644 index 000000000..4b319acd9 --- /dev/null +++ b/cognee/pipelines/retriever/g_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await g_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def g_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py new file mode 100644 index 000000000..92ef2be2e --- /dev/null +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -0,0 +1,119 @@ +import asyncio +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.graph import get_graph_engine + + +def format_triplets(edges): + print("\n\n\n") + def filter_attributes(obj, attributes): + """Helper function to filter out non-None properties, including nested dicts.""" + result = {} + for attr in attributes: + value = getattr(obj, attr, None) + if value is not None: + # If the value is a dict, extract relevant keys from it + if isinstance(value, dict): + nested_values = {k: v for k, v in value.items() if k in attributes and v is not None} + result[attr] = nested_values + else: + result[attr] = value + return result + + triplets = [] + for edge in edges: + node1 = edge.node1 + node2 = edge.node2 + edge_attributes = edge.attributes + node1_attributes = node1.attributes + node2_attributes = node2.attributes + + # Filter only non-None properties + node1_info = {key: value for key, value in node1_attributes.items() if value is not None} + node2_info = {key: value for key, value in node2_attributes.items() if value is not None} + edge_info = {key: value for key, value in edge_attributes.items() if value is not None} + + # Create the formatted triplet + triplet = ( + f"Node1: {node1_info}\n" + f"Edge: {edge_info}\n" + f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation + ) + triplets.append(triplet) + + return "".join(triplets) + + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await run_two_step_retriever(query, user) + + filtered_search_results = [] + + return retrieved_results + + +def delete_duplicated_vector_db_elements(collections, results): #:TODO: This is just for now to fix vector db duplicates + results_dict = {} + for collection, results in zip(collections, results): + seen_ids = set() + unique_results = [] + for result in results: + if result.id not in seen_ids: + unique_results.append(result) + seen_ids.add(result.id) + else: + print(f"Duplicate found in collection '{collection}': {result.id}") + results_dict[collection] = unique_results + + return results_dict + + +async def run_two_step_retriever(query: str, user, community_filter = []) -> list: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + + collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] + results = await asyncio.gather( + *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + ) + + ############################################# This part is a quick fix til we don't fix the vector db inconsistency + node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed + # results_dict = {collection: result for collection, result in zip(collections, results)} + ############################################## + + memory_fragment = CogneeGraph() + + await memory_fragment.project_graph_from_db(graph_engine, + node_properties_to_project=['id', + 'description', + 'name', + 'type', + 'text'], + edge_properties_to_project=['id', + 'relationship_name']) + + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) + + await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db + + results = await memory_fragment.calculate_top_triplet_importances(k=5) + + print(format_triplets(results)) + print(f'Query was the following:{query}' ) + + return results diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 9052cf89e..4811106e5 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -1,13 +1,28 @@ +from typing import List, Optional from cognee.infrastructure.engine import DataPoint class Repository(DataPoint): path: str + type: Optional[str] = "Repository" class CodeFile(DataPoint): extracted_id: str # actually file path + type: Optional[str] = "CodeFile" + source_code: Optional[str] = None + part_of: Optional[Repository] = None + depends_on: Optional[List["CodeFile"]] = None + depends_directly_on: Optional[List["CodeFile"]] = None + contains: Optional[List["CodePart"]] = None + + _metadata: dict = { + "index_fields": ["source_code"] + } + +class CodePart(DataPoint): type: str + # part_of: Optional[CodeFile] source_code: str - part_of: Repository + type: Optional[str] = "CodePart" _metadata: dict = { "index_fields": ["source_code"] @@ -18,3 +33,6 @@ class CodeRelationship(DataPoint): target_id: str type: str # between files relation: str # depends on or depends directly + +CodeFile.model_rebuild() +CodePart.model_rebuild() diff --git a/cognee/tasks/code/enrich_dependency_graph_checker.py b/cognee/tasks/code/enrich_dependency_graph_checker.py index 3e56fab59..7b04e0357 100644 --- a/cognee/tasks/code/enrich_dependency_graph_checker.py +++ b/cognee/tasks/code/enrich_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph @@ -15,7 +15,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) graph = asyncio.run(enrich_dependency_graph(graph)) for node in graph.nodes: print(f"Node: {node}") diff --git a/cognee/tasks/code/expand_dependency_graph_checker.py b/cognee/tasks/code/expand_dependency_graph_checker.py index 1b3ce3246..bdef34162 100644 --- a/cognee/tasks/code/expand_dependency_graph_checker.py +++ b/cognee/tasks/code/expand_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph from cognee.tasks.repo_processor.expand_dependency_graph import expand_dependency_graph @@ -16,7 +16,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) graph = asyncio.run(enrich_dependency_graph(graph)) graph = expand_dependency_graph(graph) for node in graph.nodes: diff --git a/cognee/tasks/code/get_repo_dependency_graph_checker.py b/cognee/tasks/code/get_repo_dependency_graph_checker.py index 57afc7274..3a393d3f3 100644 --- a/cognee/tasks/code/get_repo_dependency_graph_checker.py +++ b/cognee/tasks/code/get_repo_dependency_graph_checker.py @@ -1,7 +1,7 @@ import os import asyncio import argparse -from cognee.tasks.repo_processor.get_repo_dependency_graph import get_repo_dependency_graph +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies def main(): @@ -14,7 +14,7 @@ def main(): print(f"Error: The provided repository path does not exist: {repo_path}") return - graph = asyncio.run(get_repo_dependency_graph(repo_path)) + graph = asyncio.run(get_repo_file_dependencies(repo_path)) for node in graph.nodes: print(f"Node: {node}") diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index d881514a2..8ee87bcad 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -1,16 +1,51 @@ from cognee.modules.data.models import Data -from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument +from cognee.modules.data.processing.document_types import ( + Document, + PdfDocument, + AudioDocument, + ImageDocument, + TextDocument, +) EXTENSION_TO_DOCUMENT_CLASS = { - "pdf": PdfDocument, - "audio": AudioDocument, - "image": ImageDocument, - "txt": TextDocument + "pdf": PdfDocument, # Text documents + "txt": TextDocument, + "png": ImageDocument, # Image documents + "dwg": ImageDocument, + "xcf": ImageDocument, + "jpg": ImageDocument, + "jpx": ImageDocument, + "apng": ImageDocument, + "gif": ImageDocument, + "webp": ImageDocument, + "cr2": ImageDocument, + "tif": ImageDocument, + "bmp": ImageDocument, + "jxr": ImageDocument, + "psd": ImageDocument, + "ico": ImageDocument, + "heic": ImageDocument, + "avif": ImageDocument, + "aac": AudioDocument, # Audio documents + "mid": AudioDocument, + "mp3": AudioDocument, + "m4a": AudioDocument, + "ogg": AudioDocument, + "flac": AudioDocument, + "wav": AudioDocument, + "amr": AudioDocument, + "aiff": AudioDocument, } + def classify_documents(data_documents: list[Data]) -> list[Document]: documents = [ - EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name) + EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( + id=data_item.id, + title=f"{data_item.name}.{data_item.extension}", + raw_data_location=data_item.raw_data_location, + name=data_item.name, + ) for data_item in data_documents ] return documents diff --git a/cognee/tasks/graph/convert_graph_from_code_graph.py b/cognee/tasks/graph/convert_graph_from_code_graph.py deleted file mode 100644 index bc8544994..000000000 --- a/cognee/tasks/graph/convert_graph_from_code_graph.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import networkx as nx - -from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository -from cognee.tasks.storage import add_data_points - - -async def convert_graph_from_code_graph( - graph: nx.DiGraph, repo_path: str -) -> tuple[str, list[CodeFile], list[CodeRelationship]]: - code_objects = code_objects_from_di_graph(graph, repo_path) - - add_data_points(code_objects) - - return code_objects - - -def create_code_file(path, type, repo): - abspath = os.path.abspath(path) - - with open(abspath, "r") as f: - source_code = f.read() - - code_file = CodeFile( - extracted_id = abspath, - type = type, - source_code = source_code, - part_of = repo, - ) - - return code_file - - -def code_objects_from_di_graph( - graph: nx.DiGraph, repo_path: str -) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]: - repo = Repository(path=repo_path) - - code_files = [ - create_code_file(os.path.join(repo_path, path), "python_file", repo) - for path in graph.nodes - ] - - code_relationships = [ - CodeRelationship( - os.path.join(repo_path, source), - os.path.join(repo_path, target), - "python_file", - graph.get_edge_data(source, target)["relation"], - ) - for source, target in graph.edges - ] - - return (repo, code_files, code_relationships) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index ad6ae19d2..2bf4aeba4 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -24,11 +24,13 @@ async def extract_graph_from_data( (chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs) ] existing_edges_map = await retrieve_existing_edges( - chunk_and_chunk_graphs, graph_engine + chunk_and_chunk_graphs, + graph_engine, ) graph_nodes, graph_edges = expand_with_nodes_and_edges( - chunk_and_chunk_graphs, existing_edges_map + chunk_and_chunk_graphs, + existing_edges_map, ) if len(graph_nodes) > 0: diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index a1aeabcdc..05e111b29 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,4 @@ logger = logging.getLogger("task:repo_processor") from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph -from .get_repo_dependency_graph import get_repo_dependency_graph +from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/enrich_dependency_graph.py b/cognee/tasks/repo_processor/enrich_dependency_graph.py index c3aa9fd74..ba222ef3f 100644 --- a/cognee/tasks/repo_processor/enrich_dependency_graph.py +++ b/cognee/tasks/repo_processor/enrich_dependency_graph.py @@ -1,25 +1,38 @@ +import asyncio import networkx as nx from typing import Dict, List +from tqdm.asyncio import tqdm + +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile +from cognee.modules.graph.utils import get_graph_from_model, convert_node_to_data_point +from cognee.infrastructure.databases.graph import get_graph_engine def topologically_sort_subgraph(subgraph_node_to_indegree: Dict[str, int], graph: nx.DiGraph) -> List[str]: """Performs a topological sort on a subgraph based on node indegrees.""" results = [] remaining_nodes = subgraph_node_to_indegree.copy() + while remaining_nodes: next_node = min(remaining_nodes, key=remaining_nodes.get) results.append(next_node) + for successor in graph.successors(next_node): if successor in remaining_nodes: remaining_nodes[successor] -= 1 + remaining_nodes.pop(next_node) + return results def topologically_sort(graph: nx.DiGraph) -> List[str]: """Performs a topological sort on the entire graph.""" subgraphs = (graph.subgraph(c).copy() for c in nx.weakly_connected_components(graph)) + topological_order = [] + for subgraph in subgraphs: node_to_indegree = { node: len(list(subgraph.successors(node))) @@ -28,29 +41,84 @@ def topologically_sort(graph: nx.DiGraph) -> List[str]: topological_order.extend( topologically_sort_subgraph(node_to_indegree, subgraph) ) + return topological_order -def node_enrich_and_connect(graph: nx.MultiDiGraph, topological_order: List[str], node: str) -> None: +async def node_enrich_and_connect( + graph: nx.MultiDiGraph, + topological_order: List[str], + node: CodeFile, + data_points_map: Dict[str, DataPoint], +) -> None: """Adds 'depends_on' edges to the graph based on topological order.""" - topological_rank = topological_order.index(node) - graph.nodes[node]['topological_rank'] = topological_rank - node_descendants = nx.descendants(graph, node) - if graph.has_edge(node,node): - node_descendants.add(node) - for desc in node_descendants: - if desc not in topological_order[:topological_rank+1]: + topological_rank = topological_order.index(node.id) + node.topological_rank = topological_rank + node_descendants = nx.descendants(graph, node.id) + + if graph.has_edge(node.id, node.id): + node_descendants.add(node.id) + + new_connections = [] + graph_engine = await get_graph_engine() + + for desc_id in node_descendants: + if desc_id not in topological_order[:topological_rank + 1]: continue - graph.add_edge(node, desc, relation='depends_on') -async def enrich_dependency_graph(graph: nx.DiGraph) -> nx.MultiDiGraph: + if desc_id in data_points_map: + desc = data_points_map[desc_id] + else: + node_data = await graph_engine.extract_node(desc_id) + desc = convert_node_to_data_point(node_data) + + new_connections.append(desc) + + node.depends_directly_on = node.depends_directly_on or [] + node.depends_directly_on.extend(new_connections) + + +async def enrich_dependency_graph(data_points: list[DataPoint]) -> list[DataPoint]: """Enriches the graph with topological ranks and 'depends_on' edges.""" - graph = nx.MultiDiGraph(graph) + nodes = [] + edges = [] + + for data_point in data_points: + graph_nodes, graph_edges = await get_graph_from_model(data_point) + nodes.extend(graph_nodes) + edges.extend(graph_edges) + + graph = nx.MultiDiGraph() + + simple_nodes = [(node.id, node.model_dump()) for node in nodes] + + graph.add_nodes_from(simple_nodes) + + graph.add_edges_from(edges) + topological_order = topologically_sort(graph) + node_rank_map = {node: idx for idx, node in enumerate(topological_order)} - for node in graph.nodes: - if node not in node_rank_map: + + # for node_id, node in tqdm(graph.nodes(data = True), desc = "Enriching dependency graph", unit = "node"): + # if node_id not in node_rank_map: + # continue + + # data_points.append(node_enrich_and_connect(graph, topological_order, node)) + + data_points_map = {data_point.id: data_point for data_point in data_points} + data_points_futures = [] + + for data_point in tqdm(data_points, desc = "Enriching dependency graph", unit = "data_point"): + if data_point.id not in node_rank_map: continue - node_enrich_and_connect(graph, topological_order, node) - return graph + + if isinstance(data_point, CodeFile): + data_points_futures.append(node_enrich_and_connect(graph, topological_order, data_point, data_points_map)) + + # yield data_point + + await asyncio.gather(*data_points_futures) + + return data_points diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py index a6b724302..722bfa5c6 100644 --- a/cognee/tasks/repo_processor/expand_dependency_graph.py +++ b/cognee/tasks/repo_processor/expand_dependency_graph.py @@ -1,28 +1,43 @@ -import networkx as nx - +from uuid import NAMESPACE_OID, uuid5 +# from tqdm import tqdm +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile, CodePart from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts from cognee.tasks.repo_processor import logger - -def _add_code_parts_nodes_and_edges(graph, parent_node_id, part_type, code_parts): +def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None: """Add code part nodes and edges for a specific part type.""" if not code_parts: - logger.debug(f"No code parts to add for parent_node_id {parent_node_id} and part_type {part_type}.") + logger.debug(f"No code parts to add for node {code_file.id} and part_type {part_type}.") return + part_nodes = [] + for idx, code_part in enumerate(code_parts): if not code_part.strip(): - logger.warning(f"Empty code part in parent_node_id {parent_node_id} and part_type {part_type}.") + logger.warning(f"Empty code part in node {code_file.id} and part_type {part_type}.") continue - part_node_id = f"{parent_node_id}_{part_type}_{idx}" - graph.add_node(part_node_id, source_code=code_part, node_type=part_type) - graph.add_edge(parent_node_id, part_node_id, relation="contains") + + part_node_id = uuid5(NAMESPACE_OID, f"{code_file.id}_{part_type}_{idx}") + + part_nodes.append(CodePart( + id = part_node_id, + type = part_type, + # part_of = code_file, + source_code = code_part, + )) + + # graph.add_node(part_node_id, source_code=code_part, node_type=part_type) + # graph.add_edge(parent_node_id, part_node_id, relation="contains") + + code_file.contains = code_file.contains or [] + code_file.contains.extend(part_nodes) -def _process_single_node(graph, node_id, node_data): +def _process_single_node(code_file: CodeFile) -> None: """Process a single Python file node.""" - graph.nodes[node_id]["node_type"] = "python_file" - source_code = node_data.get("source_code", "") + node_id = code_file.id + source_code = code_file.source_code if not source_code.strip(): logger.warning(f"Node {node_id} has no or empty 'source_code'. Skipping.") @@ -35,15 +50,14 @@ def _process_single_node(graph, node_id, node_data): return for part_type, code_parts in code_parts_dict.items(): - _add_code_parts_nodes_and_edges(graph, node_id, part_type, code_parts) + _add_code_parts_nodes_and_edges(code_file, part_type, code_parts) -def expand_dependency_graph(graph: nx.MultiDiGraph) -> nx.MultiDiGraph: +async def expand_dependency_graph(data_points: list[DataPoint]) -> list[DataPoint]: """Process Python file nodes, adding code part nodes and edges.""" - expanded_graph = graph.copy() - for node_id, node_data in graph.nodes(data=True): - if not node_data: # Check if node_data is empty - logger.warning(f"Node {node_id} has no data. Skipping.") - continue - _process_single_node(expanded_graph, node_id, node_data) - return expanded_graph + # for data_point in tqdm(data_points, desc = "Expand dependency graph", unit = "data_point"): + for data_point in data_points: + if isinstance(data_point, CodeFile): + _process_single_node(data_point) + + return data_points diff --git a/cognee/tasks/repo_processor/get_repo_dependency_graph.py b/cognee/tasks/repo_processor/get_repo_dependency_graph.py deleted file mode 100644 index 7f96bd49c..000000000 --- a/cognee/tasks/repo_processor/get_repo_dependency_graph.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import aiofiles -import networkx as nx - -from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies - - -async def get_py_path_and_source(file_path, repo_path): - relative_path = os.path.relpath(file_path, repo_path) - try: - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - source_code = await f.read() - return relative_path, source_code - except Exception as e: - print(f"Error reading file {file_path}: {e}") - return relative_path, None - - -async def get_py_files_dict(repo_path): - """Get .py files and their source code""" - if not os.path.exists(repo_path): - return {} - - py_files_paths = ( - os.path.join(root, file) - for root, _, files in os.walk(repo_path) for file in files if file.endswith(".py") - ) - - py_files_dict = {} - for file_path in py_files_paths: - relative_path, source_code = await get_py_path_and_source(file_path, repo_path) - py_files_dict[relative_path] = {"source_code": source_code} - - return py_files_dict - - -def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple: - if relative_paths: - file_path = os.path.relpath(file_path, repo_path) - dependency = os.path.relpath(dependency, repo_path) - return (file_path, dependency, {"relation": "depends_directly_on"}) - - -async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph: - """Generate a dependency graph for Python files in the given repository path.""" - py_files_dict = await get_py_files_dict(repo_path) - - dependency_graph = nx.DiGraph() - - dependency_graph.add_nodes_from(py_files_dict.items()) - - for file_path, metadata in py_files_dict.items(): - source_code = metadata.get("source_code") - if source_code is None: - continue - - dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) - dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] - dependency_graph.add_edges_from(dependency_edges) - - return dependency_graph diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py new file mode 100644 index 000000000..58f3857a9 --- /dev/null +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -0,0 +1,87 @@ +import os +from uuid import NAMESPACE_OID, uuid5 +import aiofiles +from tqdm.asyncio import tqdm + +from cognee.infrastructure.engine import DataPoint +from cognee.shared.CodeGraphEntities import CodeFile, Repository +from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies + + +async def get_py_path_and_source(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + source_code = await f.read() + return file_path, source_code + except Exception as e: + print(f"Error reading file {file_path}: {e}") + return file_path, None + + +async def get_py_files_dict(repo_path): + """Get .py files and their source code""" + if not os.path.exists(repo_path): + return {} + + py_files_paths = ( + os.path.join(root, file) + for root, _, files in os.walk(repo_path) for file in files if file.endswith(".py") + ) + + py_files_dict = {} + for file_path in py_files_paths: + absolute_path = os.path.abspath(file_path) + relative_path, source_code = await get_py_path_and_source(absolute_path) + py_files_dict[relative_path] = {"source_code": source_code} + + return py_files_dict + + +def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = False) -> tuple: + if relative_paths: + file_path = os.path.relpath(file_path, repo_path) + dependency = os.path.relpath(dependency, repo_path) + return (file_path, dependency, {"relation": "depends_directly_on"}) + + +async def get_repo_file_dependencies(repo_path: str) -> list[DataPoint]: + """Generate a dependency graph for Python files in the given repository path.""" + py_files_dict = await get_py_files_dict(repo_path) + + repo = Repository( + id = uuid5(NAMESPACE_OID, repo_path), + path = repo_path, + ) + + data_points = [repo] + + # dependency_graph = nx.DiGraph() + + # dependency_graph.add_nodes_from(py_files_dict.items()) + + async for file_path, metadata in tqdm(py_files_dict.items(), desc="Repo dependency graph", unit="file"): + source_code = metadata.get("source_code") + if source_code is None: + continue + + dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) + + data_points.append(CodeFile( + id = uuid5(NAMESPACE_OID, file_path), + source_code = source_code, + extracted_id = file_path, + part_of = repo, + depends_on = [ + CodeFile( + id = uuid5(NAMESPACE_OID, dependency), + extracted_id = dependency, + part_of = repo, + ) for dependency in dependencies + ] if len(dependencies) else None, + )) + # dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] + + # dependency_graph.add_edges_from(dependency_edges) + + return data_points + # return dependency_graph diff --git a/cognee/tasks/storage/add_data_points.py b/cognee/tasks/storage/add_data_points.py index b803c9dfd..33f9d7a70 100644 --- a/cognee/tasks/storage/add_data_points.py +++ b/cognee/tasks/storage/add_data_points.py @@ -1,3 +1,4 @@ +import asyncio from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.graph.utils import get_graph_from_model @@ -8,11 +9,13 @@ async def add_data_points(data_points: list[DataPoint]): nodes = [] edges = [] - for data_point in data_points: - property_nodes, property_edges = get_graph_from_model(data_point) + results = await asyncio.gather(*[ + get_graph_from_model(data_point) for data_point in data_points + ]) - nodes.extend(property_nodes) - edges.extend(property_edges) + for result_nodes, result_edges in results: + nodes.extend(result_nodes) + edges.extend(result_edges) graph_engine = await get_graph_engine() diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 12903173a..03ad30f9d 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -16,6 +16,9 @@ async def index_data_points(data_points: list[DataPoint]): data_point_type = type(data_point) for field_name in data_point._metadata["index_fields"]: + if getattr(data_point, field_name, None) is None: + continue + index_name = f"{data_point_type.__tablename__}.{field_name}" if index_name not in created_indexes: @@ -35,12 +38,21 @@ async def index_data_points(data_points: list[DataPoint]): return data_points -def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> list[DataPoint]: +def get_data_points_from_model(data_point: DataPoint, added_data_points = None, visited_properties = None) -> list[DataPoint]: data_points = [] + added_data_points = added_data_points or {} + visited_properties = visited_properties or {} for field_name, field_value in data_point: if isinstance(field_value, DataPoint): - new_data_points = get_data_points_from_model(field_value, added_data_points) + property_key = f"{str(data_point.id)}{field_name}{str(field_value.id)}" + + if property_key in visited_properties: + return [] + + visited_properties[property_key] = True + + new_data_points = get_data_points_from_model(field_value, added_data_points, visited_properties) for new_point in new_data_points: if str(new_point.id) not in added_data_points: @@ -49,7 +61,14 @@ def get_data_points_from_model(data_point: DataPoint, added_data_points = {}) -> if isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], DataPoint): for field_value_item in field_value: - new_data_points = get_data_points_from_model(field_value_item, added_data_points) + property_key = f"{str(data_point.id)}{field_name}{str(field_value_item.id)}" + + if property_key in visited_properties: + return [] + + visited_properties[property_key] = True + + new_data_points = get_data_points_from_model(field_value_item, added_data_points, visited_properties) for new_point in new_data_points: if str(new_point.id) not in added_data_points: @@ -79,4 +98,3 @@ if __name__ == "__main__": data_points = get_data_points_from_model(person) print(data_points) - \ No newline at end of file diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index 31b86d325..277081f40 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -4,6 +4,7 @@ from uuid import uuid5 from pydantic import BaseModel +from cognee.infrastructure.engine import DataPoint from cognee.modules.data.extraction.extract_summary import extract_summary from cognee.shared.CodeGraphEntities import CodeFile from cognee.tasks.storage import add_data_points @@ -12,13 +13,16 @@ from .models import CodeSummary async def summarize_code( - code_files: list[CodeFile], summarization_model: Type[BaseModel] -) -> list[CodeFile]: + code_files: list[DataPoint], + summarization_model: Type[BaseModel], +) -> list[DataPoint]: if len(code_files) == 0: return code_files + code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)] + file_summaries = await asyncio.gather( - *[extract_summary(file.source_code, summarization_model) for file in code_files] + *[extract_summary(file.source_code, summarization_model) for file in code_files_data_points] ) summaries = [ @@ -27,9 +31,9 @@ async def summarize_code( made_from = file, text = file_summaries[file_index].summary, ) - for (file_index, file) in enumerate(code_files) + for (file_index, file) in enumerate(code_files_data_points) ] await add_data_points(summaries) - return code_files, summaries + return code_files diff --git a/cognee/tests/tasks/graph/code_graph_test_data_generation.py b/cognee/tests/tasks/graph/code_graph_test_data_generation.py deleted file mode 100644 index 74ca2de71..000000000 --- a/cognee/tests/tasks/graph/code_graph_test_data_generation.py +++ /dev/null @@ -1,51 +0,0 @@ -import random -import string - -import numpy as np - -from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship - - -def random_str(n, spaces=True): - candidates = string.ascii_letters + string.digits - if spaces: - candidates += " " - return "".join(random.choice(candidates) for _ in range(n)) - - -def code_graph_test_data_generation(): - nodes = [ - CodeFile( - extracted_id=random_str(10, spaces=False), - type="file", - source_code=random_str(random.randrange(50, 500)), - ) - for _ in range(100) - ] - n_nodes = len(nodes) - first_source = np.random.randint(0, n_nodes) - reached_nodes = {first_source} - last_iteration = [first_source] - edges = [] - while len(reached_nodes) < n_nodes: - for source in last_iteration: - last_iteration = [] - tries = 0 - while ((len(last_iteration) == 0 or tries < 500)) and ( - len(reached_nodes) < n_nodes - ): - tries += 1 - target = np.random.randint(n_nodes) - if target not in reached_nodes: - last_iteration.append(target) - edges.append( - CodeRelationship( - source_id=nodes[source].extracted_id, - target_id=nodes[target].extracted_id, - type="files", - relation="depends", - ) - ) - reached_nodes = reached_nodes.union(set(last_iteration)) - - return (nodes, edges) diff --git a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py b/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py deleted file mode 100644 index 755840b01..000000000 --- a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py +++ /dev/null @@ -1,27 +0,0 @@ -import asyncio - -import pytest - -from cognee.shared.CodeGraphEntities import Repository -from cognee.tasks.graph.convert_graph_from_code_graph import ( - convert_graph_from_code_graph, -) -from cognee.tests.tasks.graph.code_graph_test_data_generation import ( - code_graph_test_data_generation, -) - - -def test_convert_graph_from_code_graph(): - repo = Repository(path="test/repo/path") - nodes, edges = code_graph_test_data_generation() - repo_out, nodes_out, edges_out = asyncio.run( - convert_graph_from_code_graph(repo, nodes, edges) - ) - - assert repo == repo_out, f"{repo = } != {repo_out = }" - - for node_in, node_out in zip(nodes, nodes_out): - assert node_in == node_out, f"{node_in = } != {node_out = }" - - for edge_in, edge_out in zip(edges, edges_out): - assert edge_in == edge_out, f"{edge_in = } != {edge_out = }" diff --git a/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py new file mode 100644 index 000000000..f75b84826 --- /dev/null +++ b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py @@ -0,0 +1,100 @@ +import asyncio +import random +import time +from typing import List +from uuid import uuid5, NAMESPACE_OID + +from cognee.infrastructure.engine import DataPoint +from cognee.modules.graph.utils import get_graph_from_model + +random.seed(1500) + +class Repository(DataPoint): + path: str + +class CodeFile(DataPoint): + part_of: Repository + contains: List["CodePart"] = [] + depends_on: List["CodeFile"] = [] + source_code: str + +class CodePart(DataPoint): + part_of: CodeFile + source_code: str + +CodeFile.model_rebuild() +CodePart.model_rebuild() + + +def nanoseconds_to_largest_unit(nanoseconds): + # Define conversion factors + conversion_factors = { + 'weeks': 7 * 24 * 60 * 60 * 1e9, + 'days': 24 * 60 * 60 * 1e9, + 'hours': 60 * 60 * 1e9, + 'minutes': 60 * 1e9, + 'seconds': 1e9, + 'miliseconds': 1e6, + 'microseconds': 1e3, + } + + # Iterate through conversion factors to find the largest unit + for unit, factor in conversion_factors.items(): + converted_value = nanoseconds / factor + if converted_value >= 1: + return converted_value, unit + + # If nanoseconds is smaller than a second + return nanoseconds, 'nanoseconds' + + +async def test_circular_reference_extraction(): + repo = Repository(path = "repo1") + + code_files = [CodeFile( + id = uuid5(NAMESPACE_OID, f"file{file_index}"), + source_code = "source code", + part_of = repo, + contains = [], + depends_on = [CodeFile( + id = uuid5(NAMESPACE_OID, f"file{random_id}"), + source_code = "source code", + part_of = repo, + depends_on = [], + ) for random_id in [random.randint(0, 1499) for _ in range(random.randint(0, 5))]], + ) for file_index in range(1500)] + + for code_file in code_files: + code_file.contains.extend([CodePart( + part_of = code_file, + source_code = f"Part {part_index}", + ) for part_index in range(random.randint(1, 20))]) + + nodes = [] + edges = [] + + start = time.perf_counter_ns() + + results = await asyncio.gather(*[ + get_graph_from_model(code_file) for code_file in code_files + ]) + + time_to_run = time.perf_counter_ns() - start + + print(nanoseconds_to_largest_unit(time_to_run)) + + for result_nodes, result_edges in results: + nodes.extend(result_nodes) + edges.extend(result_edges) + + # for code_file in code_files: + # model_nodes, model_edges = get_graph_from_model(code_file) + + # nodes.extend(model_nodes) + # edges.extend(model_edges) + + assert len(nodes) == 1501 + assert len(edges) == 1501 * 20 + 1500 * 5 + +if __name__ == "__main__": + asyncio.run(test_circular_reference_extraction()) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index d2a1b6c59..a3755a58f 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -8,7 +8,7 @@ def test_node_initialization(): """Test that a Node is initialized correctly.""" node = Node("node1", {"attr1": "value1"}, dimension=2) assert node.id == "node1" - assert node.attributes == {"attr1": "value1"} + assert node.attributes == {"attr1": "value1", 'vector_distance': np.inf} assert len(node.status) == 2 assert np.all(node.status == 1) @@ -95,7 +95,7 @@ def test_edge_initialization(): edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2) assert edge.node1 == node1 assert edge.node2 == node2 - assert edge.attributes == {"weight": 10} + assert edge.attributes == {'vector_distance': np.inf,"weight": 10} assert edge.directed is False assert len(edge.status) == 2 assert np.all(edge.status == 1) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index d05292d75..bad474023 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -77,11 +77,11 @@ def test_get_edges_success(setup_graph): graph.add_node(node2) edge = Edge(node1, node2) graph.add_edge(edge) - assert edge in graph.get_edges("node1") + assert edge in graph.get_edges_of_node("node1") def test_get_edges_nonexistent_node(setup_graph): """Test retrieving edges for a nonexistent node raises an exception.""" graph = setup_graph with pytest.raises(ValueError, match="Node with id nonexistent does not exist."): - graph.get_edges("nonexistent") + graph.get_edges_of_node("nonexistent") diff --git a/docker-compose.yml b/docker-compose.yml index afb216169..9c40979bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,7 +46,7 @@ services: - 7687:7687 environment: - NEO4J_AUTH=neo4j/pleaseletmein - - NEO4J_PLUGINS=["apoc"] + - NEO4J_PLUGINS=["apoc", "graph-data-science"] networks: - cognee-network diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index ec93bda07..0a4806e3f 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -8,30 +8,63 @@ from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE import cognee + +from cognee.shared.data_models import SummarizedContent +from cognee.shared.utils import render_graph +from cognee.tasks.repo_processor import ( + enrich_dependency_graph, + expand_dependency_graph, + get_repo_file_dependencies, +) +from cognee.tasks.storage import add_data_points +from cognee.tasks.summarization import summarize_code +from cognee.modules.pipelines import Task, run_tasks from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt from evals.eval_utils import download_instances +from evals.eval_utils import ingest_repos +from evals.eval_utils import download_github_repo +from evals.eval_utils import delete_repo - -async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance): await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) + await cognee.prune.prune_system() - dataset_name = "SWE_test_data" - code_text = instance["text"] - await cognee.add([code_text], dataset_name) - await code_graph_pipeline([dataset_name]) - graph_engine = await get_graph_engine() - with open(graph_engine.filename, "r") as f: - graph_str = f.read() + #dataset_name = "SWE_test_data" + + #await cognee.add('', dataset_name = dataset_name) + + # repo_path = download_github_repo(instance, '../RAW_GIT_REPOS') + + repo_path = '/Users/borisarzentar/Projects/graphrag' + + tasks = [ + Task(get_repo_file_dependencies), + Task(add_data_points), + Task(enrich_dependency_graph), + Task(expand_dependency_graph), + Task(add_data_points), + # Task(summarize_code, summarization_model = SummarizedContent), + ] + + pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline") + + async for result in pipeline: + print(result) + + print('Here we have the repo under the repo_path') + + await render_graph() problem_statement = instance['problem_statement'] instructions = read_query_prompt("patch_gen_instructions.txt") + graph_str = 'HERE WE SHOULD PASS THE TRIPLETS FROM GRAPHRAG' + prompt = "\n".join([ instructions, "", @@ -41,14 +74,18 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): graph_str ]) + return 0 + + ''' :TODO: We have to find out how do we do the generation llm_client = get_llm_client() answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, ) - return answer_prediction + return answer_prediction + ''' async def generate_patch_without_cognee(instance): problem_statement = instance['problem_statement'] @@ -71,11 +108,16 @@ async def get_preds(dataset, with_cognee=True): model_name = "without_cognee" pred_func = generate_patch_without_cognee + + for instance in dataset: + await pred_func(instance) + + ''' preds = [{"instance_id": instance["instance_id"], "model_patch": await pred_func(instance), "model_name_or_path": model_name} for instance in dataset] - - return preds + ''' + return 0 async def main(): @@ -115,4 +157,5 @@ async def main(): if __name__ == "__main__": import asyncio + asyncio.run(main(), debug=True) diff --git a/evals/eval_utils.py b/evals/eval_utils.py index e95a84cec..3192127dc 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -8,7 +8,8 @@ from swebench.inference.make_datasets.create_instance import make_code_text from swebench.inference.make_datasets.utils import (AutoContextManager, ingest_directory_contents) from tqdm.auto import tqdm - +from git import Repo +import shutil def ingest_files(filenames): files_dict = dict() @@ -101,3 +102,56 @@ def download_instances( dataset = create_dataset(input_instances_with_text) dataset.save_to_disk(path) return dataset + + +def download_github_repo(instance, output_dir): + """ + Downloads a GitHub repository and checks out the specified commit. + + Args: + instance (dict): Dictionary containing 'repo', 'base_commit', and 'instance_id'. + output_dir (str): Directory to store the downloaded repositories. + + Returns: + str: Path to the downloaded repository. + """ + repo_owner_repo = instance['repo'] + base_commit = instance['base_commit'] + instance_id = instance['instance_id'] + + repo_url = f"https://github.com/{repo_owner_repo}.git" + + repo_path = os.path.abspath(os.path.join(output_dir, instance_id)) + + # Clone repository if it doesn't already exist + if not os.path.exists(repo_path): + print(f"Cloning {repo_url} to {repo_path}...") + Repo.clone_from(repo_url, repo_path) + else: + print(f"Repository already exists at {repo_path}.") + + + repo = Repo(repo_path) + repo.git.checkout(base_commit) + + return repo_path + + +def delete_repo(repo_path): + """ + Deletes the specified repository directory. + + Args: + repo_path (str): Path to the repository to delete. + + Returns: + None + """ + try: + if os.path.exists(repo_path): + shutil.rmtree(repo_path) + print(f"Deleted repository at {repo_path}.") + else: + print(f"Repository path {repo_path} does not exist. Nothing to delete.") + except Exception as e: + print(f"Error deleting repository at {repo_path}: {e}") \ No newline at end of file diff --git a/examples/python/code_graph_pipeline.py b/examples/python/code_graph_pipeline.py index fe4be371b..52c1e0474 100644 --- a/examples/python/code_graph_pipeline.py +++ b/examples/python/code_graph_pipeline.py @@ -1,22 +1,11 @@ -import argparse import asyncio -import os - from cognee.modules.pipelines import Task, run_tasks -from cognee.shared.CodeGraphEntities import CodeRelationship, Repository -from cognee.shared.data_models import SummarizedContent -from cognee.tasks.code.get_local_dependencies_checker import ( - get_local_script_dependencies, -) -from cognee.tasks.graph.convert_graph_from_code_graph import ( - create_code_file, - convert_graph_from_code_graph, -) from cognee.tasks.repo_processor import ( enrich_dependency_graph, expand_dependency_graph, - get_repo_dependency_graph, + get_repo_file_dependencies, ) +from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_code @@ -24,58 +13,24 @@ async def print_results(pipeline): async for result in pipeline: print(result) - -async def get_local_script_dependencies_wrapper(script_path, repo_path): - dependencies = await get_local_script_dependencies(script_path, repo_path) - return (script_path, dependencies) - - -async def scan_repo(path, condition): - futures = [] - for root, dirs, files in os.walk(path): - for file in files: - if condition(file): - futures.append( - get_local_script_dependencies_wrapper( - os.path.abspath(f"{root}/{file}"), path - ) - ) - results = await asyncio.gather(*futures) - - code_files = {} - code_relationships = [] - for abspath, dependencies in results: - code_file, abspath = create_code_file(abspath, "python_file") - code_files[abspath] = code_file - - for dependency in dependencies: - dependency_code_file, dependency_abspath = create_code_file( - dependency, "python_file" - ) - code_files[dependency_abspath] = dependency_code_file - code_relationship = CodeRelationship( - source_id=abspath, - target_id=dependency_abspath, - type="files", - relation="depends_on", - ) - code_relationships.append(code_relationship) - - return (Repository(path=path), list(code_files.values()), code_relationships) - - if __name__ == "__main__": + ''' parser = argparse.ArgumentParser(description="Process a file path") parser.add_argument("path", help="Path to the file") args = parser.parse_args() abspath = os.path.abspath(args.path or ".") + ''' + + abspath = '/Users/laszlohajdu/Documents/Github/RAW_GIT_REPOS/astropy__astropy-12907' tasks = [ - Task(get_repo_dependency_graph), + Task(get_repo_file_dependencies), + Task(add_data_points), Task(enrich_dependency_graph), Task(expand_dependency_graph), - Task(convert_graph_from_code_graph), - Task(summarize_code, summarization_model = SummarizedContent), + Task(add_data_points), + # Task(summarize_code, summarization_model = SummarizedContent), ] pipeline = run_tasks(tasks, abspath, "cognify_code_pipeline") + asyncio.run(print_results(pipeline)) diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 309aea82c..49b41db1c 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,32 +1,6 @@ import cognee import asyncio -from cognee.api.v1.search import SearchType - -job_position = """0:Senior Data Scientist (Machine Learning) - -Company: TechNova Solutions -Location: San Francisco, CA - -Job Description: - -TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights. - -Responsibilities: - -Develop and implement advanced machine learning algorithms and models. -Analyze large, complex datasets to extract meaningful patterns and insights. -Collaborate with cross-functional teams to integrate predictive models into products. -Stay updated with the latest advancements in machine learning and data science. -Mentor junior data scientists and provide technical guidance. -Qualifications: - -Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field. -5+ years of experience in data science and machine learning. -Proficient in Python, R, and SQL. -Experience with deep learning frameworks (e.g., TensorFlow, PyTorch). -Strong problem-solving skills and attention to detail. -Candidate CVs -""" +from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever job_1 = """ CV 1: Relevant @@ -195,7 +169,7 @@ async def main(enable_steps): # Step 2: Add text if enable_steps.get("add_text"): - text_list = [job_position, job_1, job_2, job_3, job_4, job_5] + text_list = [job_1, job_2, job_3, job_4, job_5] for text in text_list: await cognee.add(text) print(f"Added text: {text[:35]}...") @@ -206,24 +180,21 @@ async def main(enable_steps): print("Knowledge graph created.") # Step 4: Query insights - if enable_steps.get("search_insights"): - search_results = await cognee.search( - SearchType.INSIGHTS, - {'query': 'Which applicant has the most relevant experience in data science?'} - ) - print("Search results:") - for result_text in search_results: - print(result_text) + if enable_steps.get("retriever"): + await two_step_retriever('Who has Phd?') if __name__ == '__main__': # Flags to enable/disable steps + + rebuild_kg = True + retrieve = True steps_to_enable = { - "prune_data": True, - "prune_system": True, - "add_text": True, - "cognify": True, - "search_insights": True + "prune_data": rebuild_kg, + "prune_system": rebuild_kg, + "add_text": rebuild_kg, + "cognify": rebuild_kg, + "retriever": retrieve } asyncio.run(main(steps_to_enable)) diff --git a/examples/python/multimedia_example.py b/examples/python/multimedia_example.py new file mode 100644 index 000000000..6c8bc5995 --- /dev/null +++ b/examples/python/multimedia_example.py @@ -0,0 +1,48 @@ +import os +import asyncio +import pathlib + +import cognee +from cognee.api.v1.search import SearchType + +# Prerequisites: +# 1. Copy `.env.template` and rename it to `.env`. +# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field: +# LLM_API_KEY = "your_key_here" + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + # cognee knowledge graph will be created based on the text + # and description of these files + mp3_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + ".data/multimedia/text_to_speech.mp3", + ) + png_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + ".data/multimedia/example.png", + ) + + # Add the files, and make it available for cognify + await cognee.add([mp3_file_path, png_file_path]) + + # Use LLMs and cognee to create knowledge graph + await cognee.cognify() + + # Query cognee for summaries of the data in the multimedia files + search_results = await cognee.search( + SearchType.SUMMARIES, + query_text="What is in the multimedia files?", + ) + + # Display search results + for result_text in search_results: + print(result_text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/python/simple_example.py b/examples/python/simple_example.py index 47940ca6e..55b07c4c3 100644 --- a/examples/python/simple_example.py +++ b/examples/python/simple_example.py @@ -1,5 +1,4 @@ import asyncio - import cognee from cognee.api.v1.search import SearchType @@ -11,29 +10,57 @@ from cognee.api.v1.search import SearchType async def main(): # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") # cognee knowledge graph will be created based on this text text = """ Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. """ - + + print("Adding text to cognee:") + print(text.strip()) # Add the text, and make it available for cognify await cognee.add(text) + print("Text added successfully.\n") + + print("Running cognify to create knowledge graph...\n") + print("Cognify process steps:") + print("1. Classifying the document: Determining the type and category of the input text.") + print("2. Checking permissions: Ensuring the user has the necessary rights to process the text.") + print("3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis.") + print("4. Adding data points: Storing the extracted chunks for processing.") + print("5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph.") + print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n") + # Use LLMs and cognee to create knowledge graph await cognee.cognify() + print("Cognify process complete.\n") + + query_text = 'Tell me about NLP' + print(f"Searching cognee for insights with query: '{query_text}'") # Query cognee for insights on the added text search_results = await cognee.search( - SearchType.INSIGHTS, query_text='Tell me about NLP' + SearchType.INSIGHTS, query_text=query_text ) - - # Display search results + + print("Search results:") + # Display results for result_text in search_results: print(result_text) + # Example output: + # ({'id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'natural language processing', 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'target_node_id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 15, 473137, tzinfo=datetime.timezone.utc)}, {'id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'computer science', 'description': 'The study of computation and information processing.'}) + # (...) + # It represents nodes and relationships in the knowledge graph: + # - The first element is the source node (e.g., 'natural language processing'). + # - The second element is the relationship between nodes (e.g., 'is_a_subfield_of'). + # - The third element is the target node (e.g., 'computer science'). + if __name__ == '__main__': asyncio.run(main()) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 67bb4e07f..13fcb8cb4 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "df16431d0f48b006", "metadata": { "ExecuteTime": { @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9086abf3af077ab4", "metadata": { "ExecuteTime": { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a9de0cc07f798b7f", "metadata": { "ExecuteTime": { @@ -393,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "185ff1c102d06111", "metadata": { "ExecuteTime": { @@ -437,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "d55ce4c58f8efb67", "metadata": { "ExecuteTime": { @@ -479,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "ca4ecc32721ad332", "metadata": { "ExecuteTime": { @@ -529,14 +529,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bce39dc6", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", - "# # Setting environment variables\n", + "# Setting environment variables\n", "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", "\n", @@ -546,24 +546,26 @@ "if \"LLM_API_KEY\" not in os.environ:\n", " os.environ[\"LLM_API_KEY\"] = \"\"\n", "\n", - "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n", + "# \"neo4j\" or \"networkx\"\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", "# Not needed if using networkx\n", - "#GRAPH_DATABASE_URL=\"\"\n", - "#GRAPH_DATABASE_USERNAME=\"\"\n", - "#GRAPH_DATABASE_PASSWORD=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", "\n", - "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n", - "# Not needed if using \"lancedb\"\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", - "# Database provider\n", - "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", "\n", - "# Postgres specific parameters (Only if Postgres is run)\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", @@ -620,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "7c431fdef4921ae0", "metadata": { "ExecuteTime": { diff --git a/notebooks/cognee_llama_index.ipynb b/notebooks/cognee_llama_index.ipynb index 742c2f51c..ec899aaea 100644 --- a/notebooks/cognee_llama_index.ipynb +++ b/notebooks/cognee_llama_index.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -90,23 +90,23 @@ "# \"neo4j\" or \"networkx\"\n", "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", "# Not needed if using networkx\n", - "#GRAPH_DATABASE_URL=\"\"\n", - "#GRAPH_DATABASE_USERNAME=\"\"\n", - "#GRAPH_DATABASE_PASSWORD=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", "\n", - "# \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", - "# Not needed if using \"lancedb\"\n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", "\n", - "# Database provider\n", - "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", "\n", "# Database name\n", "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", "\n", - "# Postgres specific parameters (Only if Postgres is run)\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", "# os.environ[\"DB_PORT\"]=\"5432\"\n", "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", @@ -130,8 +130,6 @@ "\n", "from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n", "from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n", - "from cognee.infrastructure.databases.graph import get_graph_engine\n", - "from cognee.shared.utils import render_graph\n", "from cognee.modules.users.models import User\n", "from cognee.modules.users.methods import get_default_user\n", "from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n", @@ -196,6 +194,9 @@ "source": [ "import graphistry\n", "\n", + "from cognee.infrastructure.databases.graph import get_graph_engine\n", + "from cognee.shared.utils import render_graph\n", + "\n", "# Get graph\n", "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", "graph_engine = await get_graph_engine()\n", diff --git a/notebooks/cognee_multimedia_demo.ipynb b/notebooks/cognee_multimedia_demo.ipynb new file mode 100644 index 000000000..2d35132f6 --- /dev/null +++ b/notebooks/cognee_multimedia_demo.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cognee GraphRAG with Multimedia files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## Load Data\n", + "\n", + "We will use a few sample multimedia files which we have on GitHub for easy access." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "\n", + "# cognee knowledge graph will be created based on the text\n", + "# and description of these files\n", + "mp3_file_path = os.path.join(\n", + " os.path.abspath(''), \"../\",\n", + " \".data/multimedia/text_to_speech.mp3\",\n", + ")\n", + "png_file_path = os.path.join(\n", + " os.path.abspath(''), \"../\",\n", + " \".data/multimedia/example.png\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Setting environment variables\n", + "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", + "\n", + "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n", + " os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", + "\n", + "if \"LLM_API_KEY\" not in os.environ:\n", + " os.environ[\"LLM_API_KEY\"] = \"\"\n", + "\n", + "# \"neo4j\" or \"networkx\"\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n", + "# Not needed if using networkx\n", + "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n", + "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n", + "\n", + "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n", + "# Not needed if using \"lancedb\" or \"pgvector\"\n", + "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", + "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", + "\n", + "# Relational Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n", + "\n", + "# Database name\n", + "os.environ[\"DB_NAME\"]=\"cognee_db\"\n", + "\n", + "# Postgres specific parameters (Only if Postgres or PGVector is used)\n", + "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", + "# os.environ[\"DB_PORT\"]=\"5432\"\n", + "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", + "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Cognee with multimedia files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cognee\n", + "\n", + "# Create a clean slate for cognee -- reset data and system state\n", + "await cognee.prune.prune_data()\n", + "await cognee.prune.prune_system(metadata=True)\n", + "\n", + "# Add multimedia files and make them available for cognify\n", + "await cognee.add([mp3_file_path, png_file_path])\n", + "\n", + "# Create knowledge graph with cognee\n", + "await cognee.cognify()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query Cognee for summaries related to multimedia files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cognee.api.v1.search import SearchType\n", + "\n", + "# Query cognee for summaries of the data in the multimedia files\n", + "search_results = await cognee.search(\n", + " SearchType.SUMMARIES,\n", + " query_text=\"What is in the multimedia files?\",\n", + ")\n", + "\n", + "# Display search results\n", + "for result_text in search_results:\n", + " print(result_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 96f9aec27..01f7e44a4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6171,11 +6171,6 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, - {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},