diff --git a/README.md b/README.md index 6efb79b..e02e877 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,20 @@ Whether you like this implementation or not, I highly recommend you to read Deep - [中文微信公众号 第一讲](https://mp.weixin.qq.com/s/-pPhHDi2nz8hp5R3Lm_mww), [第二讲](https://mp.weixin.qq.com/s/apnorBj4TZs3-Mo23xUReQ) - [日本語: DeepSearch/DeepResearch 実装の実践ガイド](https://jina.ai/ja/news/a-practical-guide-to-implementing-deepsearch-deepresearch) -## Test it Yourself +## Try it Yourself We host an online deployment of this **exact** codebase, which allows you to do a vibe-check; or use it as daily productivity tools. https://search.jina.ai +The official API is also available for you to use: + +``` +https://deepsearch.jina.ai/v1/chat/completions +``` + +Learn more about the API at https://jina.ai/deepsearch + @@ -323,31 +331,3 @@ flowchart TD BeastMode --> FinalAnswer[Generate final answer] --> End ``` - -## Evaluation - -I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not. - -I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer. - -```bash -npm run eval ./src/evals/questions.json -``` - -Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set. - -Plain `gemini-2.0-flash` can be run by setting `tokenBudget` to zero, skipping the while-loop and directly answering the question. - -It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as I intentionally filtered out the questions that LLMs can answer. - -| Metric | gemini-2.0-flash | #188f1bb | -|--------|------------------|----------| -| Pass Rate | 0% | 75% | -| Average Steps | 1 | 4 | -| Maximum Steps | 1 | 13 | -| Minimum Steps | 1 | 2 | -| Median Steps | 1 | 3 | -| Average Tokens | 428 | 68,574 | -| Median Tokens | 434 | 31,541 | -| Maximum Tokens | 463 | 363,655 | -| Minimum Tokens | 374 | 7,963 | diff --git a/src/agent.ts b/src/agent.ts index 9bd9d29..782e753 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -279,7 +279,7 @@ async function executeSearchQueries( const newKnowledge: KnowledgeItem[] = []; const searchedQueries: string[] = []; context.actionTracker.trackThink('search_for', SchemaGen.languageCode, {keywords: uniqQOnly.join(', ')}); - + let utilityScore = 0; for (const query of keywordsQueries) { let results: SearchResult[] = []; const oldQuery = query.q; @@ -328,7 +328,7 @@ async function executeSearchQueries( .filter(Boolean) as SearchSnippet[]; // Filter out null entries and assert type minResults.forEach(r => { - addToAllURLs(r, allURLs); + utilityScore = utilityScore + addToAllURLs(r, allURLs); }); searchedQueries.push(query.q) @@ -340,6 +340,8 @@ async function executeSearchQueries( updated: query.tbs ? formatDateRange(query) : undefined }); } + + console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`); return { newKnowledge, searchedQueries diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index 8c8a5d9..b52579f 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -275,14 +275,16 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: export const addToAllURLs = (r: SearchSnippet, allURLs: Record, weightDelta = 1) => { const nURL = normalizeUrl(r.url); - if (!nURL) return; + if (!nURL) return 0; if (!allURLs[nURL]) { allURLs[nURL] = r; allURLs[nURL].weight = weightDelta; + return 1; } else { (allURLs[nURL].weight as number) += weightDelta; const curDesc = allURLs[nURL].description; allURLs[nURL].description = smartMergeStrings(curDesc, r.description); + return 0; } }