mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
feat: improve prompting
This commit is contained in:
@@ -289,7 +289,7 @@ I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./
|
||||
I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.
|
||||
|
||||
```bash
|
||||
npm run eval ./src/evals/ego-questions
|
||||
npm run eval ./src/evals/questions.json
|
||||
```
|
||||
|
||||
Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set.
|
||||
|
||||
3
package-lock.json
generated
3
package-lock.json
generated
@@ -20,7 +20,8 @@
|
||||
"express": "^4.21.2",
|
||||
"node-fetch": "^3.3.2",
|
||||
"undici": "^7.3.0",
|
||||
"zod": "^3.22.4"
|
||||
"zod": "^3.22.4",
|
||||
"zod-to-json-schema": "^3.24.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/commander": "^2.12.0",
|
||||
|
||||
@@ -37,7 +37,8 @@
|
||||
"express": "^4.21.2",
|
||||
"node-fetch": "^3.3.2",
|
||||
"undici": "^7.3.0",
|
||||
"zod": "^3.22.4"
|
||||
"zod": "^3.22.4",
|
||||
"zod-to-json-schema": "^3.24.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/commander": "^2.12.0",
|
||||
|
||||
60
src/agent.ts
60
src/agent.ts
@@ -1,6 +1,6 @@
|
||||
import {z} from 'zod';
|
||||
import {z, ZodObject} from 'zod';
|
||||
import {generateObject} from 'ai';
|
||||
import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP, LLM_PROVIDER} from "./config";
|
||||
import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP} from "./config";
|
||||
import {readUrl} from "./tools/read";
|
||||
import {handleGenerateObjectError} from './utils/error-handling';
|
||||
import fs from 'fs/promises';
|
||||
@@ -8,14 +8,15 @@ import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
|
||||
import {braveSearch} from "./tools/brave-search";
|
||||
import {rewriteQuery} from "./tools/query-rewriter";
|
||||
import {dedupQueries} from "./tools/jina-dedup";
|
||||
import {evaluateAnswer} from "./tools/evaluator";
|
||||
import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
|
||||
import {analyzeSteps} from "./tools/error-analyzer";
|
||||
import {TokenTracker} from "./utils/token-tracker";
|
||||
import {ActionTracker} from "./utils/action-tracker";
|
||||
import {StepAction, AnswerAction} from "./types";
|
||||
import {TrackerContext} from "./types";
|
||||
import {search} from "./tools/jina-search";
|
||||
import {grounding} from "./tools/grounding";
|
||||
// import {grounding} from "./tools/grounding";
|
||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
|
||||
async function sleep(ms: number) {
|
||||
const seconds = Math.ceil(ms / 1000);
|
||||
@@ -43,7 +44,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
|
||||
properties.references = z.array(
|
||||
z.object({
|
||||
exactQuote: z.string().describe("Exact relevant quote from the document"),
|
||||
url: z.string().describe("URL of the document; must be directly from the context")
|
||||
url: z.string().describe("source URL; must be directly from the context")
|
||||
}).required()
|
||||
).describe("Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document").optional();
|
||||
}
|
||||
@@ -291,6 +292,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
let step = 0;
|
||||
let totalStep = 0;
|
||||
let badAttempts = 0;
|
||||
let schema: ZodObject<any> = getSchema(true, true, true, true)
|
||||
const gaps: string[] = [question]; // All questions to be answered including the orginal question
|
||||
const allQuestions = [question];
|
||||
const allKeywords = [];
|
||||
@@ -307,6 +309,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
|
||||
const allURLs: Record<string, string> = {};
|
||||
const visitedURLs: string[] = [];
|
||||
const evaluationMetrics: Record<string, any[]> = {};
|
||||
while (context.tokenTracker.getTotalUsage() < tokenBudget && badAttempts <= maxBadAttempts) {
|
||||
// add 1s delay to avoid rate limiting
|
||||
await sleep(STEP_SLEEP);
|
||||
@@ -317,6 +320,10 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
console.log('Gaps:', gaps);
|
||||
allowReflect = allowReflect && (gaps.length <= 1);
|
||||
const currentQuestion = gaps.length > 0 ? gaps.shift()! : question;
|
||||
if (!evaluationMetrics[currentQuestion]) {
|
||||
evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker)
|
||||
}
|
||||
|
||||
// update all urls with buildURLMap
|
||||
allowRead = allowRead && (Object.keys(allURLs).length > 0);
|
||||
allowSearch = allowSearch && (Object.keys(allURLs).length < 50); // disable search when too many urls already
|
||||
@@ -336,14 +343,14 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
allURLs,
|
||||
false
|
||||
);
|
||||
|
||||
schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch)
|
||||
const model = getModel('agent');
|
||||
let object;
|
||||
let totalTokens = 0;
|
||||
try {
|
||||
const result = await generateObject({
|
||||
model,
|
||||
schema: getSchema(allowReflect, allowRead, allowAnswer, allowSearch),
|
||||
schema,
|
||||
prompt,
|
||||
maxTokens: getMaxTokens('agent')
|
||||
});
|
||||
@@ -384,7 +391,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
});
|
||||
|
||||
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
|
||||
['definitive', 'freshness', 'plurality'], context.tokenTracker);
|
||||
evaluationMetrics[currentQuestion], context.tokenTracker);
|
||||
|
||||
if (currentQuestion === question) {
|
||||
if (evaluation.pass) {
|
||||
@@ -437,6 +444,13 @@ ${evaluation.think}
|
||||
evaluation: evaluation.think,
|
||||
...errorAnalysis
|
||||
});
|
||||
|
||||
if (errorAnalysis.questionsToAnswer) {
|
||||
gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
|
||||
allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
|
||||
gaps.push(question); // always keep the original question in the gaps
|
||||
}
|
||||
|
||||
badAttempts++;
|
||||
allowAnswer = false; // disable answer action in the immediate next step
|
||||
diaryContext = [];
|
||||
@@ -504,7 +518,7 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
keywordsQueries = dedupedQueries;
|
||||
|
||||
if (keywordsQueries.length > 0) {
|
||||
let googleGrounded = '';
|
||||
// let googleGrounded = '';
|
||||
const searchResults = [];
|
||||
for (const query of keywordsQueries) {
|
||||
console.log(`Search query: ${query}`);
|
||||
@@ -515,9 +529,9 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
case 'jina':
|
||||
// use jinaSearch
|
||||
results = {results: (await search(query, context.tokenTracker)).response?.data || []};
|
||||
if (LLM_PROVIDER === 'gemini') {
|
||||
googleGrounded = await grounding(query, context.tokenTracker);
|
||||
}
|
||||
// if (LLM_PROVIDER === 'gemini') {
|
||||
// googleGrounded = await grounding(query, context.tokenTracker);
|
||||
// }
|
||||
break;
|
||||
case 'duck':
|
||||
results = await duckSearch(query, {safeSearch: SafeSearchType.STRICT});
|
||||
@@ -556,7 +570,8 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
|
||||
allKnowledge.push({
|
||||
question: `What do Internet say about ${thisStep.searchQuery}?`,
|
||||
answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
|
||||
answer: removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
|
||||
// answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
|
||||
// flatten into one url list, and take unique urls
|
||||
references: searchResults.map(r => r.results.map(r => r.url)).flat().filter((v, i, a) => a.indexOf(v) === i),
|
||||
type: 'side-info'
|
||||
@@ -645,10 +660,10 @@ You decided to think out of the box or cut from a completely different angle.`);
|
||||
}
|
||||
}
|
||||
|
||||
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
}
|
||||
|
||||
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
if (isAnswered) {
|
||||
return {result: thisStep, context};
|
||||
} else {
|
||||
@@ -671,13 +686,14 @@ You decided to think out of the box or cut from a completely different angle.`);
|
||||
true
|
||||
);
|
||||
|
||||
schema = getSchema(false, false, true, false);
|
||||
const model = getModel('agentBeastMode');
|
||||
let object;
|
||||
let totalTokens;
|
||||
try {
|
||||
const result = await generateObject({
|
||||
model,
|
||||
schema: getSchema(false, false, allowAnswer, false),
|
||||
schema: schema,
|
||||
prompt,
|
||||
maxTokens: getMaxTokens('agentBeastMode')
|
||||
});
|
||||
@@ -688,7 +704,7 @@ You decided to think out of the box or cut from a completely different angle.`);
|
||||
object = result.object;
|
||||
totalTokens = result.totalTokens;
|
||||
}
|
||||
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
thisStep = object as StepAction;
|
||||
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
||||
context.tokenTracker.trackUsage('agent', totalTokens);
|
||||
@@ -697,9 +713,15 @@ You decided to think out of the box or cut from a completely different angle.`);
|
||||
}
|
||||
}
|
||||
|
||||
async function storeContext(prompt: string, memory: any[][], step: number) {
|
||||
async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
|
||||
try {
|
||||
await fs.writeFile(`prompt-${step}.txt`, prompt);
|
||||
await fs.writeFile(`prompt-${step}.txt`, `
|
||||
Prompt:
|
||||
${prompt}
|
||||
|
||||
JSONSchema:
|
||||
${JSON.stringify(zodToJsonSchema(schema), null, 2)}
|
||||
`);
|
||||
const [context, keywords, questions, knowledge] = memory;
|
||||
await fs.writeFile('context.json', JSON.stringify(context, null, 2));
|
||||
await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2));
|
||||
|
||||
@@ -36,6 +36,19 @@ interface QueryRequest extends Request {
|
||||
};
|
||||
}
|
||||
|
||||
function buildMdFromAnswer(answer: AnswerAction) {
|
||||
let refStr = '';
|
||||
if (answer.references?.length > 0) {
|
||||
refStr = `
|
||||
|
||||
## References
|
||||
${answer.references.map((ref, i) => `
|
||||
${i + 1}. [${ref.exactQuote}](${ref.url})`).join('')}`;
|
||||
}
|
||||
return `${answer.answer.replace(/\(REF_(\d+)\)/g, (_, num) => `[^${num}]`)}${refStr}`;
|
||||
}
|
||||
|
||||
|
||||
// OpenAI-compatible chat completions endpoint
|
||||
app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
// Check authentication only if secret is set
|
||||
@@ -175,7 +188,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
system_fingerprint: 'fp_' + requestId,
|
||||
choices: [{
|
||||
index: 0,
|
||||
delta: { content: '</think>\n\n' },
|
||||
delta: { content: `</think>\n\n` },
|
||||
logprobs: null,
|
||||
finish_reason: null
|
||||
}]
|
||||
@@ -191,7 +204,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
system_fingerprint: 'fp_' + requestId,
|
||||
choices: [{
|
||||
index: 0,
|
||||
delta: { content: result.action === 'answer' ? (result as AnswerAction).answer : result.think },
|
||||
delta: { content: result.action === 'answer' ? buildMdFromAnswer(result) : result.think },
|
||||
logprobs: null,
|
||||
finish_reason: 'stop'
|
||||
}]
|
||||
@@ -210,7 +223,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
index: 0,
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: result.action === 'answer' ? (result as AnswerAction).answer : result.think
|
||||
content: result.action === 'answer' ? buildMdFromAnswer(result): result.think
|
||||
},
|
||||
logprobs: null,
|
||||
finish_reason: 'stop'
|
||||
|
||||
@@ -10,7 +10,11 @@ const model = getModel('errorAnalyzer');
|
||||
const responseSchema = z.object({
|
||||
recap: z.string().describe('Recap of the actions taken and the steps conducted'),
|
||||
blame: z.string().describe('Which action or the step was the root cause of the answer rejection'),
|
||||
improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.')
|
||||
improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.'),
|
||||
questionsToAnswer: z.array(
|
||||
z.string().describe("each question must be a single line, concise and clear. not composite or compound, less than 20 words.")
|
||||
).max(2)
|
||||
.describe("List of most important reflect questions to fill the knowledge gaps"),
|
||||
});
|
||||
|
||||
|
||||
@@ -93,7 +97,12 @@ The answer is not definitive and fails to provide the requested information. La
|
||||
|
||||
"blame": "The root cause of failure was getting stuck in a repetitive search pattern without adapting the strategy. Steps 4-5 repeated the same search, and step 6 deviated to less reliable entertainment sources instead of exploring business journals, news articles, or professional databases. Additionally, the process didn't attempt to triangulate age through indirect information like education history or career milestones.",
|
||||
|
||||
"improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements."
|
||||
"improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements.",
|
||||
|
||||
"questionsToAnswer": [
|
||||
"What alternative professional databases or news archives could provide reliable biographical information?",
|
||||
"How can we use education history or career milestones to estimate age range?"
|
||||
]
|
||||
}
|
||||
</output>
|
||||
</example>
|
||||
|
||||
@@ -231,6 +231,107 @@ Question: ${JSON.stringify(question)}
|
||||
Answer: ${JSON.stringify(answer)}`;
|
||||
}
|
||||
|
||||
|
||||
const questionEvaluationSchema = z.object({
|
||||
needsFreshness: z.boolean().describe('Whether the question requires freshness check'),
|
||||
needsPlurality: z.boolean().describe('Whether the question requires plurality check'),
|
||||
reasoning: z.string().describe('Explanation of why these checks are needed or not needed')
|
||||
});
|
||||
|
||||
function getQuestionEvaluationPrompt(question: string): string {
|
||||
return `You are an evaluator that determines if a question requires freshness and/or plurality checks in addition to the required definitiveness check.
|
||||
|
||||
<evaluation_types>
|
||||
1. freshness - Checks if the answer needs to be current and up-to-date
|
||||
2. plurality - Checks if the answer needs to provide multiple items or a specific count
|
||||
Note: Definitiveness check is always applied regardless of the question type
|
||||
</evaluation_types>
|
||||
|
||||
<rules>
|
||||
1. Freshness Evaluation:
|
||||
- Required for questions about current state, recent events, or time-sensitive information
|
||||
- Required for: prices, versions, leadership positions, status updates
|
||||
- Look for terms: "current", "latest", "recent", "now", "today", "new"
|
||||
- Consider company positions, product versions, market data time-sensitive
|
||||
|
||||
2. Plurality Evaluation:
|
||||
- Required when question asks for multiple items or specific counts
|
||||
- Check for: numbers ("5 examples"), plural nouns, list requests
|
||||
- Look for: "all", "list", "enumerate", "examples", plural forms
|
||||
- Required when question implies completeness ("all the reasons", "every factor")
|
||||
|
||||
3. Ordering Rules:
|
||||
- Always include definitive check in the order
|
||||
- Prioritize freshness for "current/latest" queries as outdated info invalidates other aspects
|
||||
- Prioritize plurality for explicit numbered requests when freshness isn't critical
|
||||
- Default order is: definitive -> freshness -> plurality
|
||||
</rules>
|
||||
|
||||
<examples>
|
||||
Question: "What is the current CEO of OpenAI?"
|
||||
Evaluation: {
|
||||
"needsFreshness": true,
|
||||
"needsPlurality": false,
|
||||
"reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position."
|
||||
}
|
||||
|
||||
Question: "List all the AI companies in Berlin"
|
||||
Evaluation: {
|
||||
"needsFreshness": false,
|
||||
"needsPlurality": true,
|
||||
"reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive."
|
||||
}
|
||||
|
||||
Question: "What are the top 5 latest AI models released by OpenAI?"
|
||||
Evaluation: {
|
||||
"needsFreshness": true,
|
||||
"needsPlurality": true,
|
||||
"reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items."
|
||||
}
|
||||
|
||||
Question: "Who created Python?"
|
||||
Evaluation: {
|
||||
"needsFreshness": false,
|
||||
"needsPlurality": false,
|
||||
"reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed."
|
||||
}
|
||||
</examples>
|
||||
|
||||
Now evaluate this question:
|
||||
Question: ${JSON.stringify(question)}`;
|
||||
}
|
||||
|
||||
export async function evaluateQuestion(
|
||||
question: string,
|
||||
tracker?: TokenTracker
|
||||
): Promise<EvaluationType[]> {
|
||||
try {
|
||||
const result = await generateObject({
|
||||
model: getModel('evaluator'),
|
||||
schema: questionEvaluationSchema,
|
||||
prompt: getQuestionEvaluationPrompt(question),
|
||||
maxTokens: getMaxTokens('evaluator')
|
||||
});
|
||||
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||
console.log('Question Evaluation:', result.object);
|
||||
|
||||
// Always include definitive in types
|
||||
const types: EvaluationType[] = ['definitive'];
|
||||
if (result.object.needsFreshness) types.push('freshness');
|
||||
if (result.object.needsPlurality) types.push('plurality');
|
||||
|
||||
console.log('Question Metrics:', types)
|
||||
|
||||
// Always evaluate definitive first, then freshness (if needed), then plurality (if needed)
|
||||
return types;
|
||||
} catch (error) {
|
||||
// Default to all evaluations in standard order if evaluation fails
|
||||
console.error('Question evaluation failed:', error);
|
||||
return ['definitive', 'freshness', 'plurality'];
|
||||
}
|
||||
}
|
||||
|
||||
export async function evaluateAnswer(
|
||||
question: string,
|
||||
answer: string,
|
||||
|
||||
@@ -97,8 +97,6 @@ export interface ReadResponse {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
export type EvaluationResponse = {
|
||||
pass: boolean;
|
||||
think: string;
|
||||
@@ -121,6 +119,7 @@ export type ErrorAnalysisResponse = {
|
||||
recap: string;
|
||||
blame: string;
|
||||
improvement: string;
|
||||
questionsToAnswer: string[];
|
||||
};
|
||||
|
||||
export interface SearchResult {
|
||||
@@ -214,8 +213,8 @@ export interface ChatCompletionChunk {
|
||||
}
|
||||
|
||||
// Tracker Types
|
||||
import { TokenTracker } from './utils/token-tracker';
|
||||
import { ActionTracker } from './utils/action-tracker';
|
||||
import {TokenTracker} from './utils/token-tracker';
|
||||
import {ActionTracker} from './utils/action-tracker';
|
||||
|
||||
export interface TrackerContext {
|
||||
tokenTracker: TokenTracker;
|
||||
|
||||
Reference in New Issue
Block a user