feat: improve prompting

This commit is contained in:
Han Xiao
2025-02-10 12:10:46 +08:00
parent 18f0312c38
commit 441654ac5d
9 changed files with 177 additions and 31 deletions

View File

@@ -289,7 +289,7 @@ I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./
I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.
```bash
npm run eval ./src/evals/ego-questions
npm run eval ./src/evals/questions.json
```
Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set.

3
package-lock.json generated
View File

@@ -20,7 +20,8 @@
"express": "^4.21.2",
"node-fetch": "^3.3.2",
"undici": "^7.3.0",
"zod": "^3.22.4"
"zod": "^3.22.4",
"zod-to-json-schema": "^3.24.1"
},
"devDependencies": {
"@types/commander": "^2.12.0",

View File

@@ -37,7 +37,8 @@
"express": "^4.21.2",
"node-fetch": "^3.3.2",
"undici": "^7.3.0",
"zod": "^3.22.4"
"zod": "^3.22.4",
"zod-to-json-schema": "^3.24.1"
},
"devDependencies": {
"@types/commander": "^2.12.0",

View File

@@ -1,6 +1,6 @@
import {z} from 'zod';
import {z, ZodObject} from 'zod';
import {generateObject} from 'ai';
import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP, LLM_PROVIDER} from "./config";
import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP} from "./config";
import {readUrl} from "./tools/read";
import {handleGenerateObjectError} from './utils/error-handling';
import fs from 'fs/promises';
@@ -8,14 +8,15 @@ import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
import {braveSearch} from "./tools/brave-search";
import {rewriteQuery} from "./tools/query-rewriter";
import {dedupQueries} from "./tools/jina-dedup";
import {evaluateAnswer} from "./tools/evaluator";
import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker";
import {StepAction, AnswerAction} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
import {grounding} from "./tools/grounding";
// import {grounding} from "./tools/grounding";
import { zodToJsonSchema } from "zod-to-json-schema";
async function sleep(ms: number) {
const seconds = Math.ceil(ms / 1000);
@@ -43,7 +44,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
properties.references = z.array(
z.object({
exactQuote: z.string().describe("Exact relevant quote from the document"),
url: z.string().describe("URL of the document; must be directly from the context")
url: z.string().describe("source URL; must be directly from the context")
}).required()
).describe("Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document").optional();
}
@@ -291,6 +292,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
let step = 0;
let totalStep = 0;
let badAttempts = 0;
let schema: ZodObject<any> = getSchema(true, true, true, true)
const gaps: string[] = [question]; // All questions to be answered including the orginal question
const allQuestions = [question];
const allKeywords = [];
@@ -307,6 +309,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
const allURLs: Record<string, string> = {};
const visitedURLs: string[] = [];
const evaluationMetrics: Record<string, any[]> = {};
while (context.tokenTracker.getTotalUsage() < tokenBudget && badAttempts <= maxBadAttempts) {
// add 1s delay to avoid rate limiting
await sleep(STEP_SLEEP);
@@ -317,6 +320,10 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
console.log('Gaps:', gaps);
allowReflect = allowReflect && (gaps.length <= 1);
const currentQuestion = gaps.length > 0 ? gaps.shift()! : question;
if (!evaluationMetrics[currentQuestion]) {
evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker)
}
// update all urls with buildURLMap
allowRead = allowRead && (Object.keys(allURLs).length > 0);
allowSearch = allowSearch && (Object.keys(allURLs).length < 50); // disable search when too many urls already
@@ -336,14 +343,14 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
allURLs,
false
);
schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch)
const model = getModel('agent');
let object;
let totalTokens = 0;
try {
const result = await generateObject({
model,
schema: getSchema(allowReflect, allowRead, allowAnswer, allowSearch),
schema,
prompt,
maxTokens: getMaxTokens('agent')
});
@@ -384,7 +391,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
});
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
['definitive', 'freshness', 'plurality'], context.tokenTracker);
evaluationMetrics[currentQuestion], context.tokenTracker);
if (currentQuestion === question) {
if (evaluation.pass) {
@@ -437,6 +444,13 @@ ${evaluation.think}
evaluation: evaluation.think,
...errorAnalysis
});
if (errorAnalysis.questionsToAnswer) {
gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
gaps.push(question); // always keep the original question in the gaps
}
badAttempts++;
allowAnswer = false; // disable answer action in the immediate next step
diaryContext = [];
@@ -504,7 +518,7 @@ But then you realized you have asked them before. You decided to to think out of
keywordsQueries = dedupedQueries;
if (keywordsQueries.length > 0) {
let googleGrounded = '';
// let googleGrounded = '';
const searchResults = [];
for (const query of keywordsQueries) {
console.log(`Search query: ${query}`);
@@ -515,9 +529,9 @@ But then you realized you have asked them before. You decided to to think out of
case 'jina':
// use jinaSearch
results = {results: (await search(query, context.tokenTracker)).response?.data || []};
if (LLM_PROVIDER === 'gemini') {
googleGrounded = await grounding(query, context.tokenTracker);
}
// if (LLM_PROVIDER === 'gemini') {
// googleGrounded = await grounding(query, context.tokenTracker);
// }
break;
case 'duck':
results = await duckSearch(query, {safeSearch: SafeSearchType.STRICT});
@@ -556,7 +570,8 @@ But then you realized you have asked them before. You decided to to think out of
allKnowledge.push({
question: `What do Internet say about ${thisStep.searchQuery}?`,
answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
answer: removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
// answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')),
// flatten into one url list, and take unique urls
references: searchResults.map(r => r.results.map(r => r.url)).flat().filter((v, i, a) => a.indexOf(v) === i),
type: 'side-info'
@@ -645,10 +660,10 @@ You decided to think out of the box or cut from a completely different angle.`);
}
}
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
}
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
if (isAnswered) {
return {result: thisStep, context};
} else {
@@ -671,13 +686,14 @@ You decided to think out of the box or cut from a completely different angle.`);
true
);
schema = getSchema(false, false, true, false);
const model = getModel('agentBeastMode');
let object;
let totalTokens;
try {
const result = await generateObject({
model,
schema: getSchema(false, false, allowAnswer, false),
schema: schema,
prompt,
maxTokens: getMaxTokens('agentBeastMode')
});
@@ -688,7 +704,7 @@ You decided to think out of the box or cut from a completely different angle.`);
object = result.object;
totalTokens = result.totalTokens;
}
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
thisStep = object as StepAction;
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
context.tokenTracker.trackUsage('agent', totalTokens);
@@ -697,9 +713,15 @@ You decided to think out of the box or cut from a completely different angle.`);
}
}
async function storeContext(prompt: string, memory: any[][], step: number) {
async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
try {
await fs.writeFile(`prompt-${step}.txt`, prompt);
await fs.writeFile(`prompt-${step}.txt`, `
Prompt:
${prompt}
JSONSchema:
${JSON.stringify(zodToJsonSchema(schema), null, 2)}
`);
const [context, keywords, questions, knowledge] = memory;
await fs.writeFile('context.json', JSON.stringify(context, null, 2));
await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2));

View File

@@ -36,6 +36,19 @@ interface QueryRequest extends Request {
};
}
function buildMdFromAnswer(answer: AnswerAction) {
let refStr = '';
if (answer.references?.length > 0) {
refStr = `
## References
${answer.references.map((ref, i) => `
${i + 1}. [${ref.exactQuote}](${ref.url})`).join('')}`;
}
return `${answer.answer.replace(/\(REF_(\d+)\)/g, (_, num) => `[^${num}]`)}${refStr}`;
}
// OpenAI-compatible chat completions endpoint
app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
// Check authentication only if secret is set
@@ -175,7 +188,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
system_fingerprint: 'fp_' + requestId,
choices: [{
index: 0,
delta: { content: '</think>\n\n' },
delta: { content: `</think>\n\n` },
logprobs: null,
finish_reason: null
}]
@@ -191,7 +204,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
system_fingerprint: 'fp_' + requestId,
choices: [{
index: 0,
delta: { content: result.action === 'answer' ? (result as AnswerAction).answer : result.think },
delta: { content: result.action === 'answer' ? buildMdFromAnswer(result) : result.think },
logprobs: null,
finish_reason: 'stop'
}]
@@ -210,7 +223,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
index: 0,
message: {
role: 'assistant',
content: result.action === 'answer' ? (result as AnswerAction).answer : result.think
content: result.action === 'answer' ? buildMdFromAnswer(result): result.think
},
logprobs: null,
finish_reason: 'stop'

View File

@@ -10,7 +10,11 @@ const model = getModel('errorAnalyzer');
const responseSchema = z.object({
recap: z.string().describe('Recap of the actions taken and the steps conducted'),
blame: z.string().describe('Which action or the step was the root cause of the answer rejection'),
improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.')
improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.'),
questionsToAnswer: z.array(
z.string().describe("each question must be a single line, concise and clear. not composite or compound, less than 20 words.")
).max(2)
.describe("List of most important reflect questions to fill the knowledge gaps"),
});
@@ -93,7 +97,12 @@ The answer is not definitive and fails to provide the requested information. La
"blame": "The root cause of failure was getting stuck in a repetitive search pattern without adapting the strategy. Steps 4-5 repeated the same search, and step 6 deviated to less reliable entertainment sources instead of exploring business journals, news articles, or professional databases. Additionally, the process didn't attempt to triangulate age through indirect information like education history or career milestones.",
"improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements."
"improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements.",
"questionsToAnswer": [
"What alternative professional databases or news archives could provide reliable biographical information?",
"How can we use education history or career milestones to estimate age range?"
]
}
</output>
</example>

View File

@@ -231,6 +231,107 @@ Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
}
const questionEvaluationSchema = z.object({
needsFreshness: z.boolean().describe('Whether the question requires freshness check'),
needsPlurality: z.boolean().describe('Whether the question requires plurality check'),
reasoning: z.string().describe('Explanation of why these checks are needed or not needed')
});
function getQuestionEvaluationPrompt(question: string): string {
return `You are an evaluator that determines if a question requires freshness and/or plurality checks in addition to the required definitiveness check.
<evaluation_types>
1. freshness - Checks if the answer needs to be current and up-to-date
2. plurality - Checks if the answer needs to provide multiple items or a specific count
Note: Definitiveness check is always applied regardless of the question type
</evaluation_types>
<rules>
1. Freshness Evaluation:
- Required for questions about current state, recent events, or time-sensitive information
- Required for: prices, versions, leadership positions, status updates
- Look for terms: "current", "latest", "recent", "now", "today", "new"
- Consider company positions, product versions, market data time-sensitive
2. Plurality Evaluation:
- Required when question asks for multiple items or specific counts
- Check for: numbers ("5 examples"), plural nouns, list requests
- Look for: "all", "list", "enumerate", "examples", plural forms
- Required when question implies completeness ("all the reasons", "every factor")
3. Ordering Rules:
- Always include definitive check in the order
- Prioritize freshness for "current/latest" queries as outdated info invalidates other aspects
- Prioritize plurality for explicit numbered requests when freshness isn't critical
- Default order is: definitive -> freshness -> plurality
</rules>
<examples>
Question: "What is the current CEO of OpenAI?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position."
}
Question: "List all the AI companies in Berlin"
Evaluation: {
"needsFreshness": false,
"needsPlurality": true,
"reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive."
}
Question: "What are the top 5 latest AI models released by OpenAI?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": true,
"reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items."
}
Question: "Who created Python?"
Evaluation: {
"needsFreshness": false,
"needsPlurality": false,
"reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed."
}
</examples>
Now evaluate this question:
Question: ${JSON.stringify(question)}`;
}
export async function evaluateQuestion(
question: string,
tracker?: TokenTracker
): Promise<EvaluationType[]> {
try {
const result = await generateObject({
model: getModel('evaluator'),
schema: questionEvaluationSchema,
prompt: getQuestionEvaluationPrompt(question),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Question Evaluation:', result.object);
// Always include definitive in types
const types: EvaluationType[] = ['definitive'];
if (result.object.needsFreshness) types.push('freshness');
if (result.object.needsPlurality) types.push('plurality');
console.log('Question Metrics:', types)
// Always evaluate definitive first, then freshness (if needed), then plurality (if needed)
return types;
} catch (error) {
// Default to all evaluations in standard order if evaluation fails
console.error('Question evaluation failed:', error);
return ['definitive', 'freshness', 'plurality'];
}
}
export async function evaluateAnswer(
question: string,
answer: string,

View File

@@ -97,8 +97,6 @@ export interface ReadResponse {
}
export type EvaluationResponse = {
pass: boolean;
think: string;
@@ -121,6 +119,7 @@ export type ErrorAnalysisResponse = {
recap: string;
blame: string;
improvement: string;
questionsToAnswer: string[];
};
export interface SearchResult {
@@ -214,8 +213,8 @@ export interface ChatCompletionChunk {
}
// Tracker Types
import { TokenTracker } from './utils/token-tracker';
import { ActionTracker } from './utils/action-tracker';
import {TokenTracker} from './utils/token-tracker';
import {ActionTracker} from './utils/action-tracker';
export interface TrackerContext {
tokenTracker: TokenTracker;