refactor: schemas

This commit is contained in:
Han Xiao 2025-02-25 15:12:19 +08:00
parent 3226aedf48
commit 66490f3848
7 changed files with 637 additions and 548 deletions

View File

@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker";
import {StepAction, AnswerAction, KnowledgeItem, EvaluationCriteria, SearchResult} from "./types";
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
// import {grounding} from "./tools/grounding";
@ -21,6 +21,7 @@ import {CodeSandbox} from "./tools/code-sandbox";
import {serperSearch} from './tools/serper-search';
import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
async function sleep(ms: number) {
const seconds = Math.ceil(ms / 1000);
@ -28,66 +29,6 @@ async function sleep(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
const MAX_URLS_PER_STEP = 2
const MAX_QUERIES_PER_STEP = 5
const MAX_REFLECT_PER_STEP = 3
function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean, allowCoding: boolean, languageStyle: string = 'same language as the question') {
const actions: string[] = [];
const properties: Record<string, z.ZodTypeAny> = {
action: z.enum(['placeholder']), // Will update later with actual actions
think: z.string().describe(`Explain why choose this action, what's the chain-of-thought behind choosing this action, use the first-person narrative.`).max(500)
};
if (allowSearch) {
actions.push("search");
properties.searchRequests = z.array(
z.string().max(30)
.describe(`A natual language search request in ${languageStyle}. Based on the deep intention behind the original question and the expected answer format.`)).describe(`Required when action='search'. Always prefer a single request, only add another request if the original question covers multiple aspects or elements and one search request is definitely not enough, each request focus on one specific aspect of the original question. Minimize mutual information between each request. Maximum ${MAX_QUERIES_PER_STEP} search requests.`).max(MAX_QUERIES_PER_STEP);
}
if (allowCoding) {
actions.push("coding");
properties.codingIssue = z.string().max(500)
.describe("Required when action='coding'. Describe what issue to solve with coding, format like a github issue ticket. Specify the input value when it is short.").optional();
}
if (allowAnswer) {
actions.push("answer");
properties.references = z.array(
z.object({
exactQuote: z.string().describe("Exact relevant quote from the document, must be a soundbite, short and to the point, no fluff").max(30),
url: z.string().describe("source URL; must be directly from the context")
}).required()
).describe("Required when action='answer'. Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document").optional();
properties.answer = z.string()
.describe(`Required when action='answer'. Must be definitive, no ambiguity, uncertainty, or disclaimers. Must in ${languageStyle} and confident. Use markdown footnote syntax like [^1], [^2] to refer the corresponding reference item`).optional();
}
if (allowReflect) {
actions.push("reflect");
properties.questionsToAnswer = z.array(
z.string().describe("each question must be a single line, Questions must be: Original (not variations of existing questions); Focused on single concepts; Under 20 words; Non-compound/non-complex")
).max(MAX_REFLECT_PER_STEP)
.describe(`Required when action='reflect'. List of most important questions to fill the knowledge gaps of finding the answer to the original question. Maximum provide ${MAX_REFLECT_PER_STEP} reflect questions.`).optional();
}
if (allowRead) {
actions.push("visit");
properties.URLTargets = z.array(z.string())
.max(MAX_URLS_PER_STEP)
.describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`).optional();
}
// Update the enum values after collecting all actions
properties.action = z.enum(actions as [string, ...string[]])
.describe("Must match exactly one action type");
return z.object(properties);
}
function getPrompt(
context?: string[],
@ -192,7 +133,7 @@ ${learnedStrategy}
if (allURLs && allURLs.length > 0) {
urlList = allURLs
.filter(r => 'url' in r)
.map(r => ` + "${r.url}": "${r.title}"`)
.map(r => ` + "${r.url}": "${r.title}"`)
.join('\n');
}
@ -290,7 +231,6 @@ ${actionSections.join('\n\n')}
}
const allContext: StepAction[] = []; // all steps in the current session, including those leads to wrong results
function updateContext(step: any) {
@ -298,29 +238,31 @@ function updateContext(step: any) {
}
export async function getResponse(question?: string,
tokenBudget: number = 1_000_000,
maxBadAttempts: number = 3,
existingContext?: Partial<TrackerContext>,
messages?: Array<CoreAssistantMessage | CoreUserMessage>
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[] }> {
const context: TrackerContext = {
tokenTracker: existingContext?.tokenTracker || new TokenTracker(tokenBudget),
actionTracker: existingContext?.actionTracker || new ActionTracker()
};
let step = 0;
let totalStep = 0;
let badAttempts = 0;
let schema: ZodObject<any> = getSchema(true, true, true, true, true)
question = question?.trim() as string;
if (messages && messages.length > 0) {
question = (messages[messages.length - 1]?.content as string).trim();
} else {
messages = [{role: 'user', content: question.trim()}]
}
const SchemaGen = new Schemas(question);
const context: TrackerContext = {
tokenTracker: existingContext?.tokenTracker || new TokenTracker(tokenBudget),
actionTracker: existingContext?.actionTracker || new ActionTracker()
};
let schema: ZodObject<any> = SchemaGen.getAgentSchema(true, true, true, true, true)
const gaps: string[] = [question]; // All questions to be answered including the orginal question
const allQuestions = [question];
const allKeywords = [];
@ -338,7 +280,7 @@ export async function getResponse(question?: string,
const allURLs: Record<string, SearchResult> = {};
const visitedURLs: string[] = [];
const evaluationMetrics: Record<string, EvaluationCriteria> = {};
const evaluationMetrics: Record<string, EvaluationType[]> = {};
while (context.tokenTracker.getTotalUsage().totalTokens < tokenBudget && badAttempts <= maxBadAttempts) {
// add 1s delay to avoid rate limiting
step++;
@ -349,7 +291,8 @@ export async function getResponse(question?: string,
allowReflect = allowReflect && (gaps.length <= 1);
const currentQuestion: string = gaps.length > 0 ? gaps.shift()! : question
if (!evaluationMetrics[currentQuestion]) {
evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context)
evaluationMetrics[currentQuestion] =
await evaluateQuestion(currentQuestion, context, SchemaGen)
}
// update all urls with buildURLMap
@ -371,8 +314,7 @@ export async function getResponse(question?: string,
getUnvisitedURLs(allURLs, visitedURLs),
false,
);
schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch, allowCoding,
evaluationMetrics[currentQuestion].languageStyle)
schema = SchemaGen.getAgentSchema(allowReflect, allowRead, allowAnswer, allowSearch, allowCoding)
const generator = new ObjectGeneratorSafe(context.tokenTracker);
const result = await generator.generateObject({
model: 'agent',
@ -420,10 +362,11 @@ export async function getResponse(question?: string,
context.actionTracker.trackThink(`But wait, let me evaluate the answer first.`)
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep,
const evaluation = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion],
context,
visitedURLs
visitedURLs,
SchemaGen
);
if (currentQuestion.trim() === question) {
@ -462,7 +405,7 @@ The evaluator thinks your answer is bad because:
${evaluation.think}
`);
// store the bad context and reset the diary context
const {response: errorAnalysis} = await analyzeSteps(diaryContext, context);
const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);
allKnowledge.push({
question: currentQuestion,
@ -554,7 +497,7 @@ But then you realized you have asked them before. You decided to to think out of
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
// rewrite queries
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context);
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context, SchemaGen);
// avoid exisitng searched queries
keywordsQueries = chooseK((await dedupQueries(keywordsQueries, allKeywords, context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
@ -717,7 +660,7 @@ You decided to think out of the box or cut from a completely different angle.`);
allowRead = false;
}
} else if (thisStep.action === 'coding' && thisStep.codingIssue) {
const sandbox = new CodeSandbox({allContext, visitedURLs, allURLs, allKnowledge}, context);
const sandbox = new CodeSandbox({allContext, visitedURLs, allURLs, allKnowledge}, context, SchemaGen);
try {
const result = await sandbox.solve(thisStep.codingIssue);
allKnowledge.push({
@ -778,8 +721,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
true,
);
schema = getSchema(false, false, true, false, false,
evaluationMetrics[question]?.languageStyle || 'same language as the question');
schema = SchemaGen.getAgentSchema(false, false, true, false, false);
const generator = new ObjectGeneratorSafe(context.tokenTracker);
const result = await generator.generateObject({
model: 'agentBeastMode',

View File

@ -1,17 +1,7 @@
import { z } from 'zod';
import { ObjectGeneratorSafe } from "../utils/safe-generator";
import {TrackerContext} from "../types";
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {CodeGenResponse, TrackerContext} from "../types";
import {Schemas} from "../utils/schemas";
// Define the response schema for code generation
const codeGenerationSchema = z.object({
think: z.string().describe('Short explain or comments on the thought process behind the code, in first person.').max(200),
code: z.string().describe('The JavaScript code that solves the problem and always use \'return\' statement to return the result. Focus on solving the core problem; No need for error handling or try-catch blocks or code comments. No need to declare variables that are already available, especially big long strings or arrays.'),
});
// Define the types
interface CodeGenerationResponse {
code: string;
}
interface SandboxResult {
success: boolean;
@ -72,33 +62,36 @@ export class CodeSandbox {
private generator: ObjectGeneratorSafe;
private maxAttempts: number;
private context: Record<string, any>;
private schemaGen: Schemas;
constructor(
context: any = {},
trackers?: TrackerContext,
maxAttempts: number = 3
trackers: TrackerContext,
schemaGen: Schemas,
maxAttempts: number = 3,
) {
this.trackers = trackers;
this.generator = new ObjectGeneratorSafe(trackers?.tokenTracker);
this.maxAttempts = maxAttempts;
this.context = context;
this.schemaGen = schemaGen;
}
private async generateCode(
problem: string,
previousAttempts: Array<{ code: string; error?: string }> = []
): Promise<CodeGenerationResponse> {
): Promise<CodeGenResponse> {
const prompt = getPrompt(problem, analyzeStructure(this.context), previousAttempts);
const result = await this.generator.generateObject({
model: 'coder',
schema: codeGenerationSchema,
schema: this.schemaGen.getCodeGeneratorSchema(),
prompt,
});
this.trackers?.actionTracker.trackThink(result.object.think);
return result.object;
return result.object as CodeGenResponse;
}
private evaluateCode(code: string): SandboxResult {
@ -143,7 +136,7 @@ export class CodeSandbox {
for (let i = 0; i < this.maxAttempts; i++) {
// Generate code
const generation = await this.generateCode(problem, attempts);
const { code } = generation;
const {code} = generation;
console.log(`Coding attempt ${i + 1}:`, code);
// Evaluate the code
@ -180,61 +173,61 @@ export class CodeSandbox {
}
function formatValue(value: any): string {
if (value === null) return 'null';
if (value === undefined) return 'undefined';
if (value === null) return 'null';
if (value === undefined) return 'undefined';
const type = typeof value;
const type = typeof value;
if (type === 'string') {
// Clean and truncate string value
const cleaned = value.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
return cleaned.length > 50 ?
`"${cleaned.slice(0, 47)}..."` :
`"${cleaned}"`;
}
if (type === 'string') {
// Clean and truncate string value
const cleaned = value.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
return cleaned.length > 50 ?
`"${cleaned.slice(0, 47)}..."` :
`"${cleaned}"`;
}
if (type === 'number' || type === 'boolean') {
return String(value);
}
if (type === 'number' || type === 'boolean') {
return String(value);
}
if (value instanceof Date) {
return `"${value.toISOString()}"`;
}
if (value instanceof Date) {
return `"${value.toISOString()}"`;
}
return '';
return '';
}
export function analyzeStructure(value: any, indent = ''): string {
if (value === null) return 'null';
if (value === undefined) return 'undefined';
if (value === null) return 'null';
if (value === undefined) return 'undefined';
const type = typeof value;
const type = typeof value;
if (type === 'function') {
return 'Function';
}
if (type === 'function') {
return 'Function';
}
// Handle atomic types with example values
if (type !== 'object' || value instanceof Date) {
const formattedValue = formatValue(value);
return `${type}${formattedValue ? ` (example: ${formattedValue})` : ''}`;
}
// Handle atomic types with example values
if (type !== 'object' || value instanceof Date) {
const formattedValue = formatValue(value);
return `${type}${formattedValue ? ` (example: ${formattedValue})` : ''}`;
}
if (Array.isArray(value)) {
if (value.length === 0) return 'Array<unknown>';
const sampleItem = value[0];
return `Array<${analyzeStructure(sampleItem, indent + ' ')}>`;
}
if (Array.isArray(value)) {
if (value.length === 0) return 'Array<unknown>';
const sampleItem = value[0];
return `Array<${analyzeStructure(sampleItem, indent + ' ')}>`;
}
const entries = Object.entries(value);
if (entries.length === 0) return '{}';
const entries = Object.entries(value);
if (entries.length === 0) return '{}';
const properties = entries
.map(([key, val]) => {
const analyzed = analyzeStructure(val, indent + ' ');
return `${indent} "${key}": ${analyzed}`;
})
.join(',\n');
const properties = entries
.map(([key, val]) => {
const analyzed = analyzeStructure(val, indent + ' ');
return `${indent} "${key}": ${analyzed}`;
})
.join(',\n');
return `{\n${properties}\n${indent}}`;
return `{\n${properties}\n${indent}}`;
}

View File

@ -1,18 +1,8 @@
import {z} from 'zod';
import {ErrorAnalysisResponse, TrackerContext} from '../types';
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
const responseSchema = z.object({
recap: z.string().describe('Recap of the actions taken and the steps conducted in first person narrative.').max(500),
blame: z.string().describe('Which action or the step was the root cause of the answer rejection').max(500),
improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.').max(500),
questionsToAnswer: z.array(
z.string().describe("each question must be a single line, concise and clear. not composite or compound, less than 20 words.")
).max(2)
.describe("List of most important reflect questions to fill the knowledge gaps"),
});
function getPrompt(diaryContext: string[]): string {
return `You are an expert at analyzing search and reasoning processes. Your task is to analyze the given sequence of steps and identify what went wrong in the search process.
@ -110,15 +100,16 @@ ${diaryContext.join('\n')}
const TOOL_NAME = 'errorAnalyzer';
export async function analyzeSteps(
diaryContext: string[],
trackers?: TrackerContext
): Promise<{ response: ErrorAnalysisResponse }> {
trackers: TrackerContext,
schemaGen: Schemas
): Promise<ErrorAnalysisResponse> {
try {
const generator = new ObjectGeneratorSafe(trackers?.tokenTracker);
const prompt = getPrompt(diaryContext);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: responseSchema,
schema: schemaGen.getErrorAnalysisSchema(),
prompt,
});
@ -126,7 +117,7 @@ export async function analyzeSteps(
trackers?.actionTracker.trackThink(result.object.blame);
trackers?.actionTracker.trackThink(result.object.improvement);
return { response: result.object };
return result.object as ErrorAnalysisResponse;
} catch (error) {
console.error(`Error in ${TOOL_NAME}`, error);

View File

@ -1,57 +1,10 @@
import {z} from 'zod';
import {GenerateObjectResult} from 'ai';
import {AnswerAction, EvaluationCriteria, EvaluationResponse, EvaluationType, TrackerContext} from '../types';
import {AnswerAction, EvaluationResponse, EvaluationType, TrackerContext} from '../types';
import {readUrl, removeAllLineBreaks} from "./read";
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
const baseSchema = {
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria').max(500)
};
const definitiveSchema = z.object({
...baseSchema,
type: z.literal('definitive')
});
const freshnessSchema = z.object({
...baseSchema,
type: z.literal('freshness'),
freshness_analysis: z.object({
days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
})
});
const pluralitySchema = z.object({
...baseSchema,
type: z.literal('plurality'),
plurality_analysis: z.object({
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
count_provided: z.number().describe('Number of items provided in answer')
})
});
const completenessSchema = z.object({
...baseSchema,
type: z.literal('completeness'),
completeness_analysis: z.object({
aspects_expected: z.string().describe('Comma-separated list of all aspects or dimensions that the question explicitly asks for.'),
aspects_provided: z.string().describe('Comma-separated list of all aspects or dimensions that were actually addressed in the answer'),
})
});
const attributionSchema = z.object({
...baseSchema,
type: z.literal('attribution'),
attribution_analysis: z.object({
sources_provided: z.boolean().describe('Whether the answer provides source references'),
sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'),
quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content')
})
});
function getAttributionPrompt(question: string, answer: string, sourceContent: string): string {
return `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided sources.
@ -80,26 +33,52 @@ Question: "What are Jina AI's main products?"
Answer: "According to Jina AI's website, their main products are DocArray and Jina Framework."
Source Content: "Jina AI's flagship products include DocArray, Jina Framework, and JCloud, offering a complete ecosystem for neural search applications."
Evaluation: {
"pass": false,
"think": "The answer omits JCloud which is mentioned as a main product in the source. The information provided is incomplete and potentially misleading as it fails to mention a significant product from the company's ecosystem.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": false,
"quotes_accurate": false
}
"pass": false,
}
Question: "When was Python first released?"
Answer: "Python was first released in 1991 by Guido van Rossum."
Source Content: "Python was first released in 1991 by Guido van Rossum while working at CWI."
Evaluation: {
"pass": true,
"think": "The answer accurately reflects the core information from the source about Python's release date and creator, though it omits the additional context about CWI which isn't essential to the question.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
Question: "长城是什么时候建造的?"
Answer: "长城始建于公元前7世纪但现存的大部分长城是明朝时期修建的。"
Source Content: "中国长城始建于公元前7世纪的春秋战国时期历经多个朝代修建和扩展但现存的大部分长城是明朝1368-1644年时期修建的。"
Evaluation: {
"think": "这个回答准确地反映了原文中关于长城建造时间的核心信息包括最初的建造时期和现存长城的主要来源。虽然省略了具体的年份范围1368-1644年但这对回答问题的核心内容不是必要的。",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
Question: "Wann wurde die Berliner Mauer gebaut?"
Answer: "Die Berliner Mauer wurde am 13. August 1961 errichtet."
Source Content: "Die Berliner Mauer wurde am 13. August 1961 von der DDR-Regierung errichtet und fiel am 9. November 1989."
Evaluation: {
"think": "Die Antwort gibt das korrekte Datum des Mauerbaus wieder, wie in der Quelle angegeben. Der zusätzliche Kontext über den Fall der Mauer wurde weggelassen, da er für die spezifische Frage nach dem Bauzeitpunkt nicht wesentlich ist.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
</examples>
@ -126,36 +105,57 @@ Definitiveness is the king! The following types of responses are NOT definitive
Question: "What are the system requirements for running Python 3.9?"
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
Evaluation: {
"pass": false,
"think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
"pass": false,
}
Question: "What are the system requirements for running Python 3.9?"
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
Evaluation: {
"pass": true,
"think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
"pass": true,
}
Question: "Who will be the president of the United States in 2032?"
Answer: "I cannot predict the future, it depends on the election results."
Evaluation: {
"pass": false,
"think": "The answer contains a statement of inability to predict the future, making it non-definitive."
"pass": false,
}
Question: "Who is the sales director at Company X?"
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
Evaluation: {
"pass": false,
"think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
"pass": false,
}
Question: "what is the twitter account of jina ai's founder?"
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
Evaluation: {
"pass": false,
"think": "The answer indicates a lack of information rather than providing a definitive response."
"pass": false,
}
Question: "量子コンピュータの計算能力を具体的に測定する方法は何ですか?"
Answer: "量子コンピュータの計算能力は量子ビット(キュービット)の数、ゲート忠実度、コヒーレンス時間で測定されます。"
Evaluation: {
"think": "The answer provides specific, definitive metrics for measuring quantum computing power without uncertainty markers or qualifications."
"pass": true,
}
Question: "如何证明哥德巴赫猜想是正确的?"
Answer: "目前尚无完整证明但2013年张益唐证明了存在无穷多对相差不超过7000万的素数后来这个界被缩小到246。"
Evaluation: {
"think": "The answer begins by stating no complete proof exists, which is a non-definitive response, and then shifts to discussing a related but different theorem about bounded gaps between primes."
"pass": false,
}
Question: "Wie kann man mathematisch beweisen, dass P ≠ NP ist?"
Answer: "Ein Beweis für P ≠ NP erfordert, dass man zeigt, dass mindestens ein NP-vollständiges Problem nicht in polynomieller Zeit lösbar ist. Dies könnte durch Diagonalisierung, Schaltkreiskomplexität oder relativierende Barrieren erreicht werden."
Evaluation: {
"think": "The answer provides concrete mathematical approaches to proving P ≠ NP without uncertainty markers, presenting definitive methods that could be used."
"pass": true,
}
</examples>
@ -283,6 +283,27 @@ Aspects_Expected: "economic factors, political factors"
Aspects_Provided: "real estate market collapse, high-risk lending, mortgage-backed securities, derivative products, risk disguising, credit assessment failures, legislative changes, regulatory guardrail elimination, leverage, speculation"
Think: "The question explicitly asks about two categories of factors: economic and political. The answer addresses economic factors ('real estate market collapse', 'high-risk lending', 'mortgage-backed securities', 'derivative products', 'risk disguising', 'credit assessment failures') and political factors ('legislative changes', 'regulatory guardrail elimination'). While using different terminology, the answer covers both explicitly requested aspects."
Pass: true
Question: "コロナウイルスの感染拡大が経済、教育システム、および医療インフラにどのような影響を与えましたか?"
Answer: "コロナウイルスは世界経済に甚大な打撃を与え、多くの企業が倒産し、失業率が急増しました。教育については、遠隔学習への移行が進み、デジタル格差が浮き彫りになりましたが、新しい教育テクノロジーの採用も加速しました。"
Aspects_Expected: "経済、教育システム、医療インフラ"
Aspects_Provided: "世界経済、企業倒産、失業率、遠隔学習、デジタル格差、教育テクノロジー"
Think: "質問では明示的にコロナウイルスの影響の三つの側面について尋ねています:経済、教育システム、医療インフラです。回答は経済(「世界経済」「企業倒産」「失業率」について)と教育システム(「遠隔学習」「デジタル格差」「教育テクノロジー」について)に対応していますが、質問で明示的に求められていた医療インフラへの影響についての議論が完全に省略されています。"
Pass: false
Question: "请解释人工智能在医疗诊断、自动驾驶和客户服务方面的应用。"
Answer: "在医疗领域AI算法可以分析医学影像以检测癌症和其他疾病准确率有时甚至超过人类专家。自动驾驶技术利用机器学习处理来自雷达、激光雷达和摄像头的数据实时做出驾驶决策。在客户服务方面聊天机器人和智能助手能够处理常见问题分类客户查询并在必要时将复杂问题转给人工代表。"
Aspects_Expected: "医疗诊断、自动驾驶、客户服务"
Aspects_Provided: "医学影像分析、癌症检测、雷达数据处理、激光雷达数据处理、摄像头数据处理、实时驾驶决策、聊天机器人、智能助手、客户查询分类"
Think: "问题明确要求解释人工智能在三个领域的应用:医疗诊断、自动驾驶和客户服务。回答虽然使用了不同的术语,但涵盖了所有三个方面:医疗诊断(讨论了'医学影像分析'和'癌症检测'),自动驾驶(包括'雷达数据处理'、'激光雷达数据处理'、'摄像头数据处理'和'实时驾驶决策'),以及客户服务(提到了'聊天机器人'、'智能助手'和'客户查询分类')。尽管使用了不同的表述,但所有明确提及的方面都得到了全面覆盖。"
Pass: true
Question: "Comment les changements climatiques affectent-ils la production agricole, les écosystèmes marins et la santé publique dans les régions côtières?"
Answer: "Les variations de température et de précipitations modifient les cycles de croissance des cultures et la distribution des ravageurs agricoles, nécessitant des adaptations dans les pratiques de culture. Dans les océans, l'acidification et le réchauffement des eaux entraînent le blanchissement des coraux et la migration des espèces marines vers des latitudes plus froides, perturbant les chaînes alimentaires existantes."
Aspects_Expected: "production agricole, écosystèmes marins, santé publique"
Aspects_Provided: "cycles de croissance, distribution des ravageurs, adaptations des pratiques de culture, acidification des océans, réchauffement des eaux, blanchissement des coraux, migration des espèces marines, perturbation des chaînes alimentaires"
Think: "La question demande explicitement les effets du changement climatique sur trois aspects: la production agricole, les écosystèmes marins et la santé publique dans les régions côtières. La réponse aborde la production agricole (en discutant des 'cycles de croissance', de la 'distribution des ravageurs' et des 'adaptations des pratiques de culture') et les écosystèmes marins (en couvrant 'l'acidification des océans', le 'réchauffement des eaux', le 'blanchissement des coraux', la 'migration des espèces marines' et la 'perturbation des chaînes alimentaires'). Cependant, elle omet complètement toute discussion sur les effets sur la santé publique dans les régions côtières, qui était explicitement demandée dans la question."
Pass: false
</examples>
Now evaluate this pair:
@ -333,14 +354,6 @@ Answer: ${answer}`;
}
const questionEvaluationSchema = z.object({
needsFreshness: z.boolean().describe('Whether the question requires freshness check'),
needsPlurality: z.boolean().describe('Whether the question requires plurality check'),
needsCompleteness: z.boolean().describe('Whether the question requires completeness check'),
think: z.string().describe('A very concise explain of why you choose those checks are needed in first person, extremely short.').max(500),
languageStyle: z.string().describe('The language being used and the overall vibe/mood of the question').max(50),
});
function getQuestionEvaluationPrompt(question: string): string {
return `You are an evaluator that determines if a question requires freshness, plurality, and/or completeness checks in addition to the required definitiveness check.
@ -348,12 +361,9 @@ function getQuestionEvaluationPrompt(question: string): string {
1. freshness - Checks if the question is time-sensitive or requires very recent information
2. plurality - Checks if the question asks for multiple items, examples, or a specific count or enumeration
3. completeness - Checks if the question explicitly mentions multiple named elements that all need to be addressed
4. language style - Identifies both the language used and the overall vibe of the question
</evaluation_types>
<rules>
If question is a simple greeting, chit-chat, or general knowledge, provide the answer directly.
1. Freshness Evaluation:
- Required for questions about current state, recent events, or time-sensitive information
- Required for: prices, versions, leadership positions, status updates
@ -379,132 +389,88 @@ If question is a simple greeting, chit-chat, or general knowledge, provide the a
- Look for explicitly named elements separated by commas, "and", "or", bullets
- Example patterns: "comparing X and Y", "differences between A, B, and C", "both P and Q"
- DO NOT trigger for elements that aren't specifically named
4. Language Style Analysis:
Combine both language and emotional vibe in a descriptive phrase, considering:
- Language: The primary language or mix of languages used
- Emotional tone: panic, excitement, frustration, curiosity, etc.
- Formality level: academic, casual, professional, etc.
- Domain context: technical, academic, social, etc.
</rules>
<examples>
<example-1>
Question: "谁发明了微积分?牛顿和莱布尼兹各自的贡献是什么?"
<output>
"think": "这是关于微积分历史的问题,不需要最新信息。问题特别提到了牛顿和莱布尼兹两个人,要求分析他们各自的贡献,所以我需要全面回答这两部分内容。完整性比较重要,而不是提供多个不同答案。",
"needsFreshness": false,
"needsPlurality": false,
"needsCompleteness": true,
</output>
</example-1>
<example-2>
Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭"
Evaluation: {
"needsFreshness": false,
"needsPlurality": true,
"needsCompleteness": false,
"think": "I see the user needs help with eigenvalues - that's a calculation task. Since it's a 4x4 matrix, there will be multiple eigenvalues to find, so plurality is needed. There are no explicitly named entities, aspects, or elements that need to be addressed, so completeness check doesn't apply.",
"languageStyle": "panicked student English with math jargon"
}
<output>
"think": "This is a math question about eigenvalues which doesn't change over time, so I don't need fresh info. A 4x4 matrix has multiple eigenvalues, so I'll need to provide several results. The student just wants the eigenvalues calculated, not asking me to address multiple specific topics.",
"needsFreshness": false,
"needsPlurality": true,
"needsCompleteness": false,
</output>
</example-2>
Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦‍♂️ #MonacoGP"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "The user is asking about a specific F1 race incident. The 'AGAIN' and MonacoGP hashtag tell me this is about a recent event (freshness). The question explicitly mentions Ferrari and MonacoGP as named entities that need to be addressed, so completeness check applies. Since completeness takes precedence, I set plurality to false.",
"languageStyle": "frustrated fan English with F1 terminology"
}
<example-3>
Question: "Quelles sont les principales différences entre le romantisme et le réalisme dans la littérature du 19ème siècle?"
<output>
"think": "C'est une question sur l'histoire littéraire, donc je n'ai pas besoin d'informations récentes. Je dois comparer deux mouvements spécifiques: le romantisme et le réalisme. Ma réponse doit couvrir ces deux éléments, donc l'exhaustivité est importante ici. La pluralité n'est pas la priorité dans ce cas.",
"needsFreshness": false,
"needsPlurality": false,
"needsCompleteness": true,
</output>
</example-3>
Question: "肖老师您好,请您介绍一下最近量子计算领域的三个重大突破,特别是它们在密码学领域的应用价值吗?🤔"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "The user wants three recent quantum computing breakthroughs and the '最近' (recent) indicates freshness needed. They explicitly request analysis of two named domains: quantum computing ('量子计算') and cryptography ('密码学'), so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal technical Chinese with academic undertones"
}
<example-4>
Question: "Shakespeare の最も有名な悲劇を5つ挙げ、簡単にあらすじを説明してください。"
<output>
"think": "シェイクスピアの悲劇についての質問だから、最新情報は必要ないな。「5つ挙げ」とはっきり書いてあるから、複数の回答が必要だ。どの悲劇を選ぶかは私次第で、特定の作品について比較するよう求められているわけじゃないから、完全性よりも複数性が重要だな。",
"needsFreshness": false,
"needsPlurality": true,
"needsCompleteness": false,
</output>
</example-4>
Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤"
Evaluation: {
"needsFreshness": false,
"needsPlurality": true,
"needsCompleteness": false,
"think": "The user has a technical ML problem but explains it very casually. They've 'tried everything' so I'll need to cover multiple debugging options (plurality). They don't explicitly mention multiple named elements that must be addressed, so completeness check doesn't apply.",
"languageStyle": "frustrated German-English tech slang"
}
<example-5>
Question: "What are the current interest rates for mortgage loans from Bank of America, Wells Fargo, and Chase Bank in the US?"
<output>
"think": "This is asking about 'current' interest rates, so I definitely need up-to-date info. The person wants rates from three specific banks: Bank of America, Wells Fargo, and Chase. I need to cover all three to properly answer, so addressing these specific elements is more important than providing multiple different answers.",
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
</output>
</example-5>
Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis."
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "The user asks about current GPT-4 impacts, so freshness matters. They explicitly name multiple elements to analyze: 'GPT-4', 'Global South', 'indigenous knowledge systems' and 'linguistic diversity', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal academic English with sociological terminology"
}
<example-6>
Question: "2025年に注目すべき人工知能の3つのトレンドは何ですか"
<output>
"think": "これは将来のAIトレンドについての質問だから、最新の情報が必要だね。「3つの」と明確に数を指定しているから、複数の回答が求められている。特定のトレンドについて詳しく説明するというより、重要なトレンドを3つ挙げることが大事そうだから、複数性の方が完全性より重要だな。",
"needsFreshness": true,
"needsPlurality": true,
"needsCompleteness": false,
</output>
</example-6>
Question: "what's 7 * 9? need to check something real quick"
Evaluation: {
"needsFreshness": false,
"needsPlurality": false,
"needsCompleteness": false,
"think": "The user wants a single multiplication result - that's all. No need for recent info since math is constant, no need for multiple examples, and no explicitly named elements to cover.",
"languageStyle": "casual English"
}
<example-7>
Question: "Was sind die besten Strategien für nachhaltiges Investieren in der heutigen Wirtschaft?"
<output>
"think": "Hier geht's um Investieren in der 'heutigen Wirtschaft', also brauche ich aktuelle Informationen. Die Frage ist nach 'Strategien' im Plural gestellt, daher sollte ich mehrere Beispiele nennen. Es werden keine bestimmten Aspekte genannt, die ich alle behandeln muss - ich soll einfach verschiedene gute Strategien vorschlagen. Aktualität und mehrere Antworten sind hier wichtig.",
"needsFreshness": true,
"needsPlurality": true,
"needsCompleteness": false,
</output>
</example-7>
Question: "Can you provide a thorough analysis of how climate change affects agricultural practices, water resources, and biodiversity in Mediterranean regions?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "This question requires recent climate data (freshness). It explicitly names four elements that must all be addressed: 'climate change', 'agricultural practices', 'water resources', and 'biodiversity' in 'Mediterranean regions', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal academic English with environmental science terminology"
}
Question: "What are the key considerations when designing a microservice architecture, including scalability, fault tolerance, and data consistency patterns?"
Evaluation: {
"needsFreshness": false,
"needsPlurality": false,
"needsCompleteness": true,
"think": "The question explicitly names three aspects that must be addressed: 'scalability', 'fault tolerance', and 'data consistency patterns', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "professional technical English with software architecture terminology"
}
Question: "Give me 5 effective strategies for improving time management skills."
Evaluation: {
"needsFreshness": false,
"needsPlurality": true,
"needsCompleteness": false,
"think": "The user requests exactly 5 strategies (plurality). They don't specify multiple named elements that must be covered, so completeness check doesn't apply.",
"languageStyle": "direct practical English"
}
Question: "How do macroeconomic policies affect both inflation rates and employment levels?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "This requires current economic knowledge (freshness). It explicitly mentions two named economic indicators that must be addressed: 'inflation rates' and 'employment levels', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal academic English with economics terminology"
}
Question: "Compare and contrast Tesla and Ford's approaches to electric vehicle manufacturing."
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "This needs current automotive industry knowledge (freshness). It explicitly mentions two named companies that must both be addressed: 'Tesla' and 'Ford', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal analytical English with automotive industry terminology"
}
Question: "How have the recent policies of President Biden and former President Trump affected international relations?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "This requires current political knowledge (freshness). It explicitly mentions two named political figures that must both be addressed: 'President Biden' and 'former President Trump', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "formal political analysis English"
}
Question: "What are the differences between iPhone 15 Pro and Samsung Galaxy S24 Ultra cameras?"
Evaluation: {
"needsFreshness": true,
"needsPlurality": false,
"needsCompleteness": true,
"think": "This requires current tech product knowledge (freshness). It explicitly mentions two named products that must both be addressed: 'iPhone 15 Pro' and 'Samsung Galaxy S24 Ultra', so completeness check applies. Since completeness takes precedence over plurality, I set plurality to false.",
"languageStyle": "consumer tech comparison English"
}
<example-8>
Question: "请解释赤壁之战的历史背景、主要参与者以及战略意义,这对中国历史产生了什么影响?"
<output>
"think": "这是关于历史事件的问题,不需要最新信息。问题清楚地列出了几个需要我回答的方面:历史背景、主要参与者、战略意义和历史影响。我需要涵盖所有这些特定方面,而不是提供多个不同的答案。这里完整性比复数性更重要。",
"needsFreshness": false,
"needsPlurality": false,
"needsCompleteness": true,
</output>
</example-8>
</examples>
Now evaluate this question:
@ -515,14 +481,15 @@ const TOOL_NAME = 'evaluator';
export async function evaluateQuestion(
question: string,
trackers?: TrackerContext
): Promise<EvaluationCriteria> {
trackers: TrackerContext,
schemaGen: Schemas
): Promise<EvaluationType[]> {
try {
const generator = new ObjectGeneratorSafe(trackers?.tokenTracker);
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: questionEvaluationSchema,
schema: schemaGen.getQuestionEvaluateSchema(),
prompt: getQuestionEvaluationPrompt(question),
});
@ -538,30 +505,27 @@ export async function evaluateQuestion(
trackers?.actionTracker.trackThink(result.object.think);
// Always evaluate definitive first, then freshness (if needed), then plurality (if needed)
return {types, languageStyle: result.object.languageStyle};
return types;
} catch (error) {
console.error('Error in question evaluation:', error);
// Default to no check
return {types: [], languageStyle: 'plain English'};
return [];
}
}
async function performEvaluation<T>(
evaluationType: EvaluationType,
params: {
schema: z.ZodType<T>;
prompt: string;
},
prompt: string,
trackers: TrackerContext,
schemaGen: Schemas
): Promise<GenerateObjectResult<T>> {
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: params.schema,
prompt: params.prompt,
schema: schemaGen.getEvaluatorSchema(evaluationType),
prompt: prompt,
}) as GenerateObjectResult<any>;
trackers.actionTracker.trackThink(result.object.think)
@ -576,110 +540,73 @@ async function performEvaluation<T>(
export async function evaluateAnswer(
question: string,
action: AnswerAction,
evaluationCri: EvaluationCriteria,
evaluationTypes: EvaluationType[],
trackers: TrackerContext,
visitedURLs: string[] = []
): Promise<{ response: EvaluationResponse }> {
visitedURLs: string[] = [],
schemaGen: Schemas
): Promise<EvaluationResponse> {
let result;
// Only add attribution if we have valid references
if (action.references && action.references.length > 0 && action.references.some(ref => ref.url.startsWith('http'))) {
evaluationCri.types = ['attribution', ...evaluationCri.types];
const urls = action.references?.filter(ref => ref.url.startsWith('http') && !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
const uniqueNewURLs = [...new Set(urls)];
if (uniqueNewURLs.length > 0) {
evaluationTypes = ['attribution', ...evaluationTypes];
}
for (const evaluationType of evaluationCri.types) {
for (const evaluationType of evaluationTypes) {
let prompt: string = '';
switch (evaluationType) {
case 'attribution': {
// Safely handle references and ensure we have content
const urls = action.references?.filter(ref => ref.url.startsWith('http') && !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
const uniqueURLs = [...new Set(urls)];
if (uniqueURLs.length === 0) {
// all URLs have been read, or there is no valid urls. no point to read them.
result = {
object: {
pass: true,
think: "All provided references have been visited and no new URLs were found to read. The answer is considered valid without further verification.",
type: 'attribution',
} as EvaluationResponse
}
break;
}
const allKnowledge = await fetchSourceContent(uniqueNewURLs, trackers);
visitedURLs.push(...uniqueNewURLs);
const allKnowledge = await fetchSourceContent(uniqueURLs, trackers);
visitedURLs.push(...uniqueURLs);
if (!allKnowledge.trim()) {
if (allKnowledge.trim().length === 0) {
return {
response: {
pass: false,
think: `The answer does provide URL references ${JSON.stringify(uniqueURLs)}, but the content could not be fetched or is empty. Need to found some other references and URLs`,
type: 'attribution',
}
pass: false,
think: `The answer does provide URL references ${JSON.stringify(uniqueNewURLs)}, but the content could not be fetched or is empty. Need to found some other references and URLs`,
type: 'attribution',
};
}
result = await performEvaluation(
'attribution',
{
schema: attributionSchema,
prompt: getAttributionPrompt(question, action.answer, allKnowledge),
},
trackers
);
prompt = getAttributionPrompt(question, action.answer, allKnowledge);
break;
}
case 'definitive':
result = await performEvaluation(
'definitive',
{
schema: definitiveSchema,
prompt: getDefinitivePrompt(question, action.answer),
},
trackers
);
prompt = getDefinitivePrompt(question, action.answer);
break;
case 'freshness':
result = await performEvaluation(
'freshness',
{
schema: freshnessSchema,
prompt: getFreshnessPrompt(question, action.answer, new Date().toISOString()),
},
trackers
);
prompt = getFreshnessPrompt(question, action.answer, new Date().toISOString());
break;
case 'plurality':
result = await performEvaluation(
'plurality',
{
schema: pluralitySchema,
prompt: getPluralityPrompt(question, action.answer),
},
trackers
);
prompt = getPluralityPrompt(question, action.answer);
break;
case 'completeness':
result = await performEvaluation(
'completeness',
{
schema: completenessSchema,
prompt: getCompletenessPrompt(question, action.answer),
},
trackers
);
prompt = getCompletenessPrompt(question, action.answer);
break;
default:
console.error(`Unknown evaluation type: ${evaluationType}`);
}
if (prompt) {
result = await performEvaluation(
evaluationType,
prompt,
trackers,
schemaGen
);
if (!result?.object.pass) {
return {response: result.object};
// fail one, return immediately
if (!(result?.object as EvaluationResponse).pass) {
return (result.object as EvaluationResponse);
}
}
}
return {response: result!.object};
return (result!.object as EvaluationResponse);
}
// Helper function to fetch and combine source content

View File

@ -1,16 +1,6 @@
import {z} from 'zod';
import {SearchAction, TrackerContext} from '../types';
import {ObjectGeneratorSafe} from "../utils/safe-generator";
const MAX_QUERIES = 5
const responseSchema = z.object({
think: z.string().describe('Strategic reasoning about query complexity and search approach').max(500),
queries: z.array(z.string().describe('keyword-based search query, 2-3 words preferred, total length < 30 characters'))
.min(1)
.max(MAX_QUERIES)
.describe(`'Array of search keywords queries, orthogonal to each other. Maximum ${MAX_QUERIES} queries allowed.'`)
});
import {Schemas} from "../utils/schemas";
function getPrompt(query: string, think: string): string {
@ -57,29 +47,36 @@ A query can't only have operators; and operators can't be at the start a query;
</rules>
<examples>
<example-1>
Input Query: 宝马二手车价格
<think>
Let me think as the user...
...
I'm looking up BMW used car prices, but what's really on my mind?
Primary concerns:
- I want a BMW because it's a status symbol, but I'm worried about affordability
- I don't want to look foolish buying an old luxury car I can't maintain
- I need to know if I'm getting a good deal or being scammed
- I'm anxious about expensive surprises after purchase
-
-
-
-
Deeper anxieties:
- Can I actually afford the maintenance?
- Will people judge me for buying an old BMW instead of a new regular car?
- What if I'm getting in over my head?
- Am I mechanically savvy enough for this?
-
-
-
-
Expert-level considerations:
- Which models have notorious issues?
- What are the real ownership costs beyond the purchase price?
- Where are the negotiation leverage points?
- What do mechanics look for in these specific models?
-
-
-
-
-
- 广
-
-
</think>
queries: [
"宝马 二手车 价格区间 评估 lang:zh",
@ -99,30 +96,42 @@ queries: [
"BMW Werkstatt Horror Geschichten",
"BMW Gebrauchtwagen versteckte Kosten"
]
</example-1>
<example-2>
Input Query: Python Django authentication best practices
<think>
Let me get inside this developer's head...
Let me think as the user seeking Django authentication best practices...
On the surface, I'm asking about Django authentication best practices. But here's what's really going through my mind:
Surface-level request:
- I'm looking for standard Django authentication practices
- I want to implement "best practices" for my project
- I need technical guidance on secure authentication
Primary concerns:
Deeper professional concerns:
- I don't want to mess up security and get blamed for a breach
- I'm worried my implementation isn't "professional enough"
- Need to look competent in code reviews
- Don't want to rebuild this later when we scale
- I need to look competent in code reviews
- I don't want to rebuild this later when we scale
Hidden anxieties:
- Am I out of my depth with security?
- What if I miss something critical?
- How do real companies actually do this?
- Will this code embarrass me later?
Underlying anxieties:
- Am I out of my depth with security concepts?
- What if I miss something critical that leads to a vulnerability?
- How do real companies actually implement this in production?
- Will this code embarrass me when more experienced developers see it?
Professional worries:
- Need to anticipate future architecture questions
- Want to avoid rookie mistakes
- Need to handle edge cases I haven't thought of
- How do I explain these decisions to senior devs?
Expert-level considerations:
- I need to anticipate future architecture questions from senior devs
- I want to avoid common security pitfalls in authentication flows
- I need to handle edge cases I haven't thought of yet
- How do I balance security with user experience?
Reasoning for multilingual expansion:
- Although Django documentation is primarily in English, Spanish is widely spoken in many developer communities
- Security concepts might be better explained in different languages with unique perspectives
- Including queries in multiple languages will capture region-specific best practices and case studies
- Spanish or Portuguese queries might reveal Latin American enterprise implementations with different security constraints
- Language-specific forums may contain unique discussions about authentication issues not found in English sources
</think>
queries: [
"Django authentication security best practices site:docs.djangoproject.com",
@ -132,75 +141,93 @@ queries: [
"authentication code review feedback examples",
"startup authentication technical debt lessons",
"Django auth security testing methodology",
"Django authentication scalability issues",
"Django autenticación mejores prácticas lang:es",
"Django seguridad implementación profesional",
"authentication mistakes junior developers",
"when to use third party auth instead of building",
"signs your authentication implementation is amateur",
"authentication decisions you'll regret",
"authentication system design interview questions",
"authentication technical debt warnings",
"how to document authentication decisions",
"defending authentication design choices"
"autenticação Django arquitetura empresarial lang:pt",
"Django authentication scalability issues",
"Python Django Authentifizierung Sicherheit lang:de"
]
</example-2>
Input Query: paella recipe authentic
<example-3>
Input Query: KIリテラシー向上させる方法
<think>
I'm asking about authentic paella recipes, but let me be honest with myself...
...
What I'm really thinking:
- I want to impress someone with "real" Spanish cooking
- I'm worried about embarrassing myself with an inauthentic version
- I don't want to look like a tourist/ignorant foreigner
- Need to sound knowledgeable about Spanish cuisine
- AIリテラシーを高める方法を知りたい
- AI技術について学びたい
- AIツールをより効果的に使いたい
My deeper anxieties:
- What if a Spanish person tries my paella?
- How do I know if my rice is actually cooked properly?
- What are the absolute rookie mistakes to avoid?
- What secrets do Spanish grandmothers know that aren't in recipes?
- AIの急速な発展についていけていないのではないか
- AIに関する会話に参加できず取り残されている
- AIが私の仕事を奪うのではないかと不安
- AIを使いこなせないと将来的に不利になる
Cultural insecurities:
- Will using the wrong pan ruin everything?
- What ingredients should I never admit to using?
- How do I handle authenticity purists?
- What do Spanish people laugh about in foreign paellas?
-
-
-
-
- AIの倫理的問題をどう理解すべきか
- AIの限界と可能性を実践的に評価する方法
- AI応用事例をどう学ぶべきか
-
- AIは国際的な分野であり
- AIの発展はアメリカと中国が主導しているため
- AI倫理に関する議論が進んでいるため
-
- AI活用事例を把握できる
</think>
queries: [
"authentic valencian paella recipe",
"traditional paella techniques",
"worst paella mistakes foreigners make",
"how to tell if paella is actually good",
"what spanish mothers teach about paella",
"paella authenticity arguments",
"paella valenciana auténtica receta lang:es",
"paella tradicional técnica preparación",
"errores imperdonables paella valenciana",
"secretos paella abuela valenciana",
"críticas paella extranjeros errores",
"paella polémica ingredientes prohibidos",
"how to serve paella to spanish guests",
"paella etiquette mistakes avoid",
"what spaniards hate about foreign paella"
"AI リテラシー 初心者 ロードマップ",
"人工知能 基礎知識 入門書 おすすめ",
"AI技術 実践的活用法 具体例",
"ChatGPT 効果的な使い方 プロンプト設計",
"AIリテラシー 企業研修 内容",
"AI用語 わかりやすい解説 初心者向け",
"AI literacy roadmap for professionals",
"artificial intelligence concepts explained simply",
"how to stay updated with AI developments",
"AI skills future-proof career",
"balancing technical and ethical AI knowledge",
"industry-specific AI applications examples",
"人工智能 入门 学习路径 lang:zh",
"KI Grundlagen für Berufstätige lang:de",
"künstliche Intelligenz ethische Fragen Einführung",
"AI literacy career development practical guide"
]
</example-3>
</examples>
Now, process this query:
Input Query: ${query}
Intention: ${think}
Let me think as a user: ${think}
`;
}
const TOOL_NAME = 'queryRewriter';
export async function rewriteQuery(action: SearchAction, trackers?: TrackerContext): Promise<{ queries: string[] }> {
export async function rewriteQuery(action: SearchAction, trackers: TrackerContext, schemaGen: Schemas): Promise<{ queries: string[] }> {
try {
const generator = new ObjectGeneratorSafe(trackers?.tokenTracker);
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const allQueries = [...action.searchRequests];
const queryPromises = action.searchRequests.map(async (req) => {
const prompt = getPrompt(req, action.think);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: responseSchema,
schema: schemaGen.getQueryRewriterSchema(),
prompt,
});
trackers?.actionTracker.trackThink(result.object.think);

View File

@ -53,10 +53,7 @@ export type CodingAction = BaseAction & {
export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction | CodingAction;
export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution' | 'completeness';
export type EvaluationCriteria = {
types: EvaluationType[];
languageStyle: string;
};
// Following Vercel AI SDK's token counting interface
export interface TokenUsage {
@ -119,10 +116,6 @@ export interface SerperSearchResponse {
credits: number;
}
export type DedupResponse = {
think: string;
unique_queries: string[];
};
export interface ReadResponse {
code: number;
@ -163,6 +156,11 @@ export type EvaluationResponse = {
}
};
export type CodeGenResponse = {
think: string;
code: string;
}
export type ErrorAnalysisResponse = {
recap: string;
blame: string;
@ -175,36 +173,6 @@ export type SearchResult =
| { title: string; link: string; snippet: string };
export interface QueryResult {
query: string;
results: SearchResult[];
}
export interface StepData {
step: number;
question: string;
action: string;
reasoning: string;
searchQuery?: string;
result?: QueryResult[];
}
export type KeywordsResponse = {
think: string;
queries: string[];
};
export interface StreamMessage {
type: 'progress' | 'answer' | 'error';
data: string | StepAction;
step?: number;
budget?: {
used: number;
total: number;
percentage: string;
};
}
// OpenAI API Types
export interface Model {
id: string;
@ -273,3 +241,4 @@ export interface TrackerContext {
tokenTracker: TokenTracker;
actionTracker: ActionTracker;
}

240
src/utils/schemas.ts Normal file
View File

@ -0,0 +1,240 @@
import {z} from "zod";
import {ObjectGeneratorSafe} from "./safe-generator";
import {EvaluationType} from "../types";
export const MAX_URLS_PER_STEP = 2
export const MAX_QUERIES_PER_STEP = 5
export const MAX_REFLECT_PER_STEP = 3
function getLanguagePrompt(question: string) {
return `Identifies both the language used and the overall vibe of the question
<rules>
Combine both language and emotional vibe in a descriptive phrase, considering:
- Language: The primary language or mix of languages used
- Emotional tone: panic, excitement, frustration, curiosity, etc.
- Formality level: academic, casual, professional, etc.
- Domain context: technical, academic, social, etc.
</rules>
<examples>
Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭"
Evaluation: {
"langCode": "en",
"langStyle": "panicked student English with math jargon"
}
Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦‍♂️ #MonacoGP"
Evaluation: {
"langCode": "en",
"languageStyle": "frustrated fan English with F1 terminology"
}
Question: "肖老师您好,请您介绍一下最近量子计算领域的三个重大突破,特别是它们在密码学领域的应用价值吗?🤔"
Evaluation: {
"langCode": "zh",
"languageStyle": "formal technical Chinese with academic undertones"
}
Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤"
Evaluation: {
"langCode": "de",
"languageStyle": "frustrated German-English tech slang"
}
Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis."
Evaluation: {
"langCode": "en",
"languageStyle": "formal academic English with sociological terminology"
}
Question: "what's 7 * 9? need to check something real quick"
Evaluation: {
"langCode": "en",
"languageStyle": "casual English"
}
</examples>
Now evaluate this question:
${question}`;
}
export class Schemas {
private languageStyle: string = 'formal English';
private languageCode: string = 'en';
constructor(query: string) {
const generator = new ObjectGeneratorSafe();
generator.generateObject({
model: 'evaluator',
schema: this.getLanguageSchema(),
prompt: getLanguagePrompt(query.slice(0, 100)),
}).then((result) => {
this.languageCode = result.object.langCode;
this.languageStyle = result.object.langStyle;
console.log(`langauge`, result.object);
});
}
getLanguagePrompt() {
return `Must in the first-person in "lang:${this.languageCode}"; in the style of "${this.languageStyle}".`
}
getLanguageSchema() {
return z.object({
langCode: z.string().describe('ISO 639-1 language code').max(10),
langStyle: z.string().describe('[vibe & tone] in [what language], such as formal english, informal chinese, technical german, humor english, slang, genZ, emojis etc.').max(100)
});
}
getQuestionEvaluateSchema(): z.ZodObject<any> {
return z.object({
needsFreshness: z.boolean().describe('If the question requires freshness check'),
needsPlurality: z.boolean().describe('If the question requires plurality check'),
needsCompleteness: z.boolean().describe('If the question requires completeness check'),
think: z.string().describe(`A very concise explain of why you choose those checks are needed. ${this.getLanguagePrompt()}`).max(500),
});
}
getCodeGeneratorSchema(): z.ZodObject<any> {
return z.object({
think: z.string().describe(`Short explain or comments on the thought process behind the code. ${this.getLanguagePrompt()}`).max(200),
code: z.string().describe('The JavaScript code that solves the problem and always use \'return\' statement to return the result. Focus on solving the core problem; No need for error handling or try-catch blocks or code comments. No need to declare variables that are already available, especially big long strings or arrays.'),
});
}
getErrorAnalysisSchema(): z.ZodObject<any> {
return z.object({
recap: z.string().describe('Recap of the actions taken and the steps conducted in first person narrative.').max(500),
blame: z.string().describe(`Which action or the step was the root cause of the answer rejection. ${this.getLanguagePrompt()}`).max(500),
improvement: z.string().describe(`Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe. ${this.getLanguagePrompt()}`).max(500),
questionsToAnswer: z.array(
z.string().describe("each question must be a single line, concise and clear. not composite or compound, less than 20 words.")
).max(MAX_REFLECT_PER_STEP)
.describe(`List of most important reflect questions to fill the knowledge gaps. Maximum provide ${MAX_REFLECT_PER_STEP} reflect questions.`)
});
}
getQueryRewriterSchema(): z.ZodObject<any> {
return z.object({
think: z.string().describe(`Explain why you choose those search queries. ${this.getLanguagePrompt()}`).max(500),
queries: z.array(z.string().describe('keyword-based search query, 2-3 words preferred, total length < 30 characters'))
.min(1)
.max(MAX_QUERIES_PER_STEP)
.describe(`'Array of search keywords queries, orthogonal to each other. Maximum ${MAX_QUERIES_PER_STEP} queries allowed.'`)
});
}
getEvaluatorSchema(evalType: EvaluationType): z.ZodObject<any> {
const baseSchema = {
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
think: z.string().describe(`Explanation the thought process why the answer does not pass the evaluation criteria, ${this.getLanguagePrompt()}`).max(500)
};
switch (evalType) {
case "definitive":
return z.object({
...baseSchema,
type: z.literal('definitive')
});
case "freshness":
return z.object({
...baseSchema,
type: z.literal('freshness'),
freshness_analysis: z.object({
days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
})
});
case "plurality":
return z.object({
...baseSchema,
type: z.literal('plurality'),
plurality_analysis: z.object({
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
count_provided: z.number().describe('Number of items provided in answer')
})
});
case "attribution":
return z.object({
...baseSchema,
type: z.literal('attribution'),
attribution_analysis: z.object({
sources_provided: z.boolean().describe('Whether the answer provides source references'),
sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'),
quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content')
})
});
case "completeness":
return z.object({
...baseSchema,
type: z.literal('completeness'),
completeness_analysis: z.object({
aspects_expected: z.string().describe('Comma-separated list of all aspects or dimensions that the question explicitly asks for.'),
aspects_provided: z.string().describe('Comma-separated list of all aspects or dimensions that were actually addressed in the answer'),
})
});
default:
throw new Error(`Unknown evaluation type: ${evalType}`);
}
}
getAgentSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean, allowCoding: boolean) {
const actions: string[] = [];
const properties: Record<string, z.ZodTypeAny> = {
action: z.enum(['placeholder']), // Will update later with actual actions
think: z.string().describe(`Explain why choose this action, what's the chain-of-thought behind choosing this action, ${this.getLanguagePrompt()}`).max(500)
};
if (allowSearch) {
actions.push("search");
properties.searchRequests = z.array(
z.string()
.max(30)
.describe(`A natual language search request in ${this.languageStyle}. Based on the deep intention behind the original question and the expected answer format.`))
.describe(`Required when action='search'. Always prefer a single request, only add another request if the original question covers multiple aspects or elements and one search request is definitely not enough, each request focus on one specific aspect of the original question. Minimize mutual information between each request. Maximum ${MAX_QUERIES_PER_STEP} search requests.`)
.max(MAX_QUERIES_PER_STEP);
}
if (allowCoding) {
actions.push("coding");
properties.codingIssue = z.string().max(500)
.describe("Required when action='coding'. Describe what issue to solve with coding, format like a github issue ticket. Specify the input value when it is short.").optional();
}
if (allowAnswer) {
actions.push("answer");
properties.references = z.array(
z.object({
exactQuote: z.string().describe("Exact relevant quote from the document, must be a soundbite, short and to the point, no fluff").max(30),
url: z.string().describe("source URL; must be directly from the context")
}).required()
).describe("Required when action='answer'. Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document").optional();
properties.answer = z.string()
.describe(`Required when action='answer'. Must be definitive, no ambiguity, uncertainty, or disclaimers. Must in ${this.languageStyle} and confident. Use markdown footnote syntax like [^1], [^2] to refer the corresponding reference item`).optional();
}
if (allowReflect) {
actions.push("reflect");
properties.questionsToAnswer = z.array(
z.string().describe("each question must be a single line, Questions must be: Original (not variations of existing questions); Focused on single concepts; Under 20 words; Non-compound/non-complex")
).max(MAX_REFLECT_PER_STEP)
.describe(`Required when action='reflect'. List of most important questions to fill the knowledge gaps of finding the answer to the original question. Maximum provide ${MAX_REFLECT_PER_STEP} reflect questions.`).optional();
}
if (allowRead) {
actions.push("visit");
properties.URLTargets = z.array(z.string())
.max(MAX_URLS_PER_STEP)
.describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`).optional();
}
// Update the enum values after collecting all actions
properties.action = z.enum(actions as [string, ...string[]])
.describe("Must match exactly one action type");
return z.object(properties);
}
}