From f48c84d207e5e62d2eb5d5e13b20f5c9a6d6e1ac Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@jina.ai>
Date: Wed, 5 Feb 2025 11:41:29 +0800
Subject: [PATCH] fix: use xml prompt

---
 src/agent.ts       | 148 ++++++++++++++++++++++++++++++++-------------
 src/tools/dedup.ts | 104 ++++++++++++-------------------
 2 files changed, 144 insertions(+), 108 deletions(-)
diff --git a/src/agent.ts b/src/agent.ts
index 3abcced..549d0fa 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -111,7 +111,7 @@ function getPrompt(
   allowRead: boolean = true,
   allowSearch: boolean = true,
   badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[],
-  knowledge?: { question: string; answer: string; }[],
+  knowledge?: { question: string; answer: string; references: any[]}[],
   allURLs?: Record<string, string>,
   beastMode?: boolean
 ): string {
@@ -122,49 +122,78 @@ function getPrompt(
 
 You are an advanced AI research analyst specializing in multi-step reasoning. Using your training data and prior lessons learned, answer the following question with absolute certainty:
 
-## Question
-${question}`);
+<question>
+${question}
+</question>
+`);
 
   // Add context section if exists
   if (context?.length) {
-    sections.push(`## Context
+    sections.push(`
+<context>
 You have conducted the following actions:
 
-${context.join('\n')}`);
+${context.join('\n')}
+
+</context>
+`);
   }
 
   // Add knowledge section if exists
   if (knowledge?.length) {
     const knowledgeItems = knowledge
-      .map((k, i) => `### Knowledge ${i + 1}: ${k.question}\n${k.answer}`)
+      .map((k, i) => `
+<knowledge-${i + 1}>
+<question>
+${k.question}
+</question>
+<answer>
+${k.answer}
+</answer>
+<references>
+${JSON.stringify(k.references)}
+</references>
+</knowledge-${i + 1}>
+`)
       .join('\n\n');
 
-    sections.push(`## Knowledge
+    sections.push(`
+<knowledge>
 You have successfully gathered some knowledge which might be useful for answering the original question. Here is the knowledge you have gathered so far
 
-${knowledgeItems}`);
+${knowledgeItems}
+
+</knowledge>
+`);
   }
 
   // Add bad context section if exists
   if (badContext?.length) {
     const attempts = badContext
-      .map((c, i) => `### Attempt ${i + 1}
+      .map((c, i) => `
+<attempt-${i + 1}>
 - Question: ${c.question}
 - Answer: ${c.answer}
 - Reject Reason: ${c.evaluation}
 - Actions Recap: ${c.recap}
-- Actions Blame: ${c.blame}`)
+- Actions Blame: ${c.blame}
+</attempt-${i + 1}>
+`)
       .join('\n\n');
 
     const learnedStrategy = badContext.map(c => c.improvement).join('\n');
 
-    sections.push(`## Unsuccessful Attempts
+    sections.push(`
+<bad-attempts>    
 Your have tried the following actions but failed to find the answer to the question.
 
 ${attempts}
 
-## Learned Strategy
+</bad-attempts>
+
+<learned-strategy>
 ${learnedStrategy}
+</learned-strategy>
 `);
   }
 
@@ -176,50 +205,69 @@ ${learnedStrategy}
       .map(([url, desc]) => `  + "${url}": "${desc}"`)
       .join('\n');
 
-    actions.push(`**visit**:
+    actions.push(`
+<action-visit>    
 - Visit any URLs from below to gather external knowledge, choose the most relevant URLs that might contain the answer
+<url-list>
 ${urlList}
+</url-list>
 - When you have enough search result in the context and want to deep dive into specific URLs
-- It allows you to access the full content behind any URLs`);
+- It allows you to access the full content behind any URLs
+
+</action-visit>
+`);
   }
 
   if (allowSearch) {
-    actions.push(`**search**:
+    actions.push(`
+<action-search>    
 - Query external sources using a public search engine
 - Focus on solving one specific aspect of the question
-- Only give keywords search query, not full sentences`);
+- Only give keywords search query, not full sentences
+</action-search>
+`);
   }
 
   if (allowAnswer) {
-    actions.push(`**answer**:
+    actions.push(`
+<action-answer>
 - Provide final response only when 100% certain
-- Responses must be definitive (no ambiguity, uncertainty, or disclaimers)${allowReflect ? '\n- If doubts remain, use "reflect" instead' : ''}`);
+- Responses must be definitive (no ambiguity, uncertainty, or disclaimers)${allowReflect ? '\n- If doubts remain, use <action-reflect> instead' : ''}
+</action-answer>
+`);
   }
 
   if (beastMode) {
-   actions.push(`**answer**:
-- You have gathered enough information to answer the question; they may not be perfect, but this is your very last chance to answer the question.
-- Try the best of the best reasoning ability, investigate every details in the context and provide the best answer you can think of.
-- When uncertain, educated guess is allowed and encouraged, but make sure it is based on the context and knowledge you have gathered.
-- Responses must be definitive (no ambiguity, uncertainty, or disclaimers`);
+    actions.push(`
+<action-answer>
+- Any answer is better than no answer
+- Partial answers are allowed, but make sure they are based on the context and knowledge you have gathered    
+- When uncertain, educated guess based on the context and knowledge is allowed and encouraged.
+- Responses must be definitive (no ambiguity, uncertainty, or disclaimers)
+</action-answer>
+`);
   }
 
   if (allowReflect) {
-    actions.push(`**reflect**:
+    actions.push(`
+<action-reflect>    
 - Perform critical analysis through hypothetical scenarios or systematic breakdowns
 - Identify knowledge gaps and formulate essential clarifying questions
 - Questions must be:
   - Original (not variations of existing questions)
   - Focused on single concepts
   - Under 20 words
-  - Non-compound/non-complex`);
+  - Non-compound/non-complex
+</action-reflect>
+`);
   }
 
-  sections.push(`## Actions
-
+  sections.push(`
+<actions>
 Based on the current context, you must choose one of the following actions:
-
-${actions.join('\n\n')}`);
+${actions.join('\n\n')}
+</actions>
+`);
 
   // Add footer
   sections.push(`Respond exclusively in valid JSON format matching exact JSON schema.
@@ -243,6 +291,10 @@ function removeAllLineBreaks(text: string) {
   return text.replace(/(\r\n|\n|\r)/gm, " ");
 }
 
+function removeHTMLtags(text: string) {
+  return text.replace(/<[^>]*>?/gm, '');
+}
+
 export async function getResponse(question: string, tokenBudget: number = 1_000_000,
                                   maxBadAttempts: number = 3,
                                   existingContext?: Partial<TrackerContext>): Promise<{ result: StepAction; context: TrackerContext }> {
@@ -250,7 +302,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
     tokenTracker: existingContext?.tokenTracker || new TokenTracker(tokenBudget),
     actionTracker: existingContext?.actionTracker || new ActionTracker()
   };
-  context.actionTracker.trackAction({ gaps: [question], totalStep: 0, badAttempts: 0 });
+  context.actionTracker.trackAction({gaps: [question], totalStep: 0, badAttempts: 0});
   let step = 0;
   let totalStep = 0;
   let badAttempts = 0;
@@ -275,7 +327,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
     await sleep(STEP_SLEEP);
     step++;
     totalStep++;
-    context.actionTracker.trackAction({ totalStep, thisStep, gaps, badAttempts });
+    context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
     const budgetPercentage = (context.tokenTracker.getTotalUsage() / tokenBudget * 100).toFixed(2);
     console.log(`Step ${totalStep} / Budget used ${budgetPercentage}%`);
     console.log('Gaps:', gaps);
@@ -298,7 +350,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
       allKnowledge,
       allURLs,
       false
-      );
+    );
 
     const model = genAI.getGenerativeModel({
       model: modelConfigs.agent.model,
@@ -439,15 +491,14 @@ Although you solved a sub-question, you still need to find the answer to the ori
         allKnowledge.push({
           question: currentQuestion,
           answer: thisStep.answer,
+          references: thisStep.references,
           type: 'qa'
         });
       }
     } else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
       let newGapQuestions = thisStep.questionsToAnswer
       const oldQuestions = newGapQuestions;
-      if (allQuestions.length) {
-        newGapQuestions = (await dedupQueries(newGapQuestions, allQuestions)).unique_queries;
-      }
+      newGapQuestions = (await dedupQueries(newGapQuestions, allQuestions)).unique_queries;
       if (newGapQuestions.length > 0) {
         // found new gap questions
         diaryContext.push(`
@@ -479,10 +530,9 @@ But then you realized you have asked them before. You decided to to think out of
 
       const oldKeywords = keywordsQueries;
       // avoid exisitng searched queries
-      if (allKeywords.length) {
-        const {unique_queries: dedupedQueries} = await dedupQueries(keywordsQueries, allKeywords);
-        keywordsQueries = dedupedQueries;
-      }
+      const {unique_queries: dedupedQueries} = await dedupQueries(keywordsQueries, allKeywords);
+      keywordsQueries = dedupedQueries;
+
       if (keywordsQueries.length > 0) {
         const searchResults = [];
         for (const query of keywordsQueries) {
@@ -508,6 +558,14 @@ But then you realized you have asked them before. You decided to to think out of
             url: r.url,
             description: r.description,
           }));
+
+          allKnowledge.push({
+            question: `What do Internet say about ${query}?`,
+            answer: removeHTMLtags(minResults.map(r => `${r.description}`).join('; ')),
+            references: minResults.map(r => r.url),
+            type: 'side-info'
+          });
+
           for (const r of minResults) {
             allURLs[r.url] = r.title;
           }
@@ -559,6 +617,7 @@ You decided to think out of the box or cut from a completely different angle.
             allKnowledge.push({
               question: `What is in ${response.data?.url || 'the URL'}?`,
               answer: removeAllLineBreaks(response.data?.content || 'No content available'),
+              references: [response.data?.url],
               type: 'url'
             });
             visitedURLs.push(url);
@@ -602,7 +661,7 @@ You decided to think out of the box or cut from a completely different angle.`);
   totalStep++;
   await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
   if (isAnswered) {
-    return { result: thisStep, context };
+    return {result: thisStep, context};
   } else {
     console.log('Enter Beast mode!!!')
     const prompt = getPrompt(
@@ -617,7 +676,7 @@ You decided to think out of the box or cut from a completely different angle.`);
       allKnowledge,
       allURLs,
       true
-      );
+    );
 
     const model = genAI.getGenerativeModel({
       model: modelConfigs.agentBeastMode.model,
@@ -636,7 +695,7 @@ You decided to think out of the box or cut from a completely different angle.`);
     await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
     thisStep = JSON.parse(response.text());
     console.log(thisStep)
-    return { result: thisStep, context };
+    return {result: thisStep, context};
   }
 }
 
@@ -658,7 +717,10 @@ const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);
 
 export async function main() {
   const question = process.argv[2] || "";
-  const { result: finalStep, context: tracker } = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
+  const {
+    result: finalStep,
+    context: tracker
+  } = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
   console.log('Final Answer:', finalStep.answer);
 
   tracker.tokenTracker.printSummary();
diff --git a/src/tools/dedup.ts b/src/tools/dedup.ts
index 3d6d644..8fa6cf7 100644
--- a/src/tools/dedup.ts
+++ b/src/tools/dedup.ts
@@ -33,81 +33,55 @@ const model = genAI.getGenerativeModel({
 });
 
 function getPrompt(newQueries: string[], existingQueries: string[]): string {
-  return `You are an expert in semantic similarity analysis. Given a set of new queries (A) and existing queries (B), identify which queries from set A are semantically unique when compared BOTH to other queries within A AND to queries in set B.
+  return `You are an expert in semantic similarity analysis. Given a set of queries (setA) and a set of queries (setB)
 
-Core Rules:
+<rules>
+Function FilterSetA(setA, setB, threshold):
+    filteredA = empty set
+    
+    for each candidateQuery in setA:
+        isValid = true
+        
+        // Check similarity with already accepted queries in filteredA
+        for each acceptedQuery in filteredA:
+            similarity = calculateSimilarity(candidateQuery, acceptedQuery)
+            if similarity >= threshold:
+                isValid = false
+                break
+        
+        // If passed first check, compare with set B
+        if isValid:
+            for each queryB in setB:
+                similarity = calculateSimilarity(candidateQuery, queryB)
+                if similarity >= threshold:
+                    isValid = false
+                    break
+        
+        // If passed all checks, add to filtered set
+        if isValid:
+            add candidateQuery to filteredA
+    
+    return filteredA
+</rules>    
+
+<similarity-definition>
 1. Consider semantic meaning and query intent, not just lexical similarity
 2. Account for different phrasings of the same information need
-3. A query is considered duplicate ONLY if:
-   - It has identical base keywords AND identical operators to another query in set A
-   - OR it has identical base keywords AND identical operators to a query in set B
-4. Queries with same base keywords but different operators are NOT duplicates
-5. Different aspects or perspectives of the same topic are not duplicates
-6. Consider query specificity - a more specific query is not a duplicate of a general one
-7. Search operators that make queries behave differently:
+3. Queries with same base keywords but different operators are NOT duplicates
+4. Different aspects or perspectives of the same topic are not duplicates
+5. Consider query specificity - a more specific query is not a duplicate of a general one
+6. Search operators that make queries behave differently:
    - Different site: filters (e.g., site:youtube.com vs site:github.com)
    - Different file types (e.g., filetype:pdf vs filetype:doc)
    - Different language/location filters (e.g., lang:en vs lang:es)
    - Different exact match phrases (e.g., "exact phrase" vs no quotes)
    - Different inclusion/exclusion (+/- operators)
    - Different title/body filters (intitle: vs inbody:)
+</similarity-definition>
 
-Examples:
-
-Set A: [
-  "python tutorial site:youtube.com",
-  "python tutorial site:udemy.com",
-  "python tutorial filetype:pdf",
-  "best restaurants brooklyn",
-  "best restaurants brooklyn site:yelp.com",
-  "python tutorial site:youtube.com -beginner"
-]
-Set B: [
-  "python programming guide",
-  "brooklyn dining recommendations"
-]
-Thought: Let's analyze each query in set A considering operators:
-1. First query targets YouTube tutorials - unique
-2. Second query targets Udemy - different site operator, so unique
-3. Third query targets PDF files - different filetype operator, so unique
-4. Fourth query is basic restaurant search - unique
-5. Fifth query adds Yelp filter - different site operator, so unique
-6. Sixth query has same site as first but adds exclusion - different operator combo, so unique
-None of the queries in set B have matching operators, so they don't cause duplicates.
-Unique Queries: [
-  "python tutorial site:youtube.com",
-  "python tutorial site:udemy.com",
-  "python tutorial filetype:pdf",
-  "best restaurants brooklyn",
-  "best restaurants brooklyn site:yelp.com",
-  "python tutorial site:youtube.com -beginner"
-]
-
-Set A: [
-  "machine learning +tensorflow filetype:pdf",
-  "machine learning +pytorch filetype:pdf",
-  "machine learning tutorial lang:en",
-  "machine learning tutorial lang:es"
-]
-Set B: [
-  "machine learning guide"
-]
-Thought: Analyzing queries with attention to operators:
-1. First query specifies tensorflow PDFs - unique
-2. Second query targets pytorch PDFs - different inclusion operator, so unique
-3. Third query targets English content - unique due to language filter
-4. Fourth query targets Spanish content - different language filter, so unique
-The query in set B has no operators and different base terms, so it doesn't affect our decisions.
-Unique Queries: [
-  "machine learning +tensorflow filetype:pdf",
-  "machine learning +pytorch filetype:pdf",
-  "machine learning tutorial lang:en",
-  "machine learning tutorial lang:es"
-]
-
-Now, analyze these sets:
-Set A: ${JSON.stringify(newQueries)}
-Set B: ${JSON.stringify(existingQueries)}`;
+Now, run FilterSetA on the following:
+SetA: ${JSON.stringify(newQueries)}
+SetB: ${JSON.stringify(existingQueries)}`;
 }
 
 export async function dedupQueries(newQueries: string[], existingQueries: string[], tracker?: TokenTracker): Promise<{ unique_queries: string[], tokens: number }> {