chore: first commit

2025-12-26 06:28:56 +08:00 · 2025-02-02 18:47:29 +08:00 · 2025-02-02 18:47:29 +08:00 · 8c8484593f
commit 8c8484593f
parent f1b4d2681e
2 changed files with 35 additions and 35 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -351,12 +351,10 @@ ${evaluation.reasoning}

 Your journey ends here.
 `);
-          console.log('Final Answer:', thisStep.answer);
-          tokenTracker.printSummary();
          await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
          return thisStep;
        }
-        if (evaluation.is_valid_answer) {
+        if (evaluation.is_definitive) {
          if (thisStep.references.length > 0 || Object.keys(allURLs).length === 0) {
            // EXIT POINT OF THE PROGRAM!!!!
            diaryContext.push(`
@ -373,8 +371,6 @@ ${evaluation.reasoning}

 Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
 `);
-            console.log('Final Answer:', thisStep.answer);
-            tokenTracker.printSummary();
            await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
            return thisStep;
          } else {
@ -391,6 +387,9 @@ Unfortunately, you did not provide any references to support your answer.
 You need to find more URL references to support your answer.`);
          }

+          await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
+          return thisStep;
+
        } else {
          diaryContext.push(`
 At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
@ -418,7 +417,7 @@ ${evaluation.reasoning}
          diaryContext = [];
          step = 0;
        }
-      } else if (evaluation.is_valid_answer) {
+      } else if (evaluation.is_definitive) {
        diaryContext.push(`
 At step ${step}, you took **answer** action. You found a good answer to the sub-question:

@ -436,7 +435,8 @@ Although you solved a sub-question, you still need to find the answer to the ori
        allKnowledge.push({
          question: currentQuestion,
          answer: thisStep.answer,
-          type: 'qa'});
+          type: 'qa'
+        });
      }
    } else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
      let newGapQuestions = thisStep.questionsToAnswer
@ -611,5 +611,14 @@ async function storeContext(prompt: string, memory: any[][], step: number) {

 const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);

-const question = process.argv[2] || "";
-getResponse(question);
+
+export async function main() {
+  const question = process.argv[2] || "";
+  const finalStep = await getResponse(question);
+  console.log('Final Answer:', finalStep.answer);
+  tokenTracker.printSummary();
+}
+
+if (require.main === module) {
+  main().catch(console.error);
+}
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@ -3,23 +3,23 @@ import { GEMINI_API_KEY, MODEL_NAME } from "../config";
 import { tokenTracker } from "../utils/token-tracker";

 type EvaluationResponse = {
-  is_valid_answer: boolean;
+  is_definitive: boolean;
  reasoning: string;
 };

 const responseSchema = {
  type: SchemaType.OBJECT,
  properties: {
-    is_valid_answer: {
+    is_definitive: {
      type: SchemaType.BOOLEAN,
-      description: "Whether the answer provides any useful information to the question"
+      description: "Whether the answer provides a definitive response without uncertainty or 'I don't know' type statements"
    },
    reasoning: {
      type: SchemaType.STRING,
-      description: "Detailed explanation of the evaluation"
+      description: "Explanation of why the answer is or isn't definitive"
    }
  },
-  required: ["is_valid_answer", "reasoning"]
+  required: ["is_definitive", "reasoning"]
 };

 const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);
@ -33,41 +33,32 @@ const model = genAI.getGenerativeModel({
 });

 function getPrompt(question: string, answer: string): string {
-  return `You are an expert evaluator of question-answer pairs. Analyze if the given answer based on the following criteria is valid or not.
+  return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.

-Core Evaluation Criteria:
- Definitiveness: "I don't know", "lack of information", "doesn't exist" or highly uncertain ambiguous responses are **not** valid answers, must return false!
- Informativeness: Answer must provide substantial, useful information
- Completeness: When question mentions multiple aspects or elements, the answer should cover all of them
+Core Evaluation Criterion:
+- Definitiveness: "I don't know", "lack of information", "doesn't exist", "not sure" or highly uncertain/ambiguous responses are **not** definitive, must return false!

 Examples:

 Question: "What are the system requirements for running Python 3.9?"
 Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
 Evaluation: {
-  "is_valid_answer": false,
-  "reasoning": "The answer is vague, uncertain, and lacks specific information about actual system requirements. It fails the specificity and informativeness criteria."
+  "is_definitive": false,
+  "reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
 }

 Question: "What are the system requirements for running Python 3.9?"
-Answer: "Python 3.9 requires: Windows 7 or later, macOS 10.11 or later, or Linux. Minimum 4GB RAM recommended, 2GB disk space, and x86-64 processor. For Windows, you'll need Microsoft Visual C++ 2015 or later."
+Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
 Evaluation: {
-  "is_valid_answer": true,
-  "reasoning": "The answer is comprehensive, specific, and covers all key system requirements across different operating systems. It provides concrete numbers and necessary additional components."
+  "is_definitive": true,
+  "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
 }

 Question: "what is the twitter account of jina ai's founder?"
 Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
 Evaluation: {
-  "is_valid_answer": false,
-  "reasoning": "The answer is not definitive and fails to provide the requested information. Don't know, can't derive, lack of information is unacceptable,"
-}
-
-Question: "who owns jina ai?"
-Answer: "The ownership structure of Jina AI is not publicly disclosed."
-Evaluation: {
-  "is_valid_answer": false,
-  "reasoning": "The answer is not definitive and fails to provide the requested information. Lack of information is unacceptable, more search and deep reasoning is needed."
+  "is_definitive": false,
+  "reasoning": "The answer indicates a lack of information rather than providing a definitive response."
 }

 Now evaluate this pair:
@ -83,7 +74,7 @@ export async function evaluateAnswer(question: string, answer: string): Promise<
    const usage = response.usageMetadata;
    const json = JSON.parse(response.text()) as EvaluationResponse;
    console.log('Evaluation:', {
-      valid: json.is_valid_answer,
+      definitive: json.is_definitive,
      reason: json.reasoning
    });
    const tokens = usage?.totalTokenCount || 0;
@ -114,4 +105,4 @@ async function main() {

 if (require.main === module) {
  main().catch(console.error);
-}
+}