fix: evaluator

2026-03-22 07:29:35 +08:00 · 2025-02-06 20:03:36 +08:00
parent deff7235b2
commit a5e5627823
2 changed files with 49 additions and 8 deletions
--- a/src/evals/ego-questions.json
+++ b/src/evals/ego-questions.json
@@ -0,0 +1,22 @@
+[
+  {
+    "question": "what is jina ai ceo's twitter account",
+    "answer": "hxiao"
+  },
+  {
+    "question": "what is the latest model published by jina ai?",
+    "answer": "ReaderLM-2.0"
+  },
+  {
+    "question": "what is the lastest blog post that jina ai published?",
+    "answer": "A Practical Guide to Deploying Search Foundation Models in Production"
+  },
+  {
+    "question": "what is the context length of readerlm-v2?",
+    "answer": "512K"
+  },
+  {
+    "question": "how many employees does jina ai have right now?",
+    "answer": "30"
+  }
+]
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -8,20 +8,25 @@ import { handleGenerateObjectError } from '../utils/error-handling';
 const model = getModel('evaluator');

 const responseSchema = z.object({
-  is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or \'I don\'t know\' type statements'),
+  is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
  reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
 });

-
-
 function getPrompt(question: string, answer: string): string {
  return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.

-Core Evaluation Criterion:
- Definitiveness: "I don't know", "lack of information", "doesn't exist", "not sure" or highly uncertain/ambiguous responses are **not** definitive, must return false!
+<rules>
+First, if the answer is not a direct response to the question, it must return false. 
+Definitiveness is the king! The following types of responses are NOT definitive and must return false:
+  1. Expressions of uncertainty: "I don't know", "not sure", "might be", "probably"
+  2. Lack of information statements: "doesn't exist", "lack of information", "could not find"
+  3. Inability statements: "I cannot provide", "I am unable to", "we cannot"
+  4. Negative statements that redirect: "However, you can...", "Instead, try..."
+  5. Non-answers that suggest alternatives
+</rules>

-Examples:

+<examples>
 Question: "What are the system requirements for running Python 3.9?"
 Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
 Evaluation: {
@@ -36,13 +41,27 @@ Evaluation: {
  "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
 }

+Question: "Who will be the president of the United States in 2032?"
+Answer: "I cannot predict the future, it depends on the election results."
+Evaluation: {
+  "is_definitive": false,
+  "reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
+}
+
+Question: "Who is the sales director at Company X?"
+Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
+Evaluation: {
+  "is_definitive": false,
+  "reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
+}
+
 Question: "what is the twitter account of jina ai's founder?"
 Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
 Evaluation: {
  "is_definitive": false,
  "reasoning": "The answer indicates a lack of information rather than providing a definitive response."
 }
-
+</examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
@@ -98,4 +117,4 @@ async function main() {

 if (require.main === module) {
  main().catch(console.error);
-}
+}