refactor: evaluator (#66)

* refactor: evaluator * refactor: evaluator * refactor: evaluator * refactor: evaluator
2026-03-22 07:29:35 +08:00 · 2025-02-20 15:25:10 +08:00
parent 40ee042e56
commit 3c0dffcdd1
5 changed files with 93 additions and 141 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # DeepResearch
-[UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
+[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
 Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
@@ -54,15 +54,21 @@ export JINA_API_KEY=jina_...  # free jina api key, get from https://jina.ai/read
 npm run dev $QUERY
 ```
 ### Official Site
 You can try it on [our official site](https://search.jina.ai).
 ### Official API
-You can also use our official DeepSearch API, hosted and optimized by Jina AI:
+You can also use [our official DeepSearch API](https://jina.ai/deepsearch):
 ```
 https://deepsearch.jina.ai/v1/chat/completions
 ```
-You can use it with any OpenAI-compatible client. For the authentication Bearer, get your Jina API key from https://jina.ai
+You can use it with any OpenAI-compatible client. 
 For the authentication Bearer, API key, rate limit, get from https://jina.ai/deepsearch.
 #### Client integration guidelines
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -576,7 +576,7 @@ But then you realized you have asked them before. You decided to to think out of
      let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker);
      // add the original query before rewrite to the keywordsQueries
-      keywordsQueries.push(thisStep.searchQuery)
+      keywordsQueries.push(question)
      const oldKeywords = keywordsQueries;
      // avoid exisitng searched queries
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -21,9 +21,7 @@ const freshnessSchema = z.object({
  ...baseSchema,
  type: z.literal('freshness'),
  freshness_analysis: z.object({
-    likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
+    days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
    dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
    current_time: z.string().describe('Current system time when evaluation was performed'),
    max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
  })
 });
@@ -32,8 +30,6 @@ const pluralitySchema = z.object({
  ...baseSchema,
  type: z.literal('plurality'),
  plurality_analysis: z.object({
    expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
    provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
    count_expected: z.number().optional().describe('Number of items expected if specified in question'),
    count_provided: z.number().describe('Number of items provided in answer')
  })
@@ -162,69 +158,61 @@ Answer: ${JSON.stringify(answer)}`;
 }
 function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
-  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
+  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates (or implied datetime) and current system time: ${currentTime}
 <rules>
-1. Date Analysis:
+Question-Answer Freshness Checker Guidelines
   - Extract all dates mentioned in the answer
   - Compare against current system time: ${currentTime}
   - Consider content outdated if:
     * It refers to a "latest" or "current" state from more than 30 days ago
     * It mentions specific dates/events that have been superseded
     * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
   - For product versions, releases, or announcements, max age is 30 days
   - For company positions, leadership, or general facts, max age is 90 days
-2. Context Hints:
+# Revised QA Type Maximum Age Table
-   - Words indicating recency: "latest", "current", "newest", "just released", "recently"
+
-   - Time-sensitive terms: "CEO", "price", "version", "release"
+| QA Type                  | Max Age (Days) | Notes                                                                 |
-   - Future dates should be ignored in outdated calculation
+|--------------------------|--------------|-----------------------------------------------------------------------|
 | Financial Data (Real-time)| 0.1        | Stock prices, exchange rates, crypto (real-time preferred)             |
 | Breaking News            | 1           | Immediate coverage of major events                                     |
 | News/Current Events      | 1           | Time-sensitive news, politics, or global events                        |
 | Weather Forecasts        | 1           | Accuracy drops significantly after 24 hours                            |
 | Sports Scores/Events     | 1           | Live updates required for ongoing matches                              |
 | Security Advisories      | 1           | Critical security updates and patches                                  |
 | Social Media Trends      | 1           | Viral content, hashtags, memes                                         |
 | Cybersecurity Threats    | 7           | Rapidly evolving vulnerabilities/patches                               |
 | Tech News                | 7           | Technology industry updates and announcements                          |
 | Political Developments   | 7           | Legislative changes, political statements                              |
 | Political Elections      | 7           | Poll results, candidate updates                                        |
 | Sales/Promotions         | 7           | Limited-time offers and marketing campaigns                            |
 | Travel Restrictions      | 7           | Visa rules, pandemic-related policies                                  |
 | Entertainment News       | 14          | Celebrity updates, industry announcements                              |
 | Product Launches         | 14          | New product announcements and releases                                 |
 | Market Analysis          | 14          | Market trends and competitive landscape                                |
 | Competitive Intelligence | 21          | Analysis of competitor activities and market position                  |
 | Product Recalls          | 30          | Safety alerts or recalls from manufacturers                            |
 | Industry Reports         | 30          | Sector-specific analysis and forecasting                               |
 | Software Version Info    | 30          | Updates, patches, and compatibility information                        |
 | Legal/Regulatory Updates | 30          | Laws, compliance rules (jurisdiction-dependent)                        |
 | Economic Forecasts       | 30          | Macroeconomic predictions and analysis                                 |
 | Consumer Trends          | 45          | Shifting consumer preferences and behaviors                            |
 | Scientific Discoveries   | 60          | New research findings and breakthroughs (includes all scientific research) |
 | Healthcare Guidelines    | 60          | Medical recommendations and best practices (includes medical guidelines)|
 | Environmental Reports    | 60          | Climate and environmental status updates                               |
 | Best Practices           | 90          | Industry standards and recommended procedures                          |
 | API Documentation        | 90          | Technical specifications and implementation guides                     |
 | Tutorial Content         | 180         | How-to guides and instructional materials (includes educational content)|
 | Tech Product Info        | 180         | Product specs, release dates, or pricing                               |
 | Statistical Data         | 180         | Demographic and statistical information                                |
 | Reference Material       | 180         | General reference information and resources                            |
 | Historical Content       | 365         | Events and information from the past year                              |
 | Cultural Trends          | 730         | Shifts in language, fashion, or social norms                           |
 | Entertainment Releases   | 730         | Movie/TV show schedules, media catalogs                                |
 | Factual Knowledge        | ∞           | Static facts (e.g., historical events, geography, physical constants)   |
 ### Implementation Notes:
 1. **Contextual Adjustment**: Freshness requirements may change during crises or rapid developments in specific domains.
 2. **Tiered Approach**: Consider implementing urgency levels (critical, important, standard) alongside age thresholds.
 3. **User Preferences**: Allow customization of thresholds for specific query types or user needs.
 4. **Source Reliability**: Pair freshness metrics with source credibility scores for better quality assessment.
 5. **Domain Specificity**: Some specialized fields (medical research during pandemics, financial data during market volatility) may require dynamically adjusted thresholds.
 6. **Geographic Relevance**: Regional considerations may alter freshness requirements for local regulations or events.
 </rules>
 <examples>
 Question: "What was Jina AI's closing stock price yesterday?"
 Answer: "Jina AI's stock closed at $45.30 per share at yesterday's market close."
 Current Time: "2024-03-07T14:30:00Z"
 Evaluation: {
  "pass": true,
  "think": "The question specifically asks for yesterday's closing price, and the answer provides exactly that information. Since it's asking for a historical data point rather than current price, yesterday's closing price is the correct timeframe.",
  "freshness_analysis": {
    "likely_outdated": false,
    "dates_mentioned": ["2024-03-06"],
    "current_time": "2024-03-07T14:30:00Z",
    "max_age_days": 1
  }
 }
 Question: "What is Jina AI's latest embedding model?"
 Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
 Current Time: "2024-10-06T00:00:00Z"
 Evaluation: {
  "pass": false,
  "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
  "freshness_analysis": {
    "likely_outdated": true,
    "dates_mentioned": ["2024-03-15"],
    "current_time": "2024-10-06T00:00:00Z",
    "max_age_days": 30
  }
 }
 Question: "Who is OpenAI's CEO?"
 Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
 Current Time: "2024-02-06T00:00:00Z"
 Evaluation: {
  "pass": true,
  "think": "The answer is about company leadership and is within the 60-day threshold for such information",
  "freshness_analysis": {
    "likely_outdated": false,
    "dates_mentioned": ["2023-12"],
    "current_time": "2024-02-06T00:00:00Z",
    "max_age_days": 90
  }
 }
 </examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
@@ -234,77 +222,38 @@ function getPluralityPrompt(question: string, answer: string): string {
  return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
 <rules>
-1. Question Analysis:
+Question Type Reference Table
   - Check if question asks for multiple items using indicators like:
     * Plural nouns: "companies", "people", "names"
     * Quantifiers: "all", "many", "several", "various", "multiple"
     * List requests: "list", "enumerate", "name all", "give me all"
     * Numbers: "5 examples", "top 10"
   - Otherwise skip the analysis and return pass to true
-2. Answer Analysis:
+| Question Type | Expected Items | Evaluation Rules |
-   - Count distinct items provided in the answer
+|---------------|----------------|------------------|
-   - Check if answer uses limiting words like "only", "just", "single"
+| Explicit Count | Exact match to number specified | Provide exactly the requested number of distinct, non-redundant items relevant to the query. |
-   - Identify if answer acknowledges there are more items but only provides some
+| Numeric Range | Any number within specified range | Ensure count falls within given range with distinct, non-redundant items. For "at least N" queries, meet minimum threshold. |
-
+| Implied Multiple | ≥ 2 | Provide multiple items (typically 2-4 unless context suggests more) with balanced detail and importance. |
-3. Definitiveness Rules:
+| "Few" | 2-4 | Offer 2-4 substantive items prioritizing quality over quantity. |
-   - If question asks for multiple items but answer provides only one → NOT definitive
+| "Several" | 3-7 | Include 3-7 items with comprehensive yet focused coverage, each with brief explanation. |
-   - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
+| "Many" | 7+ | Present 7+ items demonstrating breadth, with concise descriptions per item. |
-   - If answer clearly states it's providing a partial list → NOT definitive
+| "Most important" | Top 3-5 by relevance | Prioritize by importance, explain ranking criteria, and order items by significance. |
-   - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
+| "Top N" | Exactly N, ranked | Provide exactly N items ordered by importance/relevance with clear ranking criteria. |
 | "Pros and Cons" | ≥ 2 of each category | Present balanced perspectives with at least 2 items per category addressing different aspects. |
 | "Compare X and Y" | ≥ 3 comparison points | Address at least 3 distinct comparison dimensions with balanced treatment covering major differences/similarities. |
 | "Steps" or "Process" | All essential steps | Include all critical steps in logical order without missing dependencies. |
 | "Examples" | ≥ 3 unless specified | Provide at least 3 diverse, representative, concrete examples unless count specified. |
 | "Comprehensive" | 10+ | Deliver extensive coverage (10+ items) across major categories/subcategories demonstrating domain expertise. |
 | "Brief" or "Quick" | 1-3 | Present concise content (1-3 items) focusing on most important elements described efficiently. |
 | "Complete" | All relevant items | Provide exhaustive coverage within reasonable scope without major omissions, using categorization if needed. |
 | "Thorough" | 7-10 | Offer detailed coverage addressing main topics and subtopics with both breadth and depth. |
 | "Overview" | 3-5 | Cover main concepts/aspects with balanced coverage focused on fundamental understanding. |
 | "Summary" | 3-5 key points | Distill essential information capturing main takeaways concisely yet comprehensively. |
 | "Main" or "Key" | 3-7 | Focus on most significant elements fundamental to understanding, covering distinct aspects. |
 | "Essential" | 3-7 | Include only critical, necessary items without peripheral or optional elements. |
 | "Basic" | 2-5 | Present foundational concepts accessible to beginners focusing on core principles. |
 | "Detailed" | 5-10 with elaboration | Provide in-depth coverage with explanations beyond listing, including specific information and nuance. |
 | "Common" | 4-8 most frequent | Focus on typical or prevalent items, ordered by frequency when possible, that are widely recognized. |
 | "Primary" | 2-5 most important | Focus on dominant factors with explanation of their primacy and outsized impact. |
 | "Secondary" | 3-7 supporting items | Present important but not critical items that complement primary factors and provide additional context. |
 | Unspecified Analysis | 3-5 key points | Default to 3-5 main points covering primary aspects with balanced breadth and depth. |
 </rules>
 <examples>
 Question: "Who works in Jina AI's sales team?"
 Answer: "John Smith is a sales representative at Jina AI."
 Evaluation: {
  "pass": true,
  "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
  "plurality_analysis": {
    "expects_multiple": false,
    "provides_multiple": false,
    "count_provided": 1
  }
 }
 Question: "List all the salespeople who work at Jina AI"
 Answer: "John Smith is a sales representative at Jina AI."
 Evaluation: {
  "pass": false,
  "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": false,
    "count_provided": 1
  }
 }
 Question: "Name the top 3 products sold by Jina AI"
 Answer: "Jina AI's product lineup includes DocArray and Jina."
 Evaluation: {
  "pass": false,
  "think": "The question asks for top 3 products but only 2 are provided.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": true,
    "count_expected": 3,
    "count_provided": 2
  }
 }
 Question: "List as many AI companies in Berlin as you can find"
 Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
 Evaluation: {
  "pass": false,
  "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": true,
    "count_provided": 5
  }
 }
 </examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@@ -23,7 +23,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
        'Content-Type': 'application/json',
        'Content-Length': data.length,
        'X-Retain-Images': 'none',
-        'X-Return-Format': 'markdown'
+        'X-Return-Format': 'markdown',
        'X-Engine': 'direct'
      }
    };
--- a/src/types.ts
+++ b/src/types.ts
@@ -143,14 +143,10 @@ export type EvaluationResponse = {
  think: string;
  type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
  freshness_analysis?: {
-    likely_outdated: boolean;
+    days_ago: number;
    dates_mentioned: string[];
    current_time: string;
    max_age_days?: number;
  };
  plurality_analysis?: {
    expects_multiple: boolean;
    provides_multiple: boolean;
    count_expected?: number;
    count_provided: number;
  };