mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
refactor: evaluator (#66)
* refactor: evaluator * refactor: evaluator * refactor: evaluator * refactor: evaluator
This commit is contained in:
12
README.md
12
README.md
@@ -1,6 +1,6 @@
|
|||||||
# DeepResearch
|
# DeepResearch
|
||||||
|
|
||||||
[UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
|
[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
|
||||||
|
|
||||||
Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
|
Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
|
||||||
|
|
||||||
@@ -54,15 +54,21 @@ export JINA_API_KEY=jina_... # free jina api key, get from https://jina.ai/read
|
|||||||
npm run dev $QUERY
|
npm run dev $QUERY
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Official Site
|
||||||
|
|
||||||
|
You can try it on [our official site](https://search.jina.ai).
|
||||||
|
|
||||||
### Official API
|
### Official API
|
||||||
|
|
||||||
You can also use our official DeepSearch API, hosted and optimized by Jina AI:
|
You can also use [our official DeepSearch API](https://jina.ai/deepsearch):
|
||||||
|
|
||||||
```
|
```
|
||||||
https://deepsearch.jina.ai/v1/chat/completions
|
https://deepsearch.jina.ai/v1/chat/completions
|
||||||
```
|
```
|
||||||
|
|
||||||
You can use it with any OpenAI-compatible client. For the authentication Bearer, get your Jina API key from https://jina.ai
|
You can use it with any OpenAI-compatible client.
|
||||||
|
|
||||||
|
For the authentication Bearer, API key, rate limit, get from https://jina.ai/deepsearch.
|
||||||
|
|
||||||
#### Client integration guidelines
|
#### Client integration guidelines
|
||||||
|
|
||||||
|
|||||||
@@ -576,7 +576,7 @@ But then you realized you have asked them before. You decided to to think out of
|
|||||||
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker);
|
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker);
|
||||||
|
|
||||||
// add the original query before rewrite to the keywordsQueries
|
// add the original query before rewrite to the keywordsQueries
|
||||||
keywordsQueries.push(thisStep.searchQuery)
|
keywordsQueries.push(question)
|
||||||
|
|
||||||
const oldKeywords = keywordsQueries;
|
const oldKeywords = keywordsQueries;
|
||||||
// avoid exisitng searched queries
|
// avoid exisitng searched queries
|
||||||
|
|||||||
@@ -21,9 +21,7 @@ const freshnessSchema = z.object({
|
|||||||
...baseSchema,
|
...baseSchema,
|
||||||
type: z.literal('freshness'),
|
type: z.literal('freshness'),
|
||||||
freshness_analysis: z.object({
|
freshness_analysis: z.object({
|
||||||
likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
|
days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
|
||||||
dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
|
|
||||||
current_time: z.string().describe('Current system time when evaluation was performed'),
|
|
||||||
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
|
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
@@ -32,8 +30,6 @@ const pluralitySchema = z.object({
|
|||||||
...baseSchema,
|
...baseSchema,
|
||||||
type: z.literal('plurality'),
|
type: z.literal('plurality'),
|
||||||
plurality_analysis: z.object({
|
plurality_analysis: z.object({
|
||||||
expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
|
|
||||||
provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
|
|
||||||
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
|
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
|
||||||
count_provided: z.number().describe('Number of items provided in answer')
|
count_provided: z.number().describe('Number of items provided in answer')
|
||||||
})
|
})
|
||||||
@@ -162,69 +158,61 @@ Answer: ${JSON.stringify(answer)}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
|
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
|
||||||
return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
|
return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates (or implied datetime) and current system time: ${currentTime}
|
||||||
|
|
||||||
<rules>
|
<rules>
|
||||||
1. Date Analysis:
|
Question-Answer Freshness Checker Guidelines
|
||||||
- Extract all dates mentioned in the answer
|
|
||||||
- Compare against current system time: ${currentTime}
|
|
||||||
- Consider content outdated if:
|
|
||||||
* It refers to a "latest" or "current" state from more than 30 days ago
|
|
||||||
* It mentions specific dates/events that have been superseded
|
|
||||||
* It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
|
|
||||||
- For product versions, releases, or announcements, max age is 30 days
|
|
||||||
- For company positions, leadership, or general facts, max age is 90 days
|
|
||||||
|
|
||||||
2. Context Hints:
|
# Revised QA Type Maximum Age Table
|
||||||
- Words indicating recency: "latest", "current", "newest", "just released", "recently"
|
|
||||||
- Time-sensitive terms: "CEO", "price", "version", "release"
|
| QA Type | Max Age (Days) | Notes |
|
||||||
- Future dates should be ignored in outdated calculation
|
|--------------------------|--------------|-----------------------------------------------------------------------|
|
||||||
|
| Financial Data (Real-time)| 0.1 | Stock prices, exchange rates, crypto (real-time preferred) |
|
||||||
|
| Breaking News | 1 | Immediate coverage of major events |
|
||||||
|
| News/Current Events | 1 | Time-sensitive news, politics, or global events |
|
||||||
|
| Weather Forecasts | 1 | Accuracy drops significantly after 24 hours |
|
||||||
|
| Sports Scores/Events | 1 | Live updates required for ongoing matches |
|
||||||
|
| Security Advisories | 1 | Critical security updates and patches |
|
||||||
|
| Social Media Trends | 1 | Viral content, hashtags, memes |
|
||||||
|
| Cybersecurity Threats | 7 | Rapidly evolving vulnerabilities/patches |
|
||||||
|
| Tech News | 7 | Technology industry updates and announcements |
|
||||||
|
| Political Developments | 7 | Legislative changes, political statements |
|
||||||
|
| Political Elections | 7 | Poll results, candidate updates |
|
||||||
|
| Sales/Promotions | 7 | Limited-time offers and marketing campaigns |
|
||||||
|
| Travel Restrictions | 7 | Visa rules, pandemic-related policies |
|
||||||
|
| Entertainment News | 14 | Celebrity updates, industry announcements |
|
||||||
|
| Product Launches | 14 | New product announcements and releases |
|
||||||
|
| Market Analysis | 14 | Market trends and competitive landscape |
|
||||||
|
| Competitive Intelligence | 21 | Analysis of competitor activities and market position |
|
||||||
|
| Product Recalls | 30 | Safety alerts or recalls from manufacturers |
|
||||||
|
| Industry Reports | 30 | Sector-specific analysis and forecasting |
|
||||||
|
| Software Version Info | 30 | Updates, patches, and compatibility information |
|
||||||
|
| Legal/Regulatory Updates | 30 | Laws, compliance rules (jurisdiction-dependent) |
|
||||||
|
| Economic Forecasts | 30 | Macroeconomic predictions and analysis |
|
||||||
|
| Consumer Trends | 45 | Shifting consumer preferences and behaviors |
|
||||||
|
| Scientific Discoveries | 60 | New research findings and breakthroughs (includes all scientific research) |
|
||||||
|
| Healthcare Guidelines | 60 | Medical recommendations and best practices (includes medical guidelines)|
|
||||||
|
| Environmental Reports | 60 | Climate and environmental status updates |
|
||||||
|
| Best Practices | 90 | Industry standards and recommended procedures |
|
||||||
|
| API Documentation | 90 | Technical specifications and implementation guides |
|
||||||
|
| Tutorial Content | 180 | How-to guides and instructional materials (includes educational content)|
|
||||||
|
| Tech Product Info | 180 | Product specs, release dates, or pricing |
|
||||||
|
| Statistical Data | 180 | Demographic and statistical information |
|
||||||
|
| Reference Material | 180 | General reference information and resources |
|
||||||
|
| Historical Content | 365 | Events and information from the past year |
|
||||||
|
| Cultural Trends | 730 | Shifts in language, fashion, or social norms |
|
||||||
|
| Entertainment Releases | 730 | Movie/TV show schedules, media catalogs |
|
||||||
|
| Factual Knowledge | ∞ | Static facts (e.g., historical events, geography, physical constants) |
|
||||||
|
|
||||||
|
### Implementation Notes:
|
||||||
|
1. **Contextual Adjustment**: Freshness requirements may change during crises or rapid developments in specific domains.
|
||||||
|
2. **Tiered Approach**: Consider implementing urgency levels (critical, important, standard) alongside age thresholds.
|
||||||
|
3. **User Preferences**: Allow customization of thresholds for specific query types or user needs.
|
||||||
|
4. **Source Reliability**: Pair freshness metrics with source credibility scores for better quality assessment.
|
||||||
|
5. **Domain Specificity**: Some specialized fields (medical research during pandemics, financial data during market volatility) may require dynamically adjusted thresholds.
|
||||||
|
6. **Geographic Relevance**: Regional considerations may alter freshness requirements for local regulations or events.
|
||||||
</rules>
|
</rules>
|
||||||
|
|
||||||
<examples>
|
|
||||||
Question: "What was Jina AI's closing stock price yesterday?"
|
|
||||||
Answer: "Jina AI's stock closed at $45.30 per share at yesterday's market close."
|
|
||||||
Current Time: "2024-03-07T14:30:00Z"
|
|
||||||
Evaluation: {
|
|
||||||
"pass": true,
|
|
||||||
"think": "The question specifically asks for yesterday's closing price, and the answer provides exactly that information. Since it's asking for a historical data point rather than current price, yesterday's closing price is the correct timeframe.",
|
|
||||||
"freshness_analysis": {
|
|
||||||
"likely_outdated": false,
|
|
||||||
"dates_mentioned": ["2024-03-06"],
|
|
||||||
"current_time": "2024-03-07T14:30:00Z",
|
|
||||||
"max_age_days": 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Question: "What is Jina AI's latest embedding model?"
|
|
||||||
Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
|
|
||||||
Current Time: "2024-10-06T00:00:00Z"
|
|
||||||
Evaluation: {
|
|
||||||
"pass": false,
|
|
||||||
"think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
|
|
||||||
"freshness_analysis": {
|
|
||||||
"likely_outdated": true,
|
|
||||||
"dates_mentioned": ["2024-03-15"],
|
|
||||||
"current_time": "2024-10-06T00:00:00Z",
|
|
||||||
"max_age_days": 30
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Question: "Who is OpenAI's CEO?"
|
|
||||||
Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
|
|
||||||
Current Time: "2024-02-06T00:00:00Z"
|
|
||||||
Evaluation: {
|
|
||||||
"pass": true,
|
|
||||||
"think": "The answer is about company leadership and is within the 60-day threshold for such information",
|
|
||||||
"freshness_analysis": {
|
|
||||||
"likely_outdated": false,
|
|
||||||
"dates_mentioned": ["2023-12"],
|
|
||||||
"current_time": "2024-02-06T00:00:00Z",
|
|
||||||
"max_age_days": 90
|
|
||||||
}
|
|
||||||
}
|
|
||||||
</examples>
|
|
||||||
|
|
||||||
Now evaluate this pair:
|
Now evaluate this pair:
|
||||||
Question: ${JSON.stringify(question)}
|
Question: ${JSON.stringify(question)}
|
||||||
Answer: ${JSON.stringify(answer)}`;
|
Answer: ${JSON.stringify(answer)}`;
|
||||||
@@ -234,77 +222,38 @@ function getPluralityPrompt(question: string, answer: string): string {
|
|||||||
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
|
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
|
||||||
|
|
||||||
<rules>
|
<rules>
|
||||||
1. Question Analysis:
|
Question Type Reference Table
|
||||||
- Check if question asks for multiple items using indicators like:
|
|
||||||
* Plural nouns: "companies", "people", "names"
|
|
||||||
* Quantifiers: "all", "many", "several", "various", "multiple"
|
|
||||||
* List requests: "list", "enumerate", "name all", "give me all"
|
|
||||||
* Numbers: "5 examples", "top 10"
|
|
||||||
- Otherwise skip the analysis and return pass to true
|
|
||||||
|
|
||||||
2. Answer Analysis:
|
| Question Type | Expected Items | Evaluation Rules |
|
||||||
- Count distinct items provided in the answer
|
|---------------|----------------|------------------|
|
||||||
- Check if answer uses limiting words like "only", "just", "single"
|
| Explicit Count | Exact match to number specified | Provide exactly the requested number of distinct, non-redundant items relevant to the query. |
|
||||||
- Identify if answer acknowledges there are more items but only provides some
|
| Numeric Range | Any number within specified range | Ensure count falls within given range with distinct, non-redundant items. For "at least N" queries, meet minimum threshold. |
|
||||||
|
| Implied Multiple | ≥ 2 | Provide multiple items (typically 2-4 unless context suggests more) with balanced detail and importance. |
|
||||||
3. Definitiveness Rules:
|
| "Few" | 2-4 | Offer 2-4 substantive items prioritizing quality over quantity. |
|
||||||
- If question asks for multiple items but answer provides only one → NOT definitive
|
| "Several" | 3-7 | Include 3-7 items with comprehensive yet focused coverage, each with brief explanation. |
|
||||||
- If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
|
| "Many" | 7+ | Present 7+ items demonstrating breadth, with concise descriptions per item. |
|
||||||
- If answer clearly states it's providing a partial list → NOT definitive
|
| "Most important" | Top 3-5 by relevance | Prioritize by importance, explain ranking criteria, and order items by significance. |
|
||||||
- If question asks for "all" or "every" but answer seems incomplete → NOT definitive
|
| "Top N" | Exactly N, ranked | Provide exactly N items ordered by importance/relevance with clear ranking criteria. |
|
||||||
|
| "Pros and Cons" | ≥ 2 of each category | Present balanced perspectives with at least 2 items per category addressing different aspects. |
|
||||||
|
| "Compare X and Y" | ≥ 3 comparison points | Address at least 3 distinct comparison dimensions with balanced treatment covering major differences/similarities. |
|
||||||
|
| "Steps" or "Process" | All essential steps | Include all critical steps in logical order without missing dependencies. |
|
||||||
|
| "Examples" | ≥ 3 unless specified | Provide at least 3 diverse, representative, concrete examples unless count specified. |
|
||||||
|
| "Comprehensive" | 10+ | Deliver extensive coverage (10+ items) across major categories/subcategories demonstrating domain expertise. |
|
||||||
|
| "Brief" or "Quick" | 1-3 | Present concise content (1-3 items) focusing on most important elements described efficiently. |
|
||||||
|
| "Complete" | All relevant items | Provide exhaustive coverage within reasonable scope without major omissions, using categorization if needed. |
|
||||||
|
| "Thorough" | 7-10 | Offer detailed coverage addressing main topics and subtopics with both breadth and depth. |
|
||||||
|
| "Overview" | 3-5 | Cover main concepts/aspects with balanced coverage focused on fundamental understanding. |
|
||||||
|
| "Summary" | 3-5 key points | Distill essential information capturing main takeaways concisely yet comprehensively. |
|
||||||
|
| "Main" or "Key" | 3-7 | Focus on most significant elements fundamental to understanding, covering distinct aspects. |
|
||||||
|
| "Essential" | 3-7 | Include only critical, necessary items without peripheral or optional elements. |
|
||||||
|
| "Basic" | 2-5 | Present foundational concepts accessible to beginners focusing on core principles. |
|
||||||
|
| "Detailed" | 5-10 with elaboration | Provide in-depth coverage with explanations beyond listing, including specific information and nuance. |
|
||||||
|
| "Common" | 4-8 most frequent | Focus on typical or prevalent items, ordered by frequency when possible, that are widely recognized. |
|
||||||
|
| "Primary" | 2-5 most important | Focus on dominant factors with explanation of their primacy and outsized impact. |
|
||||||
|
| "Secondary" | 3-7 supporting items | Present important but not critical items that complement primary factors and provide additional context. |
|
||||||
|
| Unspecified Analysis | 3-5 key points | Default to 3-5 main points covering primary aspects with balanced breadth and depth. |
|
||||||
</rules>
|
</rules>
|
||||||
|
|
||||||
<examples>
|
|
||||||
Question: "Who works in Jina AI's sales team?"
|
|
||||||
Answer: "John Smith is a sales representative at Jina AI."
|
|
||||||
Evaluation: {
|
|
||||||
"pass": true,
|
|
||||||
"think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
|
|
||||||
"plurality_analysis": {
|
|
||||||
"expects_multiple": false,
|
|
||||||
"provides_multiple": false,
|
|
||||||
"count_provided": 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Question: "List all the salespeople who work at Jina AI"
|
|
||||||
Answer: "John Smith is a sales representative at Jina AI."
|
|
||||||
Evaluation: {
|
|
||||||
"pass": false,
|
|
||||||
"think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
|
|
||||||
"plurality_analysis": {
|
|
||||||
"expects_multiple": true,
|
|
||||||
"provides_multiple": false,
|
|
||||||
"count_provided": 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Question: "Name the top 3 products sold by Jina AI"
|
|
||||||
Answer: "Jina AI's product lineup includes DocArray and Jina."
|
|
||||||
Evaluation: {
|
|
||||||
"pass": false,
|
|
||||||
"think": "The question asks for top 3 products but only 2 are provided.",
|
|
||||||
"plurality_analysis": {
|
|
||||||
"expects_multiple": true,
|
|
||||||
"provides_multiple": true,
|
|
||||||
"count_expected": 3,
|
|
||||||
"count_provided": 2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Question: "List as many AI companies in Berlin as you can find"
|
|
||||||
Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
|
|
||||||
Evaluation: {
|
|
||||||
"pass": false,
|
|
||||||
"think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
|
|
||||||
"plurality_analysis": {
|
|
||||||
"expects_multiple": true,
|
|
||||||
"provides_multiple": true,
|
|
||||||
"count_provided": 5
|
|
||||||
}
|
|
||||||
}
|
|
||||||
</examples>
|
|
||||||
|
|
||||||
Now evaluate this pair:
|
Now evaluate this pair:
|
||||||
Question: ${JSON.stringify(question)}
|
Question: ${JSON.stringify(question)}
|
||||||
Answer: ${JSON.stringify(answer)}`;
|
Answer: ${JSON.stringify(answer)}`;
|
||||||
|
|||||||
@@ -23,7 +23,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
|
|||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Content-Length': data.length,
|
'Content-Length': data.length,
|
||||||
'X-Retain-Images': 'none',
|
'X-Retain-Images': 'none',
|
||||||
'X-Return-Format': 'markdown'
|
'X-Return-Format': 'markdown',
|
||||||
|
'X-Engine': 'direct'
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -143,14 +143,10 @@ export type EvaluationResponse = {
|
|||||||
think: string;
|
think: string;
|
||||||
type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
|
type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
|
||||||
freshness_analysis?: {
|
freshness_analysis?: {
|
||||||
likely_outdated: boolean;
|
days_ago: number;
|
||||||
dates_mentioned: string[];
|
|
||||||
current_time: string;
|
|
||||||
max_age_days?: number;
|
max_age_days?: number;
|
||||||
};
|
};
|
||||||
plurality_analysis?: {
|
plurality_analysis?: {
|
||||||
expects_multiple: boolean;
|
|
||||||
provides_multiple: boolean;
|
|
||||||
count_expected?: number;
|
count_expected?: number;
|
||||||
count_provided: number;
|
count_provided: number;
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user