diff --git a/README.md b/README.md
index b6036fd..eba2feb 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# DeepResearch
-[UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
+[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Evaluation](#evaluation)
Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
@@ -54,15 +54,21 @@ export JINA_API_KEY=jina_... # free jina api key, get from https://jina.ai/read
npm run dev $QUERY
```
+### Official Site
+
+You can try it on [our official site](https://search.jina.ai).
+
### Official API
-You can also use our official DeepSearch API, hosted and optimized by Jina AI:
+You can also use [our official DeepSearch API](https://jina.ai/deepsearch):
```
https://deepsearch.jina.ai/v1/chat/completions
```
-You can use it with any OpenAI-compatible client. For the authentication Bearer, get your Jina API key from https://jina.ai
+You can use it with any OpenAI-compatible client.
+
+For the authentication Bearer, API key, rate limit, get from https://jina.ai/deepsearch.
#### Client integration guidelines
diff --git a/src/agent.ts b/src/agent.ts
index 2e7e1c8..91fb82c 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -576,7 +576,7 @@ But then you realized you have asked them before. You decided to to think out of
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker);
// add the original query before rewrite to the keywordsQueries
- keywordsQueries.push(thisStep.searchQuery)
+ keywordsQueries.push(question)
const oldKeywords = keywordsQueries;
// avoid exisitng searched queries
diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
index 45745fb..e68bdfd 100644
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -21,9 +21,7 @@ const freshnessSchema = z.object({
...baseSchema,
type: z.literal('freshness'),
freshness_analysis: z.object({
- likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
- dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
- current_time: z.string().describe('Current system time when evaluation was performed'),
+ days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'),
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
})
});
@@ -32,8 +30,6 @@ const pluralitySchema = z.object({
...baseSchema,
type: z.literal('plurality'),
plurality_analysis: z.object({
- expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
- provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
count_provided: z.number().describe('Number of items provided in answer')
})
@@ -162,69 +158,61 @@ Answer: ${JSON.stringify(answer)}`;
}
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
- return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
+ return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates (or implied datetime) and current system time: ${currentTime}
-1. Date Analysis:
- - Extract all dates mentioned in the answer
- - Compare against current system time: ${currentTime}
- - Consider content outdated if:
- * It refers to a "latest" or "current" state from more than 30 days ago
- * It mentions specific dates/events that have been superseded
- * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
- - For product versions, releases, or announcements, max age is 30 days
- - For company positions, leadership, or general facts, max age is 90 days
+Question-Answer Freshness Checker Guidelines
-2. Context Hints:
- - Words indicating recency: "latest", "current", "newest", "just released", "recently"
- - Time-sensitive terms: "CEO", "price", "version", "release"
- - Future dates should be ignored in outdated calculation
+# Revised QA Type Maximum Age Table
+
+| QA Type | Max Age (Days) | Notes |
+|--------------------------|--------------|-----------------------------------------------------------------------|
+| Financial Data (Real-time)| 0.1 | Stock prices, exchange rates, crypto (real-time preferred) |
+| Breaking News | 1 | Immediate coverage of major events |
+| News/Current Events | 1 | Time-sensitive news, politics, or global events |
+| Weather Forecasts | 1 | Accuracy drops significantly after 24 hours |
+| Sports Scores/Events | 1 | Live updates required for ongoing matches |
+| Security Advisories | 1 | Critical security updates and patches |
+| Social Media Trends | 1 | Viral content, hashtags, memes |
+| Cybersecurity Threats | 7 | Rapidly evolving vulnerabilities/patches |
+| Tech News | 7 | Technology industry updates and announcements |
+| Political Developments | 7 | Legislative changes, political statements |
+| Political Elections | 7 | Poll results, candidate updates |
+| Sales/Promotions | 7 | Limited-time offers and marketing campaigns |
+| Travel Restrictions | 7 | Visa rules, pandemic-related policies |
+| Entertainment News | 14 | Celebrity updates, industry announcements |
+| Product Launches | 14 | New product announcements and releases |
+| Market Analysis | 14 | Market trends and competitive landscape |
+| Competitive Intelligence | 21 | Analysis of competitor activities and market position |
+| Product Recalls | 30 | Safety alerts or recalls from manufacturers |
+| Industry Reports | 30 | Sector-specific analysis and forecasting |
+| Software Version Info | 30 | Updates, patches, and compatibility information |
+| Legal/Regulatory Updates | 30 | Laws, compliance rules (jurisdiction-dependent) |
+| Economic Forecasts | 30 | Macroeconomic predictions and analysis |
+| Consumer Trends | 45 | Shifting consumer preferences and behaviors |
+| Scientific Discoveries | 60 | New research findings and breakthroughs (includes all scientific research) |
+| Healthcare Guidelines | 60 | Medical recommendations and best practices (includes medical guidelines)|
+| Environmental Reports | 60 | Climate and environmental status updates |
+| Best Practices | 90 | Industry standards and recommended procedures |
+| API Documentation | 90 | Technical specifications and implementation guides |
+| Tutorial Content | 180 | How-to guides and instructional materials (includes educational content)|
+| Tech Product Info | 180 | Product specs, release dates, or pricing |
+| Statistical Data | 180 | Demographic and statistical information |
+| Reference Material | 180 | General reference information and resources |
+| Historical Content | 365 | Events and information from the past year |
+| Cultural Trends | 730 | Shifts in language, fashion, or social norms |
+| Entertainment Releases | 730 | Movie/TV show schedules, media catalogs |
+| Factual Knowledge | ∞ | Static facts (e.g., historical events, geography, physical constants) |
+
+### Implementation Notes:
+1. **Contextual Adjustment**: Freshness requirements may change during crises or rapid developments in specific domains.
+2. **Tiered Approach**: Consider implementing urgency levels (critical, important, standard) alongside age thresholds.
+3. **User Preferences**: Allow customization of thresholds for specific query types or user needs.
+4. **Source Reliability**: Pair freshness metrics with source credibility scores for better quality assessment.
+5. **Domain Specificity**: Some specialized fields (medical research during pandemics, financial data during market volatility) may require dynamically adjusted thresholds.
+6. **Geographic Relevance**: Regional considerations may alter freshness requirements for local regulations or events.
-
-Question: "What was Jina AI's closing stock price yesterday?"
-Answer: "Jina AI's stock closed at $45.30 per share at yesterday's market close."
-Current Time: "2024-03-07T14:30:00Z"
-Evaluation: {
- "pass": true,
- "think": "The question specifically asks for yesterday's closing price, and the answer provides exactly that information. Since it's asking for a historical data point rather than current price, yesterday's closing price is the correct timeframe.",
- "freshness_analysis": {
- "likely_outdated": false,
- "dates_mentioned": ["2024-03-06"],
- "current_time": "2024-03-07T14:30:00Z",
- "max_age_days": 1
- }
-}
-
-Question: "What is Jina AI's latest embedding model?"
-Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
-Current Time: "2024-10-06T00:00:00Z"
-Evaluation: {
- "pass": false,
- "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
- "freshness_analysis": {
- "likely_outdated": true,
- "dates_mentioned": ["2024-03-15"],
- "current_time": "2024-10-06T00:00:00Z",
- "max_age_days": 30
- }
-}
-
-Question: "Who is OpenAI's CEO?"
-Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
-Current Time: "2024-02-06T00:00:00Z"
-Evaluation: {
- "pass": true,
- "think": "The answer is about company leadership and is within the 60-day threshold for such information",
- "freshness_analysis": {
- "likely_outdated": false,
- "dates_mentioned": ["2023-12"],
- "current_time": "2024-02-06T00:00:00Z",
- "max_age_days": 90
- }
-}
-
-
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
@@ -234,77 +222,38 @@ function getPluralityPrompt(question: string, answer: string): string {
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
-1. Question Analysis:
- - Check if question asks for multiple items using indicators like:
- * Plural nouns: "companies", "people", "names"
- * Quantifiers: "all", "many", "several", "various", "multiple"
- * List requests: "list", "enumerate", "name all", "give me all"
- * Numbers: "5 examples", "top 10"
- - Otherwise skip the analysis and return pass to true
+Question Type Reference Table
-2. Answer Analysis:
- - Count distinct items provided in the answer
- - Check if answer uses limiting words like "only", "just", "single"
- - Identify if answer acknowledges there are more items but only provides some
-
-3. Definitiveness Rules:
- - If question asks for multiple items but answer provides only one → NOT definitive
- - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
- - If answer clearly states it's providing a partial list → NOT definitive
- - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
+| Question Type | Expected Items | Evaluation Rules |
+|---------------|----------------|------------------|
+| Explicit Count | Exact match to number specified | Provide exactly the requested number of distinct, non-redundant items relevant to the query. |
+| Numeric Range | Any number within specified range | Ensure count falls within given range with distinct, non-redundant items. For "at least N" queries, meet minimum threshold. |
+| Implied Multiple | ≥ 2 | Provide multiple items (typically 2-4 unless context suggests more) with balanced detail and importance. |
+| "Few" | 2-4 | Offer 2-4 substantive items prioritizing quality over quantity. |
+| "Several" | 3-7 | Include 3-7 items with comprehensive yet focused coverage, each with brief explanation. |
+| "Many" | 7+ | Present 7+ items demonstrating breadth, with concise descriptions per item. |
+| "Most important" | Top 3-5 by relevance | Prioritize by importance, explain ranking criteria, and order items by significance. |
+| "Top N" | Exactly N, ranked | Provide exactly N items ordered by importance/relevance with clear ranking criteria. |
+| "Pros and Cons" | ≥ 2 of each category | Present balanced perspectives with at least 2 items per category addressing different aspects. |
+| "Compare X and Y" | ≥ 3 comparison points | Address at least 3 distinct comparison dimensions with balanced treatment covering major differences/similarities. |
+| "Steps" or "Process" | All essential steps | Include all critical steps in logical order without missing dependencies. |
+| "Examples" | ≥ 3 unless specified | Provide at least 3 diverse, representative, concrete examples unless count specified. |
+| "Comprehensive" | 10+ | Deliver extensive coverage (10+ items) across major categories/subcategories demonstrating domain expertise. |
+| "Brief" or "Quick" | 1-3 | Present concise content (1-3 items) focusing on most important elements described efficiently. |
+| "Complete" | All relevant items | Provide exhaustive coverage within reasonable scope without major omissions, using categorization if needed. |
+| "Thorough" | 7-10 | Offer detailed coverage addressing main topics and subtopics with both breadth and depth. |
+| "Overview" | 3-5 | Cover main concepts/aspects with balanced coverage focused on fundamental understanding. |
+| "Summary" | 3-5 key points | Distill essential information capturing main takeaways concisely yet comprehensively. |
+| "Main" or "Key" | 3-7 | Focus on most significant elements fundamental to understanding, covering distinct aspects. |
+| "Essential" | 3-7 | Include only critical, necessary items without peripheral or optional elements. |
+| "Basic" | 2-5 | Present foundational concepts accessible to beginners focusing on core principles. |
+| "Detailed" | 5-10 with elaboration | Provide in-depth coverage with explanations beyond listing, including specific information and nuance. |
+| "Common" | 4-8 most frequent | Focus on typical or prevalent items, ordered by frequency when possible, that are widely recognized. |
+| "Primary" | 2-5 most important | Focus on dominant factors with explanation of their primacy and outsized impact. |
+| "Secondary" | 3-7 supporting items | Present important but not critical items that complement primary factors and provide additional context. |
+| Unspecified Analysis | 3-5 key points | Default to 3-5 main points covering primary aspects with balanced breadth and depth. |
-
-Question: "Who works in Jina AI's sales team?"
-Answer: "John Smith is a sales representative at Jina AI."
-Evaluation: {
- "pass": true,
- "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
- "plurality_analysis": {
- "expects_multiple": false,
- "provides_multiple": false,
- "count_provided": 1
- }
-}
-
-Question: "List all the salespeople who work at Jina AI"
-Answer: "John Smith is a sales representative at Jina AI."
-Evaluation: {
- "pass": false,
- "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
- "plurality_analysis": {
- "expects_multiple": true,
- "provides_multiple": false,
- "count_provided": 1
- }
-}
-
-Question: "Name the top 3 products sold by Jina AI"
-Answer: "Jina AI's product lineup includes DocArray and Jina."
-Evaluation: {
- "pass": false,
- "think": "The question asks for top 3 products but only 2 are provided.",
- "plurality_analysis": {
- "expects_multiple": true,
- "provides_multiple": true,
- "count_expected": 3,
- "count_provided": 2
- }
-}
-
-Question: "List as many AI companies in Berlin as you can find"
-Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
-Evaluation: {
- "pass": false,
- "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
- "plurality_analysis": {
- "expects_multiple": true,
- "provides_multiple": true,
- "count_provided": 5
- }
-}
-
-
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
diff --git a/src/tools/read.ts b/src/tools/read.ts
index fe076be..89a8c29 100644
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@@ -23,7 +23,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
'Content-Type': 'application/json',
'Content-Length': data.length,
'X-Retain-Images': 'none',
- 'X-Return-Format': 'markdown'
+ 'X-Return-Format': 'markdown',
+ 'X-Engine': 'direct'
}
};
diff --git a/src/types.ts b/src/types.ts
index 89e797c..21d8a05 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -143,14 +143,10 @@ export type EvaluationResponse = {
think: string;
type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
freshness_analysis?: {
- likely_outdated: boolean;
- dates_mentioned: string[];
- current_time: string;
+ days_ago: number;
max_age_days?: number;
};
plurality_analysis?: {
- expects_multiple: boolean;
- provides_multiple: boolean;
count_expected?: number;
count_provided: number;
};