feat: add OpenAI-compatible chat completions endpoint (#48)

2026-03-22 07:29:35 +08:00 · 2025-02-09 09:25:01 +08:00
parent a9008ae0dd
commit 39579d560e
9 changed files with 1061 additions and 44 deletions
--- a/src/tests/server.test.ts
+++ b/src/tests/server.test.ts
@@ -0,0 +1,353 @@
+import request from 'supertest';
+import { EventEmitter } from 'events';
+import type { Express } from 'express';
+
+const TEST_SECRET = 'test-secret';
+let app: Express;
+
+describe('/v1/chat/completions', () => {
+  jest.setTimeout(120000); // Increase timeout for all tests in this suite
+  
+  beforeEach(async () => {
+    // Set NODE_ENV to test to prevent server from auto-starting
+    process.env.NODE_ENV = 'test';
+    
+    // Clean up any existing secret
+    const existingSecretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
+    if (existingSecretIndex !== -1) {
+      process.argv.splice(existingSecretIndex, 1);
+    }
+    
+    // Set up test secret and import server module
+    process.argv.push(`--secret=${TEST_SECRET}`);
+    
+    // Import server module (jest.resetModules() is called automatically before each test)
+    const { default: serverModule } = await import('../server');
+    app = serverModule;
+  });
+  
+  afterEach(async () => {
+    // Clean up any remaining event listeners
+    const emitter = EventEmitter.prototype;
+    emitter.removeAllListeners();
+    emitter.setMaxListeners(emitter.getMaxListeners() + 1);
+    
+    // Clean up test secret
+    const secretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
+    if (secretIndex !== -1) {
+      process.argv.splice(secretIndex, 1);
+    }
+
+    // Wait for any pending promises to settle
+    await new Promise(resolve => setTimeout(resolve, 500));
+
+    // Reset module cache to ensure clean state
+    jest.resetModules();
+  });
+  it('should require authentication when secret is set', async () => {
+    // Note: secret is already set in beforeEach
+    
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    expect(response.status).toBe(401);
+  });
+
+  it('should allow requests without auth when no secret is set', async () => {
+    // Remove secret for this test
+    const secretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
+    if (secretIndex !== -1) {
+      process.argv.splice(secretIndex, 1);
+    }
+    
+    // Reload server module without secret
+    const { default: serverModule } = await import('../server');
+    app = serverModule;
+    
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    expect(response.status).toBe(200);
+  });
+
+  it('should reject requests without user message', async () => {
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'developer', content: 'test' }]
+      });
+    expect(response.status).toBe(400);
+    expect(response.body.error).toBe('Last message must be from user');
+  });
+
+  it('should handle non-streaming request', async () => {
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    expect(response.status).toBe(200);
+    expect(response.body).toMatchObject({
+      object: 'chat.completion',
+      choices: [{
+        message: {
+          role: 'assistant'
+        }
+      }]
+    });
+  });
+
+  it('should track tokens correctly in non-streaming response', async () => {
+    // Create a promise that resolves when token tracking is complete
+    const tokenTrackingPromise = new Promise<void>((resolve) => {
+      const emitter = EventEmitter.prototype;
+      const originalEmit = emitter.emit;
+      
+      // Override emit to detect when token tracking is done
+      emitter.emit = function(event: string, ...args: any[]) {
+        if (event === 'usage') {
+          // Wait for next tick to ensure all token tracking is complete
+          process.nextTick(() => {
+            emitter.emit = originalEmit;
+            resolve();
+          });
+        }
+        return originalEmit.apply(this, [event, ...args]);
+      };
+    });
+
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    
+    // Wait for token tracking to complete
+    await tokenTrackingPromise;
+
+    expect(response.body.usage).toMatchObject({
+      prompt_tokens: expect.any(Number),
+      completion_tokens: expect.any(Number),
+      total_tokens: expect.any(Number),
+      completion_tokens_details: {
+        reasoning_tokens: expect.any(Number),
+        accepted_prediction_tokens: expect.any(Number),
+        rejected_prediction_tokens: expect.any(Number)
+      }
+    });
+
+    // Verify token counts are reasonable
+    expect(response.body.usage.prompt_tokens).toBeGreaterThan(0);
+    expect(response.body.usage.completion_tokens).toBeGreaterThan(0);
+    expect(response.body.usage.total_tokens).toBe(
+      response.body.usage.prompt_tokens + response.body.usage.completion_tokens
+    );
+  });
+
+  it('should handle streaming request and track tokens correctly', async () => {
+    return new Promise<void>((resolve, reject) => {
+      let isDone = false;
+      let totalCompletionTokens = 0;
+
+      const cleanup = () => {
+        clearTimeout(timeoutHandle);
+        isDone = true;
+        resolve();
+      };
+
+      const timeoutHandle = setTimeout(() => {
+        if (!isDone) {
+          cleanup();
+          reject(new Error('Test timed out'));
+        }
+      }, 30000);
+
+      request(app)
+        .post('/v1/chat/completions')
+        .set('Authorization', `Bearer ${TEST_SECRET}`)
+        .send({
+          model: 'test-model',
+          messages: [{ role: 'user', content: 'test' }],
+          stream: true
+        })
+        .buffer(true)
+        .parse((res, callback) => {
+          const response = res as unknown as {
+            on(event: 'data', listener: (chunk: Buffer) => void): void;
+            on(event: 'end', listener: () => void): void;
+            on(event: 'error', listener: (err: Error) => void): void;
+          };
+          let responseData = '';
+          
+          response.on('error', (err) => {
+            cleanup();
+            callback(err, null);
+          });
+
+          response.on('data', (chunk) => {
+            responseData += chunk.toString();
+          });
+
+          response.on('end', () => {
+            try {
+              callback(null, responseData);
+            } catch (err) {
+              cleanup();
+              callback(err instanceof Error ? err : new Error(String(err)), null);
+            }
+          });
+        })
+        .end((err, res) => {
+          if (err) return reject(err);
+          
+          expect(res.status).toBe(200);
+          expect(res.headers['content-type']).toBe('text/event-stream');
+          
+          // Verify stream format and content
+          if (isDone) return; // Prevent multiple resolves
+          
+          const responseText = res.body as string;
+          const chunks = responseText
+            .split('\n\n')
+            .filter((line: string) => line.startsWith('data: '))
+            .map((line: string) => JSON.parse(line.replace('data: ', '')));
+          
+          // Process all chunks
+          expect(chunks.length).toBeGreaterThan(0);
+          
+          // Verify initial chunk format
+          expect(chunks[0]).toMatchObject({
+            id: expect.any(String),
+            object: 'chat.completion.chunk',
+            choices: [{
+              index: 0,
+              delta: { role: 'assistant' },
+              logprobs: null,
+              finish_reason: null
+            }]
+          });
+
+          // Verify content chunks have content
+          chunks.slice(1).forEach(chunk => {
+            const content = chunk.choices[0].delta.content;
+            if (content && content.trim()) {
+              totalCompletionTokens += 1; // Count 1 token per chunk as per Vercel convention
+            }
+            expect(chunk).toMatchObject({
+              object: 'chat.completion.chunk',
+              choices: [{
+                delta: expect.objectContaining({
+                  content: expect.any(String)
+                })
+              }]
+            });
+          });
+
+          // Verify final chunk format if present
+          const lastChunk = chunks[chunks.length - 1];
+          if (lastChunk?.choices?.[0]?.finish_reason === 'stop') {
+            expect(lastChunk).toMatchObject({
+              object: 'chat.completion.chunk',
+              choices: [{
+                delta: {},
+                finish_reason: 'stop'
+              }]
+            });
+          }
+
+          // Verify we tracked some completion tokens
+          expect(totalCompletionTokens).toBeGreaterThan(0);
+          
+          // Clean up and resolve
+          if (!isDone) {
+            cleanup();
+          }
+        });
+    });
+  });
+
+  it('should track tokens correctly in error response', async () => {
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [] // Invalid messages array
+      });
+    
+    expect(response.status).toBe(400);
+    expect(response.body).toHaveProperty('error');
+    expect(response.body.error).toBe('Messages array is required and must not be empty');
+    
+    // Make another request to verify token tracking after error
+    const validResponse = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    
+    // Verify token tracking still works after error
+    expect(validResponse.body.usage).toMatchObject({
+      prompt_tokens: expect.any(Number),
+      completion_tokens: expect.any(Number),
+      total_tokens: expect.any(Number),
+      completion_tokens_details: {
+        reasoning_tokens: expect.any(Number),
+        accepted_prediction_tokens: expect.any(Number),
+        rejected_prediction_tokens: expect.any(Number)
+      }
+    });
+
+    // Verify token counts are reasonable
+    expect(validResponse.body.usage.prompt_tokens).toBeGreaterThan(0);
+    expect(validResponse.body.usage.completion_tokens).toBeGreaterThan(0);
+    expect(validResponse.body.usage.total_tokens).toBe(
+      validResponse.body.usage.prompt_tokens + validResponse.body.usage.completion_tokens
+    );
+  });
+
+  it('should provide token usage in Vercel AI SDK format', async () => {
+    const response = await request(app)
+      .post('/v1/chat/completions')
+      .set('Authorization', `Bearer ${TEST_SECRET}`)
+      .send({
+        model: 'test-model',
+        messages: [{ role: 'user', content: 'test' }]
+      });
+    
+    expect(response.status).toBe(200);
+    const usage = response.body.usage;
+
+    expect(usage).toMatchObject({
+      prompt_tokens: expect.any(Number),
+      completion_tokens: expect.any(Number),
+      total_tokens: expect.any(Number),
+      completion_tokens_details: {
+        reasoning_tokens: expect.any(Number),
+        accepted_prediction_tokens: expect.any(Number),
+        rejected_prediction_tokens: expect.any(Number)
+      }
+    });
+
+    // Verify token counts are reasonable
+    expect(usage.prompt_tokens).toBeGreaterThan(0);
+    expect(usage.completion_tokens).toBeGreaterThan(0);
+    expect(usage.total_tokens).toBe(
+      usage.prompt_tokens + usage.completion_tokens
+    );
+  });
+});
--- a/src/evals/batch-evals.ts
+++ b/src/evals/batch-evals.ts
@@ -7,8 +7,6 @@ import {GEMINI_API_KEY} from '../config';
 import {z} from 'zod';
 import {AnswerAction, TrackerContext} from "../types";
 import {createGoogleGenerativeAI} from "@ai-sdk/google";
-import {TokenTracker} from "../utils/token-tracker";
-import {ActionTracker} from "../utils/action-tracker";

 const execAsync = promisify(exec);

@@ -184,27 +182,6 @@ async function batchEvaluate(inputFile: string): Promise<void> {
    }
  }

-  async function getResponseStreamingAgent(query: string) {
-    const res = await fetch("http://localhost:3000/chat", {
-      method: "POST",
-      headers: {"Content-Type": "application/json"},
-      body: JSON.stringify({query})
-    })
-    const text = await res.text()
-    return {
-      result: {
-        think: '',
-        action: 'answer',
-        answer: text.split("RESPONSE_START")[1].split("RESPONSE_END")[0].trim(),
-        references: []
-      },
-      context: {
-         tokenTracker: new TokenTracker(),
-         actionTracker: new ActionTracker()
-      }
-    }
-  }
-
  // Calculate and print statistics
  const stats = calculateStats(results, modelName);
  printStats(stats);
@@ -229,4 +206,4 @@ if (require.main === module) {
  batchEvaluate(inputFile).catch(console.error);
 }

-export {batchEvaluate};
+export {batchEvaluate};
--- a/src/server.ts
+++ b/src/server.ts
@@ -2,7 +2,16 @@ import express, {Request, Response, RequestHandler} from 'express';
 import cors from 'cors';
 import {EventEmitter} from 'events';
 import {getResponse} from './agent';
-import {StepAction, StreamMessage, TrackerContext} from './types';
+import {
+  StepAction,
+  StreamMessage,
+  TrackerContext,
+  ChatCompletionRequest,
+  ChatCompletionResponse,
+  ChatCompletionChunk,
+  AnswerAction,
+  TOKEN_CATEGORIES
+} from './types';
 import fs from 'fs/promises';
 import path from 'path';
 import {TokenTracker} from "./utils/token-tracker";
@@ -11,6 +20,9 @@ import {ActionTracker} from "./utils/action-tracker";
 const app = express();
 const port = process.env.PORT || 3000;

+// Get secret from command line args for optional authentication
+const secret = process.argv.find(arg => arg.startsWith('--secret='))?.split('=')[1];
+
 app.use(cors());
 app.use(express.json());

@@ -24,6 +36,278 @@ interface QueryRequest extends Request {
  };
 }

+// OpenAI-compatible chat completions endpoint
+app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
+  // Check authentication if secret is set
+  if (secret) {
+    const authHeader = req.headers.authorization;
+    if (!authHeader || !authHeader.startsWith('Bearer ') || authHeader.split(' ')[1] !== secret) {
+      console.log('[chat/completions] Unauthorized request');
+      res.status(401).json({ error: 'Unauthorized' });
+      return;
+    }
+  }
+
+  // Log request details (excluding sensitive data)
+  console.log('[chat/completions] Request:', {
+    model: req.body.model,
+    stream: req.body.stream,
+    messageCount: req.body.messages?.length,
+    hasAuth: !!req.headers.authorization,
+    requestId: Date.now().toString()
+  });
+
+  const body = req.body as ChatCompletionRequest;
+  if (!body.messages?.length) {
+    return res.status(400).json({ error: 'Messages array is required and must not be empty' });
+  }
+  const lastMessage = body.messages[body.messages.length - 1];
+  if (lastMessage.role !== 'user') {
+    return res.status(400).json({ error: 'Last message must be from user' });
+  }
+
+  const requestId = Date.now().toString();
+  const context: TrackerContext = {
+    tokenTracker: new TokenTracker(),
+    actionTracker: new ActionTracker()
+  };
+
+  // Track prompt tokens for the initial message
+  // Use Vercel's token counting convention - 1 token per message
+  const messageTokens = body.messages.length;
+  context.tokenTracker.trackUsage('agent', messageTokens, TOKEN_CATEGORIES.PROMPT);
+
+  if (body.stream) {
+    res.setHeader('Content-Type', 'text/event-stream');
+    res.setHeader('Cache-Control', 'no-cache');
+    res.setHeader('Connection', 'keep-alive');
+
+    // Send initial chunk with opening think tag
+    const initialChunk: ChatCompletionChunk = {
+      id: requestId,
+      object: 'chat.completion.chunk',
+      created: Math.floor(Date.now() / 1000),
+      model: body.model,
+      system_fingerprint: 'fp_' + requestId,
+      choices: [{
+        index: 0,
+        delta: { role: 'assistant', content: '<think>' },
+        logprobs: null,
+        finish_reason: null
+      }]
+    };
+    res.write(`data: ${JSON.stringify(initialChunk)}\n\n`);
+
+    // Set up progress listener with cleanup
+    const actionListener = (action: any) => {
+      // Track reasoning tokens for each chunk using Vercel's convention
+      const chunkTokens = 1; // Default to 1 token per chunk
+      context.tokenTracker.trackUsage('evaluator', chunkTokens, TOKEN_CATEGORIES.REASONING);
+      
+      // Only send chunk if there's content to send
+      if (action.think) {
+        const chunk: ChatCompletionChunk = {
+          id: requestId,
+          object: 'chat.completion.chunk',
+          created: Math.floor(Date.now() / 1000),
+          model: body.model,
+          system_fingerprint: 'fp_' + requestId,
+          choices: [{
+            index: 0,
+            delta: { content: `<think>${action.think}</think>` },
+            logprobs: null,
+            finish_reason: null
+          }]
+        };
+        const chunkStr = `data: ${JSON.stringify(chunk)}\n\n`;
+        console.log('[chat/completions] Sending chunk:', {
+          id: chunk.id,
+          content: chunk.choices[0].delta.content,
+          finish_reason: chunk.choices[0].finish_reason
+        });
+        res.write(chunkStr);
+      }
+    };
+    context.actionTracker.on('action', actionListener);
+    
+    // Clean up listener on response finish
+    res.on('finish', () => {
+      context.actionTracker.removeListener('action', actionListener);
+    });
+  }
+
+  try {
+    // Track initial query tokens - already tracked above
+    // const queryTokens = Buffer.byteLength(lastMessage.content, 'utf-8');
+    // context.tokenTracker.trackUsage('agent', queryTokens, 'prompt');
+
+    let result;
+    try {
+      ({ result } = await getResponse(lastMessage.content, undefined, undefined, context));
+    } catch (error: any) {
+      // If deduplication fails, retry without it
+      if (error?.response?.status === 402) {
+        // If deduplication fails, retry with maxBadAttempt=3 to skip dedup
+        ({ result } = await getResponse(lastMessage.content, undefined, 3, context));
+      } else {
+        throw error;
+      }
+    }
+    
+    // Track tokens based on action type
+    if (result.action === 'answer') {
+      // Track accepted prediction tokens for the final answer using Vercel's convention
+      const answerTokens = 1; // Default to 1 token per answer
+      context.tokenTracker.trackUsage('evaluator', answerTokens, TOKEN_CATEGORIES.ACCEPTED);
+    } else {
+      // Track rejected prediction tokens for non-answer responses
+      const rejectedTokens = 1; // Default to 1 token per rejected response
+      context.tokenTracker.trackUsage('evaluator', rejectedTokens, TOKEN_CATEGORIES.REJECTED);
+    }
+
+    if (body.stream) {
+      // Send closing think tag
+      const closeThinkChunk: ChatCompletionChunk = {
+        id: requestId,
+        object: 'chat.completion.chunk',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          delta: { content: '</think>' },
+          logprobs: null,
+          finish_reason: null
+        }]
+      };
+      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
+
+      // Send final answer as separate chunk
+      const answerChunk: ChatCompletionChunk = {
+        id: requestId,
+        object: 'chat.completion.chunk',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          delta: { content: result.action === 'answer' ? (result as AnswerAction).answer : result.think },
+          logprobs: null,
+          finish_reason: 'stop'
+        }]
+      };
+      res.write(`data: ${JSON.stringify(answerChunk)}\n\n`);
+      res.end();
+    } else {
+      const usage = context.tokenTracker.getUsageDetails();
+      const response: ChatCompletionResponse = {
+        id: requestId,
+        object: 'chat.completion',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          message: {
+            role: 'assistant',
+            content: result.action === 'answer' ? (result as AnswerAction).answer : result.think
+          },
+          logprobs: null,
+          finish_reason: 'stop'
+        }],
+        usage
+      };
+      
+      // Log final response (excluding full content for brevity)
+      console.log('[chat/completions] Response:', {
+        id: response.id,
+        status: 200,
+        contentLength: response.choices[0].message.content.length,
+        usage: response.usage
+      });
+      
+      res.json(response);
+    }
+  } catch (error: any) {
+    // Log error details
+    console.error('[chat/completions] Error:', {
+      message: error?.message || 'An error occurred',
+      stack: error?.stack,
+      type: error?.constructor?.name,
+      requestId
+    });
+
+    // Track error as rejected tokens with Vercel token counting
+    const errorMessage = error?.message || 'An error occurred';
+    // Default to 1 token for errors as per Vercel AI SDK convention
+    const errorTokens = 1;
+    context.tokenTracker.trackUsage('evaluator', errorTokens, TOKEN_CATEGORIES.REJECTED);
+
+    // Clean up event listeners
+    context.actionTracker.removeAllListeners('action');
+
+    // Get token usage in OpenAI API format
+    const usage = context.tokenTracker.getUsageDetails();
+
+    if (body.stream && res.headersSent) {
+      // For streaming responses that have already started, send error as a chunk
+      // First send closing think tag if we're in the middle of thinking
+      const closeThinkChunk: ChatCompletionChunk = {
+        id: requestId,
+        object: 'chat.completion.chunk',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          delta: { content: '</think>' },
+          logprobs: null,
+          finish_reason: null
+        }]
+      };
+      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
+
+      // Track error token and send error message
+      context.tokenTracker.trackUsage('evaluator', 1, TOKEN_CATEGORIES.REJECTED);
+      const errorChunk: ChatCompletionChunk = {
+        id: requestId,
+        object: 'chat.completion.chunk',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          delta: { content: errorMessage },
+          logprobs: null,
+          finish_reason: 'stop'
+        }]
+      };
+      res.write(`data: ${JSON.stringify(errorChunk)}\n\n`);
+      res.end();
+    } else {
+      // For non-streaming or not-yet-started responses, send error as JSON
+      const response: ChatCompletionResponse = {
+        id: requestId,
+        object: 'chat.completion',
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        system_fingerprint: 'fp_' + requestId,
+        choices: [{
+          index: 0,
+          message: {
+            role: 'assistant',
+            content: `Error: ${errorMessage}`
+          },
+          logprobs: null,
+          finish_reason: 'stop'
+        }],
+        usage
+      };
+      res.json(response);
+    }
+  }
+}) as RequestHandler);
+
 interface StreamResponse extends Response {
  write: (chunk: string) => boolean;
 }
@@ -185,8 +469,16 @@ app.get('/api/v1/task/:requestId', (async (req: Request, res: Response) => {
  }
 }) as RequestHandler);

-app.listen(port, () => {
-  console.log(`Server running at http://localhost:${port}`);
-});
+// Export server startup function for better testing
+export function startServer() {
+  return app.listen(port, () => {
+    console.log(`Server running at http://localhost:${port}`);
+  });
+}
+
+// Start server if running directly
+if (process.env.NODE_ENV !== 'test') {
+  startServer();
+}

 export default app;
--- a/src/tools/jina-dedup.ts
+++ b/src/tools/jina-dedup.ts
@@ -1,13 +1,25 @@
-import axios from 'axios';
+import axios, { AxiosError } from 'axios';
 import { TokenTracker } from "../utils/token-tracker";
 import {JINA_API_KEY} from "../config";

 const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
 const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity

+const JINA_API_CONFIG = {
+  MODEL: 'jina-embeddings-v3',
+  TASK: 'text-matching',
+  DIMENSIONS: 1024,
+  EMBEDDING_TYPE: 'float',
+  LATE_CHUNKING: false
+} as const;
+
 // Types for Jina API
 interface JinaEmbeddingRequest {
  model: string;
+  task: string;
+  late_chunking: boolean;
+  dimensions: number;
+  embedding_type: string;
  input: string[];
 }

@@ -41,7 +53,11 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
  }

  const request: JinaEmbeddingRequest = {
-    model: 'jina-embeddings-v3',
+    model: JINA_API_CONFIG.MODEL,
+    task: JINA_API_CONFIG.TASK,
+    late_chunking: JINA_API_CONFIG.LATE_CHUNKING,
+    dimensions: JINA_API_CONFIG.DIMENSIONS,
+    embedding_type: JINA_API_CONFIG.EMBEDDING_TYPE,
    input: queries
  };

@@ -57,6 +73,15 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
      }
    );

+    // Validate response format
+    if (!response.data.data || response.data.data.length !== queries.length) {
+      console.error('Invalid response from Jina API:', response.data);
+      return {
+        embeddings: [],
+        tokens: 0
+      };
+    }
+
    // Sort embeddings by index to maintain original order
    const embeddings = response.data.data
      .sort((a, b) => a.index - b.index)
@@ -68,6 +93,12 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
    };
  } catch (error) {
    console.error('Error getting embeddings from Jina:', error);
+    if (error instanceof AxiosError && error.response?.status === 402) {
+      return {
+        embeddings: [],
+        tokens: 0
+      };
+    }
    throw error;
  }
 }
@@ -91,6 +122,15 @@ export async function dedupQueries(
    const allQueries = [...newQueries, ...existingQueries];
    const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries);

+    // If embeddings is empty (due to 402 error), return all new queries
+    if (!allEmbeddings.length) {
+      console.log('Dedup (no embeddings):', newQueries);
+      return {
+        unique_queries: newQueries,
+        tokens: 0
+      };
+    }
+
    // Split embeddings back into new and existing
    const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
    const existingEmbeddings = allEmbeddings.slice(newQueries.length);
--- a/src/types.ts
+++ b/src/types.ts
@@ -31,9 +31,24 @@ export type VisitAction = BaseAction & {
 export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction;

 // Response Types
+export const TOKEN_CATEGORIES = {
+  PROMPT: 'prompt',
+  REASONING: 'reasoning',
+  ACCEPTED: 'accepted',
+  REJECTED: 'rejected'
+} as const;
+
+export type TokenCategory = typeof TOKEN_CATEGORIES[keyof typeof TOKEN_CATEGORIES];
+
+// Following Vercel AI SDK's token counting interface
 export interface TokenUsage {
  tool: string;
  tokens: number;
+  category?: TokenCategory;
+  // Following Vercel AI SDK's token counting interface
+  prompt_tokens?: number;
+  completion_tokens?: number;
+  total_tokens?: number;
 }

 export interface SearchResponse {
@@ -144,6 +159,60 @@ export interface StreamMessage {
  };
 }

+// OpenAI API Types
+export interface ChatCompletionRequest {
+  model: string;
+  messages: Array<{
+    role: string;
+    content: string;
+  }>;
+  stream?: boolean;
+}
+
+export interface ChatCompletionResponse {
+  id: string;
+  object: 'chat.completion';
+  created: number;
+  model: string;
+  system_fingerprint: string;
+  choices: Array<{
+    index: number;
+    message: {
+      role: 'assistant';
+      content: string;
+    };
+    logprobs: null;
+    finish_reason: 'stop';
+  }>;
+  usage: {
+    prompt_tokens: number;
+    completion_tokens: number;
+    total_tokens: number;
+    completion_tokens_details?: {
+      reasoning_tokens: number;
+      accepted_prediction_tokens: number;
+      rejected_prediction_tokens: number;
+    };
+  };
+}
+
+export interface ChatCompletionChunk {
+  id: string;
+  object: 'chat.completion.chunk';
+  created: number;
+  model: string;
+  system_fingerprint: string;
+  choices: Array<{
+    index: number;
+    delta: {
+      role?: 'assistant';
+      content?: string;
+    };
+    logprobs: null;
+    finish_reason: null | 'stop';
+  }>;
+}
+
 // Tracker Types
 import { TokenTracker } from './utils/token-tracker';
 import { ActionTracker } from './utils/action-tracker';
--- a/src/utils/token-tracker.ts
+++ b/src/utils/token-tracker.ts
@@ -1,6 +1,6 @@
 import { EventEmitter } from 'events';

-import { TokenUsage } from '../types';
+import { TokenUsage, TokenCategory } from '../types';

 export class TokenTracker extends EventEmitter {
  private usages: TokenUsage[] = [];
@@ -11,15 +11,16 @@ export class TokenTracker extends EventEmitter {
    this.budget = budget;
  }

-  trackUsage(tool: string, tokens: number) {
+  trackUsage(tool: string, tokens: number, category?: TokenCategory) {
    const currentTotal = this.getTotalUsage();
    if (this.budget && currentTotal + tokens > this.budget) {
      console.error(`Token budget exceeded: ${currentTotal + tokens} > ${this.budget}`);
    }
    // Only track usage if we're within budget
    if (!this.budget || currentTotal + tokens <= this.budget) {
-      this.usages.push({ tool, tokens });
-      this.emit('usage', { tool, tokens });
+      const usage = { tool, tokens, category };
+      this.usages.push(usage);
+      this.emit('usage', usage);
    }
  }

@@ -34,6 +35,41 @@ export class TokenTracker extends EventEmitter {
    }, {} as Record<string, number>);
  }

+  getUsageDetails(): {
+    prompt_tokens: number;
+    completion_tokens: number;
+    total_tokens: number;
+    completion_tokens_details?: {
+      reasoning_tokens: number;
+      accepted_prediction_tokens: number;
+      rejected_prediction_tokens: number;
+    };
+  } {
+    const categoryBreakdown = this.usages.reduce((acc, { tokens, category }) => {
+      if (category) {
+        acc[category] = (acc[category] || 0) + tokens;
+      }
+      return acc;
+    }, {} as Record<string, number>);
+
+    const prompt_tokens = categoryBreakdown.prompt || 0;
+    const completion_tokens = 
+      (categoryBreakdown.reasoning || 0) + 
+      (categoryBreakdown.accepted || 0) + 
+      (categoryBreakdown.rejected || 0);
+
+    return {
+      prompt_tokens,
+      completion_tokens,
+      total_tokens: prompt_tokens + completion_tokens,
+      completion_tokens_details: {
+        reasoning_tokens: categoryBreakdown.reasoning || 0,
+        accepted_prediction_tokens: categoryBreakdown.accepted || 0,
+        rejected_prediction_tokens: categoryBreakdown.rejected || 0
+      }
+    };
+  }
+
  printSummary() {
    const breakdown = this.getUsageBreakdown();
    console.log('Token Usage Summary:', {