feat: add OpenAI-compatible chat completions endpoint (#48)

2026-03-22 07:29:35 +08:00 · 2025-02-09 09:25:01 +08:00
parent a9008ae0dd
commit 39579d560e
9 changed files with 1061 additions and 44 deletions
--- a/README.md
+++ b/README.md
@@ -107,11 +107,104 @@ export DEFAULT_MODEL_NAME=qwen2.5-7b  # your local llm model name
 Start the server:
 ```bash
 # Without authentication
 npm run serve
 # With authentication (clients must provide this secret as Bearer token)
 npm run serve --secret=your_secret_token
 ```
 The server will start on http://localhost:3000 with the following endpoints:
 ### POST /v1/chat/completions
 OpenAI-compatible chat completions endpoint:
 ```bash
 # Without authentication
 curl http://localhost:3000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [
      {
        "role": "user",
        "content": "Hello!"
      }
    ]
  }'
 # With authentication (when server is started with --secret)
 curl http://localhost:3000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer your_secret_token" \
  -d '{
    "model": "gpt-4o-mini",
    "messages": [
      {
        "role": "user",
        "content": "Hello!"
      }
    ],
    "stream": true
  }'
 ```
 Response format:
 ```json
 {
  "id": "chatcmpl-123",
  "object": "chat.completion",
  "created": 1677652288,
  "model": "gpt-4o-mini",
  "system_fingerprint": "fp_44709d6fcb",
  "choices": [{
    "index": 0,
    "message": {
      "role": "assistant",
      "content": "YOUR FINAL ANSWER"
    },
    "logprobs": null,
    "finish_reason": "stop"
  }],
  "usage": {
    "prompt_tokens": 9,
    "completion_tokens": 12,
    "total_tokens": 21,
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  }
 }
 ```
 For streaming responses (stream: true), the server sends chunks in this format:
 ```json
 {
  "id": "chatcmpl-123",
  "object": "chat.completion.chunk",
  "created": 1694268190,
  "model": "gpt-4o-mini",
  "system_fingerprint": "fp_44709d6fcb",
  "choices": [{
    "index": 0,
    "delta": {
      "content": "..."
    },
    "logprobs": null,
    "finish_reason": null
  }]
 }
 ```
 Note: The think content in streaming responses is wrapped in XML tags:
 ```
 <think>
 [thinking steps...]
 </think>
 [final answer]
 ```
 ### POST /api/v1/query
 Submit a query to be answered:
 ```bash
@@ -248,4 +341,4 @@ It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as
 | Average Tokens | 428 | 59,408                                          |
 | Median Tokens | 434 | 16,001                                          |
 | Maximum Tokens | 463 | 347,222                                         |
-| Minimum Tokens | 374 | 5,594                                           |
+| Minimum Tokens | 374 | 5,594                                           |
--- a/package-lock.json
+++ b/package-lock.json
@@ -11,7 +11,7 @@
      "dependencies": {
        "@ai-sdk/google": "^1.0.0",
        "@ai-sdk/openai": "^1.1.9",
-        "ai": "^4.1.21",
+        "ai": "^4.1.26",
        "axios": "^1.7.9",
        "commander": "^13.1.0",
        "cors": "^2.8.5",
@@ -29,10 +29,12 @@
        "@types/jest": "^29.5.14",
        "@types/node": "^22.10.10",
        "@types/node-fetch": "^2.6.12",
        "@types/supertest": "^6.0.2",
        "@typescript-eslint/eslint-plugin": "^7.0.1",
        "@typescript-eslint/parser": "^7.0.1",
        "eslint": "^8.56.0",
        "jest": "^29.7.0",
        "supertest": "^7.0.0",
        "ts-jest": "^29.2.5",
        "ts-node": "^10.9.2",
        "typescript": "^5.7.3"
@@ -1552,6 +1554,13 @@
        "@types/node": "*"
      }
    },
    "node_modules/@types/cookiejar": {
      "version": "2.1.5",
      "resolved": "https://registry.npmjs.org/@types/cookiejar/-/cookiejar-2.1.5.tgz",
      "integrity": "sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@types/cors": {
      "version": "2.8.17",
      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
@@ -1649,6 +1658,13 @@
        "pretty-format": "^29.0.0"
      }
    },
    "node_modules/@types/methods": {
      "version": "1.1.4",
      "resolved": "https://registry.npmjs.org/@types/methods/-/methods-1.1.4.tgz",
      "integrity": "sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@types/mime": {
      "version": "1.3.5",
      "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.5.tgz",
@@ -1721,6 +1737,30 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@types/superagent": {
      "version": "8.1.9",
      "resolved": "https://registry.npmjs.org/@types/superagent/-/superagent-8.1.9.tgz",
      "integrity": "sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "@types/cookiejar": "^2.1.5",
        "@types/methods": "^1.1.4",
        "@types/node": "*",
        "form-data": "^4.0.0"
      }
    },
    "node_modules/@types/supertest": {
      "version": "6.0.2",
      "resolved": "https://registry.npmjs.org/@types/supertest/-/supertest-6.0.2.tgz",
      "integrity": "sha512-137ypx2lk/wTQbW6An6safu9hXmajAifU/s7szAHLN/FeIm5w7yR0Wkl9fdJMRSHwOn4HLAI0DaB2TOORuhPDg==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "@types/methods": "^1.1.4",
        "@types/superagent": "^8.1.0"
      }
    },
    "node_modules/@types/yargs": {
      "version": "17.0.33",
      "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.33.tgz",
@@ -1988,9 +2028,9 @@
      }
    },
    "node_modules/ai": {
-      "version": "4.1.21",
+      "version": "4.1.26",
-      "resolved": "https://registry.npmjs.org/ai/-/ai-4.1.21.tgz",
+      "resolved": "https://registry.npmjs.org/ai/-/ai-4.1.26.tgz",
-      "integrity": "sha512-w1v3T/fisoD1qRFz7CS7nE7mggeaxEpkEvWvVUWRem9lERgwh670OPhMPUSrdzTtCjMkOTrNkaecKoYAwvqM/A==",
+      "integrity": "sha512-Mww6mJbGwmMK0qAKR67WfVK1WyaUjfFlPZ2rhUUmDns3WhI+DVgMM7gLmuo0rA+I5qq69g7YE1OCgUwMRKKjMw==",
      "license": "Apache-2.0",
      "dependencies": {
        "@ai-sdk/provider": "1.0.7",
@@ -2132,6 +2172,13 @@
        "node": ">=8"
      }
    },
    "node_modules/asap": {
      "version": "2.0.6",
      "resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz",
      "integrity": "sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/async": {
      "version": "3.2.6",
      "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz",
@@ -2629,6 +2676,16 @@
        "node": ">=18"
      }
    },
    "node_modules/component-emitter": {
      "version": "1.3.1",
      "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.3.1.tgz",
      "integrity": "sha512-T0+barUSQRTUQASh8bx02dl+DhF54GtIDY13Y3m9oWTklKbb3Wv974meRpeZ3lp1JpLVECWWNHC4vaG2XHXouQ==",
      "dev": true,
      "license": "MIT",
      "funding": {
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
    "node_modules/concat-map": {
      "version": "0.0.1",
      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@@ -2679,6 +2736,13 @@
      "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==",
      "license": "MIT"
    },
    "node_modules/cookiejar": {
      "version": "2.1.4",
      "resolved": "https://registry.npmjs.org/cookiejar/-/cookiejar-2.1.4.tgz",
      "integrity": "sha512-LDx6oHrK+PhzLKJU9j5S7/Y3jM/mUHvD/DeI1WQmJn652iPC5Y4TBzC9l+5OMOXlyTTA+SmVUPm0HQUwpD5Jqw==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/cors": {
      "version": "2.8.5",
      "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz",
@@ -2842,6 +2906,17 @@
        "node": ">=8"
      }
    },
    "node_modules/dezalgo": {
      "version": "1.0.4",
      "resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz",
      "integrity": "sha512-rXSP0bf+5n0Qonsb+SVVfNfIsimO4HEtmnIpPHY8Q1UCzKlQrDMfdobr8nJOOsRgWCyMRqeSBQzmWUMq7zvVig==",
      "dev": true,
      "license": "ISC",
      "dependencies": {
        "asap": "^2.0.0",
        "wrappy": "1"
      }
    },
    "node_modules/diff": {
      "version": "4.0.2",
      "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz",
@@ -3429,6 +3504,13 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/fast-safe-stringify": {
      "version": "2.1.1",
      "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz",
      "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/fastq": {
      "version": "1.19.0",
      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.0.tgz",
@@ -3639,6 +3721,21 @@
        "node": ">=12.20.0"
      }
    },
    "node_modules/formidable": {
      "version": "3.5.2",
      "resolved": "https://registry.npmjs.org/formidable/-/formidable-3.5.2.tgz",
      "integrity": "sha512-Jqc1btCy3QzRbJaICGwKcBfGWuLADRerLzDqi2NwSt/UkXLsHJw2TVResiaoBufHVHy9aSgClOHCeJsSsFLTbg==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "dezalgo": "^1.0.4",
        "hexoid": "^2.0.0",
        "once": "^1.4.0"
      },
      "funding": {
        "url": "https://ko-fi.com/tunnckoCore/commissions"
      }
    },
    "node_modules/forwarded": {
      "version": "0.2.0",
      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
@@ -3924,6 +4021,16 @@
        "node": ">= 0.4"
      }
    },
    "node_modules/hexoid": {
      "version": "2.0.0",
      "resolved": "https://registry.npmjs.org/hexoid/-/hexoid-2.0.0.tgz",
      "integrity": "sha512-qlspKUK7IlSQv2o+5I7yhUd7TxlOG2Vr5LTa3ve2XSNVKAL/n/u/7KLvKmFNimomDIKvZFXWHv0T12mv7rT8Aw==",
      "dev": true,
      "license": "MIT",
      "engines": {
        "node": ">=8"
      }
    },
    "node_modules/html-entities": {
      "version": "2.5.2",
      "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz",
@@ -6360,6 +6467,54 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
    "node_modules/superagent": {
      "version": "9.0.2",
      "resolved": "https://registry.npmjs.org/superagent/-/superagent-9.0.2.tgz",
      "integrity": "sha512-xuW7dzkUpcJq7QnhOsnNUgtYp3xRwpt2F7abdRYIpCsAt0hhUqia0EdxyXZQQpNmGtsCzYHryaKSV3q3GJnq7w==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "component-emitter": "^1.3.0",
        "cookiejar": "^2.1.4",
        "debug": "^4.3.4",
        "fast-safe-stringify": "^2.1.1",
        "form-data": "^4.0.0",
        "formidable": "^3.5.1",
        "methods": "^1.1.2",
        "mime": "2.6.0",
        "qs": "^6.11.0"
      },
      "engines": {
        "node": ">=14.18.0"
      }
    },
    "node_modules/superagent/node_modules/mime": {
      "version": "2.6.0",
      "resolved": "https://registry.npmjs.org/mime/-/mime-2.6.0.tgz",
      "integrity": "sha512-USPkMeET31rOMiarsBNIHZKLGgvKc/LrjofAnBlOttf5ajRvqiRA8QsenbcooctK6d6Ts6aqZXBA+XbkKthiQg==",
      "dev": true,
      "license": "MIT",
      "bin": {
        "mime": "cli.js"
      },
      "engines": {
        "node": ">=4.0.0"
      }
    },
    "node_modules/supertest": {
      "version": "7.0.0",
      "resolved": "https://registry.npmjs.org/supertest/-/supertest-7.0.0.tgz",
      "integrity": "sha512-qlsr7fIC0lSddmA3tzojvzubYxvlGtzumcdHgPwbFWMISQwL22MhM2Y3LNt+6w9Yyx7559VW5ab70dgphm8qQA==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "methods": "^1.1.2",
        "superagent": "^9.0.1"
      },
      "engines": {
        "node": ">=14.18.0"
      }
    },
    "node_modules/supports-color": {
      "version": "7.2.0",
      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
--- a/package.json
+++ b/package.json
@@ -28,28 +28,30 @@
  "dependencies": {
    "@ai-sdk/google": "^1.0.0",
    "@ai-sdk/openai": "^1.1.9",
-    "ai": "^4.1.21",
+    "ai": "^4.1.26",
    "axios": "^1.7.9",
    "commander": "^13.1.0",
    "cors": "^2.8.5",
    "dotenv": "^16.4.7",
    "duck-duck-scrape": "^2.2.7",
    "express": "^4.21.2",
    "node-fetch": "^3.3.2",
    "undici": "^7.3.0",
-    "zod": "^3.22.4",
+    "zod": "^3.22.4"
    "dotenv": "^16.4.7"
  },
  "devDependencies": {
    "@types/commander": "^2.12.0",
    "@types/cors": "^2.8.17",
    "@types/express": "^5.0.0",
    "@types/node-fetch": "^2.6.12",
    "@types/commander": "^2.12.0",
    "@types/jest": "^29.5.14",
    "@types/node": "^22.10.10",
    "@types/node-fetch": "^2.6.12",
    "@types/supertest": "^6.0.2",
    "@typescript-eslint/eslint-plugin": "^7.0.1",
    "@typescript-eslint/parser": "^7.0.1",
    "eslint": "^8.56.0",
    "jest": "^29.7.0",
    "supertest": "^7.0.0",
    "ts-jest": "^29.2.5",
    "ts-node": "^10.9.2",
    "typescript": "^5.7.3"
--- a/src/tests/server.test.ts
+++ b/src/tests/server.test.ts
@@ -0,0 +1,353 @@
 import request from 'supertest';
 import { EventEmitter } from 'events';
 import type { Express } from 'express';
 const TEST_SECRET = 'test-secret';
 let app: Express;
 describe('/v1/chat/completions', () => {
  jest.setTimeout(120000); // Increase timeout for all tests in this suite
  beforeEach(async () => {
    // Set NODE_ENV to test to prevent server from auto-starting
    process.env.NODE_ENV = 'test';
    // Clean up any existing secret
    const existingSecretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
    if (existingSecretIndex !== -1) {
      process.argv.splice(existingSecretIndex, 1);
    }
    // Set up test secret and import server module
    process.argv.push(`--secret=${TEST_SECRET}`);
    // Import server module (jest.resetModules() is called automatically before each test)
    const { default: serverModule } = await import('../server');
    app = serverModule;
  });
  afterEach(async () => {
    // Clean up any remaining event listeners
    const emitter = EventEmitter.prototype;
    emitter.removeAllListeners();
    emitter.setMaxListeners(emitter.getMaxListeners() + 1);
    // Clean up test secret
    const secretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
    if (secretIndex !== -1) {
      process.argv.splice(secretIndex, 1);
    }
    // Wait for any pending promises to settle
    await new Promise(resolve => setTimeout(resolve, 500));
    // Reset module cache to ensure clean state
    jest.resetModules();
  });
  it('should require authentication when secret is set', async () => {
    // Note: secret is already set in beforeEach
    const response = await request(app)
      .post('/v1/chat/completions')
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    expect(response.status).toBe(401);
  });
  it('should allow requests without auth when no secret is set', async () => {
    // Remove secret for this test
    const secretIndex = process.argv.findIndex(arg => arg.startsWith('--secret='));
    if (secretIndex !== -1) {
      process.argv.splice(secretIndex, 1);
    }
    // Reload server module without secret
    const { default: serverModule } = await import('../server');
    app = serverModule;
    const response = await request(app)
      .post('/v1/chat/completions')
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    expect(response.status).toBe(200);
  });
  it('should reject requests without user message', async () => {
    const response = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [{ role: 'developer', content: 'test' }]
      });
    expect(response.status).toBe(400);
    expect(response.body.error).toBe('Last message must be from user');
  });
  it('should handle non-streaming request', async () => {
    const response = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    expect(response.status).toBe(200);
    expect(response.body).toMatchObject({
      object: 'chat.completion',
      choices: [{
        message: {
          role: 'assistant'
        }
      }]
    });
  });
  it('should track tokens correctly in non-streaming response', async () => {
    // Create a promise that resolves when token tracking is complete
    const tokenTrackingPromise = new Promise<void>((resolve) => {
      const emitter = EventEmitter.prototype;
      const originalEmit = emitter.emit;
      // Override emit to detect when token tracking is done
      emitter.emit = function(event: string, ...args: any[]) {
        if (event === 'usage') {
          // Wait for next tick to ensure all token tracking is complete
          process.nextTick(() => {
            emitter.emit = originalEmit;
            resolve();
          });
        }
        return originalEmit.apply(this, [event, ...args]);
      };
    });
    const response = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    // Wait for token tracking to complete
    await tokenTrackingPromise;
    expect(response.body.usage).toMatchObject({
      prompt_tokens: expect.any(Number),
      completion_tokens: expect.any(Number),
      total_tokens: expect.any(Number),
      completion_tokens_details: {
        reasoning_tokens: expect.any(Number),
        accepted_prediction_tokens: expect.any(Number),
        rejected_prediction_tokens: expect.any(Number)
      }
    });
    // Verify token counts are reasonable
    expect(response.body.usage.prompt_tokens).toBeGreaterThan(0);
    expect(response.body.usage.completion_tokens).toBeGreaterThan(0);
    expect(response.body.usage.total_tokens).toBe(
      response.body.usage.prompt_tokens + response.body.usage.completion_tokens
    );
  });
  it('should handle streaming request and track tokens correctly', async () => {
    return new Promise<void>((resolve, reject) => {
      let isDone = false;
      let totalCompletionTokens = 0;
      const cleanup = () => {
        clearTimeout(timeoutHandle);
        isDone = true;
        resolve();
      };
      const timeoutHandle = setTimeout(() => {
        if (!isDone) {
          cleanup();
          reject(new Error('Test timed out'));
        }
      }, 30000);
      request(app)
        .post('/v1/chat/completions')
        .set('Authorization', `Bearer ${TEST_SECRET}`)
        .send({
          model: 'test-model',
          messages: [{ role: 'user', content: 'test' }],
          stream: true
        })
        .buffer(true)
        .parse((res, callback) => {
          const response = res as unknown as {
            on(event: 'data', listener: (chunk: Buffer) => void): void;
            on(event: 'end', listener: () => void): void;
            on(event: 'error', listener: (err: Error) => void): void;
          };
          let responseData = '';
          response.on('error', (err) => {
            cleanup();
            callback(err, null);
          });
          response.on('data', (chunk) => {
            responseData += chunk.toString();
          });
          response.on('end', () => {
            try {
              callback(null, responseData);
            } catch (err) {
              cleanup();
              callback(err instanceof Error ? err : new Error(String(err)), null);
            }
          });
        })
        .end((err, res) => {
          if (err) return reject(err);
          expect(res.status).toBe(200);
          expect(res.headers['content-type']).toBe('text/event-stream');
          // Verify stream format and content
          if (isDone) return; // Prevent multiple resolves
          const responseText = res.body as string;
          const chunks = responseText
            .split('\n\n')
            .filter((line: string) => line.startsWith('data: '))
            .map((line: string) => JSON.parse(line.replace('data: ', '')));
          // Process all chunks
          expect(chunks.length).toBeGreaterThan(0);
          // Verify initial chunk format
          expect(chunks[0]).toMatchObject({
            id: expect.any(String),
            object: 'chat.completion.chunk',
            choices: [{
              index: 0,
              delta: { role: 'assistant' },
              logprobs: null,
              finish_reason: null
            }]
          });
          // Verify content chunks have content
          chunks.slice(1).forEach(chunk => {
            const content = chunk.choices[0].delta.content;
            if (content && content.trim()) {
              totalCompletionTokens += 1; // Count 1 token per chunk as per Vercel convention
            }
            expect(chunk).toMatchObject({
              object: 'chat.completion.chunk',
              choices: [{
                delta: expect.objectContaining({
                  content: expect.any(String)
                })
              }]
            });
          });
          // Verify final chunk format if present
          const lastChunk = chunks[chunks.length - 1];
          if (lastChunk?.choices?.[0]?.finish_reason === 'stop') {
            expect(lastChunk).toMatchObject({
              object: 'chat.completion.chunk',
              choices: [{
                delta: {},
                finish_reason: 'stop'
              }]
            });
          }
          // Verify we tracked some completion tokens
          expect(totalCompletionTokens).toBeGreaterThan(0);
          // Clean up and resolve
          if (!isDone) {
            cleanup();
          }
        });
    });
  });
  it('should track tokens correctly in error response', async () => {
    const response = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [] // Invalid messages array
      });
    expect(response.status).toBe(400);
    expect(response.body).toHaveProperty('error');
    expect(response.body.error).toBe('Messages array is required and must not be empty');
    // Make another request to verify token tracking after error
    const validResponse = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    // Verify token tracking still works after error
    expect(validResponse.body.usage).toMatchObject({
      prompt_tokens: expect.any(Number),
      completion_tokens: expect.any(Number),
      total_tokens: expect.any(Number),
      completion_tokens_details: {
        reasoning_tokens: expect.any(Number),
        accepted_prediction_tokens: expect.any(Number),
        rejected_prediction_tokens: expect.any(Number)
      }
    });
    // Verify token counts are reasonable
    expect(validResponse.body.usage.prompt_tokens).toBeGreaterThan(0);
    expect(validResponse.body.usage.completion_tokens).toBeGreaterThan(0);
    expect(validResponse.body.usage.total_tokens).toBe(
      validResponse.body.usage.prompt_tokens + validResponse.body.usage.completion_tokens
    );
  });
  it('should provide token usage in Vercel AI SDK format', async () => {
    const response = await request(app)
      .post('/v1/chat/completions')
      .set('Authorization', `Bearer ${TEST_SECRET}`)
      .send({
        model: 'test-model',
        messages: [{ role: 'user', content: 'test' }]
      });
    expect(response.status).toBe(200);
    const usage = response.body.usage;
    expect(usage).toMatchObject({
      prompt_tokens: expect.any(Number),
      completion_tokens: expect.any(Number),
      total_tokens: expect.any(Number),
      completion_tokens_details: {
        reasoning_tokens: expect.any(Number),
        accepted_prediction_tokens: expect.any(Number),
        rejected_prediction_tokens: expect.any(Number)
      }
    });
    // Verify token counts are reasonable
    expect(usage.prompt_tokens).toBeGreaterThan(0);
    expect(usage.completion_tokens).toBeGreaterThan(0);
    expect(usage.total_tokens).toBe(
      usage.prompt_tokens + usage.completion_tokens
    );
  });
 });
--- a/src/evals/batch-evals.ts
+++ b/src/evals/batch-evals.ts
@@ -7,8 +7,6 @@ import {GEMINI_API_KEY} from '../config';
 import {z} from 'zod';
 import {AnswerAction, TrackerContext} from "../types";
 import {createGoogleGenerativeAI} from "@ai-sdk/google";
 import {TokenTracker} from "../utils/token-tracker";
 import {ActionTracker} from "../utils/action-tracker";
 const execAsync = promisify(exec);
@@ -184,27 +182,6 @@ async function batchEvaluate(inputFile: string): Promise<void> {
    }
  }
  async function getResponseStreamingAgent(query: string) {
    const res = await fetch("http://localhost:3000/chat", {
      method: "POST",
      headers: {"Content-Type": "application/json"},
      body: JSON.stringify({query})
    })
    const text = await res.text()
    return {
      result: {
        think: '',
        action: 'answer',
        answer: text.split("RESPONSE_START")[1].split("RESPONSE_END")[0].trim(),
        references: []
      },
      context: {
         tokenTracker: new TokenTracker(),
         actionTracker: new ActionTracker()
      }
    }
  }
  // Calculate and print statistics
  const stats = calculateStats(results, modelName);
  printStats(stats);
@@ -229,4 +206,4 @@ if (require.main === module) {
  batchEvaluate(inputFile).catch(console.error);
 }
-export {batchEvaluate};
+export {batchEvaluate};
--- a/src/server.ts
+++ b/src/server.ts
@@ -2,7 +2,16 @@ import express, {Request, Response, RequestHandler} from 'express';
 import cors from 'cors';
 import {EventEmitter} from 'events';
 import {getResponse} from './agent';
-import {StepAction, StreamMessage, TrackerContext} from './types';
+import {
  StepAction,
  StreamMessage,
  TrackerContext,
  ChatCompletionRequest,
  ChatCompletionResponse,
  ChatCompletionChunk,
  AnswerAction,
  TOKEN_CATEGORIES
 } from './types';
 import fs from 'fs/promises';
 import path from 'path';
 import {TokenTracker} from "./utils/token-tracker";
@@ -11,6 +20,9 @@ import {ActionTracker} from "./utils/action-tracker";
 const app = express();
 const port = process.env.PORT || 3000;
 // Get secret from command line args for optional authentication
 const secret = process.argv.find(arg => arg.startsWith('--secret='))?.split('=')[1];
 app.use(cors());
 app.use(express.json());
@@ -24,6 +36,278 @@ interface QueryRequest extends Request {
  };
 }
 // OpenAI-compatible chat completions endpoint
 app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
  // Check authentication if secret is set
  if (secret) {
    const authHeader = req.headers.authorization;
    if (!authHeader || !authHeader.startsWith('Bearer ') || authHeader.split(' ')[1] !== secret) {
      console.log('[chat/completions] Unauthorized request');
      res.status(401).json({ error: 'Unauthorized' });
      return;
    }
  }
  // Log request details (excluding sensitive data)
  console.log('[chat/completions] Request:', {
    model: req.body.model,
    stream: req.body.stream,
    messageCount: req.body.messages?.length,
    hasAuth: !!req.headers.authorization,
    requestId: Date.now().toString()
  });
  const body = req.body as ChatCompletionRequest;
  if (!body.messages?.length) {
    return res.status(400).json({ error: 'Messages array is required and must not be empty' });
  }
  const lastMessage = body.messages[body.messages.length - 1];
  if (lastMessage.role !== 'user') {
    return res.status(400).json({ error: 'Last message must be from user' });
  }
  const requestId = Date.now().toString();
  const context: TrackerContext = {
    tokenTracker: new TokenTracker(),
    actionTracker: new ActionTracker()
  };
  // Track prompt tokens for the initial message
  // Use Vercel's token counting convention - 1 token per message
  const messageTokens = body.messages.length;
  context.tokenTracker.trackUsage('agent', messageTokens, TOKEN_CATEGORIES.PROMPT);
  if (body.stream) {
    res.setHeader('Content-Type', 'text/event-stream');
    res.setHeader('Cache-Control', 'no-cache');
    res.setHeader('Connection', 'keep-alive');
    // Send initial chunk with opening think tag
    const initialChunk: ChatCompletionChunk = {
      id: requestId,
      object: 'chat.completion.chunk',
      created: Math.floor(Date.now() / 1000),
      model: body.model,
      system_fingerprint: 'fp_' + requestId,
      choices: [{
        index: 0,
        delta: { role: 'assistant', content: '<think>' },
        logprobs: null,
        finish_reason: null
      }]
    };
    res.write(`data: ${JSON.stringify(initialChunk)}\n\n`);
    // Set up progress listener with cleanup
    const actionListener = (action: any) => {
      // Track reasoning tokens for each chunk using Vercel's convention
      const chunkTokens = 1; // Default to 1 token per chunk
      context.tokenTracker.trackUsage('evaluator', chunkTokens, TOKEN_CATEGORIES.REASONING);
      // Only send chunk if there's content to send
      if (action.think) {
        const chunk: ChatCompletionChunk = {
          id: requestId,
          object: 'chat.completion.chunk',
          created: Math.floor(Date.now() / 1000),
          model: body.model,
          system_fingerprint: 'fp_' + requestId,
          choices: [{
            index: 0,
            delta: { content: `<think>${action.think}</think>` },
            logprobs: null,
            finish_reason: null
          }]
        };
        const chunkStr = `data: ${JSON.stringify(chunk)}\n\n`;
        console.log('[chat/completions] Sending chunk:', {
          id: chunk.id,
          content: chunk.choices[0].delta.content,
          finish_reason: chunk.choices[0].finish_reason
        });
        res.write(chunkStr);
      }
    };
    context.actionTracker.on('action', actionListener);
    // Clean up listener on response finish
    res.on('finish', () => {
      context.actionTracker.removeListener('action', actionListener);
    });
  }
  try {
    // Track initial query tokens - already tracked above
    // const queryTokens = Buffer.byteLength(lastMessage.content, 'utf-8');
    // context.tokenTracker.trackUsage('agent', queryTokens, 'prompt');
    let result;
    try {
      ({ result } = await getResponse(lastMessage.content, undefined, undefined, context));
    } catch (error: any) {
      // If deduplication fails, retry without it
      if (error?.response?.status === 402) {
        // If deduplication fails, retry with maxBadAttempt=3 to skip dedup
        ({ result } = await getResponse(lastMessage.content, undefined, 3, context));
      } else {
        throw error;
      }
    }
    // Track tokens based on action type
    if (result.action === 'answer') {
      // Track accepted prediction tokens for the final answer using Vercel's convention
      const answerTokens = 1; // Default to 1 token per answer
      context.tokenTracker.trackUsage('evaluator', answerTokens, TOKEN_CATEGORIES.ACCEPTED);
    } else {
      // Track rejected prediction tokens for non-answer responses
      const rejectedTokens = 1; // Default to 1 token per rejected response
      context.tokenTracker.trackUsage('evaluator', rejectedTokens, TOKEN_CATEGORIES.REJECTED);
    }
    if (body.stream) {
      // Send closing think tag
      const closeThinkChunk: ChatCompletionChunk = {
        id: requestId,
        object: 'chat.completion.chunk',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          delta: { content: '</think>' },
          logprobs: null,
          finish_reason: null
        }]
      };
      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
      // Send final answer as separate chunk
      const answerChunk: ChatCompletionChunk = {
        id: requestId,
        object: 'chat.completion.chunk',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          delta: { content: result.action === 'answer' ? (result as AnswerAction).answer : result.think },
          logprobs: null,
          finish_reason: 'stop'
        }]
      };
      res.write(`data: ${JSON.stringify(answerChunk)}\n\n`);
      res.end();
    } else {
      const usage = context.tokenTracker.getUsageDetails();
      const response: ChatCompletionResponse = {
        id: requestId,
        object: 'chat.completion',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          message: {
            role: 'assistant',
            content: result.action === 'answer' ? (result as AnswerAction).answer : result.think
          },
          logprobs: null,
          finish_reason: 'stop'
        }],
        usage
      };
      // Log final response (excluding full content for brevity)
      console.log('[chat/completions] Response:', {
        id: response.id,
        status: 200,
        contentLength: response.choices[0].message.content.length,
        usage: response.usage
      });
      res.json(response);
    }
  } catch (error: any) {
    // Log error details
    console.error('[chat/completions] Error:', {
      message: error?.message || 'An error occurred',
      stack: error?.stack,
      type: error?.constructor?.name,
      requestId
    });
    // Track error as rejected tokens with Vercel token counting
    const errorMessage = error?.message || 'An error occurred';
    // Default to 1 token for errors as per Vercel AI SDK convention
    const errorTokens = 1;
    context.tokenTracker.trackUsage('evaluator', errorTokens, TOKEN_CATEGORIES.REJECTED);
    // Clean up event listeners
    context.actionTracker.removeAllListeners('action');
    // Get token usage in OpenAI API format
    const usage = context.tokenTracker.getUsageDetails();
    if (body.stream && res.headersSent) {
      // For streaming responses that have already started, send error as a chunk
      // First send closing think tag if we're in the middle of thinking
      const closeThinkChunk: ChatCompletionChunk = {
        id: requestId,
        object: 'chat.completion.chunk',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          delta: { content: '</think>' },
          logprobs: null,
          finish_reason: null
        }]
      };
      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
      // Track error token and send error message
      context.tokenTracker.trackUsage('evaluator', 1, TOKEN_CATEGORIES.REJECTED);
      const errorChunk: ChatCompletionChunk = {
        id: requestId,
        object: 'chat.completion.chunk',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          delta: { content: errorMessage },
          logprobs: null,
          finish_reason: 'stop'
        }]
      };
      res.write(`data: ${JSON.stringify(errorChunk)}\n\n`);
      res.end();
    } else {
      // For non-streaming or not-yet-started responses, send error as JSON
      const response: ChatCompletionResponse = {
        id: requestId,
        object: 'chat.completion',
        created: Math.floor(Date.now() / 1000),
        model: body.model,
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
          message: {
            role: 'assistant',
            content: `Error: ${errorMessage}`
          },
          logprobs: null,
          finish_reason: 'stop'
        }],
        usage
      };
      res.json(response);
    }
  }
 }) as RequestHandler);
 interface StreamResponse extends Response {
  write: (chunk: string) => boolean;
 }
@@ -185,8 +469,16 @@ app.get('/api/v1/task/:requestId', (async (req: Request, res: Response) => {
  }
 }) as RequestHandler);
-app.listen(port, () => {
+// Export server startup function for better testing
-  console.log(`Server running at http://localhost:${port}`);
+export function startServer() {
-});
+  return app.listen(port, () => {
    console.log(`Server running at http://localhost:${port}`);
  });
 }
 // Start server if running directly
 if (process.env.NODE_ENV !== 'test') {
  startServer();
 }
 export default app;
--- a/src/tools/jina-dedup.ts
+++ b/src/tools/jina-dedup.ts
@@ -1,13 +1,25 @@
-import axios from 'axios';
+import axios, { AxiosError } from 'axios';
 import { TokenTracker } from "../utils/token-tracker";
 import {JINA_API_KEY} from "../config";
 const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
 const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity
 const JINA_API_CONFIG = {
  MODEL: 'jina-embeddings-v3',
  TASK: 'text-matching',
  DIMENSIONS: 1024,
  EMBEDDING_TYPE: 'float',
  LATE_CHUNKING: false
 } as const;
 // Types for Jina API
 interface JinaEmbeddingRequest {
  model: string;
  task: string;
  late_chunking: boolean;
  dimensions: number;
  embedding_type: string;
  input: string[];
 }
@@ -41,7 +53,11 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
  }
  const request: JinaEmbeddingRequest = {
-    model: 'jina-embeddings-v3',
+    model: JINA_API_CONFIG.MODEL,
    task: JINA_API_CONFIG.TASK,
    late_chunking: JINA_API_CONFIG.LATE_CHUNKING,
    dimensions: JINA_API_CONFIG.DIMENSIONS,
    embedding_type: JINA_API_CONFIG.EMBEDDING_TYPE,
    input: queries
  };
@@ -57,6 +73,15 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
      }
    );
    // Validate response format
    if (!response.data.data || response.data.data.length !== queries.length) {
      console.error('Invalid response from Jina API:', response.data);
      return {
        embeddings: [],
        tokens: 0
      };
    }
    // Sort embeddings by index to maintain original order
    const embeddings = response.data.data
      .sort((a, b) => a.index - b.index)
@@ -68,6 +93,12 @@ async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][
    };
  } catch (error) {
    console.error('Error getting embeddings from Jina:', error);
    if (error instanceof AxiosError && error.response?.status === 402) {
      return {
        embeddings: [],
        tokens: 0
      };
    }
    throw error;
  }
 }
@@ -91,6 +122,15 @@ export async function dedupQueries(
    const allQueries = [...newQueries, ...existingQueries];
    const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries);
    // If embeddings is empty (due to 402 error), return all new queries
    if (!allEmbeddings.length) {
      console.log('Dedup (no embeddings):', newQueries);
      return {
        unique_queries: newQueries,
        tokens: 0
      };
    }
    // Split embeddings back into new and existing
    const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
    const existingEmbeddings = allEmbeddings.slice(newQueries.length);
--- a/src/types.ts
+++ b/src/types.ts
@@ -31,9 +31,24 @@ export type VisitAction = BaseAction & {
 export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction;
 // Response Types
 export const TOKEN_CATEGORIES = {
  PROMPT: 'prompt',
  REASONING: 'reasoning',
  ACCEPTED: 'accepted',
  REJECTED: 'rejected'
 } as const;
 export type TokenCategory = typeof TOKEN_CATEGORIES[keyof typeof TOKEN_CATEGORIES];
 // Following Vercel AI SDK's token counting interface
 export interface TokenUsage {
  tool: string;
  tokens: number;
  category?: TokenCategory;
  // Following Vercel AI SDK's token counting interface
  prompt_tokens?: number;
  completion_tokens?: number;
  total_tokens?: number;
 }
 export interface SearchResponse {
@@ -144,6 +159,60 @@ export interface StreamMessage {
  };
 }
 // OpenAI API Types
 export interface ChatCompletionRequest {
  model: string;
  messages: Array<{
    role: string;
    content: string;
  }>;
  stream?: boolean;
 }
 export interface ChatCompletionResponse {
  id: string;
  object: 'chat.completion';
  created: number;
  model: string;
  system_fingerprint: string;
  choices: Array<{
    index: number;
    message: {
      role: 'assistant';
      content: string;
    };
    logprobs: null;
    finish_reason: 'stop';
  }>;
  usage: {
    prompt_tokens: number;
    completion_tokens: number;
    total_tokens: number;
    completion_tokens_details?: {
      reasoning_tokens: number;
      accepted_prediction_tokens: number;
      rejected_prediction_tokens: number;
    };
  };
 }
 export interface ChatCompletionChunk {
  id: string;
  object: 'chat.completion.chunk';
  created: number;
  model: string;
  system_fingerprint: string;
  choices: Array<{
    index: number;
    delta: {
      role?: 'assistant';
      content?: string;
    };
    logprobs: null;
    finish_reason: null | 'stop';
  }>;
 }
 // Tracker Types
 import { TokenTracker } from './utils/token-tracker';
 import { ActionTracker } from './utils/action-tracker';
--- a/src/utils/token-tracker.ts
+++ b/src/utils/token-tracker.ts
@@ -1,6 +1,6 @@
 import { EventEmitter } from 'events';
-import { TokenUsage } from '../types';
+import { TokenUsage, TokenCategory } from '../types';
 export class TokenTracker extends EventEmitter {
  private usages: TokenUsage[] = [];
@@ -11,15 +11,16 @@ export class TokenTracker extends EventEmitter {
    this.budget = budget;
  }
-  trackUsage(tool: string, tokens: number) {
+  trackUsage(tool: string, tokens: number, category?: TokenCategory) {
    const currentTotal = this.getTotalUsage();
    if (this.budget && currentTotal + tokens > this.budget) {
      console.error(`Token budget exceeded: ${currentTotal + tokens} > ${this.budget}`);
    }
    // Only track usage if we're within budget
    if (!this.budget || currentTotal + tokens <= this.budget) {
-      this.usages.push({ tool, tokens });
+      const usage = { tool, tokens, category };
-      this.emit('usage', { tool, tokens });
+      this.usages.push(usage);
      this.emit('usage', usage);
    }
  }
@@ -34,6 +35,41 @@ export class TokenTracker extends EventEmitter {
    }, {} as Record<string, number>);
  }
  getUsageDetails(): {
    prompt_tokens: number;
    completion_tokens: number;
    total_tokens: number;
    completion_tokens_details?: {
      reasoning_tokens: number;
      accepted_prediction_tokens: number;
      rejected_prediction_tokens: number;
    };
  } {
    const categoryBreakdown = this.usages.reduce((acc, { tokens, category }) => {
      if (category) {
        acc[category] = (acc[category] || 0) + tokens;
      }
      return acc;
    }, {} as Record<string, number>);
    const prompt_tokens = categoryBreakdown.prompt || 0;
    const completion_tokens = 
      (categoryBreakdown.reasoning || 0) + 
      (categoryBreakdown.accepted || 0) + 
      (categoryBreakdown.rejected || 0);
    return {
      prompt_tokens,
      completion_tokens,
      total_tokens: prompt_tokens + completion_tokens,
      completion_tokens_details: {
        reasoning_tokens: categoryBreakdown.reasoning || 0,
        accepted_prediction_tokens: categoryBreakdown.accepted || 0,
        rejected_prediction_tokens: categoryBreakdown.rejected || 0
      }
    };
  }
  printSummary() {
    const breakdown = this.getUsageBreakdown();
    console.log('Token Usage Summary:', {