diff --git a/.gitignore b/.gitignore
index 1496b4d..2a69342 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ owl/data
 owl/tmp
 owl/.env
 owl/utils/__pycache__/
+owl/results
 
 # Logs
 *.log
diff --git a/README.md b/README.md
index 5be8102..7545015 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,10 @@ python owl/run_deepseek.py
 
 # Run with other OpenAI-compatible models
 python owl/run_openai_compatiable_model.py
+
+# Run with VLLM backends (using Qwen2.5-VL-7B-Instruct w/ 4 GPUs as an example)
+bash owl/scripts/serve.sh  # run this under another terminal or screen
+python owl/run_vllm/py
 ```
 
 For a simpler version that only requires an LLM API key, you can try our minimal example:
@@ -239,7 +243,7 @@ We provided a script to reproduce the results on GAIA.
 You can check the `run_gaia_roleplaying.py` file and run the following command:
 
 ```bash
-python run_gaia_roleplaying.py
+python owl/run_gaia_roleplaying.py
 ```
 
 # ⏱️ Future Plans
diff --git a/owl/run_gaia_roleplaying_vllm.py b/owl/run_gaia_roleplaying_vllm.py
new file mode 100644
index 0000000..fee2f21
--- /dev/null
+++ b/owl/run_gaia_roleplaying_vllm.py
@@ -0,0 +1,134 @@
+"""
+This script demonstrates how to run the OWL system with an open-source model
+on VLLM as the user agent.
+
+Pre-requisites: bash scripts/serve.sh (4 GPUs for qwen2.5-vl-7b-instruct).
+"""
+
+from dotenv import load_dotenv
+load_dotenv()
+
+import os
+from loguru import logger
+
+from camel.models import ModelFactory
+from camel.toolkits import (
+    AudioAnalysisToolkit,
+    CodeExecutionToolkit,
+    DocumentProcessingToolkit,
+    ExcelToolkit,
+    ImageAnalysisToolkit,
+    SearchToolkit,
+    VideoAnalysisToolkit,
+    WebToolkit,
+)
+from camel.types import ModelPlatformType, ModelType
+from camel.configs import ChatGPTConfig
+
+from utils import GAIABenchmark
+
+
+# Configuration
+LEVEL = 1
+SAVE_RESULT = True
+test_idx = [0]
+VLLM_MODEL_TYPE = "Qwen/Qwen2.5-VL-7B-Instruct"  # set the VLLM model type
+PORT = 8964  # set the port for the VLLM model
+SAVE_TO = "results/result_vllm.json"
+
+
+def main():
+    """Main function to run the GAIA benchmark."""
+    # Create cache directory
+    cache_dir = "tmp/"
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Create models for different components
+    models = {
+        "user": ModelFactory.create(
+            model_platform=ModelPlatformType.VLLM,
+            model_type=VLLM_MODEL_TYPE,
+            model_config_dict={"temperature": 0., "top_p": 1.},
+            url=f"http://localhost:{PORT}/v1",
+        ),
+        "assistant": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+        "web": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+        "planning": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+        "video": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+        "image": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+        "search": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(),
+        ),
+    }
+    
+    # Configure toolkits
+    tools = [
+        *WebToolkit(
+            headless=True,  # Set to True for headless mode (e.g., on remote servers)
+            web_agent_model=models["web"],
+            planning_agent_model=models["planning"],
+        ).get_tools(),
+        *DocumentProcessingToolkit().get_tools(),
+        *VideoAnalysisToolkit(model=models["video"]).get_tools(),  # This requires OpenAI Key
+        *AudioAnalysisToolkit().get_tools(),  # This requires OpenAI Key
+        *CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(),
+        *ImageAnalysisToolkit(model=models["image"]).get_tools(),
+        *SearchToolkit(model=models["search"]).get_tools(),
+        *ExcelToolkit().get_tools(),
+    ]
+    
+    # Configure agent roles and parameters
+    user_agent_kwargs = {"model": models["user"]}
+    assistant_agent_kwargs = {"model": models["assistant"], "tools": tools}
+
+    # Initialize benchmark
+    benchmark = GAIABenchmark(
+        data_dir="data/gaia",
+        save_to=SAVE_TO,
+    )
+
+    # Print benchmark information
+    print(f"Number of validation examples: {len(benchmark.valid)}")
+    print(f"Number of test examples: {len(benchmark.test)}")
+
+    # Run benchmark
+    result = benchmark.run(
+        on="valid", 
+        level=LEVEL, 
+        idx=test_idx,
+        save_result=SAVE_RESULT,
+        user_role_name="user",
+        user_agent_kwargs=user_agent_kwargs,
+        assistant_role_name="assistant",
+        assistant_agent_kwargs=assistant_agent_kwargs,
+    )
+
+    # Output results
+    logger.success(f"Correct: {result['correct']}, Total: {result['total']}")
+    logger.success(f"Accuracy: {result['accuracy']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/owl/run_vllm.py b/owl/run_vllm.py
new file mode 100644
index 0000000..c430ed7
--- /dev/null
+++ b/owl/run_vllm.py
@@ -0,0 +1,136 @@
+"""
+This script demonstrates how to run the OWL system with an open-source model
+on VLLM as the user agent.
+
+Pre-requisites: bash scripts/serve.sh (4 GPUs for qwen2.5-vl-7b-instruct).
+"""
+
+from dotenv import load_dotenv
+load_dotenv()
+
+from camel.models import ModelFactory
+from camel.toolkits import (
+    AudioAnalysisToolkit,
+    CodeExecutionToolkit,
+    DocumentProcessingToolkit,
+    ExcelToolkit,
+    ImageAnalysisToolkit,
+    SearchToolkit,
+    VideoAnalysisToolkit,
+    WebToolkit,
+)
+from camel.types import ModelPlatformType, ModelType
+
+from utils import OwlRolePlaying, run_society
+
+VLLM_MODEL_TYPE = "Qwen/Qwen2.5-VL-7B-Instruct"  # set the VLLM model type
+PORT = 8964  # set the port for the VLLM model
+
+
+def construct_society(question: str) -> OwlRolePlaying:
+    r"""Construct a society of agents based on the given question.
+    
+    Args:
+        question (str): The task or question to be addressed by the society.
+        
+    Returns:
+        OwlRolePlaying: A configured society of agents ready to address the question.
+    """
+    
+    # Create models for different components
+    models = {
+        "user": ModelFactory.create(
+            model_platform=ModelPlatformType.VLLM,
+            model_type=VLLM_MODEL_TYPE,
+            model_config_dict={"temperature": 0.},
+            url=f"http://localhost:{PORT}/v1",
+        ),
+        "assistant": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+        "web": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+        "planning": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+        "video": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+        "image": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+        "search": ModelFactory.create(
+            model_platform=ModelPlatformType.OPENAI,
+            model_type=ModelType.GPT_4O,
+            model_config_dict={"temperature": 0},
+        ),
+    }
+    
+    # Configure toolkits
+    tools = [
+        *WebToolkit(
+            headless=True,  # Set to True for headless mode (e.g., on remote servers)
+            web_agent_model=models["web"],
+            planning_agent_model=models["planning"],
+        ).get_tools(),
+        *DocumentProcessingToolkit().get_tools(),
+        *VideoAnalysisToolkit(model=models["video"]).get_tools(), # This requires OpenAI Key
+        *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
+        *CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(),
+        *ImageAnalysisToolkit(model=models["image"]).get_tools(),
+        *SearchToolkit(model=models["search"]).get_tools(),
+        *ExcelToolkit().get_tools(),
+    ]
+    
+    # Configure agent roles and parameters
+    user_agent_kwargs = {"model": models["user"]}
+    assistant_agent_kwargs = {"model": models["assistant"], "tools": tools}
+    
+    # Configure task parameters
+    task_kwargs = {
+        "task_prompt": question,
+        "with_task_specify": False,
+    }
+    
+    # Create and return the society
+    society = OwlRolePlaying(
+        **task_kwargs,
+        user_role_name="user",
+        user_agent_kwargs=user_agent_kwargs,
+        assistant_role_name="assistant",
+        assistant_agent_kwargs=assistant_agent_kwargs,
+    )
+    
+    return society
+
+
+def main():
+    r"""Main function to run the OWL system with an example question."""
+    # Example research question
+    question = (
+        "What was the volume in m^3 of the fish bag that was calculated in "
+        "the University of Leicester paper `Can Hiccup Supply Enough Fish "
+        "to Maintain a Dragon's Diet?`"
+    )
+    
+    # Construct and run the society
+    society = construct_society(question)
+    answer, chat_history, token_count = run_society(society)
+    
+    # Output the result
+    print(f"Answer: {answer}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/owl/scripts/serve.sh b/owl/scripts/serve.sh
new file mode 100644
index 0000000..3a0fe1a
--- /dev/null
+++ b/owl/scripts/serve.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# tested vllm version: 0.7.3
+
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 
+
+model=${1:-Qwen/Qwen2.5-VL-7B-Instruct}
+max_model_len=${2:-32768} 
+gpu_memory_utilization=${3:-0.9}
+devices=${4:-"0,1,2,3"}
+tp_size=$(awk -F',' '{print NF}' <<< "$devices")
+port=${5:-8964}
+
+CUDA_VISIBLE_DEVICES=${devices} python -m vllm.entrypoints.openai.api_server \
+    --model ${model} \
+    --tensor_parallel_size ${tp_size} \
+    --gpu_memory_utilization ${gpu_memory_utilization} \
+    --port ${port} \
+    --max_model_len ${max_model_len} \
diff --git a/owl/utils/gaia.py b/owl/utils/gaia.py
index eae0e10..dacac40 100644
--- a/owl/utils/gaia.py
+++ b/owl/utils/gaia.py
@@ -252,6 +252,7 @@ class GAIABenchmark(BaseBenchmark):
                 
     
             if save_result:
+                os.makedirs(os.path.dirname(self.save_to), exist_ok=True)
                 with open(self.save_to, 'w') as f:
                     json.dump(self._results, f, indent=4, ensure_ascii=False)
                 f.close()
@@ -370,6 +371,10 @@ class GAIABenchmark(BaseBenchmark):
 
 
     def normalize_number_str(self, number_str: str) -> float:
+        if number_str is None:
+            logger.error("Received None as number string, returning infinity.")
+            return float("inf")
+    
         for char in ["$", "%", ","]:
             number_str = number_str.replace(char, "")
         try: