add community-use-cae:excel_analyzer

2025-12-26 10:07:51 +08:00 · 2025-03-31 23:16:13 +08:00 · 2025-03-31 23:16:13 +08:00 · 6913c0a75a
commit 6913c0a75a
parent 99cd01db8f
4 changed files with 40 additions and 36 deletions
--- a/community_usecase/excel_analyzer/README.md
+++ b/community_usecase/excel_analyzer/README.md
@ -6,6 +6,12 @@ This project uses **Owl** for data analysis and visualization.
 - Provides both English and Chinese versions of the raw data and prompts
 - Utilizes **CodeExecutionToolkit**, **ExcelToolkit**, and **FileWriteToolkit** to complete related tasks  
 - Implements **ExcelRolePlaying** based on **OwlRolePlaying**, which overrides the `system_prompt` with a cleaner, more focused version tailored for data analysis scenarios  
+- 
+- The analysis and visualization of this Excel file involve:
+    - Complex headers (merged rows)
+    - Nan value handling
+    - Complex group calculations
+    - Visualization

 ## How to Use  
 1. Set up the environment according to Owl's official instructions
@ -13,13 +19,12 @@ This project uses **Owl** for data analysis and visualization.
    ```bash
    cd community_usecase/excel_analyzer

-    # Chinese version
-    python data_insights_deepseek_zh.py
+    # Chinese version, using deepseek-v3
+    python excel_analyzer_zh.py

-    # English version
-    python data_insights_gpt4o_zh.py
+    # English version, using gpt-4o
+    python excel_analyzer_zh.py
    ```
 3. The analysis results will be saved in the current directory

-## Demo Video

--- a/community_usecase/excel_analyzer/README_zh.md
+++ b/community_usecase/excel_analyzer/README_zh.md
@ -3,11 +3,15 @@


 ## Features
-
 - 提供了英文，中文两个版本的原始数据和prompt，方便理解
 - 使用**CodeExecutionToolkit**，**ExcelToolkit**，**FileWriteToolkit**来完成相关工作
 - 在**OwlRolePlaying**基础之上实现了**ExcelRolePalying**，它重写了system_prompt，更简洁，聚焦在数据分析场景
-
+- 经过测试，在`gpt-4o`和`deepseek-v3`下均可以达到预期效果
+- 对该excel进行分析和可视化时涉及到的内容有：
+    - 复杂表头（合并行）
+    - 缺失值处理
+    - 复杂的分组计算
+    - 可视化

 ## How to use 
 1. 按照owl的官方流程搭建好环境
@ -15,16 +19,12 @@
    ```
    cd community_usecase/excel_analyzer

-    # Chinese version
+    # Chinese version, using deepseek-v3
    python excel_analyzer_zh.py

-    # English version
+    # English version, using gpt-4o
    python excel_analyzer_zh.py
    ```
 3. 数据集分析的结果将会在出存在当前目录下


-## Demo
-视频结果：[link]
-
-
--- a/community_usecase/excel_analyzer/data_analyzer_en.py
+++ b/community_usecase/excel_analyzer/data_analyzer_en.py
@ -195,12 +195,6 @@ def construct_society(question: str) -> ExcelRolePalying:
        OwlRolePlaying: A configured society of agents ready to address the question.
    """

-    # base_model_config = {
-    #     "model_platform": ModelPlatformType.DEEPSEEK,
-    #     "model_type": 'deepseek-chat',
-    #     "model_config_dict": ChatGPTConfig(temperature=0.1, max_tokens=8192).as_dict(),
-    # }
-
    # Create models for different components using Azure OpenAI
    base_model_config = {
        "model_platform": ModelPlatformType.AZURE,
@ -245,12 +239,12 @@ def construct_society(question: str) -> ExcelRolePalying:


 def main():
-    r"""Main function to run the OWL system with Azure OpenAI."""
    # Example question

-
-    default_task = "Please help analyze the number of admitted students, as well as the highest and lowest scores for each college in this file. Visualize this information in a single chart and save it in the current directory. The file path is `./data/admission_en.xlsx.`"
-
+    default_task = """Please help analyze the file `./data/admission_en.xlsx` by:
+            - Calculating the number of admitted students, as well as the highest and lowest scores for each college
+            - Plotting this information in a single chart: use a bar chart for the number of admitted students, and line charts for the highest and lowest scores
+            - Saving the generated chart as `vis_en.png` in the current directory"""

    set_log_file('log.txt')

--- a/community_usecase/excel_analyzer/data_analyzer_zh.py
+++ b/community_usecase/excel_analyzer/data_analyzer_zh.py
@ -134,11 +134,10 @@ import pandas as pd
 - 始终验证你的最终答案是否正确！  
 - 请每次都从头开始编写完整代码，编写代码后，务必运行代码并获取结果！  
  如果遇到错误，尝试调试代码。  
-  请注意，代码执行环境不支持交互式输入。  
 - 如果工具运行失败，或者代码无法正确运行，  
  绝对不要假设其返回了正确结果，并在此基础上继续推理！  
  正确的做法是分析错误原因，并尝试修正！  
- 如果你写的代码涉及到用matplotlib画图，请始终在代码开头下面这段代码：
+- [重要！！！]如果你写的代码涉及到用matplotlib画图，请始终在代码开头下面这段代码：
 ```
 import matplotlib
 matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 支持中文
@ -232,18 +231,18 @@ def construct_society(question: str) -> ExcelRolePalying:
        OwlRolePlaying: A configured society of agents ready to address the question.
    """

-    # base_model_config = {
-    #     "model_platform": ModelPlatformType.DEEPSEEK,
-    #     "model_type": 'deepseek-chat',
-    #     "model_config_dict": ChatGPTConfig(temperature=0.1, max_tokens=8192).as_dict(),
-    # }
+    base_model_config = {
+        "model_platform": ModelPlatformType.DEEPSEEK,
+        "model_type": 'deepseek-chat',
+        "model_config_dict": ChatGPTConfig(temperature=0.1, max_tokens=8192).as_dict(),
+    }

    # Create models for different components using Azure OpenAI
-    base_model_config = {
-        "model_platform": ModelPlatformType.AZURE,
-        "model_type": os.getenv("AZURE_OPENAI_MODEL_TYPE"),
-        "model_config_dict": ChatGPTConfig(temperature=0.4, max_tokens=4096).as_dict(),
-    }
+    # base_model_config = {
+    #     "model_platform": ModelPlatformType.AZURE,
+    #     "model_type": os.getenv("AZURE_OPENAI_MODEL_TYPE"),
+    #     "model_config_dict": ChatGPTConfig(temperature=0.4, max_tokens=4096).as_dict(),
+    # }


    models = {
@ -284,7 +283,13 @@ def construct_society(question: str) -> ExcelRolePalying:
 def main():
    r"""Main function to run the OWL system with Azure OpenAI."""
    # Example question
-    default_task = "帮忙分析一下这个文件中各个学院的录取人数以及最高分最低分，把这些信息画到一张图上，并存到当前目录下。文件路径是`./data/admission_zh.xlsx`"
+    # default_task = """帮忙分析一下这个文件中各个学院的录取人数以及最高分最低分，把这些信息画到一张图上，并存到当前路径下。文件路径是`./data/admission_zh.xlsx`"""
+
+    default_task = """帮忙分析一下`./data/admission_zh.xlsx`这个文件，请你：
+    - 统计各个学院的录取人数以及最高分最低分
+    - 把这些信息画到一张图上，录取人数使用柱状图，最高分最低分使用折线图
+    - 把画完的图`vis_zh.png`存到当前目录下"""
+

    set_log_file('log.txt')