RiOSWorld/multi_evaluation.sh at main · AI45Lab/RiOSWorld · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash

export OPENAI_API_KEY="your openai api key"
BASE_URL="your base url"

TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
EVALUATOR="gpt-4o"
DOMAINS=(
"domain1"
"domain2"
)

GUI="pyautogui"
SCREENSHOT="screenshot" ## "screenshot" or "a11y_tree" or "screenshot_a11y_tree" or "som"
EVAL_DIR="path to evaluation_examples/examples_safe"
EVAL_MODE="step"
USE_IMAGE="True" ## whether you use screenshot in the evaluation

declare -a MODELS=(
    "model to be evaluated"
    )

declare -a LOG_FILES=(
    "path to log file"
)

declare -a RESULTS_DIRS=(
    "path to results directory"
)

if [ ${#MODELS[@]} -ne ${#LOG_FILES[@]} ] || [ ${#MODELS[@]} -ne ${#RESULTS_DIRS[@]} ]; then
    echo "Error: The lengths of the model, log file, and result directory arrays do not match"
    exit 1
fi

for ((i=0; i<${#MODELS[@]}; i++)); do
    MODEL_EVALUATED="${MODELS[$i]}"
    DOMAIN="${DOMAINS[$i]}"
    LOG_FILE="${LOG_FILES[$i]}"
    RESULTS_DIRS="${RESULTS_DIRS[$i]}"

    echo "====================================================================="
    echo "Processing model: $MODEL_EVALUATED"
    echo "Log file: $LOG_FILE"
    echo "Result directory: $RESULTS_DIRS"
    echo "====================================================================="

    # set output file path
    PROCESSED_OUTPUT="path to processed output file"
    EVAL_OUTPUT="path to evaluation output file"
    RESULT_FILE="path to result file"

    # create necessary directories
    mkdir -p $(dirname "$PROCESSED_OUTPUT")
    mkdir -p $(dirname "$EVAL_OUTPUT")
    mkdir -p $(dirname "$RESULT_FILE")

    echo "1. run data_process.py generate trajectory data"

    # Run data_process.py to process data
    python -c "
import sys
sys.path.append('evaluate')
from data_process import analyze_log_file, process_directory, save_data_to_json
import os

# process log file
print('Analyzing log file...')
try:
    popup_result = analyze_log_file('$LOG_FILE')
    print(f'Found {len(popup_result.keys())} tasks')

    if len(popup_result.keys()) == 0:
        print('Warning: No tasks found, but continue execution')

    # process result directory
    print('Processing result directory...')
    results_dir = '$RESULTS_DIRS'
    results_dir = os.path.join(results_dir, '$GUI', '$SCREENSHOT', '$MODEL_EVALUATED')
    eval_dir = '$EVAL_DIR'
    output_file = '$PROCESSED_OUTPUT'

    # ensure the parent directory of output_file exists
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
        print(f'Create directory: {output_dir}')

    # process data
    trajectory_data = process_directory(eval_dir, results_dir, popup_result)

    # check if trajectory_data is empty
    has_data = False
    for domain in trajectory_data:
        if trajectory_data[domain]:
            has_data = True
            break

    if not has_data:
        print('Warning: The processed result is empty')
        # create an empty structure to avoid JSON parsing errors
        trajectory_data = {'$DOMAIN': {'example': []}}

    save_data_to_json(trajectory_data, output_file)
    print(f'Data saved to {output_file}')
except Exception as e:
    print(f'Error processing data: {e}')
    # ensure at least one valid JSON file is created to avoid subsequent errors
    with open('$PROCESSED_OUTPUT', 'w') as f:
        f.write('{}')
    sys.exit(1)
"

    # check if the output file exists and is not empty
    if [ ! -s "$PROCESSED_OUTPUT" ]; then
        echo "Warning: The generated trajectory data file is empty, skipping this model's evaluation"
        continue
    fi

    echo "2. Run safety_evaluation.py for evaluation"

    # run safety_evaluation.py for evaluation
    python evaluate/safety_evaluation.py \
        --model_name "$EVALUATOR" \
        --api_key "$OPENAI_API_KEY" \
        --api_base "$BASE_URL" \
        --input_file "$PROCESSED_OUTPUT" \
        --output_file "$EVAL_OUTPUT" \
        --result_file "$RESULT_FILE" \
        --eval_mode "$EVAL_MODE" \
        --eval_prompt "${DOMAIN}_eval_prompt" \
        --use_image "$USE_IMAGE" \
        --ablation_mode "$ABLATION_MODE"

    echo "Trajectory data saved to: $PROCESSED_OUTPUT"
    echo "Evaluation result saved to: $EVAL_OUTPUT"
    echo "Statistic result saved to: $RESULT_FILE"

    echo "Evaluate each trajectory for safety risk"
    python evaluate/evaluate_traj_by_step.py \
        --path_result "$EVAL_OUTPUT" \
        --domain "$DOMAIN" \
        --result_file "$RESULT_FILE"

    echo "===== Model $MODEL_EVALUATED evaluation completed ====="
    echo ""
done

echo "===== All models evaluation completed ====="