ZJU-REAL · zhshj0110 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/src/gui_g2/data_config/rec_internvl.yaml b/src/gui_g2/data_config/rec_internvl.yaml
@@ -0,0 +1,4 @@
+datasets:
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcoco_train.json
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcocop_train.json
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcocog_train.json
diff --git a/src/gui_g2/local_scripts/create_vision_cot_data.py b/src/gui_g2/local_scripts/create_vision_cot_data.py
@@ -0,0 +1,153 @@
+import argparse
+import base64
+import concurrent.futures
+import io
+import json
+import os
+import random
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from io import BytesIO
+from typing import Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from datasets import Dataset, concatenate_datasets, load_dataset, load_from_disk
+from tqdm import tqdm
+
+import bytedtos
+import seaborn as sns
+import yaml
+from openai import AzureOpenAI
+from PIL import Image
+from pillow_avif import AvifImagePlugin
+
+
+PROMPT_FORMAT = """I will provide you with an image, an original question, and its answer related to the image. Your task is to rewrite the question in such a way that answering it requires step-by-step Chain-of-Thought (CoT) reasoning with numerical or mathematical expressions where applicable. The reasoning process can include expressions like "let me think," "oh, I see," or other natural language thought expressions.
+
+Please make sure your question is to ask for a certain answer with a certain value, do not ask for open-ended answer, and the answer is correct and easy to verify via simple protocol, like "2" or "A".
+
+Please strictly do not include "Answer:" in the question part to avoid confusion and leakage.
+
+Input Format:
+Original Question: {original_question}
+Original Answer: {original_answer}
+
+Output Format:
+Question: [rewrite the question if necessary]
+Answer: [answer with reasoning steps, including calculations where applicable]
+<think>step-by-step reasoning process</think>
+<answer>easy to verify answer</answer>
+"""
+
+
+def get_image_data_url(image_input):
+    if isinstance(image_input, str) and image_input.startswith("data:"):
+        return image_input
+
+    if isinstance(image_input, str) and image_input.startswith("http"):
+        image_input = load_image(image_input)
+
+    if isinstance(image_input, str):
+        image_input = Image.open(image_input)
+
+    if not isinstance(image_input, Image.Image):
+        raise ValueError("Unsupported image input type")
+
+    if image_input.mode != "RGB":
+        image_input = image_input.convert("RGB")
+
+    buffer = BytesIO()
+    image_input.save(buffer, format="JPEG")
+    img_bytes = buffer.getvalue()
+    base64_data = base64.b64encode(img_bytes).decode("utf-8")
+    return f"data:image/jpeg;base64,{base64_data}"
+
+
+def gpt4o_query(image, prompt, max_retries=5, initial_delay=3):
+    if image is None:
+        return None
+
+    data_url_list = [get_image_data_url(image)]
+    client = AzureOpenAI(
+        azure_endpoint="YOUR_AZURE_ENDPOINT",
+        api_version="2023-07-01-preview",
+        api_key="YOUR_API_KEY",
+    )
+
+    for attempt in range(max_retries):
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are an expert to analyze the image and provide useful information for users.",
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                    ],
+                },
+            ]
+
+            for data_url in data_url_list:
+                messages[1]["content"].insert(
+                    0, {"type": "image_url", "image_url": {"url": data_url}}
+                )
+
+            response = client.chat.completions.create(
+                model="gpt-4o-2024-08-06",
+                messages=messages,
+                temperature=0.2,
+                max_tokens=8192,
+            )
+            return response.choices[0].message.content
+
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise Exception(
+                    f"Failed after {max_retries} attempts. Last error: {str(e)}"
+                )
+            delay = initial_delay * (2**attempt) + random.uniform(
+                0, 0.1 * initial_delay * (2**attempt)
+            )
+            time.sleep(delay)
+
+
+def process_single_item(example):
+    try:
+        image_path = example["image_path"]
+        formatted_prompt = PROMPT_FORMAT.format(
+            original_question=example["question"], original_answer=example["answer"]
+        )
+
+        response = gpt4o_query(image_path, formatted_prompt)
+        example["gpt4o_response"] = response
+        return example
+    except Exception as e:
+        print(f"Error processing item: {str(e)}")
+        example["gpt4o_response"] = None
+        return example
+
+
+def main():
+    dataset_path = "path/to/your/dataset"
+    full_dataset = load_from_disk(dataset_path)
+
+    processed_dataset = full_dataset.map(
+        function=partial(process_single_item),
+        num_proc=256,
+        desc="Processing dataset with GPT-4o",
+        keep_in_memory=True,
+    )
+
+    output_path = f"{dataset_path}_processed"
+    processed_dataset.save_to_disk(output_path)
+    print(f"Processed dataset saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/gui_g2/local_scripts/lmms_eval_qwen2vl.sh b/src/gui_g2/local_scripts/lmms_eval_qwen2vl.sh
@@ -0,0 +1,61 @@
+export HF_HOME="<CACHE_DIR>"
+export HF_TOKEN="<HF_TOKEN>"
+export HF_HUB_ENABLE_HF_TRANSFER="1"
+
+export API_TYPE="<API_TYPE>"
+export AZURE_ENDPOINT="<AZURE_ENDPOINT>"
+export AZURE_API_KEY="<API_KEY>"
+export API_VERSION="<API_VERSION>"
+export MODEL_VERSION="<MODEL_VERSION>"
+export NAVIT_ATTENTION_IMPLEMENTATION="eager"
+
+# Prompt for installation with 3-second timeout
+read -t 3 -p "Do you want to install dependencies? (YES/no, timeout in 3s): " install_deps || true
+if [ "$install_deps" = "YES" ]; then
+    # Prepare the environment
+    pip3 install --upgrade pip
+    pip3 install -U setuptools
+
+    cd <PROJECT_ROOT>
+    if [ ! -d "maas_engine" ]; then
+        git clone <REPO_URL>
+    else
+        echo "maas_engine directory already exists, skipping clone"
+    fi
+    cd maas_engine
+    git pull
+    git checkout <BRANCH_NAME>
+    pip3 install --no-cache-dir --no-build-isolation -e ".[standalone]"
+
+    current_version=$(pip3 show transformers | grep Version | cut -d' ' -f2)
+    if [ "$current_version" != "4.46.2" ]; then
+        echo "Installing transformers 4.46.2 (current version: $current_version)"
+        pip3 install transformers==4.46.2
+    else
+        echo "transformers 4.46.2 is already installed"
+    fi
+
+    cd <LMMS_EVAL_DIR>
+    rm -rf <TARGET_DIR>
+    pip3 install -e .
+    pip3 install -U pydantic
+    pip3 install Levenshtein
+    pip3 install nltk
+    python3 -c "import nltk; nltk.download('wordnet', quiet=True); nltk.download('punkt', quiet=True)"
+fi
+
+TASKS=mmmu_val,mathvista_testmini,mmmu_pro
+MODEL_BASENAME=qwen2_vl
+
+model_checkpoint="<MODEL_CHECKPOINT_PATH>"
+echo "MODEL_BASENAME: ${MODEL_BASENAME}"
+cd <LMMS_EVAL_DIR>
+
+python3 -m accelerate.commands.launch --num_processes=8 --main_process_port=12345 lmms_eval \
+    --model qwen2_vl \
+    --model_args=pretrained=${model_checkpoint},max_pixels=2359296 \
+    --tasks ${TASKS} \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix ${MODEL_BASENAME} \
+    --output_path ./logs
diff --git a/src/gui_g2/local_scripts/prepare_hf_data.py b/src/gui_g2/local_scripts/prepare_hf_data.py
@@ -0,0 +1,166 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import random
+from typing import List, Dict
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import datasets
+
+import io
+from datasets import load_dataset, load_from_disk, concatenate_datasets
+from PIL import Image
+from tqdm import tqdm
+from functools import partial
+from pillow_avif import AvifImagePlugin
+from datasets import Dataset
+import json
+import yaml
+import os
+import re
+import time
+import random
+import base64
+from openai import AzureOpenAI
+import concurrent.futures
+from typing import List, Dict
+import argparse
+import time
+
+
+def extract_problem_solution(gpt4o_response):
+    # Split the response into parts
+    parts = gpt4o_response.split("<think>")
+
+    # Extract the problem (first part before any <think> tags)
+    problem = parts[0].strip()
+    # Remove "Question:" prefix if it exists
+    problem = re.sub(r"^Question:\s*", "", problem)
+    # Remove "Answer:" at the end of the problem
+    problem = re.sub(r"\s*Answer:\s*$", "", problem).strip()
+
+    # Combine all the reasoning steps into a single <think> block
+    think_parts = [p.split("</think>")[0].strip() for p in parts[1:] if "</think>" in p]
+    solution = f"<think>{' '.join(think_parts)}</think>"
+
+    # Add the final answer if it exists, removing "Answer:" prefix
+    if "<answer>" in gpt4o_response:
+        final_answer = (
+            gpt4o_response.split("<answer>")[-1].split("</answer>")[0].strip()
+        )
+        final_answer = re.sub(r"^Answer:\s*", "", final_answer)
+        solution += f"\n\n<answer>{final_answer}</answer>"
+
+    return problem, solution
+
+
+def load_image_from_path(image_path):
+    try:
+        img = Image.open(image_path)
+        return img
+    except Exception as e:
+        print(f"Error loading image {image_path}: {str(e)}")
+        return None
+
+
+def process_raw_data(raw_data):
+    # Parse the raw data if it's a string
+    if isinstance(raw_data, str):
+        data = json.loads(raw_data)
+    else:
+        data = raw_data
+
+    # Extract problem and solution
+    try:
+        problem, solution = extract_problem_solution(data["gpt4o_response"])
+        image = load_image_from_path(data["image_path"])
+
+        return {
+            "image": image,
+            "problem": problem,
+            "solution": solution,
+            "original_question": data["question"],
+            "original_answer": data["answer"],
+        }
+    except Exception as e:
+        print(f"Error processing data {data}: {str(e)}")
+        return {
+            "image": None,
+            "problem": None,
+            "solution": None,
+            "original_question": None,
+            "original_answer": None,
+        }
+
+
+raw_data_list = [
+    "/path/to/reasoning_data_with_response_90k_verified",
+]
+
+raw_data = concatenate_datasets([load_from_disk(path) for path in raw_data_list])
+
+processed_data = raw_data.map(process_raw_data, num_proc=128).shuffle(seed=42)
+
+hf_dict = {
+    "image": [],
+    "problem": [],
+    "solution": [],
+    "original_question": [],
+    "original_answer": [],
+}
+
+for item in tqdm(processed_data):
+    hf_dict["image"].append(item["image"])
+    hf_dict["problem"].append(item["problem"])
+    hf_dict["solution"].append(item["solution"])
+    hf_dict["original_question"].append(item["original_question"])
+    hf_dict["original_answer"].append(item["original_answer"])
+
+
+features = datasets.Features(
+    {
+        "image": datasets.Image(),
+        "problem": datasets.Value("string"),
+        "solution": datasets.Value("string"),
+        "original_question": datasets.Value("string"),
+        "original_answer": datasets.Value("string"),
+    }
+)
+
+
+def has_empty_tags(text):
+    # Pattern to match empty tags like <tag></tag>
+    pattern = r"<[^>]+></[^>]+>"
+    return bool(re.search(pattern, text))
+
+
+def has_answer_pattern(text):
+    if "Answer:" in text:
+        return True
+    return False
+
+
+def has_valid_image_size(example): # for Qwen2-VL-2B's processor requirement
+    # Assuming the image is in a format that can be checked for dimensions
+    # You might need to adjust this depending on how the image is stored in your dataset
+    try:
+        image = example["image"]  # or however your image is accessed
+        if isinstance(image, dict) and "height" in image and "width" in image:
+            return image["height"] >= 28 and image["width"] >= 28
+        # If image is a PIL Image or similar
+        return image.height >= 28 and image.width >= 28
+    except:
+        return False
+
+
+ds = datasets.Dataset.from_dict(hf_dict, features=features)
+ds = ds.filter(
+    lambda x: not has_empty_tags(x["solution"])
+    and not has_answer_pattern(x["problem"])
+    and has_valid_image_size(x)
+    and x["image"] is not None,
+    num_proc=128,
+)
+# Push to Hugging Face Hub
+ds.push_to_hub("path/to/your/dataset")