From 7ed2f1a2e2480bdebfe482b5e0a5b26fd2ee5831 Mon Sep 17 00:00:00 2001
From: nbbb24 <sxysxysxysxy282828@gmail.com>
Date: Tue, 11 Nov 2025 07:25:17 +0000
Subject: [PATCH 1/2] add ecg-qa-cot dataset

---
 ecg_bench/configs/constants.py              |  1 +
 ecg_bench/configs/preprocessing/config.yaml | 11 ++++
 ecg_bench/ecg_qa_cot_data.py                | 45 ++++++++++++++++
 ecg_bench/preprocessors/map_ecg.py          | 59 ++++++++++++++++++---
 ecg_bench/show_processed_json.py            | 22 ++++++++
 ecg_bench/utils/file_manager.py             |  2 +
 6 files changed, 133 insertions(+), 7 deletions(-)
 create mode 100644 ecg_bench/configs/preprocessing/config.yaml
 create mode 100644 ecg_bench/ecg_qa_cot_data.py
 create mode 100644 ecg_bench/show_processed_json.py

diff --git a/ecg_bench/configs/constants.py b/ecg_bench/configs/constants.py
index ed4d7ed..0982157 100644
--- a/ecg_bench/configs/constants.py
+++ b/ecg_bench/configs/constants.py
@@ -22,6 +22,7 @@
     "ecg_grounding_pulse",
     "ecg_grounding",
     "ecg_grounding_test",
+    "ecg_qa_cot"
 ]
 
 # Hugging Face
diff --git a/ecg_bench/configs/preprocessing/config.yaml b/ecg_bench/configs/preprocessing/config.yaml
new file mode 100644
index 0000000..fdd7890
--- /dev/null
+++ b/ecg_bench/configs/preprocessing/config.yaml
@@ -0,0 +1,11 @@
+base_data: null
+dev: null
+map_data: ecg-qa_ptbxl
+mix_data: null
+num_cores: 12
+preprocess: false
+sampled_file: null
+seed: 0
+segment_len: 1250
+target_sf: 250
+toy: null
diff --git a/ecg_bench/ecg_qa_cot_data.py b/ecg_bench/ecg_qa_cot_data.py
new file mode 100644
index 0000000..36433c4
--- /dev/null
+++ b/ecg_bench/ecg_qa_cot_data.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Script to display the first data item from ECG QA CoT dataset as JSON
+"""
+
+import csv
+import json
+import sys
+
+
+def show_first_data(csv_file):
+    """
+    Read the first data item from CSV and display as JSON
+
+    Args:
+        csv_file: Path to the CSV file
+    """
+    try:
+        with open(csv_file, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            first_row = next(reader)
+
+            # Pretty print as JSON
+            print(json.dumps(first_row, indent=2, ensure_ascii=False))
+
+    except FileNotFoundError:
+        print(f"Error: File '{csv_file}' not found", file=sys.stderr)
+        sys.exit(1)
+    except StopIteration:
+        print("Error: CSV file is empty", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    # Default path to the ECG QA CoT training data
+    csv_file = "data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv"
+
+    # Allow custom file path as command line argument
+    if len(sys.argv) > 1:
+        csv_file = sys.argv[1]
+
+    show_first_data(csv_file)
\ No newline at end of file
diff --git a/ecg_bench/preprocessors/map_ecg.py b/ecg_bench/preprocessors/map_ecg.py
index b88b8e8..9705914 100644
--- a/ecg_bench/preprocessors/map_ecg.py
+++ b/ecg_bench/preprocessors/map_ecg.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from datasets import load_dataset
 import argparse
+import csv
 
 from ecg_bench.configs.constants import MAPPED_DATASETS
 from ecg_bench.utils.file_manager import FileManager
@@ -32,6 +33,8 @@ def map_data(self):
             data = self._prepare_ecg_qa_ptb()
         elif self.args.map_data == "ecg-qa_mimic-iv-ecg":
             data = self._prepare_ecg_qa_mimic()
+        elif self.args.map_data == "ecg_qa_cot":
+            data = self._prepare_ecg_qa_cot()
         elif self.args.map_data in ["ecg_grounding_pulse", "ecg_grounding", "ecg_grounding_test"]:
             data = self._prepare_ecg_grounding()
 
@@ -52,7 +55,7 @@ def map_data(self):
         print(f"Total instances for {self.args.map_data}: {len(data)}")
         print(f"Length of available ecgs: {len(self.available_ecgs)}")
         print(f"Valid instances: {len(valid_instances)}")
-        self.fm.save_json(valid_instances, f"./data/{self.args.map_data}_mapped_{self.args.segment_len}.json")
+        self.fm.save_json(valid_instances, f"./ecg_bench/data/{self.args.map_data}_mapped_{self.args.segment_len}.json")
 
     def _process_mapping_instance(self, instance):
         name = instance.get("name", "")
@@ -60,7 +63,7 @@ def _process_mapping_instance(self, instance):
         if self.args.map_data in ["ecg_instruct_45k", "pretrain_mimic"]:
             text = instance["conversations"]
             ecg_path = "_".join(instance["ecg"].split("/"))
-            preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+            preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
 
         elif self.args.map_data == "ecg_instruct_pulse":
             text = instance["conversations"]
@@ -70,9 +73,9 @@ def _process_mapping_instance(self, instance):
             text = [instance["question_type"], instance["question"], instance["answer"]]
             ecg_path = "_".join(instance["ecg_path"][0].split("/")[2:])
             if self.args.map_data == "ecg-qa_ptbxl":
-                preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+                preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
             else:
-                preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+                preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
 
         elif self.args.map_data == "ecg_bench_pulse":
             text = instance["conversations"]
@@ -85,6 +88,20 @@ def _process_mapping_instance(self, instance):
             file_name = instance["ecg"]
             ecg_path, preprocessed_dir = self._get_ecg_grounding_path(file_name)
 
+        elif self.args.map_data == "ecg_qa_cot":
+            text = [
+                instance["question_type"],
+                instance["question"],
+                instance["answer"],
+                instance.get("rationale", "")
+            ]
+            # Parse ecg_id from format "[13625]" to "13625"
+            ecg_id = instance["ecg_id"].strip("[]")
+            # PTB-XL path structure: records500/{subfolder}/{ecg_id}_hr
+            subfolder = ecg_id[:2] + "000"
+            ecg_path = f"records500_{subfolder}_{ecg_id}_hr"
+            preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+
         return ecg_path, text, name, preprocessed_dir
 
     def _prepare_ecg_grounding(self):
@@ -123,11 +140,11 @@ def _prepare_ecg_instruct_pulse(self):
         return data
 
     def _prepare_ecg_qa_ptb(self):
-        preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+        preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
         self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))
         dataset_name = self.args.map_data.split("_")[1]
-        paraphrased_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")
-        template_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/template/*/*.json")
+        paraphrased_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json")
+        template_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/template/*/*.json")
         path_to_all_jsons = paraphrased_jsons + template_jsons
         data = self.setup_ecg_qa(path_to_all_jsons)
         return data
@@ -154,6 +171,13 @@ def _prepare_ecg_instruct_45k(self):
         data = self.fm.open_json(f"./data/{self.args.map_data}/{self.args.map_data}.json")
         return data
 
+    def _prepare_ecg_qa_cot(self):
+        """Prepare ECG-QA dataset with Chain-of-Thought rationale from CSV files"""
+        preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}"
+        self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*"))
+        data = self.setup_ecg_qa_cot()
+        return data
+
     def _setup_ecg_bench_pulse(self, json_path):
         self.list_of_hf_datasets = ["cpsc-test", "csn-test-no-cot", "code15-test", "ptb-test", "ptb-test-report", "ecgqa-test"]
         data = []
@@ -248,3 +272,24 @@ def setup_ecg_qa(self, glob_paths, question_types=["single-verify", "single-choo
             filtered_list = [item for item in loaded_file if item["question_type"] in question_types]
             data.extend(filtered_list)
         return data
+
+    def setup_ecg_qa_cot(self):
+        """Load ECG-QA CoT data from CSV files (train, val, test combined)"""
+        data = []
+        splits = {
+            "train": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv",
+            "val": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_val.csv",
+            "test": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_test.csv"
+        }
+
+        for split_name, csv_file in splits.items():
+            if os.path.exists(csv_file):
+                with open(csv_file, 'r', encoding='utf-8') as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        data.append(row)
+                print(f"Loaded {split_name} split from {csv_file}")
+            else:
+                print(f"Warning: {csv_file} not found, skipping...")
+
+        return data
diff --git a/ecg_bench/show_processed_json.py b/ecg_bench/show_processed_json.py
new file mode 100644
index 0000000..3cb17a4
--- /dev/null
+++ b/ecg_bench/show_processed_json.py
@@ -0,0 +1,22 @@
+import json
+
+# Read the first entry from the JSON file
+with open('./data/ecg_qa_cot_mapped_1250.json', 'r') as f:
+    data = json.load(f)
+
+    # Display the first entry
+    if isinstance(data, list) and len(data) > 0:
+        print("Processed ECG-QA-COT:")
+        print(json.dumps(data[0], indent=2))
+    else:
+        print(f"Unexpected data type: {type(data)}")
+
+with open('./data/ecg-qa_ptbxl_mapped_1250.json', 'r') as f:
+    data = json.load(f)
+
+    # Display the first entry
+    if isinstance(data, list) and len(data) > 0:
+        print("Processed ECG-QA-PTBXL:")
+        print(json.dumps(data[0], indent=2))
+    else:
+        print(f"Unexpected data type: {type(data)}")
diff --git a/ecg_bench/utils/file_manager.py b/ecg_bench/utils/file_manager.py
index 639c511..1c2f1b7 100644
--- a/ecg_bench/utils/file_manager.py
+++ b/ecg_bench/utils/file_manager.py
@@ -32,6 +32,8 @@ def decode_batch(batch: dict) -> dict:
     @staticmethod
     def save_config(save_path: Union[str, Path], args: argparse.Namespace):
         args_dict = {k: v for k, v in vars(args).items() if not k.startswith("_")}
+        # Create directory if it doesn't exist
+        Path(save_path).mkdir(parents=True, exist_ok=True)
         with open(f"{save_path}/config.yaml", "w") as f:
             yaml.dump(args_dict, f, default_flow_style=False)
 

From b83c6ff190db6abe149459d8c9b6be7446877630 Mon Sep 17 00:00:00 2001
From: nbbb24 <sxysxysxysxy282828@gmail.com>
Date: Wed, 12 Nov 2025 06:44:16 +0000
Subject: [PATCH 2/2] update ecgqacot

---
 README.md                        | 13 +++++++++
 ecg-plot                         |  1 +
 ecg_bench/ecg_qa_cot_data.py     | 45 --------------------------------
 ecg_bench/show_processed_json.py | 22 ----------------
 scripts/preproccess.sh           | 45 ++++++++++++++++++--------------
 transformers                     |  1 +
 6 files changed, 40 insertions(+), 87 deletions(-)
 create mode 160000 ecg-plot
 delete mode 100644 ecg_bench/ecg_qa_cot_data.py
 delete mode 100644 ecg_bench/show_processed_json.py
 create mode 160000 transformers

diff --git a/README.md b/README.md
index 161025e..33ee75d 100644
--- a/README.md
+++ b/README.md
@@ -255,6 +255,19 @@ wget https://physionet.org/static/published-projects/challenge-2020/classificati
 
 3. Unzip the file and inside of `data/cpsc/classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2/training` move the `cpsc_2018` and `cpsc_2018_extra` folders into the `data/cpsc` directory. Then delete the `classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2` folder.
 
+#### ECG-QA-COT
+
+1. Create a `ecg-qa-cot` folder inside the `data` directory.
+
+2. Inside `data/ecg-qa-cot` execute the following command in the terminal:
+```
+wget "https://polybox.ethz.ch/index.php/s/D5QaJSEw4dXkzXm/download/ecg_qa_cot_final.zip" -O ecg_qa_cot_final.zip
+```
+3. Unzip the file using 
+```
+unzip ecg_qa_cot_final.zip
+```
+
 ### Preprocessing
 
 1. Execute the preprocessing script by `bash scripts/preprocess.sh`. We have provided default configurations for all the datasets used in our study but feel free to experiment with others!
diff --git a/ecg-plot b/ecg-plot
new file mode 160000
index 0000000..d7eb9ac
--- /dev/null
+++ b/ecg-plot
@@ -0,0 +1 @@
+Subproject commit d7eb9ace0c83b658b41cc5d36fe68d83135bf4ac
diff --git a/ecg_bench/ecg_qa_cot_data.py b/ecg_bench/ecg_qa_cot_data.py
deleted file mode 100644
index 36433c4..0000000
--- a/ecg_bench/ecg_qa_cot_data.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to display the first data item from ECG QA CoT dataset as JSON
-"""
-
-import csv
-import json
-import sys
-
-
-def show_first_data(csv_file):
-    """
-    Read the first data item from CSV and display as JSON
-
-    Args:
-        csv_file: Path to the CSV file
-    """
-    try:
-        with open(csv_file, 'r', encoding='utf-8') as f:
-            reader = csv.DictReader(f)
-            first_row = next(reader)
-
-            # Pretty print as JSON
-            print(json.dumps(first_row, indent=2, ensure_ascii=False))
-
-    except FileNotFoundError:
-        print(f"Error: File '{csv_file}' not found", file=sys.stderr)
-        sys.exit(1)
-    except StopIteration:
-        print("Error: CSV file is empty", file=sys.stderr)
-        sys.exit(1)
-    except Exception as e:
-        print(f"Error: {e}", file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    # Default path to the ECG QA CoT training data
-    csv_file = "data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv"
-
-    # Allow custom file path as command line argument
-    if len(sys.argv) > 1:
-        csv_file = sys.argv[1]
-
-    show_first_data(csv_file)
\ No newline at end of file
diff --git a/ecg_bench/show_processed_json.py b/ecg_bench/show_processed_json.py
deleted file mode 100644
index 3cb17a4..0000000
--- a/ecg_bench/show_processed_json.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import json
-
-# Read the first entry from the JSON file
-with open('./data/ecg_qa_cot_mapped_1250.json', 'r') as f:
-    data = json.load(f)
-
-    # Display the first entry
-    if isinstance(data, list) and len(data) > 0:
-        print("Processed ECG-QA-COT:")
-        print(json.dumps(data[0], indent=2))
-    else:
-        print(f"Unexpected data type: {type(data)}")
-
-with open('./data/ecg-qa_ptbxl_mapped_1250.json', 'r') as f:
-    data = json.load(f)
-
-    # Display the first entry
-    if isinstance(data, list) and len(data) > 0:
-        print("Processed ECG-QA-PTBXL:")
-        print(json.dumps(data[0], indent=2))
-    else:
-        print(f"Unexpected data type: {type(data)}")
diff --git a/scripts/preproccess.sh b/scripts/preproccess.sh
index 46a1c74..6a4277c 100644
--- a/scripts/preproccess.sh
+++ b/scripts/preproccess.sh
@@ -1,21 +1,26 @@
-BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
-SEG_LENS=(1250 2500 500)
+# BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn")
+# SEG_LENS=(1250 2500 500)
 
-for base_data in "${BASE_DATA_VALUES[@]}"; do
-  for seg_len in "${SEG_LENS[@]}"; do
-    if [ "$base_data" = "mimic" ]; then
-      echo "Sampling $base_data with seg_len=$seg_len"
-      python preprocess_ecg.py \
-        --base_data="$base_data" \
-        --seg_len="$seg_len" \
-        --preprocess_files \
-        --sample_files --random_sampling
-    else
-      echo "Preprocessing $base_data with seg_len=$seg_len"
-      python preprocess_ecg.py \
-        --base_data="$base_data" \
-        --seg_len="$seg_len" \
-        --preprocess_files
-    fi
-  done
-done
\ No newline at end of file
+# for base_data in "${BASE_DATA_VALUES[@]}"; do
+#   for seg_len in "${SEG_LENS[@]}"; do
+#     if [ "$base_data" = "mimic" ]; then
+#       echo "Sampling $base_data with seg_len=$seg_len"
+#       python preprocess_ecg.py \
+#         --base_data="$base_data" \
+#         --seg_len="$seg_len" \
+#         --preprocess_files \
+#         --sample_files --random_sampling
+#     else
+#       echo "Preprocessing $base_data with seg_len=$seg_len"
+#       python preprocess_ecg.py \
+#         --base_data="$base_data" \
+#         --seg_len="$seg_len" \
+#         --preprocess_files
+#     fi
+#   done
+# done
+
+python ecg_bench/preprocess.py \
+  --map_data="ecg-qa_ptbxl" \
+  --segment_len=1250 \
+  --target_sf=250 
\ No newline at end of file
diff --git a/transformers b/transformers
new file mode 160000
index 0000000..241c04d
--- /dev/null
+++ b/transformers
@@ -0,0 +1 @@
+Subproject commit 241c04d36867259cdf11dbb4e9d9a60f9cb65ebc