From 7ed2f1a2e2480bdebfe482b5e0a5b26fd2ee5831 Mon Sep 17 00:00:00 2001 From: nbbb24 Date: Tue, 11 Nov 2025 07:25:17 +0000 Subject: [PATCH 1/2] add ecg-qa-cot dataset --- ecg_bench/configs/constants.py | 1 + ecg_bench/configs/preprocessing/config.yaml | 11 ++++ ecg_bench/ecg_qa_cot_data.py | 45 ++++++++++++++++ ecg_bench/preprocessors/map_ecg.py | 59 ++++++++++++++++++--- ecg_bench/show_processed_json.py | 22 ++++++++ ecg_bench/utils/file_manager.py | 2 + 6 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 ecg_bench/configs/preprocessing/config.yaml create mode 100644 ecg_bench/ecg_qa_cot_data.py create mode 100644 ecg_bench/show_processed_json.py diff --git a/ecg_bench/configs/constants.py b/ecg_bench/configs/constants.py index ed4d7ed..0982157 100644 --- a/ecg_bench/configs/constants.py +++ b/ecg_bench/configs/constants.py @@ -22,6 +22,7 @@ "ecg_grounding_pulse", "ecg_grounding", "ecg_grounding_test", + "ecg_qa_cot" ] # Hugging Face diff --git a/ecg_bench/configs/preprocessing/config.yaml b/ecg_bench/configs/preprocessing/config.yaml new file mode 100644 index 0000000..fdd7890 --- /dev/null +++ b/ecg_bench/configs/preprocessing/config.yaml @@ -0,0 +1,11 @@ +base_data: null +dev: null +map_data: ecg-qa_ptbxl +mix_data: null +num_cores: 12 +preprocess: false +sampled_file: null +seed: 0 +segment_len: 1250 +target_sf: 250 +toy: null diff --git a/ecg_bench/ecg_qa_cot_data.py b/ecg_bench/ecg_qa_cot_data.py new file mode 100644 index 0000000..36433c4 --- /dev/null +++ b/ecg_bench/ecg_qa_cot_data.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +Script to display the first data item from ECG QA CoT dataset as JSON +""" + +import csv +import json +import sys + + +def show_first_data(csv_file): + """ + Read the first data item from CSV and display as JSON + + Args: + csv_file: Path to the CSV file + """ + try: + with open(csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + first_row = next(reader) + + # Pretty print as JSON + print(json.dumps(first_row, indent=2, ensure_ascii=False)) + + except FileNotFoundError: + print(f"Error: File '{csv_file}' not found", file=sys.stderr) + sys.exit(1) + except StopIteration: + print("Error: CSV file is empty", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + # Default path to the ECG QA CoT training data + csv_file = "data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv" + + # Allow custom file path as command line argument + if len(sys.argv) > 1: + csv_file = sys.argv[1] + + show_first_data(csv_file) \ No newline at end of file diff --git a/ecg_bench/preprocessors/map_ecg.py b/ecg_bench/preprocessors/map_ecg.py index b88b8e8..9705914 100644 --- a/ecg_bench/preprocessors/map_ecg.py +++ b/ecg_bench/preprocessors/map_ecg.py @@ -4,6 +4,7 @@ from pathlib import Path from datasets import load_dataset import argparse +import csv from ecg_bench.configs.constants import MAPPED_DATASETS from ecg_bench.utils.file_manager import FileManager @@ -32,6 +33,8 @@ def map_data(self): data = self._prepare_ecg_qa_ptb() elif self.args.map_data == "ecg-qa_mimic-iv-ecg": data = self._prepare_ecg_qa_mimic() + elif self.args.map_data == "ecg_qa_cot": + data = self._prepare_ecg_qa_cot() elif self.args.map_data in ["ecg_grounding_pulse", "ecg_grounding", "ecg_grounding_test"]: data = self._prepare_ecg_grounding() @@ -52,7 +55,7 @@ def map_data(self): print(f"Total instances for {self.args.map_data}: {len(data)}") print(f"Length of available ecgs: {len(self.available_ecgs)}") print(f"Valid instances: {len(valid_instances)}") - self.fm.save_json(valid_instances, f"./data/{self.args.map_data}_mapped_{self.args.segment_len}.json") + self.fm.save_json(valid_instances, f"./ecg_bench/data/{self.args.map_data}_mapped_{self.args.segment_len}.json") def _process_mapping_instance(self, instance): name = instance.get("name", "") @@ -60,7 +63,7 @@ def _process_mapping_instance(self, instance): if self.args.map_data in ["ecg_instruct_45k", "pretrain_mimic"]: text = instance["conversations"] ecg_path = "_".join(instance["ecg"].split("/")) - preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}" elif self.args.map_data == "ecg_instruct_pulse": text = instance["conversations"] @@ -70,9 +73,9 @@ def _process_mapping_instance(self, instance): text = [instance["question_type"], instance["question"], instance["answer"]] ecg_path = "_".join(instance["ecg_path"][0].split("/")[2:]) if self.args.map_data == "ecg-qa_ptbxl": - preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" else: - preprocessed_dir = f"./data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + preprocessed_dir = f"./ecg_bench/data/mimic/preprocessed_{self.args.segment_len}_{self.args.target_sf}" elif self.args.map_data == "ecg_bench_pulse": text = instance["conversations"] @@ -85,6 +88,20 @@ def _process_mapping_instance(self, instance): file_name = instance["ecg"] ecg_path, preprocessed_dir = self._get_ecg_grounding_path(file_name) + elif self.args.map_data == "ecg_qa_cot": + text = [ + instance["question_type"], + instance["question"], + instance["answer"], + instance.get("rationale", "") + ] + # Parse ecg_id from format "[13625]" to "13625" + ecg_id = instance["ecg_id"].strip("[]") + # PTB-XL path structure: records500/{subfolder}/{ecg_id}_hr + subfolder = ecg_id[:2] + "000" + ecg_path = f"records500_{subfolder}_{ecg_id}_hr" + preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + return ecg_path, text, name, preprocessed_dir def _prepare_ecg_grounding(self): @@ -123,11 +140,11 @@ def _prepare_ecg_instruct_pulse(self): return data def _prepare_ecg_qa_ptb(self): - preprocessed_dir = f"./data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*")) dataset_name = self.args.map_data.split("_")[1] - paraphrased_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json") - template_jsons = glob.glob(f"./data/ecg-qa/output/{dataset_name}/template/*/*.json") + paraphrased_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/paraphrased/*/*.json") + template_jsons = glob.glob(f"./ecg_bench/data/ecg-qa/output/{dataset_name}/template/*/*.json") path_to_all_jsons = paraphrased_jsons + template_jsons data = self.setup_ecg_qa(path_to_all_jsons) return data @@ -154,6 +171,13 @@ def _prepare_ecg_instruct_45k(self): data = self.fm.open_json(f"./data/{self.args.map_data}/{self.args.map_data}.json") return data + def _prepare_ecg_qa_cot(self): + """Prepare ECG-QA dataset with Chain-of-Thought rationale from CSV files""" + preprocessed_dir = f"./ecg_bench/data/ptb/preprocessed_{self.args.segment_len}_{self.args.target_sf}" + self.available_ecgs.update(f.stem for f in Path(preprocessed_dir).glob("*")) + data = self.setup_ecg_qa_cot() + return data + def _setup_ecg_bench_pulse(self, json_path): self.list_of_hf_datasets = ["cpsc-test", "csn-test-no-cot", "code15-test", "ptb-test", "ptb-test-report", "ecgqa-test"] data = [] @@ -248,3 +272,24 @@ def setup_ecg_qa(self, glob_paths, question_types=["single-verify", "single-choo filtered_list = [item for item in loaded_file if item["question_type"] in question_types] data.extend(filtered_list) return data + + def setup_ecg_qa_cot(self): + """Load ECG-QA CoT data from CSV files (train, val, test combined)""" + data = [] + splits = { + "train": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv", + "val": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_val.csv", + "test": "./ecg_bench/data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_test.csv" + } + + for split_name, csv_file in splits.items(): + if os.path.exists(csv_file): + with open(csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + data.append(row) + print(f"Loaded {split_name} split from {csv_file}") + else: + print(f"Warning: {csv_file} not found, skipping...") + + return data diff --git a/ecg_bench/show_processed_json.py b/ecg_bench/show_processed_json.py new file mode 100644 index 0000000..3cb17a4 --- /dev/null +++ b/ecg_bench/show_processed_json.py @@ -0,0 +1,22 @@ +import json + +# Read the first entry from the JSON file +with open('./data/ecg_qa_cot_mapped_1250.json', 'r') as f: + data = json.load(f) + + # Display the first entry + if isinstance(data, list) and len(data) > 0: + print("Processed ECG-QA-COT:") + print(json.dumps(data[0], indent=2)) + else: + print(f"Unexpected data type: {type(data)}") + +with open('./data/ecg-qa_ptbxl_mapped_1250.json', 'r') as f: + data = json.load(f) + + # Display the first entry + if isinstance(data, list) and len(data) > 0: + print("Processed ECG-QA-PTBXL:") + print(json.dumps(data[0], indent=2)) + else: + print(f"Unexpected data type: {type(data)}") diff --git a/ecg_bench/utils/file_manager.py b/ecg_bench/utils/file_manager.py index 639c511..1c2f1b7 100644 --- a/ecg_bench/utils/file_manager.py +++ b/ecg_bench/utils/file_manager.py @@ -32,6 +32,8 @@ def decode_batch(batch: dict) -> dict: @staticmethod def save_config(save_path: Union[str, Path], args: argparse.Namespace): args_dict = {k: v for k, v in vars(args).items() if not k.startswith("_")} + # Create directory if it doesn't exist + Path(save_path).mkdir(parents=True, exist_ok=True) with open(f"{save_path}/config.yaml", "w") as f: yaml.dump(args_dict, f, default_flow_style=False) From b83c6ff190db6abe149459d8c9b6be7446877630 Mon Sep 17 00:00:00 2001 From: nbbb24 Date: Wed, 12 Nov 2025 06:44:16 +0000 Subject: [PATCH 2/2] update ecgqacot --- README.md | 13 +++++++++ ecg-plot | 1 + ecg_bench/ecg_qa_cot_data.py | 45 -------------------------------- ecg_bench/show_processed_json.py | 22 ---------------- scripts/preproccess.sh | 45 ++++++++++++++++++-------------- transformers | 1 + 6 files changed, 40 insertions(+), 87 deletions(-) create mode 160000 ecg-plot delete mode 100644 ecg_bench/ecg_qa_cot_data.py delete mode 100644 ecg_bench/show_processed_json.py create mode 160000 transformers diff --git a/README.md b/README.md index 161025e..33ee75d 100644 --- a/README.md +++ b/README.md @@ -255,6 +255,19 @@ wget https://physionet.org/static/published-projects/challenge-2020/classificati 3. Unzip the file and inside of `data/cpsc/classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2/training` move the `cpsc_2018` and `cpsc_2018_extra` folders into the `data/cpsc` directory. Then delete the `classification-of-12-lead-ecgs-the-physionetcomputing-in-cardiology-challenge-2020-1.0.2` folder. +#### ECG-QA-COT + +1. Create a `ecg-qa-cot` folder inside the `data` directory. + +2. Inside `data/ecg-qa-cot` execute the following command in the terminal: +``` +wget "https://polybox.ethz.ch/index.php/s/D5QaJSEw4dXkzXm/download/ecg_qa_cot_final.zip" -O ecg_qa_cot_final.zip +``` +3. Unzip the file using +``` +unzip ecg_qa_cot_final.zip +``` + ### Preprocessing 1. Execute the preprocessing script by `bash scripts/preprocess.sh`. We have provided default configurations for all the datasets used in our study but feel free to experiment with others! diff --git a/ecg-plot b/ecg-plot new file mode 160000 index 0000000..d7eb9ac --- /dev/null +++ b/ecg-plot @@ -0,0 +1 @@ +Subproject commit d7eb9ace0c83b658b41cc5d36fe68d83135bf4ac diff --git a/ecg_bench/ecg_qa_cot_data.py b/ecg_bench/ecg_qa_cot_data.py deleted file mode 100644 index 36433c4..0000000 --- a/ecg_bench/ecg_qa_cot_data.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to display the first data item from ECG QA CoT dataset as JSON -""" - -import csv -import json -import sys - - -def show_first_data(csv_file): - """ - Read the first data item from CSV and display as JSON - - Args: - csv_file: Path to the CSV file - """ - try: - with open(csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - first_row = next(reader) - - # Pretty print as JSON - print(json.dumps(first_row, indent=2, ensure_ascii=False)) - - except FileNotFoundError: - print(f"Error: File '{csv_file}' not found", file=sys.stderr) - sys.exit(1) - except StopIteration: - print("Error: CSV file is empty", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - # Default path to the ECG QA CoT training data - csv_file = "data/ecg-qa-cot/ecg_qa_cot/ecg_qa_cot_train.csv" - - # Allow custom file path as command line argument - if len(sys.argv) > 1: - csv_file = sys.argv[1] - - show_first_data(csv_file) \ No newline at end of file diff --git a/ecg_bench/show_processed_json.py b/ecg_bench/show_processed_json.py deleted file mode 100644 index 3cb17a4..0000000 --- a/ecg_bench/show_processed_json.py +++ /dev/null @@ -1,22 +0,0 @@ -import json - -# Read the first entry from the JSON file -with open('./data/ecg_qa_cot_mapped_1250.json', 'r') as f: - data = json.load(f) - - # Display the first entry - if isinstance(data, list) and len(data) > 0: - print("Processed ECG-QA-COT:") - print(json.dumps(data[0], indent=2)) - else: - print(f"Unexpected data type: {type(data)}") - -with open('./data/ecg-qa_ptbxl_mapped_1250.json', 'r') as f: - data = json.load(f) - - # Display the first entry - if isinstance(data, list) and len(data) > 0: - print("Processed ECG-QA-PTBXL:") - print(json.dumps(data[0], indent=2)) - else: - print(f"Unexpected data type: {type(data)}") diff --git a/scripts/preproccess.sh b/scripts/preproccess.sh index 46a1c74..6a4277c 100644 --- a/scripts/preproccess.sh +++ b/scripts/preproccess.sh @@ -1,21 +1,26 @@ -BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn") -SEG_LENS=(1250 2500 500) +# BASE_DATA_VALUES=("ptb" "mimic" "code15" "cpsc" "csn") +# SEG_LENS=(1250 2500 500) -for base_data in "${BASE_DATA_VALUES[@]}"; do - for seg_len in "${SEG_LENS[@]}"; do - if [ "$base_data" = "mimic" ]; then - echo "Sampling $base_data with seg_len=$seg_len" - python preprocess_ecg.py \ - --base_data="$base_data" \ - --seg_len="$seg_len" \ - --preprocess_files \ - --sample_files --random_sampling - else - echo "Preprocessing $base_data with seg_len=$seg_len" - python preprocess_ecg.py \ - --base_data="$base_data" \ - --seg_len="$seg_len" \ - --preprocess_files - fi - done -done \ No newline at end of file +# for base_data in "${BASE_DATA_VALUES[@]}"; do +# for seg_len in "${SEG_LENS[@]}"; do +# if [ "$base_data" = "mimic" ]; then +# echo "Sampling $base_data with seg_len=$seg_len" +# python preprocess_ecg.py \ +# --base_data="$base_data" \ +# --seg_len="$seg_len" \ +# --preprocess_files \ +# --sample_files --random_sampling +# else +# echo "Preprocessing $base_data with seg_len=$seg_len" +# python preprocess_ecg.py \ +# --base_data="$base_data" \ +# --seg_len="$seg_len" \ +# --preprocess_files +# fi +# done +# done + +python ecg_bench/preprocess.py \ + --map_data="ecg-qa_ptbxl" \ + --segment_len=1250 \ + --target_sf=250 \ No newline at end of file diff --git a/transformers b/transformers new file mode 160000 index 0000000..241c04d --- /dev/null +++ b/transformers @@ -0,0 +1 @@ +Subproject commit 241c04d36867259cdf11dbb4e9d9a60f9cb65ebc