From c08e270e5b59fdd9f9a53c37d07c4d8945e9ba8c Mon Sep 17 00:00:00 2001 From: IQ17 Date: Sat, 13 Apr 2024 12:04:59 +0800 Subject: [PATCH] remove pad tokens added by the accelerator.pad_across_processes; otherwise it makes tasks like pal-gsmhard-majority_voting failed --- bigcode_eval/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index ff79c0e5f..a1bbddaa7 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -2,6 +2,7 @@ import math import re import warnings +import numpy as np from collections import defaultdict from typing import List, Optional @@ -220,6 +221,13 @@ def _parse_instruction(code, instruction_tokens): shift = len("```python") return code[idx + shift :] +def _remove_rightpad_tokens(generated_tokens, pad_token): + for i in range(len(generated_tokens)-1, -1, -1): + if generated_tokens[i] == pad_token: + generated_tokens = np.delete(generated_tokens, i) + else: + break + return generated_tokens def complete_code( task, @@ -315,6 +323,7 @@ def complete_code( generated_tasks = generated_tasks.cpu().numpy() for sample, generated_tokens in zip(generated_tasks, generated_tokens): + generated_tokens = _remove_rightpad_tokens(generated_tokens, tokenizer.pad_token_id) gen_token_dict[sample].append(generated_tokens) if save_every_k_tasks >= 1 and (step + 1) % save_every_k_tasks == 0: