diff --git a/automatic_prompt_engineer/ape.py b/automatic_prompt_engineer/ape.py index f7c1798..fbd16bd 100644 --- a/automatic_prompt_engineer/ape.py +++ b/automatic_prompt_engineer/ape.py @@ -18,8 +18,8 @@ def simple_ape(dataset, eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]', prompt_gen_template=None, demos_template='Input: [INPUT]\nOutput: [OUTPUT]', - eval_model='text-davinci-002', - prompt_gen_model='text-davinci-002', + eval_model='gpt-3.5-turbo', + prompt_gen_model='gpt-3.5-turbo', prompt_gen_mode='forward', num_prompts=50, eval_rounds=20, @@ -60,7 +60,7 @@ def simple_eval(dataset, prompts, eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]', demos_template='Input: [INPUT]\nOutput: [OUTPUT]', - eval_model='text-davinci-002', + eval_model='gpt-3.5-turbo', num_samples=50): """ Function that wraps the evaluate_prompts function to make it easier to use. @@ -87,8 +87,8 @@ def simple_estimate_cost(dataset, eval_template='Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]', prompt_gen_template=None, demos_template='Input: [INPUT]\nOutput: [OUTPUT]', - eval_model='text-davinci-002', - prompt_gen_model='text-davinci-002', + eval_model='gpt-3.5-turbo', + prompt_gen_model='gpt-3.5-turbo', prompt_gen_mode='forward', num_prompts=50, eval_rounds=20, diff --git a/automatic_prompt_engineer/configs/bandits.yaml b/automatic_prompt_engineer/configs/bandits.yaml index e3862d3..d3bbca1 100644 --- a/automatic_prompt_engineer/configs/bandits.yaml +++ b/automatic_prompt_engineer/configs/bandits.yaml @@ -6,7 +6,7 @@ generation: name: GPT_forward # the name of the model used for prompt generation batch_size: 500 # the maximum batch size used for prompt generation gpt_config: # the configuration of the GPT model used for prompt generation (these are fed directly to the openai function) - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.9 max_tokens: 50 top_p: 0.9 @@ -27,7 +27,7 @@ evaluation: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 @@ -38,7 +38,7 @@ demo: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 diff --git a/automatic_prompt_engineer/configs/default.yaml b/automatic_prompt_engineer/configs/default.yaml index e4e15f7..45f68c7 100644 --- a/automatic_prompt_engineer/configs/default.yaml +++ b/automatic_prompt_engineer/configs/default.yaml @@ -6,7 +6,7 @@ generation: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.9 max_tokens: 50 top_p: 0.9 @@ -20,7 +20,7 @@ evaluation: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 @@ -31,7 +31,7 @@ demo: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 diff --git a/automatic_prompt_engineer/evaluation/likelihood.py b/automatic_prompt_engineer/evaluation/likelihood.py index 58a6079..4ef3c86 100644 --- a/automatic_prompt_engineer/evaluation/likelihood.py +++ b/automatic_prompt_engineer/evaluation/likelihood.py @@ -80,9 +80,10 @@ def __init__(self, prompts, log_probs, num_samples): def _compute_avg_likelihood(self, prompts, log_probs, num_samples): i = 0 prompt_log_probs = [] - for prompt in prompts: + # TODO: Prompts and NumSamples should be the loop strengths + for log_prob in log_probs: prompt_log_probs.append([]) - for _ in range(num_samples): + for _ in range(len(log_probs)): lps = log_probs[i] prompt_log_probs[-1].append(sum(lps) / len(lps)) i += 1 diff --git a/automatic_prompt_engineer/llm.py b/automatic_prompt_engineer/llm.py index b194aa4..b05bb90 100644 --- a/automatic_prompt_engineer/llm.py +++ b/automatic_prompt_engineer/llm.py @@ -4,8 +4,12 @@ import time from tqdm import tqdm from abc import ABC, abstractmethod - import openai +from openai import OpenAI + +SYSTEM_PROMPT = 'You are a kick ass prompt engineer, you are given with input variables and the output generated by an LLM. Find the right prompt for this batch of input and outputs' + +from automatic_prompt_engineer import utils gpt_costs_per_thousand = { 'davinci': 0.0200, @@ -18,10 +22,11 @@ def model_from_config(config, disable_tqdm=True): """Returns a model based on the config.""" model_type = config["name"] + client = OpenAI() if model_type == "GPT_forward": - return GPT_Forward(config, disable_tqdm=disable_tqdm) + return GPT_Forward(config, client, disable_tqdm=disable_tqdm) elif model_type == "GPT_insert": - return GPT_Insert(config, disable_tqdm=disable_tqdm) + return GPT_Insert(config, client, disable_tqdm=disable_tqdm) raise ValueError(f"Unknown model type: {model_type}") @@ -54,11 +59,12 @@ def log_probs(self, text, log_prob_range): class GPT_Forward(LLM): """Wrapper for GPT-3.""" - def __init__(self, config, needs_confirmation=False, disable_tqdm=True): + def __init__(self, config, client, needs_confirmation=False, disable_tqdm=True): """Initializes the model.""" self.config = config self.needs_confirmation = needs_confirmation self.disable_tqdm = disable_tqdm + self.client = client def confirm_cost(self, texts, n, max_tokens): total_estimated_cost = 0 @@ -155,10 +161,13 @@ def __generate_text(self, prompt, n): for i in range(len(prompt)): prompt[i] = prompt[i].replace('[APE]', '').strip() response = None + + messages = utils.get_messages(prompt) + while response is None: try: - response = openai.Completion.create( - **config, prompt=prompt) + response = self.client.chat.completions.create( + **config, messages=messages) except Exception as e: if 'is greater than the maximum' in str(e): raise BatchSizeException() @@ -166,7 +175,7 @@ def __generate_text(self, prompt, n): print('Retrying...') time.sleep(5) - return [response['choices'][i]['text'] for i in range(len(response['choices']))] + return [response.choices[i].message.content for i in range(len(response.choices))] def __complete(self, prompt, n): """Generates text from the model and returns the log prob data.""" @@ -178,10 +187,13 @@ def __complete(self, prompt, n): for i in range(len(prompt)): prompt[i] = prompt[i].replace('[APE]', '').strip() response = None + + messages = utils.get_messages(prompt) + while response is None: try: - response = openai.Completion.create( - **config, prompt=prompt) + response = self.client.chat.completions.create( + **config, messages=messages) except Exception as e: print(e) print('Retrying...') @@ -199,42 +211,51 @@ def __log_probs(self, text, log_prob_range=None): assert lower_index >= 0 assert upper_index - 1 < len(text[i]) config = self.config['gpt_config'].copy() - config['logprobs'] = 1 - config['echo'] = True - config['max_tokens'] = 0 - if isinstance(text, list): - text = [f'\n{text[i]}' for i in range(len(text))] - else: - text = f'\n{text}' + config['logprobs'] = True + config['top_logprobs'] = 1 + # config['echo'] = True + config['max_tokens'] = 50 + # if isinstance(text, list): + # text = [f'\n{text[i]}' for i in range(len(text))] + # else: + # text = f'\n{text}' response = None + messages = utils.get_messages(text) + while response is None: try: - response = openai.Completion.create( - **config, prompt=text) + response = self.client.chat.completions.create( + **config, messages=messages) except Exception as e: print(e) print('Retrying...') time.sleep(5) - log_probs = [response['choices'][i]['logprobs']['token_logprobs'][1:] - for i in range(len(response['choices']))] - tokens = [response['choices'][i]['logprobs']['tokens'][1:] - for i in range(len(response['choices']))] - offsets = [response['choices'][i]['logprobs']['text_offset'][1:] - for i in range(len(response['choices']))] - - # Subtract 1 from the offsets to account for the newline - for i in range(len(offsets)): - offsets[i] = [offset - 1 for offset in offsets[i]] - if log_prob_range is not None: - # First, we need to find the indices of the tokens in the log probs - # that correspond to the tokens in the log_prob_range - for i in range(len(log_probs)): - lower_index, upper_index = self.get_token_indices( - offsets[i], log_prob_range[i]) + log_probs = [] + tokens = [] + idx = 0 + jdx = 0 + + try: + for i in range(len(response.choices)): + idx = i + this_log_probs = [] + this_tokens = [] + choice = response.choices[i] + for j in range(len(choice.logprobs.content)): + jdx = j + this_log_probs.append(choice.logprobs.content[j].logprob) + this_tokens.append(choice.logprobs.content[j].token) + log_probs[i] = log_probs[i][lower_index:upper_index] + tokens.append(this_tokens) + + except AttributeError: tokens[i] = tokens[i][lower_index:upper_index] + finally: + print(f"logprobs {response.choices[idx].logprobs.content[jdx]}") + return log_probs, tokens def get_token_indices(self, offsets, log_prob_range): @@ -258,11 +279,12 @@ def get_token_indices(self, offsets, log_prob_range): class GPT_Insert(LLM): - def __init__(self, config, needs_confirmation=False, disable_tqdm=True): + def __init__(self, config, client, needs_confirmation=False, disable_tqdm=True): """Initializes the model.""" self.config = config self.needs_confirmation = needs_confirmation self.disable_tqdm = disable_tqdm + self.client = client def confirm_cost(self, texts, n, max_tokens): total_estimated_cost = 0 @@ -314,10 +336,14 @@ def __generate_text(self, prompt, n): prefix = prompt[0].split('[APE]')[0] suffix = prompt[0].split('[APE]')[1] response = None + + messages = utils.get_messages(prompt) + + while response is None: try: - response = openai.Completion.create( - **config, prompt=prefix, suffix=suffix) + response = self.client.chat.completions.create( + **config, messages=messages, suffix=suffix) except Exception as e: print(e) print('Retrying...') diff --git a/experiments/configs/instruction_induction.yaml b/experiments/configs/instruction_induction.yaml index 1246742..30da6a0 100644 --- a/experiments/configs/instruction_induction.yaml +++ b/experiments/configs/instruction_induction.yaml @@ -6,7 +6,7 @@ generation: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.9 max_tokens: 50 top_p: 0.9 @@ -20,7 +20,7 @@ evaluation: name: GPT_forward batch_size: 20 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 @@ -31,7 +31,7 @@ demo: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 diff --git a/experiments/configs/truthful_qa.yaml b/experiments/configs/truthful_qa.yaml index a88f46e..2ae25c6 100644 --- a/experiments/configs/truthful_qa.yaml +++ b/experiments/configs/truthful_qa.yaml @@ -6,7 +6,7 @@ generation: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.9 max_tokens: 50 top_p: 0.9 @@ -19,7 +19,7 @@ evaluation: name: GPT_forward batch_size: 20 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0 @@ -30,7 +30,7 @@ demo: name: GPT_forward batch_size: 500 gpt_config: - model: text-davinci-002 + model: gpt-3.5-turbo temperature: 0.7 max_tokens: 200 top_p: 1.0