From 98982aabbfb6deec860e6246f7215f0bebfefb64 Mon Sep 17 00:00:00 2001 From: windweller Date: Mon, 9 Sep 2024 10:59:22 -0700 Subject: [PATCH 01/10] add textgrad --- opto/optimizers/textgrad.py | 159 ++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 opto/optimizers/textgrad.py diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py new file mode 100644 index 00000000..96555f73 --- /dev/null +++ b/opto/optimizers/textgrad.py @@ -0,0 +1,159 @@ +from opto.optimizers.optimizer import Optimizer + +from textwrap import dedent, indent +from collections import defaultdict + +""" +Prompts are taken verbatim from: +https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer_prompts.py +""" + +GLOSSARY_TEXT = """ +### Glossary of tags that will be sent to you: +# - : The system prompt for the language model. +# - : The input to the language model. +# - : The output of the language model. +# - : The feedback to the variable. +# - : The conversation history. +# - : The focus of the optimization. +# - : The role description of the variable.""" + +### Optimize Prompts + +# System prompt to TGD +OPTIMIZER_SYSTEM_PROMPT = ( + "You are part of an optimization system that improves text (i.e., variable). " + "You will be asked to creatively and critically improve prompts, solutions to problems, code, or any other text-based variable. " + "You will receive some feedback, and use the feedback to improve the variable. " + "The feedback may be noisy, identify what is important and what is correct. " + "Pay attention to the role description of the variable, and the context in which it is used. " + "This is very important: You MUST give your response by sending the improved variable between {new_variable_start_tag} {{improved variable}} {new_variable_end_tag} tags. " + "The text you send between the tags will directly replace the variable.\n\n" + f"{GLOSSARY_TEXT}" +) + +# TGD update instruction +TGD_PROMPT_PREFIX = ( + "Here is the role of the variable you will improve: {variable_desc}.\n\n" + "The variable is the text within the following span: {variable_short} \n\n" + "Here is the context and feedback we got for the variable:\n\n" + "{variable_grad}\n\n" + "Improve the variable ({variable_desc}) using the feedback provided in tags.\n" +) + +# If the gradients are in a multi-part container +TGD_MULTIPART_PROMPT_INIT = ( + "Here is the role of the variable you will improve: {variable_desc}.\n\n" + "The variable is the text within the following span: {variable_short} \n\n" + "Here is the context and feedback we got for the variable:\n\n" +) + +TGD_MULTIPART_PROMPT_PREFIX = ( + "Improve the variable ({variable_desc}) using the feedback provided in tags.\n" +) + +TGD_PROMPT_SUFFIX = ( + "Send the improved variable " + "in the following format:\n\n{new_variable_start_tag}{{the improved variable}}{new_variable_end_tag}\n\n" + "Send ONLY the improved variable between the tags, and nothing else." +) + +MOMENTUM_PROMPT_ADDITION = ( + "Here are the past iterations of this variable:\n\n" + "{past_values}\n\n" + "Similar feedbacks across different steps suggests that the modifications to the variable are insufficient." + "If this is the case, please make more significant changes to the variable.\n\n" +) + +CONSTRAINT_PROMPT_ADDITION = ( + "You must follow the following constraints:\n\n" + "{constraint_text}\n\n" +) + +IN_CONTEXT_EXAMPLE_PROMPT_ADDITION = ( + "You must base on the following examples when modifying the {variable_desc}:\n\n" + "{in_context_examples}\n\n" +) + + +def construct_tgd_prompt(do_momentum: bool = False, + do_constrained: bool = False, + do_in_context_examples: bool = False, + **optimizer_kwargs): + """ + Construct the textual gradient descent prompt. + + :param do_momentum: Whether to include momentum in the prompt. + :type do_momentum: bool, optional + :param do_constrained: Whether to include constraints in the prompt. + :type do_constrained: bool, optional + :param do_in_context_examples: Whether to include in-context examples in the prompt. + :type do_in_context_examples: bool, optional + :param optimizer_kwargs: Additional keyword arguments for formatting the prompt. These will be things like the variable description, gradient, past values, constraints, and in-context examples. + :return: The TGD update prompt. + :rtype: str + """ + + if isinstance(optimizer_kwargs["variable_grad"], str): + multipart = False + prompt = TGD_PROMPT_PREFIX.format(**optimizer_kwargs) + + else: + gradient_context = optimizer_kwargs["variable_grad"] + gradient_context = [TGD_MULTIPART_PROMPT_INIT.format(**optimizer_kwargs)] + gradient_context + multipart = True + prompt = TGD_MULTIPART_PROMPT_PREFIX.format(**optimizer_kwargs) + + if do_momentum: + prompt += MOMENTUM_PROMPT_ADDITION.format(**optimizer_kwargs) + + if do_constrained: + prompt += CONSTRAINT_PROMPT_ADDITION.format(**optimizer_kwargs) + + if do_in_context_examples: + prompt += IN_CONTEXT_EXAMPLE_PROMPT_ADDITION.format(**optimizer_kwargs) + + prompt += TGD_PROMPT_SUFFIX.format(**optimizer_kwargs) + + if not multipart: + return prompt + + else: + return gradient_context + [prompt] + + +GRADIENT_TEMPLATE = ( + "Here is a conversation:\n\n{context}\n\n" + "This conversation is potentially part of a larger system. The output is used as {response_desc}\n\n" + "Here is the feedback we got for {variable_desc} in the conversation:\n\n{feedback}\n\n" +) +GRADIENT_MULTIPART_TEMPLATE = ( + "Above is a conversation with a language model.\n" + "This conversation is potentially part of a larger system. The output is used as {response_desc}\n\n" + "Here is the feedback we got for {variable_desc} in the conversation:\n\n{feedback}\n\n" +) + +""" +Implementation loosely adapted from +https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer.py +""" + +class TextGrad(Optimizer): + + def __init__(self, parameters, *args, **kwargs): + super().__init__(parameters, *args, **kwargs) + + def _step(self): + trace_graph = self.trace_graph # aggregate the trace graphes into one. + grads = defaultdict(str) # accumulated gradient + # trace_graph.graph is a list of nodes sorted according to the topological order + for i, (_, x) in enumerate(reversed(trace_graph.graph)): # back-propagation starts from the last node + if len(x.parents) == 0: + continue + # we take the + g = trace_graph.user_feedback if i == 0 else grads[x] + for p in self.parameters: + if p.trainable: + self._velocity[p] = self._momentum * self._velocity[p] - self._learning_rate * p.feedback + update_dict[p] = p.data + self._velocity[p] + return update_dict \ No newline at end of file From 9331d42a3edf522f169529d108afa94ccd9e2e0f Mon Sep 17 00:00:00 2001 From: windweller Date: Wed, 11 Sep 2024 12:01:02 -0700 Subject: [PATCH 02/10] update --- opto/optimizers/textgrad.py | 159 ++++++++++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 8 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 96555f73..2f00d346 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -1,4 +1,7 @@ +from typing import Any, List, Dict, Union, Tuple from opto.optimizers.optimizer import Optimizer +from opto.trace.nodes import ParameterNode, Node, MessageNode +from opto.trace.propagators import TraceGraph, GraphPropagator, Propagator from textwrap import dedent, indent from collections import defaultdict @@ -136,24 +139,164 @@ def construct_tgd_prompt(do_momentum: bool = False, """ Implementation loosely adapted from https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer.py + +Because Trace Graph is heterogeneous -- we do not treat LLM operations differently from other operations, +we don't implement specialized backward operators for LLM operations. + +TextGrad does treat LLM operations differently, and has specialized backward operators for LLM operations. +See: +https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_ops.py +https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_backward_prompts.py + +These two parts are not re-implemented here. """ +# def get_gradient_and_context_text(variable) -> Union[str, List[Union[str, bytes]]]: +# """For the variable, aggregates and returns +# i. the gradients +# ii. the context for which the gradients are computed. +# +# This is used by the optimizer. +# :return: A string containing the aggregated gradients and their corresponding context. +# :rtype: str +# """ +# +# gradient_content = [] +# for g in variable.gradients: +# if variable.gradients_context[g] is None: +# gradient_content.append(g.value) +# else: +# # If context is a list, we handle it differently. +# context = variable.gradients_context[g] +# if isinstance(context["context"], str): +# # The context could be all string. +# criticism_and_context = GRADIENT_TEMPLATE.format( +# feedback=g.value, **context) +# gradient_content.append(criticism_and_context) +# elif isinstance(context["context"], list): +# # The context may have a list of images / strings. In this case, we need to handle it differently. +# context_prompt = GRADIENT_MULTIPART_TEMPLATE.format(**context, feedback=g.value) +# criticism_and_context = context["context"] + [context_prompt] +# gradient_content.extend(criticism_and_context) +# else: +# raise ValueError("Context must be either a string or a list.") +# +# # Check if all instances are string +# if all(isinstance(i, str) for i in gradient_content): +# return "\n".join(gradient_content) +# else: +# return gradient_content + +def get_gradient_context(): + pass + class TextGrad(Optimizer): - def __init__(self, parameters, *args, **kwargs): + def __init__(self, parameters: List[ParameterNode], + config_list: List = None, + *args, + propagator: Propagator = None, + objective: Union[None, str] = None, + ignore_extraction_error: bool = True, + # ignore the type conversion error when extracting updated values from LLM's suggestion + include_example=False, + memory_size=0, # Memory size to store the past feedback + max_tokens=4096, + log=True, + **kwargs, ): super().__init__(parameters, *args, **kwargs) def _step(self): trace_graph = self.trace_graph # aggregate the trace graphes into one. - grads = defaultdict(str) # accumulated gradient + + # this is the same as gradient memory + grads = defaultdict(str) # accumulated gradient (same as variable.get_gradient_text()) + # trace_graph.graph is a list of nodes sorted according to the topological order for i, (_, x) in enumerate(reversed(trace_graph.graph)): # back-propagation starts from the last node if len(x.parents) == 0: continue - # we take the + # we take the gradient step-by-step g = trace_graph.user_feedback if i == 0 else grads[x] - for p in self.parameters: - if p.trainable: - self._velocity[p] = self._momentum * self._velocity[p] - self._learning_rate * p.feedback - update_dict[p] = p.data + self._velocity[p] - return update_dict \ No newline at end of file + + # TODO: compute gradient + # outputs, inputs, grad_outputs=None + propagated_grads = torch.autograd.grad(x.data, [p.data for p in x.parents], g) # propagate the gradient + + for p, pg in zip(x.parents, propagated_grads): + # TODO: accumulate gradient + grads[p] += pg # accumulate gradient + + # TODO: apply gradient + return {p: p.data - self.stepsize * grads[p] for p in self.parameters} # propose new update + + def call_llm( + self, system_prompt: str, user_prompt: str, verbose: Union[bool, str] = False, max_tokens: int = 4096 + ): + """Call the LLM with a prompt and return the response.""" + if verbose not in (False, "output"): + print("Prompt\n", system_prompt + user_prompt) + + messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] + + try: # Try tp force it to be a json object + response = self.llm.create( + messages=messages, + response_format={"type": "json_object"}, + max_tokens=max_tokens, + ) + except Exception: + response = self.llm.create(messages=messages, max_tokens=max_tokens) + response = response.choices[0].message.content + + if verbose: + print("LLM response:\n", response) + return response + + def get_label(self, x): + """Construct a label for a node based on its name, description, and content. + + Parameters + ---------- + x: The node for which the label is to be constructed. + + Note + ---------- + Using a colon in the name can cause problems in graph visualization tools like Graphviz. + To avoid issues, the label is constructed by combining the node's Python name, truncated description, and content. + If the description or content exceeds the print limit, it is truncated and appended with an ellipsis. + """ + # using colon in the name causes problems in graphviz + description = x.description + if len(x.description) > self.print_limit: + description = x.description[:self.print_limit] + "..." + + text = x.py_name + "\n" + description + "\n" + content = str(x.data) + if isinstance(x.data, dict): + if "content" in x.data: + content = str(x.data["content"]) + + if len(content) > self.print_limit: + content = content[:self.print_limit] + "..." + return text + content + + def _update_prompt(self, node: Node, input_nodes, gradient_memory): + # gradient_memory: just accumulated gradient from the previous calculation + optimizer_information = { + "variable_desc": node.description, + "variable_value": node.data, + "variable_grad": get_gradient_and_context_text(variable), + "variable_short": node.py_name, + "constraint_text": self.constraint_text, + "new_variable_start_tag": self.new_variable_tags[0], + "new_variable_end_tag": self.new_variable_tags[1], + "in_context_examples": "\n".join(self.in_context_examples), + "gradient_memory": gradient_memory + } + + prompt = construct_tgd_prompt(do_constrained=self.do_constrained, + do_in_context_examples=( + self.do_in_context_examples and (len(self.in_context_examples) > 0)), + do_gradient_memory=(self.do_gradient_memory and (grad_memory != "")), + **optimizer_information) \ No newline at end of file From e19b30915f83025b504c3b499dd30af5af2896b5 Mon Sep 17 00:00:00 2001 From: windweller Date: Wed, 11 Sep 2024 16:55:36 -0700 Subject: [PATCH 03/10] update --- opto/optimizers/textgrad.py | 218 +++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 52 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 2f00d346..7d11f4cb 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -13,9 +13,8 @@ GLOSSARY_TEXT = """ ### Glossary of tags that will be sent to you: -# - : The system prompt for the language model. -# - : The input to the language model. -# - : The output of the language model. +# - : The input to the operation. +# - : The output of the operation. # - : The feedback to the variable. # - : The conversation history. # - : The focus of the optimization. @@ -136,6 +135,113 @@ def construct_tgd_prompt(do_momentum: bool = False, "Here is the feedback we got for {variable_desc} in the conversation:\n\n{feedback}\n\n" ) +""" +https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_ops.py +https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_backward_prompts.py +""" + +GLOSSARY_TEXT_BACKWARD = """ +### Glossary of tags that will be sent to you: +# - : The input to the operation. +# - : The output of the operation. +# - : The objective of the optimization task. +# - : Specifies the span of the variable. +# - : The role description of the variable.""" + +### Backward engine prompts + +# System prompt to the backward engine. +BACKWARD_SYSTEM_PROMPT = ( + "You are part of an optimization system that improves a given text (i.e. the variable). You are the gradient (feedback) engine. " + "Your only responsibility is to give intelligent and creative feedback and constructive criticism to variables, given an objective specified in tags. " + "The variables may be solutions to problems, prompts to language models, code, or any other text-based variable. " + "Pay attention to the role description of the variable, and the context in which it is used. You should assume that the variable will be used in a similar context in the future. " + "Only provide strategies, explanations, and methods to change in the variable. DO NOT propose a new version of the variable, that will be the job of the optimizer. Your only job is to send feedback and criticism (compute 'gradients'). " + "For instance, feedback can be in the form of 'Since language models have the X failure mode...', 'Adding X can fix this error because...', 'Removing X can improve the objective function because...', 'Changing X to Y would fix the mistake ...', that gets at the downstream objective.\n" + "If a variable is already working well (e.g. the objective function is perfect, an evaluation shows the response is accurate), you should not give feedback.\n" + f"{GLOSSARY_TEXT_BACKWARD}") + +# First part of the prompt for the llm backward function +CONVERSATION_TEMPLATE = ( + " {prompt} \n\n" + " {response_value} \n\n" +) + +# Has the gradient on the output. +CONVERSATION_START_INSTRUCTION_CHAIN = ( + "You will give feedback to a variable with the following role: {variable_desc} . " + "Here is a conversation with a language model (LM):\n\n" + "{conversation}" +) +OBJECTIVE_INSTRUCTION_CHAIN = ( + "This conversation is part of a larger system. The was later used as {response_desc}.\n\n" + "Your goal is to give feedback to the variable to address the following feedback on the LM_OUTPUT: {response_gradient} \n\n" +) + +# Does not have gradient on the output +CONVERSATION_START_INSTRUCTION_BASE = ( + "You will give feedback to a variable with the following role: {variable_desc} . " + "Here is an evaluation of the variable using a language model:\n\n" + "{conversation}" +) + +OBJECTIVE_INSTRUCTION_BASE = ( + "Your goal is to give feedback and criticism to the variable given the above evaluation output. " + "Our only goal is to improve the above metric, and nothing else. \n\n" +) + +# Third part of the prompt for the llm backward function. +# Asks the user to evaluate a variable in the conversation. +EVALUATE_VARIABLE_INSTRUCTION = ( + "We are interested in giving feedback to the {variable_desc} " + "for this conversation. Specifically, give feedback to the following span " + "of text:\n\n " + "{variable_short} \n\n" + "Given the above history, describe how the {variable_desc} " + "could be improved to improve the . Be very creative, critical, and intelligent.\n\n" +) + +SEARCH_QUERY_BACKWARD_INSTRUCTION = ( + "Here is a query and a response from searching with {engine_name}:\n" + " {query} \n" + " {results} \n\n" +) + + +GRADIENT_OF_RESULTS_INSTRUCTION = ( + "For the search results from {engine_name} we got the following feedback:\n\n" + "{results_gradient}\n\n" +) + +IN_CONTEXT_EXAMPLE_PROMPT_ADDITION = ( + "You must base on the following examples when give feedback and criticism to the variable:\n\n" + "{in_context_examples}\n\n" +) + +""" +Gradient accumulation: reduce / sum +""" + +REDUCE_MEAN_SYSTEM_PROMPT = ( + "You are part of an optimization system that improves a given text (i.e. the variable). " + "Your only responsibility is to critically aggregate and summarize the feedback from sources. " + "The variables may be solutions to problems, prompts to language models, code, or any other text-based variable. " + "The multiple sources of feedback will be given to you in tags. " + "When giving a response, only provide the core summary of the feedback. Do not recommend a new version for the variable -- only summarize the feedback critically. " +) + + +def construct_reduce_prompt(gradients: List[str]): + """ + Construct a prompt that reduces the gradients. + """ + gradient_texts = [] + for i, gradient in enumerate(gradients): + gradient_texts.append(f"{gradient}") + gradient_texts = "\n".join(gradient_texts) + + return gradient_texts + """ Implementation loosely adapted from https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer.py @@ -143,53 +249,8 @@ def construct_tgd_prompt(do_momentum: bool = False, Because Trace Graph is heterogeneous -- we do not treat LLM operations differently from other operations, we don't implement specialized backward operators for LLM operations. -TextGrad does treat LLM operations differently, and has specialized backward operators for LLM operations. -See: -https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_ops.py -https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_backward_prompts.py - -These two parts are not re-implemented here. """ -# def get_gradient_and_context_text(variable) -> Union[str, List[Union[str, bytes]]]: -# """For the variable, aggregates and returns -# i. the gradients -# ii. the context for which the gradients are computed. -# -# This is used by the optimizer. -# :return: A string containing the aggregated gradients and their corresponding context. -# :rtype: str -# """ -# -# gradient_content = [] -# for g in variable.gradients: -# if variable.gradients_context[g] is None: -# gradient_content.append(g.value) -# else: -# # If context is a list, we handle it differently. -# context = variable.gradients_context[g] -# if isinstance(context["context"], str): -# # The context could be all string. -# criticism_and_context = GRADIENT_TEMPLATE.format( -# feedback=g.value, **context) -# gradient_content.append(criticism_and_context) -# elif isinstance(context["context"], list): -# # The context may have a list of images / strings. In this case, we need to handle it differently. -# context_prompt = GRADIENT_MULTIPART_TEMPLATE.format(**context, feedback=g.value) -# criticism_and_context = context["context"] + [context_prompt] -# gradient_content.extend(criticism_and_context) -# else: -# raise ValueError("Context must be either a string or a list.") -# -# # Check if all instances are string -# if all(isinstance(i, str) for i in gradient_content): -# return "\n".join(gradient_content) -# else: -# return gradient_content - -def get_gradient_context(): - pass - class TextGrad(Optimizer): def __init__(self, parameters: List[ParameterNode], @@ -206,11 +267,59 @@ def __init__(self, parameters: List[ParameterNode], **kwargs, ): super().__init__(parameters, *args, **kwargs) + def _construct_backward_prompt(self, backward_info): + conversation = CONVERSATION_TEMPLATE.format(**backward_info) + backward_prompt = CONVERSATION_START_INSTRUCTION_BASE.format(conversation=conversation, **backward_info) + backward_prompt += OBJECTIVE_INSTRUCTION_BASE.format(**backward_info) + backward_prompt += EVALUATE_VARIABLE_INSTRUCTION.format(**backward_info) + return backward_prompt + + def _construct_chain_backward_prompt(self, backward_info) -> str: + conversation = CONVERSATION_TEMPLATE.format(**backward_info) + backward_prompt = CONVERSATION_START_INSTRUCTION_CHAIN.format(conversation=conversation, **backward_info) + backward_prompt += OBJECTIVE_INSTRUCTION_CHAIN.format(**backward_info) + backward_prompt += EVALUATE_VARIABLE_INSTRUCTION.format(**backward_info) + return backward_prompt + + def _grad(self, input_node: Node, parent_nodes, gradient_text): + """ + https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_ops.py#L174 + + input_node is the response node + parent_nodes are the children_variables (predecessors) + + :param gradient_text: previous feedback + """ + propagated_grads = [] + for var_node in parent_nodes: + backward_info = { + "response_desc": input_node.description, + "response_value": input_node.data, + "response_gradient": gradient_text, + "prompt": var_node.data, # prompt = input to the operation + "variable_desc": var_node.description, + "variable_short": self.get_label(var_node) + } + backward_prompt = self._construct_chain_backward_prompt(backward_info) + gradient_value = self.call_llm(user_prompt=backward_prompt, system_prompt=BACKWARD_SYSTEM_PROMPT) + # we need to do inline modification of the child's feedback + propagated_grads.append(gradient_value) + + return propagated_grads + + def _reduce_gradient_mean(self, gradients: List[str]): + if len(gradients) == 1: + return gradients[0] + else: + gradient_reduce_prompt = construct_reduce_prompt(gradients) + reduced_gradient = self.call_llm(user_prompt=gradient_reduce_prompt, system_prompt=REDUCE_MEAN_SYSTEM_PROMPT) + return reduced_gradient + def _step(self): trace_graph = self.trace_graph # aggregate the trace graphes into one. # this is the same as gradient memory - grads = defaultdict(str) # accumulated gradient (same as variable.get_gradient_text()) + grads = defaultdict(list) # accumulated gradient (same as variable.get_gradient_text()) # trace_graph.graph is a list of nodes sorted according to the topological order for i, (_, x) in enumerate(reversed(trace_graph.graph)): # back-propagation starts from the last node @@ -218,14 +327,19 @@ def _step(self): continue # we take the gradient step-by-step g = trace_graph.user_feedback if i == 0 else grads[x] + if len(g) != 0: + # TODO: reduce step + g = self._reduce_gradient_mean(g) + grads[x] = [g] # TODO: compute gradient # outputs, inputs, grad_outputs=None - propagated_grads = torch.autograd.grad(x.data, [p.data for p in x.parents], g) # propagate the gradient + # propagated_grads = torch.autograd.grad(x.data, [p.data for p in x.parents], g) # propagate the gradient + propagated_grads = self._grad(x, x.parents, g) for p, pg in zip(x.parents, propagated_grads): - # TODO: accumulate gradient - grads[p] += pg # accumulate gradient + # TODO: accumulate gradient (append to list) + grads[p].append(pg) # accumulate gradient # TODO: apply gradient return {p: p.data - self.stepsize * grads[p] for p in self.parameters} # propose new update From 44d24e25cd62d3c712f519861034c5ef130fc370 Mon Sep 17 00:00:00 2001 From: windweller Date: Wed, 11 Sep 2024 17:32:04 -0700 Subject: [PATCH 04/10] finished --- opto/optimizers/textgrad.py | 92 ++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 26 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 7d11f4cb..90513947 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -1,4 +1,5 @@ -from typing import Any, List, Dict, Union, Tuple +from dataclasses import dataclass +from typing import Any, List, Dict, Union, Tuple, Optional from opto.optimizers.optimizer import Optimizer from opto.trace.nodes import ParameterNode, Node, MessageNode from opto.trace.propagators import TraceGraph, GraphPropagator, Propagator @@ -251,6 +252,11 @@ def construct_reduce_prompt(gradients: List[str]): """ +@dataclass +class GradientInfo: + gradient: str # feedback + gradient_context: Optional[Dict[str, str]] + class TextGrad(Optimizer): def __init__(self, parameters: List[ParameterNode], @@ -266,6 +272,9 @@ def __init__(self, parameters: List[ParameterNode], log=True, **kwargs, ): super().__init__(parameters, *args, **kwargs) + self.new_variable_tags = ["", ""] + self.optimizer_system_prompt = OPTIMIZER_SYSTEM_PROMPT.format(new_variable_start_tag=self.new_variable_tags[0], + new_variable_end_tag=self.new_variable_tags[1]) def _construct_backward_prompt(self, backward_info): conversation = CONVERSATION_TEMPLATE.format(**backward_info) @@ -281,7 +290,7 @@ def _construct_chain_backward_prompt(self, backward_info) -> str: backward_prompt += EVALUATE_VARIABLE_INSTRUCTION.format(**backward_info) return backward_prompt - def _grad(self, input_node: Node, parent_nodes, gradient_text): + def _grad(self, input_node: Node, parent_nodes, gradient_text) -> List[GradientInfo]: """ https://github.com/zou-group/textgrad/blob/main/textgrad/autograd/llm_ops.py#L174 @@ -302,12 +311,17 @@ def _grad(self, input_node: Node, parent_nodes, gradient_text): } backward_prompt = self._construct_chain_backward_prompt(backward_info) gradient_value = self.call_llm(user_prompt=backward_prompt, system_prompt=BACKWARD_SYSTEM_PROMPT) - # we need to do inline modification of the child's feedback - propagated_grads.append(gradient_value) + conversation = CONVERSATION_TEMPLATE.format(**backward_info) + gradients_context = { + "context": conversation, + "response_desc": input_node.description, + "variable_desc": var_node.description + } + propagated_grads.append(GradientInfo(gradient_value, gradients_context)) return propagated_grads - def _reduce_gradient_mean(self, gradients: List[str]): + def _reduce_gradient_mean(self, gradients: List[GradientInfo]): if len(gradients) == 1: return gradients[0] else: @@ -315,6 +329,38 @@ def _reduce_gradient_mean(self, gradients: List[str]): reduced_gradient = self.call_llm(user_prompt=gradient_reduce_prompt, system_prompt=REDUCE_MEAN_SYSTEM_PROMPT) return reduced_gradient + def _get_gradient_and_context_text(self, gradients: List[GradientInfo]): + gradient_content = [] + for g in gradients: + if g.gradient_context is None: + gradient_content.append(g.gradient) + else: + criticism_and_context = GRADIENT_TEMPLATE.format( + feedback=g.gradient, **g.gradient_context) + gradient_content.append(criticism_and_context) + return "\n".join(gradient_content) + + def _update_prompt(self, node: Node, gradients: List[GradientInfo]): + # gradient_memory: just accumulated gradient from the previous calculation + optimizer_information = { + "variable_desc": node.description, + "variable_value": node.data, + "variable_grad": self._get_gradient_and_context_text(gradients), + "variable_short": self.get_label(node), + "constraint_text": node._constraint, + "new_variable_start_tag": self.new_variable_tags[0], + "new_variable_end_tag": self.new_variable_tags[1], + # "in_context_examples": "\n".join(self.in_context_examples), + # "gradient_memory": gradient_memory + } + + prompt = construct_tgd_prompt(do_constrained=True, + do_in_context_examples=False, + do_gradient_memory=False, + **optimizer_information) + + return prompt + def _step(self): trace_graph = self.trace_graph # aggregate the trace graphes into one. @@ -326,7 +372,7 @@ def _step(self): if len(x.parents) == 0: continue # we take the gradient step-by-step - g = trace_graph.user_feedback if i == 0 else grads[x] + g = GradientInfo(trace_graph.user_feedback, None) if i == 0 else grads[x] if len(g) != 0: # TODO: reduce step g = self._reduce_gradient_mean(g) @@ -342,7 +388,20 @@ def _step(self): grads[p].append(pg) # accumulate gradient # TODO: apply gradient - return {p: p.data - self.stepsize * grads[p] for p in self.parameters} # propose new update + # {p: p.data - self.stepsize * grads[p] for p in self.parameters} + + update_dict = {} + for p in self.parameters: + gradients = grads[p] + prompt_update_parameter = self._update_prompt(p, gradients) + response = self.call_llm(user_prompt=prompt_update_parameter, system_prompt=OPTIMIZER_SYSTEM_PROMPT) + try: + new_value = response.split(self.new_variable_tags[0])[1].split(self.new_variable_tags[1])[0].strip() + update_dict[p] = type(p.data)(new_value) + except IndexError: + pass + + return update_dict # propose new update def call_llm( self, system_prompt: str, user_prompt: str, verbose: Union[bool, str] = False, max_tokens: int = 4096 @@ -395,22 +454,3 @@ def get_label(self, x): content = content[:self.print_limit] + "..." return text + content - def _update_prompt(self, node: Node, input_nodes, gradient_memory): - # gradient_memory: just accumulated gradient from the previous calculation - optimizer_information = { - "variable_desc": node.description, - "variable_value": node.data, - "variable_grad": get_gradient_and_context_text(variable), - "variable_short": node.py_name, - "constraint_text": self.constraint_text, - "new_variable_start_tag": self.new_variable_tags[0], - "new_variable_end_tag": self.new_variable_tags[1], - "in_context_examples": "\n".join(self.in_context_examples), - "gradient_memory": gradient_memory - } - - prompt = construct_tgd_prompt(do_constrained=self.do_constrained, - do_in_context_examples=( - self.do_in_context_examples and (len(self.in_context_examples) > 0)), - do_gradient_memory=(self.do_gradient_memory and (grad_memory != "")), - **optimizer_information) \ No newline at end of file From 5e59006077f3566939e49832e64eb56f7c0d85f2 Mon Sep 17 00:00:00 2001 From: windweller Date: Sat, 14 Sep 2024 17:12:05 -0700 Subject: [PATCH 05/10] textgrad is now working --- opto/optimizers/textgrad.py | 79 +++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 90513947..67a3c6c7 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -1,8 +1,11 @@ +import json from dataclasses import dataclass +import autogen from typing import Any, List, Dict, Union, Tuple, Optional from opto.optimizers.optimizer import Optimizer from opto.trace.nodes import ParameterNode, Node, MessageNode from opto.trace.propagators import TraceGraph, GraphPropagator, Propagator +from copy import copy from textwrap import dedent, indent from collections import defaultdict @@ -257,6 +260,20 @@ class GradientInfo: gradient: str # feedback gradient_context: Optional[Dict[str, str]] + def __len__(self): + if self.gradient_context is None: + return 1 + else: + return 2 + + def __getitem__(self, key): + if key == 0: + return self.gradient + elif key == 1: + return self.gradient_context + else: + raise IndexError + class TextGrad(Optimizer): def __init__(self, parameters: List[ParameterNode], @@ -272,6 +289,10 @@ def __init__(self, parameters: List[ParameterNode], log=True, **kwargs, ): super().__init__(parameters, *args, **kwargs) + if config_list is None: + config_list = autogen.config_list_from_json("OAI_CONFIG_LIST") + self.llm = autogen.OpenAIWrapper(config_list=config_list) + self.print_limit = 100 self.new_variable_tags = ["", ""] self.optimizer_system_prompt = OPTIMIZER_SYSTEM_PROMPT.format(new_variable_start_tag=self.new_variable_tags[0], new_variable_end_tag=self.new_variable_tags[1]) @@ -307,7 +328,7 @@ def _grad(self, input_node: Node, parent_nodes, gradient_text) -> List[GradientI "response_gradient": gradient_text, "prompt": var_node.data, # prompt = input to the operation "variable_desc": var_node.description, - "variable_short": self.get_label(var_node) + "variable_short": self.get_variable_short_desc(var_node) } backward_prompt = self._construct_chain_backward_prompt(backward_info) gradient_value = self.call_llm(user_prompt=backward_prompt, system_prompt=BACKWARD_SYSTEM_PROMPT) @@ -346,7 +367,7 @@ def _update_prompt(self, node: Node, gradients: List[GradientInfo]): "variable_desc": node.description, "variable_value": node.data, "variable_grad": self._get_gradient_and_context_text(gradients), - "variable_short": self.get_label(node), + "variable_short": self.get_variable_short_desc(node), "constraint_text": node._constraint, "new_variable_start_tag": self.new_variable_tags[0], "new_variable_end_tag": self.new_variable_tags[1], @@ -361,8 +382,9 @@ def _update_prompt(self, node: Node, gradients: List[GradientInfo]): return prompt - def _step(self): - trace_graph = self.trace_graph # aggregate the trace graphes into one. + def _step(self, verbose=False): + # aggregate the trace graphes into one. + trace_graph = copy(self.trace_graph) # this is the same as gradient memory grads = defaultdict(list) # accumulated gradient (same as variable.get_gradient_text()) @@ -373,33 +395,40 @@ def _step(self): continue # we take the gradient step-by-step g = GradientInfo(trace_graph.user_feedback, None) if i == 0 else grads[x] - if len(g) != 0: + if len(g) > 1: # TODO: reduce step g = self._reduce_gradient_mean(g) + if verbose: + print(f"Reduced gradient for {x.py_name}: {g}") grads[x] = [g] # TODO: compute gradient # outputs, inputs, grad_outputs=None # propagated_grads = torch.autograd.grad(x.data, [p.data for p in x.parents], g) # propagate the gradient propagated_grads = self._grad(x, x.parents, g) + if verbose: + print(f"Propagated gradients for {x.py_name}: {propagated_grads}") for p, pg in zip(x.parents, propagated_grads): # TODO: accumulate gradient (append to list) grads[p].append(pg) # accumulate gradient # TODO: apply gradient - # {p: p.data - self.stepsize * grads[p] for p in self.parameters} update_dict = {} for p in self.parameters: gradients = grads[p] prompt_update_parameter = self._update_prompt(p, gradients) - response = self.call_llm(user_prompt=prompt_update_parameter, system_prompt=OPTIMIZER_SYSTEM_PROMPT) + response = self.call_llm(user_prompt=prompt_update_parameter, system_prompt=self.optimizer_system_prompt) try: - new_value = response.split(self.new_variable_tags[0])[1].split(self.new_variable_tags[1])[0].strip() - update_dict[p] = type(p.data)(new_value) - except IndexError: - pass + var_json = response.split(self.new_variable_tags[0])[1].split(self.new_variable_tags[1])[0].strip() + new_proposal = json.loads(var_json) + update_dict[p] = type(p.data)(new_proposal['value']) + if verbose: + # old value to new value + print(f"Updated {p.py_name} from {p.data} to {update_dict[p]}") + except Exception as e: + print(f"Error in updating {p.py_name}: {e}, raw response: {response}") return update_dict # propose new update @@ -426,25 +455,15 @@ def call_llm( print("LLM response:\n", response) return response - def get_label(self, x): - """Construct a label for a node based on its name, description, and content. - - Parameters - ---------- - x: The node for which the label is to be constructed. - - Note - ---------- - Using a colon in the name can cause problems in graph visualization tools like Graphviz. - To avoid issues, the label is constructed by combining the node's Python name, truncated description, and content. - If the description or content exceeds the print limit, it is truncated and appended with an ellipsis. + def get_variable_short_desc(self, x): + """This is what TextGrad optimizer will use to optimize. + It's important to just include the name and the value + and wrap in JSON """ - # using colon in the name causes problems in graphviz - description = x.description - if len(x.description) > self.print_limit: - description = x.description[:self.print_limit] + "..." + variable_json = { + 'name': x.py_name, + } - text = x.py_name + "\n" + description + "\n" content = str(x.data) if isinstance(x.data, dict): if "content" in x.data: @@ -452,5 +471,7 @@ def get_label(self, x): if len(content) > self.print_limit: content = content[:self.print_limit] + "..." - return text + content + + variable_json['value'] = content + return json.dumps(variable_json) From e160a842c61c51626c83bc91bb3f90a84c599db4 Mon Sep 17 00:00:00 2001 From: windweller Date: Sat, 14 Sep 2024 17:13:40 -0700 Subject: [PATCH 06/10] remove todos --- opto/optimizers/textgrad.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 67a3c6c7..cc8760f5 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -402,19 +402,16 @@ def _step(self, verbose=False): print(f"Reduced gradient for {x.py_name}: {g}") grads[x] = [g] - # TODO: compute gradient - # outputs, inputs, grad_outputs=None - # propagated_grads = torch.autograd.grad(x.data, [p.data for p in x.parents], g) # propagate the gradient + # compute gradient propagated_grads = self._grad(x, x.parents, g) if verbose: print(f"Propagated gradients for {x.py_name}: {propagated_grads}") for p, pg in zip(x.parents, propagated_grads): - # TODO: accumulate gradient (append to list) + # accumulate gradient (append to list) grads[p].append(pg) # accumulate gradient - # TODO: apply gradient - + # apply gradient update_dict = {} for p in self.parameters: gradients = grads[p] From 43bcc9eaf099dafea0b17b3a75945e0edf24aff6 Mon Sep 17 00:00:00 2001 From: windweller Date: Sat, 14 Sep 2024 17:18:18 -0700 Subject: [PATCH 07/10] remove todos --- opto/optimizers/textgrad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index cc8760f5..8c36e948 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -396,7 +396,7 @@ def _step(self, verbose=False): # we take the gradient step-by-step g = GradientInfo(trace_graph.user_feedback, None) if i == 0 else grads[x] if len(g) > 1: - # TODO: reduce step + # reduce step g = self._reduce_gradient_mean(g) if verbose: print(f"Reduced gradient for {x.py_name}: {g}") From 7fda3730c0d6ae29d9ed46ed98ef34bce0419c77 Mon Sep 17 00:00:00 2001 From: chinganc Date: Tue, 1 Oct 2024 16:01:01 -0700 Subject: [PATCH 08/10] Make textgrad optimizer less verbose. Add prompt_symbols argument to OptoPrime. --- opto/optimizers/__init__.py | 3 ++- opto/optimizers/optoprime.py | 26 +++++++++++++++++++++++++- opto/optimizers/textgrad.py | 7 +++---- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/opto/optimizers/__init__.py b/opto/optimizers/__init__.py index 34f80257..362eff51 100644 --- a/opto/optimizers/__init__.py +++ b/opto/optimizers/__init__.py @@ -1,4 +1,5 @@ from opto.optimizers.optoprime import OptoPrime from opto.optimizers.opro import OPRO +from opto.optimizers.textgrad import TextGrad -__all__ = ["OPRO", "OptoPrime"] \ No newline at end of file +__all__ = ["OPRO", "OptoPrime", "TextGrad"] \ No newline at end of file diff --git a/opto/optimizers/optoprime.py b/opto/optimizers/optoprime.py index 23d1d066..b228f797 100644 --- a/opto/optimizers/optoprime.py +++ b/opto/optimizers/optoprime.py @@ -238,7 +238,7 @@ class OptoPrime(Optimizer): def __init__( self, parameters: List[ParameterNode], - config_list: List = None, + config_list: List = None, # autogen config_dict *args, propagator: Propagator = None, objective: Union[None, str] = None, @@ -247,12 +247,16 @@ def __init__( memory_size=0, # Memory size to store the past feedback max_tokens=4096, log=True, + prompt_symbols=None, + filter_dict : Dict = None, # autogen filter_dict **kwargs, ): super().__init__(parameters, *args, propagator=propagator, **kwargs) self.ignore_extraction_error = ignore_extraction_error if config_list is None: config_list = autogen.config_list_from_json("OAI_CONFIG_LIST") + if filter_dict is not None: + config_list = autogen.filter_config_list(config_list, filter_dict) self.llm = autogen.OpenAIWrapper(config_list=config_list) self.objective = objective or self.default_objective self.example_problem = ProblemInstance.problem_template.format( @@ -281,6 +285,18 @@ def __init__( self.log = [] if log else None self.summary_log = [] if log else None self.memory = FIFOBuffer(memory_size) + self.prompt_symbols = { + "instruction": "#Instruction", + "code": "#Code", + "documentation": "#Documentation", + "variables": "#Variables", + "constraints": "#Constraints", + "inputs": "#Inputs", + "others": "#Others", + "outputs": "#Outputs", + "feedback": "#Feedback", + } + self.prompt_symbols.update(prompt_symbols or {}) def default_propagator(self): """Return the default Propagator object of the optimizer.""" @@ -383,10 +399,18 @@ def construct_prompt(self, summary, mask=None, *args, **kwargs): return system_prompt, user_prompt + def replace_symbols(self, text: str, symbols: Dict[str, str]) -> str: + for k, v in symbols.items(): + text = text.replace(k, v) + return text + def _step(self, verbose=False, mask=None, *args, **kwargs) -> Dict[ParameterNode, Any]: assert isinstance(self.propagator, GraphPropagator) summary = self.summarize() system_prompt, user_prompt = self.construct_prompt(summary, mask=mask) + system_prompt = self.replace_symbols(system_prompt, self.prompt_symbols) + user_prompt = self.replace_symbols(user_prompt, self.prompt_symbols) + response = self.call_llm( system_prompt=system_prompt, user_prompt=user_prompt, verbose=verbose, max_tokens=self.max_tokens ) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 8c36e948..865acf9a 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -398,13 +398,13 @@ def _step(self, verbose=False): if len(g) > 1: # reduce step g = self._reduce_gradient_mean(g) - if verbose: + if verbose not in (False, "output"): print(f"Reduced gradient for {x.py_name}: {g}") grads[x] = [g] # compute gradient propagated_grads = self._grad(x, x.parents, g) - if verbose: + if verbose not in (False, "output"): print(f"Propagated gradients for {x.py_name}: {propagated_grads}") for p, pg in zip(x.parents, propagated_grads): @@ -421,7 +421,7 @@ def _step(self, verbose=False): var_json = response.split(self.new_variable_tags[0])[1].split(self.new_variable_tags[1])[0].strip() new_proposal = json.loads(var_json) update_dict[p] = type(p.data)(new_proposal['value']) - if verbose: + if verbose not in (False, "output"): # old value to new value print(f"Updated {p.py_name} from {p.data} to {update_dict[p]}") except Exception as e: @@ -471,4 +471,3 @@ def get_variable_short_desc(self, x): variable_json['value'] = content return json.dumps(variable_json) - From 55f8289fdff497280483671362a33e5e54fb4d25 Mon Sep 17 00:00:00 2001 From: chinganc Date: Tue, 1 Oct 2024 16:14:26 -0700 Subject: [PATCH 09/10] Add TextGrad code examples --- examples/textgrad_examples/README.md | 1 + .../evals/textgrad_prompt_optimization.py | 193 ++ .../notebooks/textgrad_primitives.ipynb | 60 + .../textgrad_prompt_optimization.ipynb | 2334 +++++++++++++++++ .../textgrad_solution_optimization.ipynb | 90 + .../textgrad_test_time_loss_for_code.ipynb | 381 +++ 6 files changed, 3059 insertions(+) create mode 100644 examples/textgrad_examples/README.md create mode 100644 examples/textgrad_examples/evals/textgrad_prompt_optimization.py create mode 100644 examples/textgrad_examples/notebooks/textgrad_primitives.ipynb create mode 100644 examples/textgrad_examples/notebooks/textgrad_prompt_optimization.ipynb create mode 100644 examples/textgrad_examples/notebooks/textgrad_solution_optimization.ipynb create mode 100644 examples/textgrad_examples/notebooks/textgrad_test_time_loss_for_code.ipynb diff --git a/examples/textgrad_examples/README.md b/examples/textgrad_examples/README.md new file mode 100644 index 00000000..77f90f80 --- /dev/null +++ b/examples/textgrad_examples/README.md @@ -0,0 +1 @@ +These are scripts adopted from TextGrad's repo to show how Trace can be used to tune their example codes and to compare the performance of both. Running these scripts require installing TextGrad first. \ No newline at end of file diff --git a/examples/textgrad_examples/evals/textgrad_prompt_optimization.py b/examples/textgrad_examples/evals/textgrad_prompt_optimization.py new file mode 100644 index 00000000..68a4882f --- /dev/null +++ b/examples/textgrad_examples/evals/textgrad_prompt_optimization.py @@ -0,0 +1,193 @@ +# This script applies Trace to optimize the workflow in TextGrad's prompt_optimization.py. + +from opto import trace +from opto.optimizers import OptoPrime, TextGrad + +import argparse +import concurrent +from dotenv import load_dotenv +load_dotenv(override=True) + +from tqdm import tqdm +import textgrad as tg +from textgrad.tasks import load_task + +import numpy as np +import random + +def set_seed(seed): + np.random.seed(seed) + random.seed(seed) + +def config(): + parser = argparse.ArgumentParser(description="Optimize a prompt for a task.") + parser.add_argument("--algo", type=str, default="textgrad", help="The algorithm to use for optimization.") + parser.add_argument("--task", type=str, default="BBH_object_counting", help="The task to evaluate the model on.") + parser.add_argument("--evaluation_engine", type=str, default="gpt-4o", help="The API to use for evaluation.") + parser.add_argument("--test_engine", type=str, default="gpt-3.5-turbo-0125", help="The API to use for evaluation.") + parser.add_argument("--batch_size", type=int, default=3, help="The batch size to use for training.") + parser.add_argument("--max_epochs", type=int, default=3, help="The maximum number of epochs to train for.") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--run_validation", action="store_true", help="Whether to run validation or not.") + parser.add_argument("--do_not_run_larger_model", action="store_true", help="Whether to run the larger model or not.") + parser.add_argument("--num_threads", type=int, default=32, help="The number of threads to use for evaluation.") + return parser.parse_args() + +args = config() + +def eval_sample(item, eval_fn, model): + x, y = item + x = tg.Variable(x, requires_grad=False, role_description="query to the language model") + if np.issubdtype(type(y), np.integer): + y = int(y) + y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query") + response = model(x) + try: + eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y)) + return int(eval_output_variable.value) + except: + eval_output_variable = eval_fn([x, y, response]) + eval_output_parsed = eval_fn.parse_output(eval_output_variable) + return int(eval_output_parsed) + +def eval_dataset(test_set, eval_fn, model, max_samples: int=None): + if max_samples is None: + max_samples = len(test_set) + accuracy_list = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [] + for _, sample in enumerate(test_set): + + future = executor.submit(eval_sample, sample, eval_fn, model) + futures.append(future) + if len(futures) >= max_samples: + break + tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0) + for future in tqdm_loader: + acc_item = future.result() + accuracy_list.append(acc_item) + tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}") + return accuracy_list + +def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set): + val_performance = np.mean(eval_dataset(val_set, eval_fn, model)) + previous_performance = np.mean(results["validation_acc"][-1]) + print("val_performance: ", val_performance) + print("previous_performance: ", previous_performance) + previous_prompt = results["prompt"][-1] + + if val_performance < previous_performance: + print(f"rejected prompt: {system_prompt.value}") + system_prompt.set_value(previous_prompt) + val_performance = previous_performance + + results["validation_acc"].append(val_performance) + + +set_seed(args.seed) +llm_api_eval = tg.get_engine(engine_name=args.evaluation_engine) +llm_api_test = tg.get_engine(engine_name=args.test_engine) +# tg.set_backward_engine(llm_api_eval, override=True) + +# Load the data and the evaluation function +train_set, val_set, test_set, eval_fn = load_task(args.task, evaluation_api=llm_api_eval) +print("Train/Val/Test Set Lengths: ", len(train_set), len(val_set), len(test_set)) +STARTING_SYSTEM_PROMPT = train_set.get_task_description() + +train_loader = tg.tasks.DataLoader(train_set, batch_size=args.batch_size, shuffle=True) +print(STARTING_SYSTEM_PROMPT) + +# Testing the 0-shot performance of the evaluation engine +system_prompt = trace.node(STARTING_SYSTEM_PROMPT, + trainable=True, + constraint="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task") + +# model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt) +def model_evaluation(x): + return tg.BlackboxLLM(llm_api_eval, system_prompt.data)(x) + +if not args.do_not_run_larger_model: + reference = np.mean(eval_dataset(test_set, eval_fn, model_evaluation)) + + +def model(x): + return tg.BlackboxLLM(llm_api_test, system_prompt.data)(x) + +if args.algo == "textgrad": + # This runs Trace's TextGrad optimizer + optimizer = TextGrad([system_prompt]) +else: # This runs Trace's OptoPrime optimizer + optimizer = OptoPrime([system_prompt], prompt_symbols={'variables':'#Parameters'}) + +results = {"test_acc": [], "prompt": [], "validation_acc": []} +results["test_acc"].append(eval_dataset(test_set, eval_fn, model)) +results["validation_acc"].append(eval_dataset(val_set, eval_fn, model)) +results["prompt"].append(system_prompt.data) + + +# We define Trace operations by wrapping the original TextGrad codes + +@trace.bundle() +def query(system_prompt, *inputs): + """ Query the language model with the system prompt and the input query """ + return tg.BlackboxLLM(llm_api_test, system_prompt)(*inputs) + +@trace.bundle() +def eval_response(response, ground_truth_answer): + """ Evaluate the response of the language model with respect to the ground truth answer. 1 means correct, 0 means incorrect """ + try: + eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=ground_truth_answer)) + except: + eval_output_variable = eval_fn([x, ground_truth_answer, response]) + return eval_output_variable + +@trace.bundle() +def concat(*items): + """ Concatenate the items into a single string """ + output = '' + for i, item in enumerate(items): + output += f'{[i]}: {item}\n\n' + return output + + +for epoch in range(args.max_epochs): + for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))): + pbar.set_description(f"Training step {steps}. Epoch {epoch}") + optimizer.zero_feedback() + feedbacks = [] + for (x, y) in zip(batch_x, batch_y): + x = tg.Variable(x, requires_grad=False, role_description="query to the language model") + if np.issubdtype(type(y), np.integer): + y = int(y) + y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query") + # trace these operations + response = query(system_prompt, x) # node + eval_output_variable = eval_response(response, y) # node + feedbacks.append(eval_output_variable) # list of nodes + + target = concat(*feedbacks) # node + target.backward("Improve correctness.") + optimizer.step(verbose='output') + + if args.run_validation: + # to implement the run_validation_revert in TextGrad + tg_system_prompt =tg.Variable(system_prompt.data, + requires_grad=True, + role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task") + run_validation_revert(tg_system_prompt, results, model, eval_fn, val_set) + system_prompt._data = tg_system_prompt.value + + print("sys prompt: ", system_prompt.data) + test_acc = eval_dataset(test_set, eval_fn, model) + results["test_acc"].append(test_acc) + results["prompt"].append(system_prompt.data) + if steps == 3: + break + +# Also dump the final results +import json +import os +os.makedirs("textgrad_figures", exist_ok=True) +with open(f"./textgrad_figures/results_{args.task}_{args.test_engine}_{args.algo}.json", "w") as f: + json.dump(results, f) \ No newline at end of file diff --git a/examples/textgrad_examples/notebooks/textgrad_primitives.ipynb b/examples/textgrad_examples/notebooks/textgrad_primitives.ipynb new file mode 100644 index 00000000..c03651bf --- /dev/null +++ b/examples/textgrad_examples/notebooks/textgrad_primitives.ipynb @@ -0,0 +1,60 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM response:\n", + " {\n", + "\"reasoning\": \"1. The #Instruction asks for adjustments to the given variables to improve the output based on the feedback provided. 2. The #Feedback states that we need to evaluate the correctness of the sentence, which suggests that there are errors in the sentence that need correction. 3. The variable `str0` has a typo in the word 'sentence' ('sntence'), which needs correction to fulfill the feedback's need to evaluate correctness. Also, the sentence is missing a period at the end to be grammatically correct.\",\n", + "\"answer\": \"The sentence has a typographical error in 'sntence' which should be 'sentence'. It also lacks punctuation at the end.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"A sentence with a typo.\"\n", + "}\n", + "}\n", + "A sentence with a typo.\n" + ] + } + ], + "source": [ + "from opto import trace\n", + "from opto.optimizers import OptoPrime\n", + "\n", + "x = trace.node(\"A sntence with a typo\", description=\"The input sentence\", trainable=True)\n", + "opt = OptoPrime([x])\n", + "\n", + "opt.zero_feedback()\n", + "x.backward(\"Evaluate the correctness of this sentence\")\n", + "opt.step(verbose='output')\n", + "\n", + "print(x.data)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trace-3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/textgrad_examples/notebooks/textgrad_prompt_optimization.ipynb b/examples/textgrad_examples/notebooks/textgrad_prompt_optimization.ipynb new file mode 100644 index 00000000..f0bc96a5 --- /dev/null +++ b/examples/textgrad_examples/notebooks/textgrad_prompt_optimization.ipynb @@ -0,0 +1,2334 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from opto import trace\n", + "from opto.optimizers import OptoPrime\n", + "\n", + "import os\n", + "\n", + "import argparse\n", + "import concurrent\n", + "from dotenv import load_dotenv\n", + "from tqdm import tqdm\n", + "import textgrad as tg\n", + "from textgrad.tasks import load_task\n", + "import numpy as np\n", + "import random\n", + "load_dotenv(override=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def set_seed(seed):\n", + " np.random.seed(seed)\n", + " random.seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def eval_sample(item, eval_fn, model):\n", + " \"\"\"\n", + " This function allows us to evaluate if an answer to a question in the prompt is a good answer.\n", + "\n", + " \"\"\"\n", + " x, y = item\n", + " x = tg.Variable(x, requires_grad=False, role_description=\"query to the language model\")\n", + "\n", + " if np.issubdtype(type(y), np.integer):\n", + " y = int(y)\n", + "\n", + " y = tg.Variable(y, requires_grad=False, role_description=\"correct answer for the query\")\n", + " response = model(x)\n", + " try:\n", + " eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))\n", + " return int(eval_output_variable.value)\n", + " except:\n", + " eval_output_variable = eval_fn([x, y, response])\n", + " eval_output_parsed = eval_fn.parse_output(eval_output_variable)\n", + " return int(eval_output_parsed)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def eval_dataset(test_set, eval_fn, model, max_samples: int=None):\n", + " if max_samples is None:\n", + " max_samples = len(test_set)\n", + " accuracy_list = []\n", + "\n", + " futures = []\n", + " for _, sample in enumerate(test_set):\n", + "\n", + " future = eval_sample(sample, eval_fn, model)\n", + " futures.append(future)\n", + " if len(futures) >= max_samples:\n", + " break\n", + " tqdm_loader = tqdm(futures, total=len(futures), position=0)\n", + " for future in tqdm_loader:\n", + " acc_item = future\n", + " accuracy_list.append(acc_item)\n", + " tqdm_loader.set_description(f\"Accuracy: {np.mean(accuracy_list)}\")\n", + " return accuracy_list\n", + " # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:\n", + " # futures = []\n", + " # for _, sample in enumerate(test_set):\n", + "\n", + " # future = executor.submit(eval_sample, sample, eval_fn, model)\n", + " # futures.append(future)\n", + " # if len(futures) >= max_samples:\n", + " # break\n", + " # tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0)\n", + " # for future in tqdm_loader:\n", + " # acc_item = future.result()\n", + " # accuracy_list.append(acc_item)\n", + " # tqdm_loader.set_description(f\"Accuracy: {np.mean(accuracy_list)}\")\n", + " # return accuracy_list" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set):\n", + " val_performance = np.mean(eval_dataset(val_set, eval_fn, model))\n", + " previous_performance = np.mean(results[\"validation_acc\"][-1])\n", + " print(\"val_performance: \", val_performance)\n", + " print(\"previous_performance: \", previous_performance)\n", + " previous_prompt = results[\"prompt\"][-1]\n", + "\n", + " if val_performance < previous_performance:\n", + " print(f\"rejected prompt: {system_prompt.value}\")\n", + " system_prompt.set_value(previous_prompt)\n", + " val_performance = previous_performance\n", + "\n", + " results[\"validation_acc\"].append(val_performance)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train/Val/Test Set Lengths: 50 100 100\n" + ] + } + ], + "source": [ + "set_seed(12)\n", + "llm_api_eval = tg.get_engine(engine_name=\"gpt-4o\")\n", + "llm_api_test = tg.get_engine(engine_name=\"gpt-3.5-turbo-0125\")\n", + "# tg.set_backward_engine(llm_api_eval, override=True)\n", + "\n", + "# Load the data and the evaluation function\n", + "train_set, val_set, test_set, eval_fn = load_task(\"BBH_object_counting\", evaluation_api=llm_api_eval)\n", + "print(\"Train/Val/Test Set Lengths: \", len(train_set), len(val_set), len(test_set))\n", + "STARTING_SYSTEM_PROMPT = train_set.get_task_description()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + } + ], + "source": [ + "print(STARTING_SYSTEM_PROMPT)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1449.87it/s] \n", + "Accuracy: 0.75: 100%|██████████| 100/100 [00:00<00:00, 1790.99it/s] \n" + ] + } + ], + "source": [ + "train_loader = tg.tasks.DataLoader(train_set, batch_size=3, shuffle=True)\n", + "\n", + "\n", + "# Testing the 0-shot performance of the evaluation engine\n", + "system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,\n", + " requires_grad=True,\n", + " role_description=\"system prompt to the language model\")\n", + "model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt)\n", + "\n", + "# system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,\n", + "# requires_grad=True,\n", + "# role_description=\"structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task\")\n", + "# model = tg.BlackboxLLM(llm_api_test, system_prompt)\n", + "\n", + "# optimizer = tg.TextualGradientDescent(engine=llm_api_eval, parameters=[system_prompt])\n", + "\n", + "\n", + "## Trace code\n", + "@trace.bundle()\n", + "def query(system_prompt, *inputs):\n", + " \"\"\" Query the language model with the system prompt and the input query \"\"\"\n", + " return tg.BlackboxLLM(llm_api_test, system_prompt)(*inputs)\n", + "\n", + "system_prompt = trace.node(STARTING_SYSTEM_PROMPT, trainable=True)\n", + "optimizer = OptoPrime([system_prompt], prompt_symbols={'variables':'#Parameters'}) # XXX: to avoid name clash with TextGrad\n", + "model = lambda x: query(system_prompt, x).data\n", + "\n", + "results = {\"test_acc\": [], \"prompt\": [], \"validation_acc\": []}\n", + "results[\"test_acc\"].append(eval_dataset(test_set, eval_fn, model))\n", + "results[\"validation_acc\"].append(eval_dataset(val_set, eval_fn, model))\n", + "results[\"prompt\"].append(system_prompt.data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training step 0. Epoch 0: : 0it [00:00, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction is to change parameters to improve output correctness. The code evaluates responses for correctness (1 for correct, 0 for incorrect) based on a string `str0`, which is a prompt used in a language model query. The feedback indicates a need for improvement in correctness, yet the current evaluation outputs suggest the model is already producing correct answers (since `eval_response0`, `eval_response1`, and `eval_response2` are all 1). The parameters do not need alteration because the output shows correct answers for given inputs, contradicting the feedback.\",\n", + "\"answer\": \"The correctness of the outputs is already satisfactory based on the given inputs.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.75: 100%|██████████| 100/100 [00:00<00:00, 1734.42it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.75\n", + "previous_performance: 0.75\n", + " suggested sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1429.94it/s] \n", + "Training step 1. Epoch 0: : 1it [00:03, 3.86s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '0', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task is to improve the accuracy of the outputs based on the feedback. From the code and feedback, we observe that eval_response3, eval_response4, and eval_response5 are the results of evaluating the correctness of each 'eval' output against its ground truth. The 'eval_response4' is incorrect, with a value of 0. The logic likely fails to correctly count the total number of objects in Variable408, where the correct answer should be 8, not 7, as there are four chairs. Therefore, the model's response is incorrect for this variable. Changing the ground truth answer, Variable409, from 9 to 8 will not solve the issue, as it will still result in an inconsistency. Instead, we should ensure that the system prompt encourages the model to produce the accurate count for objects. We can modify the prompt to emphasize more detail-oriented counting.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a reasoning question. Count each item accurately. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.51: 100%|██████████| 100/100 [00:00<00:00, 1129.10it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.51\n", + "previous_performance: 0.75\n", + "rejected prompt: You will answer a reasoning question. Count each item accurately. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " suggested sys prompt: You will answer a reasoning question. Count each item accurately. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1559.94it/s] \n", + "Training step 2. Epoch 0: : 2it [01:18, 45.58s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"1. The instruction asks to change the parameters to improve the correctness of the output, according to the feedback. 2. The feedback indicates to improve correctness, but the responses (eval606, eval607, eval608) and their evaluations (eval_response6, eval_response7, eval_response8) are already correct. They all match the ground truth answers, assigning a score of 1 for correctness. 3. The current 'str0' is a prompt guiding a reasoning approach for the model, which is achieving the desired correct outcomes. Thus, no change is needed in the parameters as the feedback suggests general correctness improvement, but the available outputs are already correct.\",\n", + "\"answer\": \"The correctness is already optimal.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.75: 100%|██████████| 100/100 [00:00<00:00, 1541.33it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.75\n", + "previous_performance: 0.75\n", + " suggested sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1263.10it/s] \n", + "Training step 3. Epoch 0: : 3it [01:18, 24.92s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['0', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction asks us to improve the output by adjusting the parameters according to the given feedback. Based on the feedback, eval809 returns an incorrect result for Variable818, where the expected number was 9 but the response evaluated it as 6. The parameter str0 is likely influencing how the model is interpreting the system prompt, and it is essential to ensure clarity and completeness for the reasoning questions being asked. The mismatch shows that the prompt did not guide the model to the correct understanding. Therefore, adjusting str0 might influence the eval function to produce the correct reasoning and final answer.\",\n", + "\"answer\": \"Update the 'system_prompt' to ensure clarity and enhance correctness in model responses.\",\n", + "\"suggestion\": {\n", + "\"str0\": \"You will answer a reasoning question. List all items in the category asked and count them. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.55: 100%|██████████| 100/100 [00:00<00:00, 1040.23it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.55\n", + "previous_performance: 0.75\n", + "rejected prompt: You will answer a reasoning question. List all items in the category asked and count them. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " suggested sys prompt: You will answer a reasoning question. List all items in the category asked and count them. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1289.22it/s] \n", + "Training step 4. Epoch 0: : 4it [02:54, 52.69s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '0']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task is to adjust the system prompt `str0` in the parameters to improve the output. The current `str0` prompts the system to consider 'step by step' reasoning to solve questions, which is adequate for two of the cases (Variable1024 and Variable1026), but not for Variable1028. For the third example, the evaluated result was 7, but the ground truth was 9, leading to an incorrect result. The mistake seems related to the interpretation of quantity. The prompt should encourage considering all listed items when counting, potentially solving the error. Thus, the reasoning instruction in str0 should be more explicit about counting all items described in the input.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + "\"str0\": \"You will answer a reasoning question. Count all the items given in the input completely. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.55: 100%|██████████| 100/100 [00:00<00:00, 1640.52it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.55\n", + "previous_performance: 0.75\n", + "rejected prompt: You will answer a reasoning question. Count all the items given in the input completely. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " suggested sys prompt: You will answer a reasoning question. Count all the items given in the input completely. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1676.92it/s] \n", + "Training step 5. Epoch 0: : 5it [02:54, 33.84s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction asks for improving the correctness of the output based on the #Feedback. The #Feedback suggests that there's room for improvement even though the #Outputs indicate correctness with no incorrect evaluations (all `eval_response` values are 1). The code prompts the system with reasoning tasks related to counting objects and instruments. The provided prompts are accurate and correctly formulated to get the desired responses from the language model. However, there is no explicit indication of incorrectness, implying that the request for improvement may be to ensure robustness or prevent potential edge cases. Reviewing the variables and their evaluations shows consistency and correctness in their outputs, matching the expected results (all 1s), aligning with the ground truth answers. Therefore, no changes need to be made in #Parameters as they are already producing correct results.\",\n", + "\"answer\": \"The system already produces correct results based on the current prompts and inputs.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.75: 100%|██████████| 100/100 [00:00<00:00, 1719.53it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.75\n", + "previous_performance: 0.75\n", + " suggested sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1386.08it/s] \n", + "Training step 6. Epoch 0: : 6it [03:02, 25.16s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '0', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task is to adjust the parameter `str0` to improve the correctness of the logic model's responses to the provided queries. The current outputs show that the first and third responses are correct, but the second is incorrect. The feedback suggests improving correctness, particularly for the second query about vegetables. The prompt provided in `str0` may need slight modification to ensure the model accurately counts items in list-based queries. Since the model failed to provide the correct count for vegetables, a more explicit instruction or example in the prompt could assist in enhancing precision.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.82: 100%|██████████| 100/100 [00:00<00:00, 1271.36it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.82\n", + "previous_performance: 0.75\n", + " suggested sys prompt: You will answer a counting question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a counting question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.83: 100%|██████████| 100/100 [00:00<00:00, 1797.48it/s] \n", + "Training step 7. Epoch 0: : 7it [03:05, 17.61s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['0', '0', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem involves improving the correctness of responses generated using a given prompt and inputs. The `eval` function is used to evaluate the model outputs against ground truth answers. The goal is to adjust the system prompt `str0` in order to enhance the accuracy of these responses, as currently they score 0 for two of the inputs. The prompt should encourage the model to correctly count and state the number of objects or vegetables. The current prompt 'Think step by step.' might not be guiding the model effectively. By modifying the prompt to add more explicit counting guidance, we may improve accuracy. The outputs show that the response calculation for the number of vegetables and objects are currently incorrect. Therefore, adjusting the prompt to be more explicit in asking for enumeration might help the model produce correct outputs.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1280.60it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.82\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1209.15it/s] \n", + "Training step 8. Epoch 0: : 8it [07:33, 97.52s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction indicates that we need to modify parameters to improve output correctness according to #Feedback. In this case, we observe that the outputs 'concat8' have the values [1, 1, 1], which are correct evaluations since each individual 'eval_response' is 1, indicating correctness. However, given the feedback 'Improve correctness,' the issue seems related to how this output is presented. The given 'concat8' format is unclear about the correctness of each response or perhaps how the final presentation should be formatted. Since the evaluations of 'eval_response24', 'eval_response25', and 'eval_response26' are already correct, the issue is likely with the presentation or unnecessary complexity in the 'concat' function, not with the logic requiring input value changes.\",\n", + "\"answer\": \"The responses are already correct; there is nothing to fix in terms of correctness for individual evaluations.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1031.90it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1110.39it/s] \n", + "Training step 9. Epoch 0: : 9it [12:36, 161.70s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction asks for improvements to the output based on the #Feedback which is 'Improve correctness.' The current output already shows correctness for each evaluation, 'eval_response27', 'eval_response28', and 'eval_response29' all return 1, indicating all predictions match the ground truth answers. The #Outputs show a list of 1s, suggesting that all responses are indeed correct when compared to their ground truths. Thus, the correctness does not need improvement, leading to the output being already optimal in terms of correctness.\",\n", + "\"answer\": \"The output already indicates all responses are correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1731.87it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 2363.87it/s] \n", + "Training step 10. Epoch 0: : 10it [12:47, 115.03s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The code evaluates three different counting tasks using a provided system prompt (str0) and input variables (Variable2260, Variable2262, Variable2264). Each evaluation result matches its ground truth answer, leading to eval_response30, eval_response31, and eval_response32 all being 1, indicating correctness in each case. These responses are then concatenated into concat10. The feedback suggests improving correctness, but the evaluations already match the ground truths. Since all eval_response values are already 1 (indicating correctness), the likely intention is to verify the output is correctly structured. The output shows a list of '1's, which is correct given the evaluations are all correct.\",\n", + "\"answer\": \"The evaluations are correct as per the given feedback and documentation. No changes are required to improve correctness.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 2414.14it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1319.42it/s] \n", + "Training step 11. Epoch 0: : 11it [12:52, 81.40s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction asks us to improve the correctness of the output response, which is currently a list of three values [1, 1, 1]. The values indicate whether each evaluated response matches the ground truth (1 for correct, 0 for incorrect). From the feedback and the values in 'eval_response33', 'eval_response34', and 'eval_response35', we see that the responses are all currently marked as correct (all are 1). Hence, the output 'concat11' is also reflecting this result. To improve or validate correctness, we should ensure that the 'system_prompt' in 'str0' is correctly prompting the evaluation such that it always leads to correct responses. The current prompt in 'str0' asks the model to list items, count them explicitly, and provide a final total. This matches what seems to be needed for correct evaluations. The feedback does not suggest any issues with the correctness of inputs or the way 'str0' is phrased. As the evaluations are reaching a correct conclusion, no change in 'str0' seems necessary.\",\n", + "\"answer\": \"The output is already correct as it consistently gives [1, 1, 1], indicating all responses are correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1389.31it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1733.13it/s] \n", + "Training step 12. Epoch 0: : 12it [12:59, 59.02s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction asks us to modify the parameters to improve the correctness of the output in line with the feedback. The code executes three queries and evaluates their responses, which are integer values indicating correctness (1 for correct and 0 for incorrect). The issue lies with the third evaluation (eval_response38) associated with 'eval2638'. The feedback indicates that the output should be improved for correctness. The problem with 'eval2638' stems from the output having the format 'Answer: $2', where the '$' symbol likely causes evaluation issues. By adjusting the system prompt to ensure outputs follow the expected response format without the '$' symbol, we can address the issue.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + "\"str0\": \"You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1278.75it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.92\n", + "previous_performance: 0.93\n", + "rejected prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1315.80it/s] \n", + "Training step 13. Epoch 0: : 13it [16:06, 97.61s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction requires us to change the variables in #Parameters to improve the output based on the feedback. The output indicates that all responses (eval_response39, eval_response40, eval_response41) are correct, as they are all '1', meaning they match the ground truth answers. The concat function correctly concatenates these results, resulting in the string '1 1 1'. The feedback 'Improve correctness' might be misleading because the eval_response values are correct based on the provided ground truth answers. There seems to be no issue with the current outputs and the provided instructions and feedback might not be aligned. Since everything appears to be functioning as expected, no change is necessary.\",\n", + "\"answer\": \"TERMINATE\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 2349.12it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1369.79it/s] \n", + "Training step 14. Epoch 0: : 14it [16:13, 70.30s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem involves evaluating responses to counting questions and concatenating the correctness evaluation of each response. The output is correct as the responses match the ground truth answers: eval_response42, eval_response43, and eval_response44 are all '1', meaning they are correct. The 'concat' function is then used to merge these responses, resulting in the expected '1 1 1' output. Despite the feedback requesting 'improvement in correctness', the outputs appear accurate with respect to given input and expected results. Therefore, no changes to 'str0' are necessary as it seems to guide the system correctly to produce the accurate answers.\",\n", + "\"answer\": \"The output reflects complete correctness with all evaluations returning '1'.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1212.37it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1615.84it/s] \n", + "Training step 15. Epoch 0: : 15it [16:19, 50.83s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The code evaluates three different counting queries using a language model and checks if the responses are correct. The concatenated output of evaluations indicates that all evaluated responses, eval_response45, eval_response46, and eval_response47, are correct since they are all 1. The feedback 'Improve correctness' conflicts with the output information provided, which shows that all responses are correct (1 means correct). This inconsistency could arise from an error in the feedback, as the evaluations are correct according to the given ground truth answers.\",\n", + "\"answer\": \"All concatenated evaluations are correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1669.71it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1205.07it/s] \n", + "Training step 16. Epoch 0: : 16it [16:25, 37.39s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction requires adjusting the parameters to improve the output based on the feedback. The feedback indicates 'Improve correctness,' but both eval_response48 and eval_response49 are already correct, as they both equal 1, aligning with the ground truth answers Variable3497 and Variable3499. Therefore, the outputs are already correct, and the 'concat' function is combining these correct values without issue, producing the expected output. Since no #Parameters need correction, the task is already optimized and does not require change.\",\n", + "\"answer\": \"TERMINATE\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1695.25it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1449.41it/s] \n", + "Training step 16. Epoch 0: : 17it [16:29, 58.23s/it]\n", + "Training step 0. Epoch 1: : 0it [00:00, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Feedback indicates that we should improve the correctness of the output, but the eval_response variables (eval_response50, eval_response51, eval_response52) all show a value of 1, meaning the responses are already marked as correct against the ground truth. The concatenated result concat17 is '1\\n\\n1\\n\\n1', which correctly reflects these evaluation scores. Therefore, there is no need to make any changes to the #Parameters because the outputs are already indicated as correct, and the instruction to improve correctness is inconsistent with the provided evaluations.\",\n", + "\"answer\": \"TERMINATE\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1323.47it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1302.71it/s] \n", + "Training step 1. Epoch 1: : 1it [00:04, 4.72s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"In this problem, we have three evaluations (eval3853, eval3854, eval3855) that use a system prompt (str0) to solve counting problems and then compare the results to ground truth answers (Variable3907, Variable3909, Variable3911) using 'eval_response'. The outputs suggest that eval_response for each evaluation is correct because they all return 1, indicating correctness. The 'concat' function combines the results into a single string, outputting '1, 1, 1', which shows each evaluation is correct. However, the feedback asks to 'improve correctness,' which implies a misunderstanding, as the outputs already indicate correctness. The issue may not be with str0 itself, as it's already prompting for accurate counting. The suggestion could be to double-check the problem statement to ensure understanding or verify any discrepancies in outputs.\",\n", + "\"answer\": \"All responses are correct as per eval_response output.\",\n", + "\"suggestion\": {\n", + "\"suggestion\": \"Ensure accurate and clear problem statement or investigate discrepancies if any incorrectness is present despite correct eval_response outputs.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1193.46it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1652.97it/s] \n", + "Training step 2. Epoch 1: : 2it [00:09, 4.84s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + " \"reasoning\": \"The current system prompt defined in `str0` appears to be working as expected, as the evaluated responses (`eval_response56`, `eval_response57`, and `eval_response58`) are all `1`, indicating correctness when compared to the ground truth answers (`Variable4113`, `Variable4115`, `Variable4117`). However, the feedback suggests improving correctness, likely indicating the need for future-proofing or ensuring robustness in different scenarios. Despite the feedback, given the response correctness is already satisfactory, no changes to `str0` are needed at this moment.\",\n", + " \"answer\": \"The correctness of the existing evaluation indicates that `str0` is sufficient for the given inputs. No change is necessary based on the current results.\",\n", + " \"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1780.70it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1126.77it/s] \n", + "Training step 3. Epoch 1: : 3it [00:14, 4.71s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction indicates that we need to adjust the parameters to improve the output correctness. In this case, the current output is already showing `1` for each response comparison with the ground truth, which is correct as per the feedback function documentation. The function `eval_response` returns `1` when the response matches the ground truth answer, and since all values are already `1`, the output is correct. The concatenated result of eval responses is also as expected since all the eval responses return `1`.\",\n", + "\"answer\": \"The outputs indicate correctness as all eval responses match their respective ground truth answers.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1691.68it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 2015.36it/s] \n", + "Training step 4. Epoch 1: : 4it [00:18, 4.56s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The code evaluates the responses to three counting questions and then evaluates these against the ground truth answers, resulting in a score of 1 (correct) for each. The concatenated result of these evaluations is as expected ([0]: 1, [1]: 1, [2]: 1). However, the feedback indicates a need to improve correctness. This feedback seems incorrect as all evaluations are already scoring 1, which means all are deemed correct. Therefore, it appears there is no further improvement needed in the parameters or logic as it currently functions correctly. If the feedback remains, it may be due to a misunderstanding or incorrect setup elsewhere.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1870.75it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 2089.19it/s] \n", + "Training step 5. Epoch 1: : 5it [00:23, 4.69s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction is asking us to modify parameters to improve an output. The code is evaluating responses using prompt instructions. The feedback states 'Improve correctness', but the evaluations are already correct, as seen by eval_response65, eval_response66, and eval_response67 all being 1 (correct). The concatenated output also reflects correct evaluations. Since the output is correct and aligns with feedback, no changes are necessary.\",\n", + "\"answer\": \"TERMINATE\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1853.47it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 2882.37it/s] \n", + "Training step 6. Epoch 1: : 6it [00:24, 3.52s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task involves evaluating the correctness of responses to given prompts using a language model and ensuring they match the ground truth. Each evaluated response receives a score of 1 for correct answers and 0 for incorrect ones. In this instance, all responses have been evaluated as correct, which is verified by the values of `eval_response68`, `eval_response69`, and `eval_response70` all being 1. The final output `concat23` correctly lists these scores. The instruction to improve correctness is ambiguous because the correctness is already optimal as per current execution.\",\n", + "\"answer\": \"Correctness is already at its maximum. No changes needed.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1407.06it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1691.45it/s] \n", + "Training step 7. Epoch 1: : 7it [00:27, 3.31s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"1. The instruction requires adjusting parameters to improve output based on feedback. The feedback indicates a need to improve correctness, though all eval_response values show correctness (1). This implies that initial interpretation may be incorrect or unclear. 2. The parameters don't impact computation directly, as the system prompt str0 only guides evaluation for accurate counting. 3. No change in str0 appears necessary due to correct evaluations already present for all inputs, leading to correct answers aligning with ground truths.\",\n", + "\"answer\": \"Current outputs are correct according to provided ground truth values. No changes are necessary.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1171.59it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1233.88it/s] \n", + "Training step 8. Epoch 1: : 8it [00:31, 3.37s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The feedback asks to improve correctness, but based on the #Outputs and #Others, it appears the model has already provided correct answers for each input query in 'eval5274', 'eval5275', and 'eval5276'. Since 'eval_response74', 'eval_response75', and 'eval_response76' each evaluate to 1, it indicates the responses are already correct according to 'eval_response' function, which returns 1 for correct responses. The 'concat25' combines these correct evaluations into a single string correctly. Therefore, no changes are needed to improve correctness as the model is already performing correctly.\",\n", + "\"answer\": \"The model is already providing the correct responses.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1752.10it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1767.26it/s] \n", + "Training step 9. Epoch 1: : 9it [00:35, 3.63s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction asks us to adjust the #Parameters to improve the output based on #Feedback. In the given code, the function 'eval' is used to evaluate responses based on the input and system prompt. The 'eval_response' function checks the correctness against the ground truth answers. The prompt 'str0' guides the language model to process the input accurately. The current #Feedback indicates that the correctness needs to be improved. Upon reviewing #Others, it shows that all evaluated responses (eval5477, eval5478, eval5479) numerically match their respective ground truth answers (Variable5555, Variable5557, Variable5559). However, the final concatenated output is simply listing out correctness checks (1 for correct). The #Feedback suggests an improvement is needed, although the evaluations themselves appear correct. It might be a broader indication that the format of the concatenated output or its further processing wasn't clear or satisfactory.\",\n", + "\"answer\": \"\",\n", + "\"suggestion\": {\n", + " \"str0\": \"Please answer the following counting questions accurately, listing each item and its count explicitly. Then calculate the total. Ensure your final response includes 'Answer: $VALUE' where VALUE is a numerical value.\"\n", + " }\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.91: 100%|██████████| 100/100 [00:00<00:00, 1075.96it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.91\n", + "previous_performance: 0.93\n", + "rejected prompt: Please answer the following counting questions accurately, listing each item and its count explicitly. Then calculate the total. Ensure your final response includes 'Answer: $VALUE' where VALUE is a numerical value.\n", + " suggested sys prompt: Please answer the following counting questions accurately, listing each item and its count explicitly. Then calculate the total. Ensure your final response includes 'Answer: $VALUE' where VALUE is a numerical value.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1221.11it/s] \n", + "Training step 10. Epoch 1: : 10it [05:23, 91.57s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem involves evaluating responses to counting questions using a language model and checking those responses against ground truth answers. Based on the #Feedback, we need to improve correctness, which implies the output is already correct as eval_response values (1, 1, 1) indicate that each response matched the ground truth. However, the #Instruction seems to require further improvement, possibly suggesting better alignment with desired output formatting or presentation. Considering that the evaluations are correct, no changes in #Parameters are required, as the concatenated responses (concat27) reflect the evaluated correctness (all 1s), meeting the expected output.\",\n", + "\"answer\": \"TERMINATE\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1515.06it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1469.08it/s] \n", + "Training step 11. Epoch 1: : 11it [05:29, 65.18s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task involves evaluating code that queries a language model to answer counting questions. Each evaluation checks if the response matches a ground truth answer. The correctness is assessed with values from `eval_response`, where 1 indicates a correct answer. The parameter `str0` is a prompt guiding the language model's responses. The feedback asks for improving correctness, but all `eval_response` values are 1, meaning all answers are correct. The output `concat28` concatenates these correctness checks, and the correctness seems to be already optimal. Since there are no constraints to adjust for further correctness improvement, no changes are needed to the parameters.\",\n", + "\"answer\": \"No changes needed as all responses are correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1173.02it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1781.37it/s] \n", + "Training step 12. Epoch 1: : 12it [05:32, 46.31s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '0']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task involves adjusting the `str0` parameter to improve the output correctness as indicated by the feedback. The eval function uses `str0` as a system prompt for generating responses, which are then checked for correctness using `eval_response`. The current outputs show that eval6088 is incorrect (`eval_response88=0`), as it doesn't match the ground truth. The response (eval6088) for the fruit counting in the third case gives a total of 13 instead of 14. The problem lies in incorrect calculations based on the input. Since str0 acts as a guide for generating responses, we should ensure it clearly directs to get accurate counts. The current prompt already instructs listing and counting each item, so another possible enhancement is to verify accuracy further with more explicit instructions.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + "\"str0\": \"For counting questions, list each item and its count explicitly using the correct pluralization. Ensure the numerical value in the sum matches the count, and calculate the total accurately. The last line of your response should be formatted as 'Answer: $VALUE', where VALUE is the correct numerical total. Double-check for common miscounting errors.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1108.17it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.9\n", + "previous_performance: 0.93\n", + "rejected prompt: For counting questions, list each item and its count explicitly using the correct pluralization. Ensure the numerical value in the sum matches the count, and calculate the total accurately. The last line of your response should be formatted as 'Answer: $VALUE', where VALUE is the correct numerical total. Double-check for common miscounting errors.\n", + " suggested sys prompt: For counting questions, list each item and its count explicitly using the correct pluralization. Ensure the numerical value in the sum matches the count, and calculate the total accurately. The last line of your response should be formatted as 'Answer: $VALUE', where VALUE is the correct numerical total. Double-check for common miscounting errors.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1547.00it/s] \n", + "Training step 13. Epoch 1: : 13it [07:23, 66.06s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem does not indicate an issue with the current parameters, as all evaluations match the ground truth answers and return 1, indicating correctness. Each eval_response is 1, showing the responses are correct, therefore suggesting that the outputs are satisfactory. The 'concat30' output correctly reflects this evaluation. The feedback asking for improvements in correctness might not be substantiated as each eval operation already gives the correct result.\",\n", + "\"answer\": \"All eval_response variables already equal 1, indicating all answers are correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1142.99it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1677.17it/s] \n", + "Training step 14. Epoch 1: : 14it [07:28, 47.43s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task involves evaluating whether the responses generated by a model match the ground truth answers. The feedback indicates improving correctness, but the evaluations already show all responses were correct (1 means correct). The 'concat' function output is multi-line due to the string's format. Given that eval_response92, eval_response93, and eval_response94 are all 1, the instructions and outputs suggest no parameters need changing. The evaluation and concatenation logic are working as expected.\",\n", + "\"answer\": \"All evaluations are correct as they match the ground truth.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 3427.25it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 3662.86it/s] \n", + "Training step 15. Epoch 1: : 15it [07:29, 33.56s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction is asking for adjustments to the #Parameters to improve the #Outputs. The #Feedback indicates an issue with correctness, but upon inspecting the #Others, each eval_response (eval_response95, eval_response96, eval_response97) is 1, indicating the responses are already evaluated as correct with respect to the ground truth. The concatenation of these values into a single string is also correctly resulting in 1 1 1. Therefore, the output is correct and matches the expected output given the eval_response results. The #Feedback doesn't correctly reflect the result, as there are no errors in correctness based on the evaluation steps outlined.\",\n", + "\"answer\": \"The output is already correct; no change in parameters is necessary.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1512.46it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1651.75it/s] \n", + "Training step 16. Epoch 1: : 16it [07:33, 24.70s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Feedback indicates that the output needs improvement in correctness. The #Code evaluates two different cases using the 'eval' function and compares the results against ground truth answers using 'eval_response'. The results for both cases, 'eval_response98' and 'eval_response99', are correct (both are 1). However, the concatenated output 'concat33' is a string representation of the results, which may not be the expected format. The current 'str0' parameter is appropriately structured to guide the language model to achieve correct answers since both responses match the ground truth. Therefore, the issue may not lie with 'str0' or the correctness of the individual evaluations but potentially in how they are concatenated or displayed. However, without specific constraints on the format of 'concat33', no changes to 'str0' are suggested as it already ensures the clarity and correctness of individual results.\",\n", + "\"answer\": \"The output is correct in individual evaluations.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1381.24it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1500.86it/s] \n", + "Training step 16. Epoch 1: : 17it [07:35, 26.82s/it]\n", + "Training step 0. Epoch 2: : 0it [00:00, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task is to ensure that the language model provides correct outputs for the count and listing of items based on the system prompt. The current evaluations show that all responses are marked as '1,' meaning they are correct when compared with the ground truth. However, the instruction to 'Improve correctness' suggests there might be an issue in clarity or with adherence to formatting, specifically with not explicitly following the prompt's output format. This is evident in the use of '$' in the total lines instead of following the provided format. Therefore, the suggestion should focus on adjusting the format in the string prompt to better guide responses towards clarity in expression and adherence to the given format.\",\n", + "\"answer\": \"The correctness of the evaluations is currently perfect, so no numerical improvements can be suggested directly. The suggestion will aim at further enhancing clarity and adherence to the format.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should strictly follow the format: 'Answer: VALUE' where VALUE is a numerical value WITHOUT any special characters. Please ensure maximum clarity in explanation.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.89: 100%|██████████| 100/100 [00:00<00:00, 1138.57it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.89\n", + "previous_performance: 0.93\n", + "rejected prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should strictly follow the format: 'Answer: VALUE' where VALUE is a numerical value WITHOUT any special characters. Please ensure maximum clarity in explanation.\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should strictly follow the format: 'Answer: VALUE' where VALUE is a numerical value WITHOUT any special characters. Please ensure maximum clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1444.64it/s] \n", + "Training step 1. Epoch 2: : 1it [02:08, 128.32s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Instruction indicates that we need to improve the correctness of the output, based on the feedback provided. The feedback suggests that the correctness should be improved, but all eval_response variables already yield 1, which indicates that the answers were correctly matched with the ground truth answers. The responses generated for the inputs seem to match the expected outputs for those questions based on the ground truths. Since the outputs already show correctness, there is no need for changes in #Parameters as the eval function is functioning correctly with the system prompt provided.\",\n", + "\"answer\": \"The output correctness is already optimal given the problem setup.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.93: 100%|██████████| 100/100 [00:00<00:00, 1389.97it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.93\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.9: 100%|██████████| 100/100 [00:00<00:00, 1371.09it/s] \n", + "Training step 2. Epoch 2: : 2it [02:11, 54.44s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem requires evaluating responses to counting questions. The system uses a prompt (str0) and evaluates the queries compared to a ground truth answer. The feedback suggests improving the correctness, implying some prompts may lack clarity or detail, leading to incorrect output. However, all eval_responses are correct, indicating the outputs were evaluated correctly against the ground truths. Since eval7508 displays 'Answer: $3' instead of 'Answer: 3', there might be a formatting issue caused by the prompt that affects clarity. Adjusting the prompt could enforce more straightforward formatting, ensuring the model correctly outputs numerical answers without variables like '$3'.\",\n", + "\"answer\": \"The format issue with eval7508 might be due to the prompt.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1121.34it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.93\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1043.75it/s] \n", + "Training step 3. Epoch 2: : 3it [06:11, 139.38s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The feedback suggests improving correctness. The code uses eval and eval_response functions to evaluate the responses against the ground truth. The eval function calls the query function with a system prompt, evaluating expressions based on Variables 7818, 7820, and 7822. The eval_response function correctly judges all these responses as correct, resulting in eval_response109, eval_response110, and eval_response111 all being 1. Each individual evaluation correctly matches its ground truth, as seen in eval7709, eval7710, and eval7711. The final output, concat37, concatenates these results into a string indicating that all responses are correct (each is '1'). The instruction indicates satisfaction with the current output, as the core evaluation components are correct.\",\n", + "\"answer\": \"The existing output indicates the correctness of all individual evaluations (they are all '1'). No changes to inputs seem required given the correct results.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 2380.62it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 3680.67it/s] \n", + "Training step 4. Epoch 2: : 4it [06:17, 86.72s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '0', '1']\n", + "LLM response:\n", + " { \"reasoning\": \"The instruction requires us to adjust the parameter to improve the output accuracy as indicated by the feedback. The feedback suggests there is an issue with evaluating correctness. The evaluations for eval7912 and eval7914 are correct as their respective eval_response values match the ground truth answers provided in Variables8025 and Variable8029. However, eval7913 returns 13, which does not match the expected ground truth in Variable8027, which is 14. This discrepancy results in a correctness evaluation of 0 for eval_response113. Considering the list in eval7913, the respondent calculated the total fruits to be 13 instead of 14. To improve correctness, we need to ensure the sum of fruits counts to 14. As per the given input, the count should be 'Blackberry + Peach + Nectarine + Plum + Raspberry + Orange + Strawberry + Banana + 2 Apples + 4 Grapes', which totals 14. The issue likely arises from how the system prompt structures or interprets the input. Adjusting `str0` to emphasize 'total count should reflect all items listed' could guide accurate computation, ensuring each listed fruit is counted as 1 unless explicitly specified otherwise (as with 'two apples' or 'four grapes').\", \"answer\": null, \"suggestion\": { \"str0\": \"You will answer a counting question. List each item and its count explicitly. Ensure each item is separately listed unless otherwise specified. Then calculate the total count, explicitly reflecting each item. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\" } }\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1309.80it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.92\n", + "previous_performance: 0.94\n", + "rejected prompt: You will answer a counting question. List each item and its count explicitly. Ensure each item is separately listed unless otherwise specified. Then calculate the total count, explicitly reflecting each item. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Ensure each item is separately listed unless otherwise specified. Then calculate the total count, explicitly reflecting each item. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1241.55it/s] \n", + "Training step 5. Epoch 2: : 5it [08:25, 101.70s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"1. The #Instruction asks to change the values in #Parameters to improve the output based on #Feedback. 2. The #Feedback 'Improve correctness' suggests that the evaluation responses should all be correct. However, according to #Others, all eval_responses are already 1, indicating correctness. The outputs under #Others match the ground truth answers exactly, which means the correctness is not an issue with the responses themselves. 3. Since no specific issue with the correctness of responses is identified, the #Feedback might be incorrectly suggesting there's an improvement needed when, in fact, all responses are already correct. If changes were suggested in #Parameters, it could potentially be addressing another non-obvious aspect of 'correctness'. However, without further specific indication of an issue, the code appears to function correctly as is.\",\n", + "\"answer\": \"The output is already correct with all evaluation responses being 1, indicating correctness.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1233.91it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1171.41it/s] \n", + "Training step 6. Epoch 2: : 6it [08:32, 69.40s/it] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task involved evaluating multiple questions using a language model with the provided 'str0' system prompt. Each response is compared to a ground-truth answer to check for correctness, and the final output aggregates the correctness of each response. The provided feedback indicates that the outputs are already correct as 'eval_response118', 'eval_response119', and 'eval_response120' all equal 1, which corresponds to correct evaluations for all the responses. Therefore, no change is required because the outputs already satisfy the requirement for correctness.\",\n", + "\"answer\": \"The outputs are already correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1609.21it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 2073.92it/s] \n", + "Training step 7. Epoch 2: : 7it [08:37, 48.46s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The problem requires evaluating several queries regarding the count of items using a language model, and assessing their correctness against ground truth values. The issue at hand is ensuring the output corresponds to the actual scenario described in #Parameters and #Inputs. Reviewing the values in #Others shows that each eval response is correctly matching its ground truth, leading to a concat result of all being correct (1). Thus, there is no evident issue with correctness in the approach seen in the #Feedback as all eval_response values show the model is responding correctly. The current system prompt should already be yielding correct results since all evaluations (eval_response121, eval_response122, eval_response123) are 1. If there are underlying correctness concerns, they are not highlighted by the existing outputs.\",\n", + "\"answer\": \"The language model responses are all correct according to the eval_response results, reflecting accurate completion and verification.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 3368.76it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1578.51it/s] \n", + "Training step 8. Epoch 2: : 8it [08:43, 34.79s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task requires modifying the `str0` parameter to improve the output correctness. The provided outputs are already correct (all `eval_response` variables are 1, indicating the responses match the ground truth correctly). The concatenated result `concat42` also shows each entry as correct. The feedback asks for improvement, but each part of the evaluation is already consistent with the ground truth answers. Without specific issues identified in correctness or clarity, there are no changes needed.\",\n", + "\"answer\": \"The output is already correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1572.20it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 2173.59it/s] \n", + "Training step 9. Epoch 2: : 9it [08:47, 25.24s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task requires evaluating the responses generated by the code against the ground truth answers provided in the variables. The responses are evaluated using `eval_response`, which returns 1 for a correct match, implying all responses are currently correct. However, the feedback states 'Improve correctness,' indicating a need to refine the logic for counting. The prompt in `str0` instructs how to count and present the answer. Since all responses are correct, the `str0` itself likely needs adjustments to enforce an even stricter counting and clarity policy. Given no further constraints, a possible enhancement could focus on clarity or redundancy elimination in the counting instructions to precisely encapsulate the logic intended by 'Improve correctness.' However, looking at the results and logic in place, the instructions may already be sufficiently detailed.\",\n", + "\"answer\": \"All responses are correct; str0 may already reflect necessary clarity. Adjust only if further improvement is specified.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. For each item, state \\\"Item: COUNT.\\\" Add counts to get a total. The last line of your response must be 'Answer: VALUE' with the numerical value. Clarity is paramount.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.76: 100%|██████████| 100/100 [00:00<00:00, 1262.49it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.76\n", + "previous_performance: 0.94\n", + "rejected prompt: You will answer a counting question. For each item, state \"Item: COUNT.\" Add counts to get a total. The last line of your response must be 'Answer: VALUE' with the numerical value. Clarity is paramount.\n", + " suggested sys prompt: You will answer a counting question. For each item, state \"Item: COUNT.\" Add counts to get a total. The last line of your response must be 'Answer: VALUE' with the numerical value. Clarity is paramount.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1537.15it/s] \n", + "Training step 10. Epoch 2: : 10it [10:23, 47.18s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The feedback 'Improve correctness' suggests that the current implementation might not be providing accurate results according to some expectations, although the current outputs seem correct based on the provided #Others values, where eval_response130, eval_response131, and eval_response132 all equal 1, indicating correctness according to eval_response function. The code uses a consistent #Parameters system_prompt for each query, which ensures that each prompt explicitly states a requirement to list items and count them before giving a final count. This prompt is consistent with the expected answers verified in #Others, leading to correct evaluations. Thus, no change in str0 is necessary as the accuracy requirement is already met.\",\n", + "\"answer\": \"The current outputs are correct, so no modifications are needed.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1044.75it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1784.34it/s] \n", + "Training step 11. Epoch 2: : 11it [10:31, 35.21s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The #Feedback indicates that correctness should be improved, but examining the #Others section shows that each of the eval responses (eval9333, eval9334, eval9335) is evaluated as correct (1) when compared to their respective ground truth answers. Therefore, the outputs are already correct, and the concatenated result is just a representation of the correctness of each response, which is correct as `[0]: 1\\n\\n[1]: 1\\n\\n[2]: 1`. As such, there is no incorrectness to improve within the context provided. Since the task doesn't give further specific guidance, no changes in #Parameters are required.\",\n", + "\"answer\": \"TERMINATE\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1881.39it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 3488.80it/s] \n", + "Training step 12. Epoch 2: : 12it [10:37, 26.04s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The task is to adjust the input prompt, `str0`, to enhance the correctness of the answers generated by the eval function. However, the `eval_response` functions have already confirmed that each response matches the ground truth with a value of 1, indicating full correctness for each query. The concatenated output contains `1` for each evaluation, implying the model's responses were deemed correct according to the given ground truths. Since all eval_response outputs are correct and the task requires improving correctness, there appears to be no current issue with the correctness of the responses based on the feedback provided. No changes to `str0` are necessary.\",\n", + "\"answer\": \"Responses are already correct as verified by eval_response outputs.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 2401.66it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1576.23it/s] \n", + "Training step 13. Epoch 2: : 13it [10:42, 19.67s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction is asking us to adjust the parameters to improve the performance with respect to the provided feedback. The current output indicates that all evaluations (eval_response139, eval_response140, eval_response141) are correct, as they are all 1. However, there is feedback suggesting that correctness could be improved further. This may be a misinterpretation, as all outputs are correct based on the ground truth answers and the current implementation seems to handle it correctly. Therefore, the system prompt (str0) is effectively guiding the language model to produce the desired correct responses. Since the evaluations are already correct, based on ground truth values, and no specific issue is highlighted, no change is necessary.\",\n", + "\"answer\": \"The correctness seems already achieved as per the given outputs.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 2269.37it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1687.52it/s] \n", + "Training step 14. Epoch 2: : 14it [10:47, 15.23s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction asks to improve the correctness of the response outputs. However, the feedback is not specific regarding which part needs improvement since all eval_response values are 1, indicating correctness. Therefore, the concatenated output 'concat48' is as expected. There is no indication of specific errors or issues in the output that would imply incorrectness, and the current outputs fully align with their respective ground truth answers.\",\n", + "\"answer\": \"The outputs seem correct, and there's no action needed.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 1489.22it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1342.57it/s] \n", + "Training step 15. Epoch 2: : 15it [10:52, 12.17s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['0', '1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"1. The instruction asks us to improve the output by modifying the parameters. The code evaluates different prompts against given data and compares them to the correct answers. 2. The feedback suggests that not all evaluated responses were correct, according to the correctness evaluation (`eval_response`), which returned 0 for one response. Specifically, the response for `Variable10290` had an incorrect answer ('Total: 8 + 4 = 12' instead of the expected 11). 3. The solution is to adjust `str0` to ensure it prompts the code to sum the values correctly. The prompt may require clarification so that non-integer items are not mistakenly summed or missed.\",\n", + "\"answer\": \"Modify `str0` to ensure clearer counting and summing of items.\",\n", + "\"suggestion\": {\n", + " \"str0\": \"You will answer a counting question. List each object and its count explicitly without repetition. Calculate the total number of distinct items. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Ensure no miscounting.\"\n", + "}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.35: 100%|██████████| 100/100 [00:00<00:00, 1064.91it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.35\n", + "previous_performance: 0.94\n", + "rejected prompt: You will answer a counting question. List each object and its count explicitly without repetition. Calculate the total number of distinct items. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Ensure no miscounting.\n", + " suggested sys prompt: You will answer a counting question. List each object and its count explicitly without repetition. Calculate the total number of distinct items. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Ensure no miscounting.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 1243.03it/s] \n", + "Training step 16. Epoch 2: : 16it [12:38, 40.52s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feedback ['1', '1']\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The feedback indicates a need for improvement in correctness. The current outputs demonstrate that the responses produced by the eval are correct as both eval_response148 and eval_response149 are 1, meaning the answers match the ground truth. The concatenated output concat50 is already '1 1', showing both responses are correct. The given system prompt, found in str0, specifies how to enumerate items and calculate a total. It appears that the outputs are correct as intended, thus the 'correctness' mentioned might refer to some other aspect not apparent from the current context. Without specific incorrect output or contradictions in the feedback, there is no evident change necessary to the parameters.\",\n", + "\"answer\": \"No apparent changes needed since both responses are already correct.\",\n", + "\"suggestion\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.94: 100%|██████████| 100/100 [00:00<00:00, 2614.82it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "val_performance: 0.94\n", + "previous_performance: 0.94\n", + " suggested sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n", + " sys prompt: You will answer a counting question. List each item and its count explicitly. Then calculate the total. The last line of your response should be: 'Answer: VALUE' where VALUE is a numerical count. Please ensure clarity in explanation.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Accuracy: 0.92: 100%|██████████| 100/100 [00:00<00:00, 3597.73it/s] \n", + "Training step 16. Epoch 2: : 17it [12:43, 44.92s/it]\n" + ] + } + ], + "source": [ + "@trace.bundle()\n", + "def eval_response(response, ground_truth_answer):\n", + " \"\"\" Evaluate the response of the language model with respect to the ground truth answer. 1 means correct, 0 means incorrect \"\"\"\n", + " try:\n", + " eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=ground_truth_answer))\n", + " except:\n", + " eval_output_variable = eval_fn([x, ground_truth_answer, response])\n", + " return eval_output_variable\n", + "\n", + "@trace.bundle()\n", + "def concat(*items):\n", + " \"\"\" Concatenate the items into a single string \"\"\"\n", + " output = ''\n", + " for i, item in enumerate(items):\n", + " output += f'{[i]}: {item}\\n\\n'\n", + " return output\n", + "\n", + "for epoch in range(3):\n", + " for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):\n", + " pbar.set_description(f\"Training step {steps}. Epoch {epoch}\")\n", + " optimizer.zero_feedback()\n", + " feedbacks = []\n", + " for (x, y) in zip(batch_x, batch_y):\n", + " x = tg.Variable(x, requires_grad=False, role_description=\"query to the language model\")\n", + "\n", + " if np.issubdtype(type(y), np.integer):\n", + " y = int(y)\n", + "\n", + " y = tg.Variable(y, requires_grad=False, role_description=\"correct answer for the query\")\n", + "\n", + " response = query(system_prompt, x) # node\n", + " eval_output_variable = eval_response(response, y) # node\n", + " feedbacks.append(eval_output_variable)\n", + "\n", + " target = concat(*feedbacks)\n", + " print('feedback', [f.data.value for f in feedbacks])\n", + " target.backward(\"Improve correctness.\")\n", + " optimizer.step(verbose='output')\n", + "\n", + "\n", + " tg_system_prompt =tg.Variable(system_prompt.data,\n", + " requires_grad=True,\n", + " role_description=\"structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task\")\n", + " run_validation_revert(tg_system_prompt, results, model, eval_fn, val_set)\n", + " print(\" suggested sys prompt: \", system_prompt.data)\n", + " system_prompt._data = tg_system_prompt.value # to implement the revert\n", + "\n", + " print(\" sys prompt: \", system_prompt.data)\n", + " test_acc = eval_dataset(test_set, eval_fn, model)\n", + " results[\"test_acc\"].append(test_acc)\n", + " results[\"prompt\"].append(system_prompt.data)\n", + " # if steps == 3:\n", + " # break" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.83, 0.83, 0.83, 0.83, 0.83, 0.83, 0.83, 0.83, 0.92, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92, 0.92]\n", + "Best accuracy: 0.92\n" + ] + } + ], + "source": [ + "accuracies = [np.array(x).mean() for x in results[\"test_acc\"]]\n", + "print(accuracies)\n", + "print(\"Best accuracy: \", max(accuracies))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trace-3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/textgrad_examples/notebooks/textgrad_solution_optimization.ipynb b/examples/textgrad_examples/notebooks/textgrad_solution_optimization.ipynb new file mode 100644 index 00000000..c2e98431 --- /dev/null +++ b/examples/textgrad_examples/notebooks/textgrad_solution_optimization.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 09-16 16:17:15] {315} WARNING - Model gpt-4o-2024-08-06 is not found. The cost will be 0. In your config_list, add field {\"price\" : [prompt_price_per_1k, completion_token_price_per_1k]} for customized pricing.\n", + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction asks us to evaluate the solution to the quadratic equation and correct any mistakes in the variable 'str0'. The equation provided is 3x^2 - 7x + 2 = 0. According to the quadratic formula, x = (-b ± √(b^2 - 4ac)) / 2a, the correct interpretation should consider 'b^2 - 4ac' with proper calculation. In 'str0', the terms 'b^2' and '4ac' are shown incorrectly in the quantity under the square root (as '√73' instead of '√(49 - 24)'). Thus, the solutions x1 and x2 are also incorrect in the way they are presented. The proper solutions should be x1 = (7 + √25) / 6, x2 = (7 - √25) / 6. Hence, we need to modify the string in 'str0' accordingly.\",\n", + "\"answer\": \"\",\n", + "\"suggestion\": {\n", + " \"str0\": \"To solve the equation 3x^2 - 7x + 2 = 0, we use the quadratic formula:\\nx = (-b ± √(b^2 - 4ac)) / 2a\\na = 3, b = -7, c = 2\\nx = (7 ± √(49 - 24)) / 6\\nx = (7 ± √25) / 6\\nThe solutions are:\\nx1 = (7 + √25) / 6\\nx1 = (12) / 6\\nx1 = 2\\nx2 = (7 - √25) / 6\\nx2 = (2) / 6\\nx2 = 1/3\"\n", + "}\n", + "}\n", + "To solve the equation 3x^2 - 7x + 2 = 0, we use the quadratic formula:\n", + "x = (-b ± √(b^2 - 4ac)) / 2a\n", + "a = 3, b = -7, c = 2\n", + "x = (7 ± √(49 - 24)) / 6\n", + "x = (7 ± √25) / 6\n", + "The solutions are:\n", + "x1 = (7 + √25) / 6\n", + "x1 = (12) / 6\n", + "x1 = 2\n", + "x2 = (7 - √25) / 6\n", + "x2 = (2) / 6\n", + "x2 = 1/3\n" + ] + } + ], + "source": [ + "from opto import trace\n", + "from opto.optimizers import OptoPrime\n", + "\n", + "initial_solution = \"\"\"To solve the equation 3x^2 - 7x + 2 = 0, we use the quadratic formula:\n", + "x = (-b ± √(b^2 - 4ac)) / 2a\n", + "a = 3, b = -7, c = 2\n", + "x = (7 ± √((-7)^2 + 4(3)(2))) / 6\n", + "x = (7 ± √73) / 6\n", + "The solutions are:\n", + "x1 = (7 + √73)\n", + "x2 = (7 - √73)\"\"\"\n", + "\n", + "solution = trace.node(initial_solution,\n", + " trainable=True,\n", + " description=\"solution to the math question\")\n", + "\n", + "# feedback = \"\"\"1. The discriminant calculation is incorrect: it should be b^2 - 4ac, not b^2 + 4ac.\n", + "# 2. The final solutions are missing division by 6.\n", + "# 3. The solutions should be written as fractions., role=response from the language model, grads=set())\n", + "# \"\"\"\n", + "\n", + "feedback = \"Evaluate the solution to a math question and solve it.\"\n", + "\n", + "opt = OptoPrime([solution])\n", + "\n", + "opt.zero_feedback()\n", + "solution.backward(feedback)\n", + "opt.step(verbose='output')\n", + "print(solution.data)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trace-3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/textgrad_examples/notebooks/textgrad_test_time_loss_for_code.ipynb b/examples/textgrad_examples/notebooks/textgrad_test_time_loss_for_code.ipynb new file mode 100644 index 00000000..4bac30a2 --- /dev/null +++ b/examples/textgrad_examples/notebooks/textgrad_test_time_loss_for_code.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from opto import trace\n", + "from opto.optimizers import OptoPrime\n", + "\n", + "import random\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# We'll use below utilities to run a python function.\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "def run_function_in_interpreter(func_code):\n", + " # raise Exception(\"This function will run the code returned by GPT-4o. Remove this if you'd like to run the code!\")\n", + " interpreter = InteractiveShell.instance()\n", + "\n", + " interpreter.run_cell(func_code, store_history=False, silent=True)\n", + "\n", + " func_name = func_code.split(\"def \")[1].split(\"(\")[0].strip()\n", + " func = interpreter.user_ns[func_name]\n", + "\n", + " return func\n", + "\n", + "\n", + "\n", + "def test_longest_increasing_subsequence(fn):\n", + " nums = [10, 22, 9, 33, 21, 50, 41, 60]\n", + " assert fn(nums) == 5\n", + "\n", + " nums = [7, 2, 1, 3, 8, 4, 9, 6, 5]\n", + " assert fn(nums) == 4\n", + "\n", + " nums = [5, 4, 3, 2, 1]\n", + " assert fn(nums) == 1\n", + "\n", + " nums = [1, 2, 3, 4, 5]\n", + " assert fn(nums) == 5\n", + "\n", + " nums = [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5]\n", + " assert fn(nums) == 4\n", + "\n", + " nums = [10, 9, 2, 5, 3, 7, 101, 18]\n", + " assert fn(nums) == 4\n", + "\n", + " nums = [0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15]\n", + " assert fn(nums) == 6\n", + "\n", + " nums = [7, 7, 7, 7, 7, 7, 7]\n", + " assert fn(nums) == 1\n", + "\n", + " nums = [20, 25, 47, 35, 56, 68, 98, 101, 212, 301, 415, 500]\n", + " assert fn(nums) == 11\n", + "\n", + " nums = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]\n", + " assert fn(nums) == 1\n", + "\n", + " print(\"All test cases passed!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "problem_text = \"\"\"Longest Increasing Subsequence (LIS)\n", + "\n", + "Problem Statement:\n", + "Given a sequence of integers, find the length of the longest subsequence that is strictly increasing. A subsequence is a sequence that can be derived from another sequence by deleting some or no elements without changing the order of the remaining elements.\n", + "\n", + "Input:\n", + "The input consists of a list of integers representing the sequence.\n", + "\n", + "Output:\n", + "The output should be an integer representing the length of the longest increasing subsequence.\"\"\"\n", + "\n", + "initial_solution = \"\"\"\n", + "def longest_increasing_subsequence(nums):\n", + " n = len(nums)\n", + " dp = [1] * n\n", + "\n", + " for i in range(1, n):\n", + " for j in range(i):\n", + " if nums[i] > nums[j]:\n", + " dp[i] = max(dp[i], dp[j] + 1)\n", + "\n", + " max_length = max(dp)\n", + " lis = []\n", + "\n", + " for i in range(n - 1, -1, -1):\n", + " if dp[i] == max_length:\n", + " lis.append(nums[i])\n", + " max_length -= 1\n", + "\n", + " return len(lis[::-1])\n", + "\"\"\"\n", + "\n", + "# Generate a random test case\n", + "def generate_random_test_case(size, min_value, max_value):\n", + " return [random.randint(min_value, max_value) for _ in range(size)]\n", + "\n", + "# Test the function with a random test case\n", + "size = 10000 # Adjust the size as needed\n", + "min_value = 1\n", + "max_value = 1000\n", + "\n", + "nums = generate_random_test_case(size, min_value, max_value)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Case Size: 10000\n", + "Longest Increasing Subsequence Length: 180\n", + "Runtime: 5.81503 seconds\n", + "All test cases passed!\n" + ] + } + ], + "source": [ + "longest_increasing_subsequence = run_function_in_interpreter(initial_solution)\n", + "\n", + "start_time = time.time()\n", + "lis = longest_increasing_subsequence(nums)\n", + "end_time = time.time()\n", + "\n", + "print(f\"Test Case Size: {size}\")\n", + "print(f\"Longest Increasing Subsequence Length: {lis}\")\n", + "print(f\"Runtime: {end_time - start_time:.5f} seconds\")\n", + "\n", + "# Test for all test cases\n", + "test_longest_increasing_subsequence(longest_increasing_subsequence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trace code" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "str1\n", + "\n", + "str1\n", + "[Node] This is a node in a computational graph.\n", + "Problem: {problem_text}\n", + "Current Code: {solution}\n", + "\n", + "\n", + "\n", + "format0\n", + "\n", + "format0\n", + "[format] Fills in a string template with content, str.format(). .\n", + "Problem: Longest Increasing Subsequence (LIS)\n", + "Problem Statement:\n", + "Given a sequence of integers, find...\n", + "\n", + "\n", + "\n", + "str1->format0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "str2\n", + "\n", + "str2\n", + "[Node] This is a node in a computational graph.\n", + "Longest Increasing Subsequence (LIS)\n", + "Problem Statement:\n", + "Given a sequence of integers, find the leng...\n", + "\n", + "\n", + "\n", + "str2->format0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "str0\n", + "\n", + "str0\n", + "[ParameterNode] This is a ParameterNode in a computational graph.\n", + "def longest_increasing_subsequence(nums):\n", + "    n = len(nums)\n", + "    dp = [1] * n\n", + "    for i in range(1,...\n", + "\n", + "\n", + "\n", + "str0->format0\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "code = trace.node(initial_solution, trainable=True)\n", + "opt = OptoPrime([code])\n", + "\n", + "feedback = \"Think about the problem and the code snippet. Does the code solve the problem? What is the runtime complexity? Improve the runtime complexity of the code.\"\n", + "format_string = \"Problem: {problem_text}\\nCurrent Code: {solution}\"\n", + "\n", + "from opto.trace import operators as ops\n", + "problem = ops.format(format_string, problem_text=problem_text, solution=code)\n", + "opt.zero_feedback()\n", + "\n", + "# Let's visualize our computation graph.\n", + "problem.backward(feedback, visualize=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM response:\n", + " {\n", + "\"reasoning\": \"The instruction requires improving the output based on the feedback, which suggests analyzing and optimizing the code snippet for solving the Longest Increasing Subsequence (LIS) problem. The current code employs a dynamic programming approach with a time complexity of O(n^2), where 'n' is the length of the input list. To enhance this, the code can be optimized using a combination of dynamic programming and binary search, which reduces the time complexity to O(n log n). This involves maintaining a list to track the smallest possible tail for increasing subsequences of varying lengths and utilizing binary search to find the appropriate position to update the list.\",\n", + "\"answer\": null,\n", + "\"suggestion\": {\n", + " \"str0\": \"def longest_increasing_subsequence(nums):\\n if not nums:\\n return 0\\n tails = []\\n for num in nums:\\n left, right = 0, len(tails)\\n while left < right:\\n mid = (left + right) // 2\\n if tails[mid] < num:\\n left = mid + 1\\n else:\\n right = mid\\n if left == len(tails):\\n tails.append(num)\\n else:\\n tails[left] = num\\n return len(tails)\"\n", + "}\n", + "}\n" + ] + } + ], + "source": [ + "# Let's update the code\n", + "opt.step(verbose='output')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Longest Increasing Subsequence Length: 180\n", + "Runtime: 0.01259 seconds\n", + "All test cases passed!\n" + ] + } + ], + "source": [ + "# Hopefully, we should get much better runtime!\n", + "longest_increasing_subsequence = run_function_in_interpreter(code.data)\n", + "\n", + "start_time = time.time()\n", + "lis = longest_increasing_subsequence(nums)\n", + "end_time = time.time()\n", + "\n", + "print(f\"Longest Increasing Subsequence Length: {lis}\")\n", + "print(f\"Runtime: {end_time - start_time:.5f} seconds\")\n", + "\n", + "test_longest_increasing_subsequence(longest_increasing_subsequence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, OptoPrime in Trace solves the problem. There's no need to further iterate. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "def longest_increasing_subsequence(nums):\n", + " if not nums:\n", + " return 0\n", + " tails = []\n", + " for num in nums:\n", + " left, right = 0, len(tails)\n", + " while left < right:\n", + " mid = (left + right) // 2\n", + " if tails[mid] < num:\n", + " left = mid + 1\n", + " else:\n", + " right = mid\n", + " if left == len(tails):\n", + " tails.append(num)\n", + " else:\n", + " tails[left] = num\n", + " return len(tails)\n" + ] + } + ], + "source": [ + "print(code.data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trace-3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e055030839b67d6448fc94ce5ada48b077ee8eb0 Mon Sep 17 00:00:00 2001 From: windweller Date: Wed, 9 Oct 2024 17:06:18 -0700 Subject: [PATCH 10/10] add logging and cleaned up implementation --- opto/optimizers/textgrad.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/opto/optimizers/textgrad.py b/opto/optimizers/textgrad.py index 8c36e948..972edac9 100644 --- a/opto/optimizers/textgrad.py +++ b/opto/optimizers/textgrad.py @@ -13,6 +13,9 @@ """ Prompts are taken verbatim from: https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer_prompts.py + +Optimizer implementation loosely adapted from +https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer.py """ GLOSSARY_TEXT = """ @@ -246,15 +249,6 @@ def construct_reduce_prompt(gradients: List[str]): return gradient_texts -""" -Implementation loosely adapted from -https://github.com/zou-group/textgrad/blob/main/textgrad/optimizer/optimizer.py - -Because Trace Graph is heterogeneous -- we do not treat LLM operations differently from other operations, -we don't implement specialized backward operators for LLM operations. - -""" - @dataclass class GradientInfo: gradient: str # feedback @@ -281,21 +275,19 @@ def __init__(self, parameters: List[ParameterNode], *args, propagator: Propagator = None, objective: Union[None, str] = None, - ignore_extraction_error: bool = True, - # ignore the type conversion error when extracting updated values from LLM's suggestion - include_example=False, - memory_size=0, # Memory size to store the past feedback max_tokens=4096, - log=True, + log=False, **kwargs, ): super().__init__(parameters, *args, **kwargs) if config_list is None: config_list = autogen.config_list_from_json("OAI_CONFIG_LIST") self.llm = autogen.OpenAIWrapper(config_list=config_list) self.print_limit = 100 + self.max_tokens = max_tokens self.new_variable_tags = ["", ""] self.optimizer_system_prompt = OPTIMIZER_SYSTEM_PROMPT.format(new_variable_start_tag=self.new_variable_tags[0], new_variable_end_tag=self.new_variable_tags[1]) + self.log = [] if log else None def _construct_backward_prompt(self, backward_info): conversation = CONVERSATION_TEMPLATE.format(**backward_info) @@ -427,10 +419,13 @@ def _step(self, verbose=False): except Exception as e: print(f"Error in updating {p.py_name}: {e}, raw response: {response}") + if self.log is not None: + self.log.append({"user_prompt": prompt_update_parameter, "response": response}) + return update_dict # propose new update def call_llm( - self, system_prompt: str, user_prompt: str, verbose: Union[bool, str] = False, max_tokens: int = 4096 + self, system_prompt: str, user_prompt: str, verbose: Union[bool, str] = False ): """Call the LLM with a prompt and return the response.""" if verbose not in (False, "output"): @@ -442,10 +437,10 @@ def call_llm( response = self.llm.create( messages=messages, response_format={"type": "json_object"}, - max_tokens=max_tokens, + max_tokens=self.max_tokens, ) except Exception: - response = self.llm.create(messages=messages, max_tokens=max_tokens) + response = self.llm.create(messages=messages, max_tokens=self.max_tokens) response = response.choices[0].message.content if verbose: