From 95d8d0b21fe6d4f914cb4aaffcd8bf245752ef08 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 15 May 2024 12:30:51 -0400 Subject: [PATCH] Automated File Generation from Docs Notebook Changes (#1137) Co-authored-by: joshreini1 Co-authored-by: Josh Reini <60949774+joshreini1@users.noreply.github.com> --- README.md | 4 +- docs/trulens_eval/all_tools.ipynb | 49 +++++++--------- trulens_eval/generated_files/all_tools.py | 58 +++++++++---------- trulens_eval/trulens_eval/app.py | 15 ++--- .../trulens_eval/feedback/feedback.py | 13 +++-- .../trulens_eval/feedback/provider/base.py | 31 +++++----- .../trulens_eval/feedback/provider/hugs.py | 16 ++--- trulens_eval/trulens_eval/instruments.py | 8 ++- trulens_eval/trulens_eval/schema/types.py | 2 + trulens_eval/trulens_eval/tru_rails.py | 4 +- trulens_eval/trulens_eval/utils/serial.py | 5 +- 11 files changed, 99 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index c8701cbc4..1abcfef2a 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ community](https://communityinviter.com/apps/aiqualityforum/josh)! **Don't just vibe-check your llm app!** Systematically evaluate and track your LLM experiments with TruLens. As you develop your app including prompts, models, -retrievers, knowledge sources and more, *TruLens-Eval* is the tool you need to +retreivers, knowledge sources and more, *TruLens-Eval* is the tool you need to understand its performance. Fine-grained, stack-agnostic instrumentation and comprehensive evaluations help @@ -38,7 +38,7 @@ you to identify failure modes & systematically iterate to improve your application. Read more about the core concepts behind TruLens including [Feedback -Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/feedback_functions/), +Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/ [The RAG Triad](https://www.trulens.org/trulens_eval/getting_started/core_concepts/rag_triad/), and [Honest, Harmless and Helpful Evals](https://www.trulens.org/trulens_eval/getting_started/core_concepts/honest_harmless_helpful_evals/). diff --git a/docs/trulens_eval/all_tools.ipynb b/docs/trulens_eval/all_tools.ipynb index 2f076badf..6fa74a30b 100644 --- a/docs/trulens_eval/all_tools.ipynb +++ b/docs/trulens_eval/all_tools.ipynb @@ -194,14 +194,11 @@ "from trulens_eval.app import App\n", "context = App.select_context(rag_chain)\n", "\n", - "from trulens_eval.feedback import Groundedness\n", - "grounded = Groundedness(groundedness_provider=OpenAI())\n", "# Define a groundedness feedback function\n", "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons)\n", + " Feedback(provider.groundedness_measure_with_cot_reasons)\n", " .on(context.collect()) # collect context chunks into a list\n", " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", @@ -582,14 +579,12 @@ "from trulens_eval.app import App\n", "context = App.select_context(query_engine)\n", "\n", - "from trulens_eval.feedback import Groundedness\n", - "grounded = Groundedness(groundedness_provider=OpenAI())\n", "# Define a groundedness feedback function\n", "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons)\n", + " Feedback(provider.groundedness_measure_with_cot_reasons)\n", " .on(context.collect()) # collect context chunks into a list\n", " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", + " .aggregate(provider.grounded_statements_aggregator)\n", ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", @@ -762,7 +757,8 @@ "outputs": [], "source": [ "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" + "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "os.environ[\"HUGGINGFACE_API_KEY\"] = \"hf_...\"" ] }, { @@ -862,6 +858,9 @@ "metadata": {}, "outputs": [], "source": [ + "from openai import OpenAI\n", + "oai_client = OpenAI()\n", + "\n", "class RAG_from_scratch:\n", " @instrument\n", " def retrieve(self, query: str) -> list:\n", @@ -872,7 +871,7 @@ " query_texts=query,\n", " n_results=2\n", " )\n", - " return results['documents'][0]\n", + " return results['documents']\n", "\n", " @instrument\n", " def generate_completion(self, query: str, context_str: list) -> str:\n", @@ -921,23 +920,18 @@ "outputs": [], "source": [ "from trulens_eval import Feedback, Select\n", - "from trulens_eval.feedback import Groundedness\n", "from trulens_eval.feedback.provider.openai import OpenAI\n", "\n", "import numpy as np\n", "\n", "provider = OpenAI()\n", "\n", - "grounded = Groundedness(groundedness_provider=provider)\n", - "\n", "# Define a groundedness feedback function\n", "f_groundedness = (\n", - " Feedback(grounded.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n", + " Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n", " .on(Select.RecordCalls.retrieve.rets.collect())\n", " .on_output()\n", - " .aggregate(grounded.grounded_statements_aggregator)\n", ")\n", - "\n", "# Question/answer relevance between overall question and answer.\n", "f_answer_relevance = (\n", " Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", @@ -945,12 +939,12 @@ " .on_output()\n", ")\n", "\n", - "# Question/statement relevance between question and each context chunk.\n", + "# Context relevance between question and each context chunk.\n", "f_context_relevance = (\n", " Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n", " .on(Select.RecordCalls.retrieve.args.query)\n", - " .on(Select.RecordCalls.retrieve.rets.collect())\n", - " .aggregate(np.mean)\n", + " .on(Select.RecordCalls.retrieve.rets)\n", + " .aggregate(np.mean) # choose a different aggregation method if you wish\n", ")" ] }, @@ -2016,32 +2010,31 @@ "from trulens_eval.feedback import prompts\n", "\n", "class Custom_AzureOpenAI(AzureOpenAI):\n", - " def qs_relevance_with_cot_reasons_extreme(self, question: str, statement: str) -> Tuple[float, Dict]:\n", + " def context_relevance_with_cot_reasons_extreme(self, question: str, context: str) -> Tuple[float, Dict]:\n", " \"\"\"\n", - " Tweaked version of question statement relevance, extending AzureOpenAI provider.\n", + " Tweaked version of context relevance, extending AzureOpenAI provider.\n", " A function that completes a template to check the relevance of the statement to the question.\n", " Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.\n", " Also uses chain of thought methodology and emits the reasons.\n", "\n", " Args:\n", " question (str): A question being asked. \n", - " statement (str): A statement to the question.\n", + " context (str): A statement to the question.\n", "\n", " Returns:\n", " float: A value between 0 and 1. 0 being \"not relevant\" and 1 being \"relevant\".\n", " \"\"\"\n", "\n", - " system_prompt = str.format(prompts.QS_RELEVANCE, question = question, statement = statement)\n", - "\n", " # remove scoring guidelines around middle scores\n", - " system_prompt = system_prompt.replace(\n", + " system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(\n", " \"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\\n\\n\", \"\")\n", " \n", - " system_prompt = system_prompt.replace(\n", + " user_prompt = str.format(prompts.CONTEXT_RELEVANCE_USER, question = question, context = context)\n", + " user_prompt = user_prompt.replace(\n", " \"RELEVANCE:\", prompts.COT_REASONS_TEMPLATE\n", " )\n", "\n", - " return self.generate_score_and_reasons(system_prompt)" + " return self.generate_score_and_reasons(system_prompt, user_prompt)" ] }, { @@ -2125,7 +2118,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.14" }, "vscode": { "interpreter": { diff --git a/trulens_eval/generated_files/all_tools.py b/trulens_eval/generated_files/all_tools.py index 7b95d5ea9..d6061e751 100644 --- a/trulens_eval/generated_files/all_tools.py +++ b/trulens_eval/generated_files/all_tools.py @@ -115,14 +115,11 @@ def format_docs(docs): context = App.select_context(rag_chain) -from trulens_eval.feedback import Groundedness - -grounded = Groundedness(groundedness_provider=OpenAI()) # Define a groundedness feedback function f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons + Feedback(provider.groundedness_measure_with_cot_reasons ).on(context.collect()) # collect context chunks into a list - .on_output().aggregate(grounded.grounded_statements_aggregator) + .on_output() ) # Question/answer relevance between overall question and answer. @@ -345,14 +342,11 @@ def display_call_stack(data): context = App.select_context(query_engine) -from trulens_eval.feedback import Groundedness - -grounded = Groundedness(groundedness_provider=OpenAI()) # Define a groundedness feedback function f_groundedness = ( - Feedback(grounded.groundedness_measure_with_cot_reasons + Feedback(provider.groundedness_measure_with_cot_reasons ).on(context.collect()) # collect context chunks into a list - .on_output().aggregate(grounded.grounded_statements_aggregator) + .on_output().aggregate(provider.grounded_statements_aggregator) ) # Question/answer relevance between overall question and answer. @@ -447,6 +441,7 @@ def display_call_stack(data): import os os.environ["OPENAI_API_KEY"] = "sk-..." +os.environ["HUGGINGFACE_API_KEY"] = "hf_..." # ## Get Data # @@ -500,6 +495,10 @@ def display_call_stack(data): # In[ ]: +from openai import OpenAI + +oai_client = OpenAI() + class RAG_from_scratch: @@ -509,7 +508,7 @@ def retrieve(self, query: str) -> list: Retrieve relevant text from vector store. """ results = vector_store.query(query_texts=query, n_results=2) - return results['documents'][0] + return results['documents'] @instrument def generate_completion(self, query: str, context_str: list) -> str: @@ -552,21 +551,16 @@ def query(self, query: str) -> str: from trulens_eval import Feedback from trulens_eval import Select -from trulens_eval.feedback import Groundedness from trulens_eval.feedback.provider.openai import OpenAI provider = OpenAI() -grounded = Groundedness(groundedness_provider=provider) - # Define a groundedness feedback function f_groundedness = ( Feedback( - grounded.groundedness_measure_with_cot_reasons, name="Groundedness" - ).on(Select.RecordCalls.retrieve.rets.collect() - ).on_output().aggregate(grounded.grounded_statements_aggregator) + provider.groundedness_measure_with_cot_reasons, name="Groundedness" + ).on(Select.RecordCalls.retrieve.rets.collect()).on_output() ) - # Question/answer relevance between overall question and answer. f_answer_relevance = ( Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance").on( @@ -574,12 +568,13 @@ def query(self, query: str) -> str: ).on_output() ) -# Question/statement relevance between question and each context chunk. +# Context relevance between question and each context chunk. f_context_relevance = ( Feedback( provider.context_relevance_with_cot_reasons, name="Context Relevance" - ).on(Select.RecordCalls.retrieve.args.query - ).on(Select.RecordCalls.retrieve.rets.collect()).aggregate(np.mean) + ).on(Select.RecordCalls.retrieve.args.query).on( + Select.RecordCalls.retrieve.rets + ).aggregate(np.mean) # choose a different aggregation method if you wish ) # ## Construct the app @@ -1193,38 +1188,37 @@ def style_check_professional(self, response: str) -> float: class Custom_AzureOpenAI(AzureOpenAI): - def qs_relevance_with_cot_reasons_extreme( - self, question: str, statement: str + def context_relevance_with_cot_reasons_extreme( + self, question: str, context: str ) -> Tuple[float, Dict]: """ - Tweaked version of question statement relevance, extending AzureOpenAI provider. + Tweaked version of context relevance, extending AzureOpenAI provider. A function that completes a template to check the relevance of the statement to the question. Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores. Also uses chain of thought methodology and emits the reasons. Args: question (str): A question being asked. - statement (str): A statement to the question. + context (str): A statement to the question. Returns: float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant". """ - system_prompt = str.format( - prompts.QS_RELEVANCE, question=question, statement=statement - ) - # remove scoring guidelines around middle scores - system_prompt = system_prompt.replace( + system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace( "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n", "" ) - system_prompt = system_prompt.replace( + user_prompt = str.format( + prompts.CONTEXT_RELEVANCE_USER, question=question, context=context + ) + user_prompt = user_prompt.replace( "RELEVANCE:", prompts.COT_REASONS_TEMPLATE ) - return self.generate_score_and_reasons(system_prompt) + return self.generate_score_and_reasons(system_prompt, user_prompt) # ## Multi-Output Feedback functions diff --git a/trulens_eval/trulens_eval/app.py b/trulens_eval/trulens_eval/app.py index fc6c8d6bc..48bdd3666 100644 --- a/trulens_eval/trulens_eval/app.py +++ b/trulens_eval/trulens_eval/app.py @@ -349,7 +349,8 @@ class RecordingContext(): """ def __init__(self, app: mod_app.App, record_metadata: JSON = None): - self.calls: Dict[mod_types_schema.CallID, mod_record_schema.RecordAppCall] = {} + self.calls: Dict[mod_types_schema.CallID, + mod_record_schema.RecordAppCall] = {} """A record (in terms of its RecordAppCall) in process of being created. Storing as a map as we want to override calls with the same id which may @@ -418,11 +419,9 @@ def add_call(self, call: mod_record_schema.RecordAppCall): def finish_record( self, calls_to_record: Callable[[ - List[mod_record_schema.RecordAppCall], - mod_types_schema.Metadata, - Optional[mod_record_schema.Record] - ], mod_record_schema.Record - ], + List[mod_record_schema.RecordAppCall], mod_types_schema. + Metadata, Optional[mod_record_schema.Record] + ], mod_record_schema.Record], existing_record: Optional[mod_record_schema.Record] = None ): """ @@ -432,9 +431,7 @@ def finish_record( with self.lock: record = calls_to_record( - list(self.calls.values()), - self.record_metadata, - existing_record + list(self.calls.values()), self.record_metadata, existing_record ) self.calls = {} diff --git a/trulens_eval/trulens_eval/feedback/feedback.py b/trulens_eval/trulens_eval/feedback/feedback.py index 639a86d49..916c64713 100644 --- a/trulens_eval/trulens_eval/feedback/feedback.py +++ b/trulens_eval/trulens_eval/feedback/feedback.py @@ -109,13 +109,14 @@ def rag_triad( ret = {} for f_imp, f_agg, arg1name, arg1lens, arg2name, arg2lens, f_name in [ - (provider.groundedness_measure_with_cot_reasons, np.mean, "source", context.collect(), - "statement", answer, "Groundedness"), - (provider.relevance_with_cot_reasons, np.mean, "prompt", question, "response", answer, "Answer Relevance"), - (provider.context_relevance_with_cot_reasons, np.mean, "question", question, "context", - context, "Context Relevance") + (provider.groundedness_measure_with_cot_reasons, np.mean, "source", + context.collect(), "statement", answer, "Groundedness"), + (provider.relevance_with_cot_reasons, np.mean, "prompt", question, + "response", answer, "Answer Relevance"), + (provider.context_relevance_with_cot_reasons, np.mean, "question", + question, "context", context, "Context Relevance") ]: - f = Feedback(f_imp, if_exists=context, name = f_name).aggregate(f_agg) + f = Feedback(f_imp, if_exists=context, name=f_name).aggregate(f_agg) if arg1lens is not None: f = f.on(**{arg1name: arg1lens}) else: diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py index 610d3bbb6..51620289b 100644 --- a/trulens_eval/trulens_eval/feedback/provider/base.py +++ b/trulens_eval/trulens_eval/feedback/provider/base.py @@ -1,18 +1,18 @@ import logging -from typing import ClassVar, Dict, Optional, Sequence, Tuple, List +from typing import ClassVar, Dict, List, Optional, Sequence, Tuple import warnings +import nltk +from nltk.tokenize import sent_tokenize +import numpy as np +from tqdm.auto import tqdm + from trulens_eval.feedback import prompts from trulens_eval.feedback.provider.endpoint import base as mod_endpoint from trulens_eval.utils import generated as mod_generated_utils +from trulens_eval.utils.generated import re_0_10_rating from trulens_eval.utils.pyschema import WithClassInfo from trulens_eval.utils.serial import SerialModel -from trulens_eval.utils.generated import re_0_10_rating - -import nltk -from nltk.tokenize import sent_tokenize -import numpy as np -from tqdm.auto import tqdm logger = logging.getLogger(__name__) @@ -1119,7 +1119,7 @@ def stereotypes_with_cot_reasons(self, prompt: str, ) return self.generate_score_and_reasons(system_prompt, user_prompt) - + def groundedness_measure_with_cot_reasons( self, source: str, statement: str ) -> Tuple[float, dict]: @@ -1153,19 +1153,22 @@ def groundedness_measure_with_cot_reasons( nltk.download('punkt') groundedness_scores = {} reasons_str = "" - + hypotheses = sent_tokenize(statement) system_prompt = prompts.LLM_GROUNDEDNESS_SYSTEM for i, hypothesis in enumerate(tqdm( - hypotheses, desc="Groundedness per statement in source")): + hypotheses, desc="Groundedness per statement in source")): user_prompt = prompts.LLM_GROUNDEDNESS_USER.format( - premise=f"{source}", - hypothesis=f"{hypothesis}" + premise=f"{source}", hypothesis=f"{hypothesis}" + ) + score, reason = self.generate_score_and_reasons( + system_prompt, user_prompt ) - score, reason = self.generate_score_and_reasons(system_prompt, user_prompt) groundedness_scores[f"statement_{i}"] = score reasons_str += f"STATEMENT {i}:\n{reason['reason']}\n" # Calculate the average groundedness score from the scores dictionary - average_groundedness_score = float(np.mean(list(groundedness_scores.values()))) + average_groundedness_score = float( + np.mean(list(groundedness_scores.values())) + ) return average_groundedness_score, {"reasons": reasons_str} diff --git a/trulens_eval/trulens_eval/feedback/provider/hugs.py b/trulens_eval/trulens_eval/feedback/provider/hugs.py index 1af02e891..7c597b5bb 100644 --- a/trulens_eval/trulens_eval/feedback/provider/hugs.py +++ b/trulens_eval/trulens_eval/feedback/provider/hugs.py @@ -2,9 +2,13 @@ import logging from typing import Dict, get_args, get_origin, Optional, Tuple, Union +import nltk +from nltk.tokenize import sent_tokenize import numpy as np import requests +from tqdm.auto import tqdm +from trulens_eval.feedback import prompts from trulens_eval.feedback.provider.base import Provider from trulens_eval.feedback.provider.endpoint import HuggingfaceEndpoint from trulens_eval.feedback.provider.endpoint.base import DummyEndpoint @@ -12,12 +16,6 @@ from trulens_eval.utils.python import Future from trulens_eval.utils.python import locals_except from trulens_eval.utils.threading import ThreadPoolExecutor -from trulens_eval.feedback import prompts - -import nltk -from nltk.tokenize import sent_tokenize -import numpy as np -from tqdm.auto import tqdm logger = logging.getLogger(__name__) @@ -193,7 +191,7 @@ def get_scores(text): l1: float = float(1.0 - (np.linalg.norm(diff, ord=1)) / 2.0) return l1, dict(text1_scores=scores1, text2_scores=scores2) - + def groundedness_measure_with_nli(self, source: str, statement: str) -> Tuple[float, dict]: """ @@ -242,7 +240,9 @@ def groundedness_measure_with_nli(self, source: str, score=score * 10, ) groundedness_scores[f"statement_{i}"] = score - average_groundedness_score = float(np.mean(list(groundedness_scores.values()))) + average_groundedness_score = float( + np.mean(list(groundedness_scores.values())) + ) return average_groundedness_score, {"reasons": reasons_str} @_tci diff --git a/trulens_eval/trulens_eval/instruments.py b/trulens_eval/trulens_eval/instruments.py index e23a4357b..ed278c6ad 100644 --- a/trulens_eval/trulens_eval/instruments.py +++ b/trulens_eval/trulens_eval/instruments.py @@ -588,9 +588,11 @@ def handle_done(rets): if isinstance(rets, Awaitable): # If method produced an awaitable - logger.info(f"""This app produced an asynchronous response of type `{class_name(type(rets))}`. - This record will be updated once the response is available""") - + logger.info( + f"""This app produced an asynchronous response of type `{class_name(type(rets))}`. + This record will be updated once the response is available""" + ) + # TODO(piotrm): need to track costs of awaiting the ret in the # below. diff --git a/trulens_eval/trulens_eval/schema/types.py b/trulens_eval/trulens_eval/schema/types.py index 54c99bb3a..502e960a8 100644 --- a/trulens_eval/trulens_eval/schema/types.py +++ b/trulens_eval/trulens_eval/schema/types.py @@ -18,10 +18,12 @@ See [RecordAppCall.call_id][trulens_eval.schema.record.RecordAppCall.call_id]. """ + def new_call_id() -> CallID: """Generate a new call id.""" return str(uuid.uuid4()) + AppID: typing_extensions.TypeAlias = str """Unique identifier for an app. diff --git a/trulens_eval/trulens_eval/tru_rails.py b/trulens_eval/trulens_eval/tru_rails.py index 85c82767e..2ccbb89b7 100644 --- a/trulens_eval/trulens_eval/tru_rails.py +++ b/trulens_eval/trulens_eval/tru_rails.py @@ -442,7 +442,9 @@ def __getattr__(self, name): if name == "__name__": return self.__class__.__name__ # Return the class name of TruRails elif safe_hasattr(self.app, name): - return getattr(self.app, name) # Delegate to the wrapped app if it has the attribute + return getattr( + self.app, name + ) # Delegate to the wrapped app if it has the attribute else: raise AttributeError(f"TruRails has no attribute named {name}") diff --git a/trulens_eval/trulens_eval/utils/serial.py b/trulens_eval/trulens_eval/utils/serial.py index 935fa8e6b..39a2d1925 100644 --- a/trulens_eval/trulens_eval/utils/serial.py +++ b/trulens_eval/trulens_eval/utils/serial.py @@ -428,9 +428,8 @@ def get(self, obj: Dict[str, T]) -> Iterable[T]: logger.warning( "Object (of type %s is a sequence containing more than one dictionary. " "Lookup by item or attribute `%s` is ambiguous. " - "Use a lookup by index(es) or slice first to disambiguate.", - type(obj).__name__, - self.item_or_attribute + "Use a lookup by index(es) or slice first to disambiguate.", + type(obj).__name__, self.item_or_attribute ) for sub_obj in obj: try: