Automated File Generation from Docs Notebook Changes (#1137)

Co-authored-by: joshreini1 <[email protected]> Co-authored-by: Josh Reini <[email protected]>
truera · May 15, 2024 · 95d8d0b · 95d8d0b
1 parent 78dbb12
commit 95d8d0b
Show file tree

Hide file tree

Showing 11 changed files with 99 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -30,15 +30,15 @@ community](https://communityinviter.com/apps/aiqualityforum/josh)!
 
 **Don't just vibe-check your llm app!** Systematically evaluate and track your
 LLM experiments with TruLens. As you develop your app including prompts, models,
-retrievers, knowledge sources and more, *TruLens-Eval* is the tool you need to
+retreivers, knowledge sources and more, *TruLens-Eval* is the tool you need to
 understand its performance.
 
 Fine-grained, stack-agnostic instrumentation and comprehensive evaluations help
 you to identify failure modes & systematically iterate to improve your
 application.
 
 Read more about the core concepts behind TruLens including [Feedback
-Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/feedback_functions/),
+Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/
 [The RAG Triad](https://www.trulens.org/trulens_eval/getting_started/core_concepts/rag_triad/),
 and [Honest, Harmless and Helpful
 Evals](https://www.trulens.org/trulens_eval/getting_started/core_concepts/honest_harmless_helpful_evals/).

diff --git a/docs/trulens_eval/all_tools.ipynb b/docs/trulens_eval/all_tools.ipynb
@@ -194,14 +194,11 @@
     "from trulens_eval.app import App\n",
     "context = App.select_context(rag_chain)\n",
     "\n",
-    "from trulens_eval.feedback import Groundedness\n",
-    "grounded = Groundedness(groundedness_provider=OpenAI())\n",
     "# Define a groundedness feedback function\n",
     "f_groundedness = (\n",
-    "    Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
+    "    Feedback(provider.groundedness_measure_with_cot_reasons)\n",
     "    .on(context.collect()) # collect context chunks into a list\n",
     "    .on_output()\n",
-    "    .aggregate(grounded.grounded_statements_aggregator)\n",
     ")\n",
     "\n",
     "# Question/answer relevance between overall question and answer.\n",
@@ -582,14 +579,12 @@
     "from trulens_eval.app import App\n",
     "context = App.select_context(query_engine)\n",
     "\n",
-    "from trulens_eval.feedback import Groundedness\n",
-    "grounded = Groundedness(groundedness_provider=OpenAI())\n",
     "# Define a groundedness feedback function\n",
     "f_groundedness = (\n",
-    "    Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
+    "    Feedback(provider.groundedness_measure_with_cot_reasons)\n",
     "    .on(context.collect()) # collect context chunks into a list\n",
     "    .on_output()\n",
-    "    .aggregate(grounded.grounded_statements_aggregator)\n",
+    "    .aggregate(provider.grounded_statements_aggregator)\n",
     ")\n",
     "\n",
     "# Question/answer relevance between overall question and answer.\n",
@@ -762,7 +757,8 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "os.environ[\"HUGGINGFACE_API_KEY\"] = \"hf_...\""
    ]
   },
   {
@@ -862,6 +858,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from openai import OpenAI\n",
+    "oai_client = OpenAI()\n",
+    "\n",
     "class RAG_from_scratch:\n",
     "    @instrument\n",
     "    def retrieve(self, query: str) -> list:\n",
@@ -872,7 +871,7 @@
     "        query_texts=query,\n",
     "        n_results=2\n",
     "    )\n",
-    "        return results['documents'][0]\n",
+    "        return results['documents']\n",
     "\n",
     "    @instrument\n",
     "    def generate_completion(self, query: str, context_str: list) -> str:\n",
@@ -921,36 +920,31 @@
    "outputs": [],
    "source": [
     "from trulens_eval import Feedback, Select\n",
-    "from trulens_eval.feedback import Groundedness\n",
     "from trulens_eval.feedback.provider.openai import OpenAI\n",
     "\n",
     "import numpy as np\n",
     "\n",
     "provider = OpenAI()\n",
     "\n",
-    "grounded = Groundedness(groundedness_provider=provider)\n",
-    "\n",
     "# Define a groundedness feedback function\n",
     "f_groundedness = (\n",
-    "    Feedback(grounded.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n",
+    "    Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n",
     "    .on(Select.RecordCalls.retrieve.rets.collect())\n",
     "    .on_output()\n",
-    "    .aggregate(grounded.grounded_statements_aggregator)\n",
     ")\n",
-    "\n",
     "# Question/answer relevance between overall question and answer.\n",
     "f_answer_relevance = (\n",
     "    Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
     "    .on(Select.RecordCalls.retrieve.args.query)\n",
     "    .on_output()\n",
     ")\n",
     "\n",
-    "# Question/statement relevance between question and each context chunk.\n",
+    "# Context relevance between question and each context chunk.\n",
     "f_context_relevance = (\n",
     "    Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
     "    .on(Select.RecordCalls.retrieve.args.query)\n",
-    "    .on(Select.RecordCalls.retrieve.rets.collect())\n",
-    "    .aggregate(np.mean)\n",
+    "    .on(Select.RecordCalls.retrieve.rets)\n",
+    "    .aggregate(np.mean) # choose a different aggregation method if you wish\n",
     ")"
    ]
   },
@@ -2016,32 +2010,31 @@
     "from trulens_eval.feedback import prompts\n",
     "\n",
     "class Custom_AzureOpenAI(AzureOpenAI):\n",
-    "    def qs_relevance_with_cot_reasons_extreme(self, question: str, statement: str) -> Tuple[float, Dict]:\n",
+    "    def context_relevance_with_cot_reasons_extreme(self, question: str, context: str) -> Tuple[float, Dict]:\n",
     "        \"\"\"\n",
-    "        Tweaked version of question statement relevance, extending AzureOpenAI provider.\n",
+    "        Tweaked version of context relevance, extending AzureOpenAI provider.\n",
     "        A function that completes a template to check the relevance of the statement to the question.\n",
     "        Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.\n",
     "        Also uses chain of thought methodology and emits the reasons.\n",
     "\n",
     "        Args:\n",
     "            question (str): A question being asked. \n",
-    "            statement (str): A statement to the question.\n",
+    "            context (str): A statement to the question.\n",
     "\n",
     "        Returns:\n",
     "            float: A value between 0 and 1. 0 being \"not relevant\" and 1 being \"relevant\".\n",
     "        \"\"\"\n",
     "\n",
-    "        system_prompt = str.format(prompts.QS_RELEVANCE, question = question, statement = statement)\n",
-    "\n",
     "        # remove scoring guidelines around middle scores\n",
-    "        system_prompt = system_prompt.replace(\n",
+    "        system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(\n",
     "        \"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\\n\\n\", \"\")\n",
     "        \n",
-    "        system_prompt = system_prompt.replace(\n",
+    "        user_prompt = str.format(prompts.CONTEXT_RELEVANCE_USER, question = question, context = context)\n",
+    "        user_prompt = user_prompt.replace(\n",
     "            \"RELEVANCE:\", prompts.COT_REASONS_TEMPLATE\n",
     "        )\n",
     "\n",
-    "        return self.generate_score_and_reasons(system_prompt)"
+    "        return self.generate_score_and_reasons(system_prompt, user_prompt)"
    ]
   },
   {
@@ -2125,7 +2118,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/trulens_eval/generated_files/all_tools.py b/trulens_eval/generated_files/all_tools.py
@@ -115,14 +115,11 @@ def format_docs(docs):
 
 context = App.select_context(rag_chain)
 
-from trulens_eval.feedback import Groundedness
-
-grounded = Groundedness(groundedness_provider=OpenAI())
 # Define a groundedness feedback function
 f_groundedness = (
-    Feedback(grounded.groundedness_measure_with_cot_reasons
+    Feedback(provider.groundedness_measure_with_cot_reasons
             ).on(context.collect())  # collect context chunks into a list
-    .on_output().aggregate(grounded.grounded_statements_aggregator)
+    .on_output()
 )
 
 # Question/answer relevance between overall question and answer.
@@ -345,14 +342,11 @@ def display_call_stack(data):
 
 context = App.select_context(query_engine)
 
-from trulens_eval.feedback import Groundedness
-
-grounded = Groundedness(groundedness_provider=OpenAI())
 # Define a groundedness feedback function
 f_groundedness = (
-    Feedback(grounded.groundedness_measure_with_cot_reasons
+    Feedback(provider.groundedness_measure_with_cot_reasons
             ).on(context.collect())  # collect context chunks into a list
-    .on_output().aggregate(grounded.grounded_statements_aggregator)
+    .on_output().aggregate(provider.grounded_statements_aggregator)
 )
 
 # Question/answer relevance between overall question and answer.
@@ -447,6 +441,7 @@ def display_call_stack(data):
 import os
 
 os.environ["OPENAI_API_KEY"] = "sk-..."
+os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
 
 # ## Get Data
 #
@@ -500,6 +495,10 @@ def display_call_stack(data):
 
 # In[ ]:
 
+from openai import OpenAI
+
+oai_client = OpenAI()
+
 
 class RAG_from_scratch:
 
@@ -509,7 +508,7 @@ def retrieve(self, query: str) -> list:
         Retrieve relevant text from vector store.
         """
         results = vector_store.query(query_texts=query, n_results=2)
-        return results['documents'][0]
+        return results['documents']
 
     @instrument
     def generate_completion(self, query: str, context_str: list) -> str:
@@ -552,34 +551,30 @@ def query(self, query: str) -> str:
 
 from trulens_eval import Feedback
 from trulens_eval import Select
-from trulens_eval.feedback import Groundedness
 from trulens_eval.feedback.provider.openai import OpenAI
 
 provider = OpenAI()
 
-grounded = Groundedness(groundedness_provider=provider)
-
 # Define a groundedness feedback function
 f_groundedness = (
     Feedback(
-        grounded.groundedness_measure_with_cot_reasons, name="Groundedness"
-    ).on(Select.RecordCalls.retrieve.rets.collect()
-        ).on_output().aggregate(grounded.grounded_statements_aggregator)
+        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
+    ).on(Select.RecordCalls.retrieve.rets.collect()).on_output()
 )
-
 # Question/answer relevance between overall question and answer.
 f_answer_relevance = (
     Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance").on(
         Select.RecordCalls.retrieve.args.query
     ).on_output()
 )
 
-# Question/statement relevance between question and each context chunk.
+# Context relevance between question and each context chunk.
 f_context_relevance = (
     Feedback(
         provider.context_relevance_with_cot_reasons, name="Context Relevance"
-    ).on(Select.RecordCalls.retrieve.args.query
-        ).on(Select.RecordCalls.retrieve.rets.collect()).aggregate(np.mean)
+    ).on(Select.RecordCalls.retrieve.args.query).on(
+        Select.RecordCalls.retrieve.rets
+    ).aggregate(np.mean)  # choose a different aggregation method if you wish
 )
 
 # ## Construct the app
@@ -1193,38 +1188,37 @@ def style_check_professional(self, response: str) -> float:
 
 class Custom_AzureOpenAI(AzureOpenAI):
 
-    def qs_relevance_with_cot_reasons_extreme(
-        self, question: str, statement: str
+    def context_relevance_with_cot_reasons_extreme(
+        self, question: str, context: str
     ) -> Tuple[float, Dict]:
         """
-        Tweaked version of question statement relevance, extending AzureOpenAI provider.
+        Tweaked version of context relevance, extending AzureOpenAI provider.
         A function that completes a template to check the relevance of the statement to the question.
         Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
         Also uses chain of thought methodology and emits the reasons.
 
         Args:
             question (str): A question being asked. 
-            statement (str): A statement to the question.
+            context (str): A statement to the question.
 
         Returns:
             float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
         """
 
-        system_prompt = str.format(
-            prompts.QS_RELEVANCE, question=question, statement=statement
-        )
-
         # remove scoring guidelines around middle scores
-        system_prompt = system_prompt.replace(
+        system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(
             "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
             ""
         )
 
-        system_prompt = system_prompt.replace(
+        user_prompt = str.format(
+            prompts.CONTEXT_RELEVANCE_USER, question=question, context=context
+        )
+        user_prompt = user_prompt.replace(
             "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
         )
 
-        return self.generate_score_and_reasons(system_prompt)
+        return self.generate_score_and_reasons(system_prompt, user_prompt)
 
 
 # ## Multi-Output Feedback functions

diff --git a/trulens_eval/trulens_eval/app.py b/trulens_eval/trulens_eval/app.py
@@ -349,7 +349,8 @@ class RecordingContext():
     """
 
     def __init__(self, app: mod_app.App, record_metadata: JSON = None):
-        self.calls: Dict[mod_types_schema.CallID, mod_record_schema.RecordAppCall] = {}
+        self.calls: Dict[mod_types_schema.CallID,
+                         mod_record_schema.RecordAppCall] = {}
         """A record (in terms of its RecordAppCall) in process of being created.
         
         Storing as a map as we want to override calls with the same id which may
@@ -418,11 +419,9 @@ def add_call(self, call: mod_record_schema.RecordAppCall):
     def finish_record(
         self,
         calls_to_record: Callable[[
-            List[mod_record_schema.RecordAppCall],
-            mod_types_schema.Metadata,
-            Optional[mod_record_schema.Record]
-            ], mod_record_schema.Record
-        ],
+            List[mod_record_schema.RecordAppCall], mod_types_schema.
+            Metadata, Optional[mod_record_schema.Record]
+        ], mod_record_schema.Record],
         existing_record: Optional[mod_record_schema.Record] = None
     ):
         """
@@ -432,9 +431,7 @@ def finish_record(
 
         with self.lock:
             record = calls_to_record(
-                list(self.calls.values()),
-                self.record_metadata,
-                existing_record
+                list(self.calls.values()), self.record_metadata, existing_record
             )
             self.calls = {}
 

diff --git a/trulens_eval/trulens_eval/feedback/feedback.py b/trulens_eval/trulens_eval/feedback/feedback.py
@@ -109,13 +109,14 @@ def rag_triad(
     ret = {}
 
     for f_imp, f_agg, arg1name, arg1lens, arg2name, arg2lens, f_name in [
-        (provider.groundedness_measure_with_cot_reasons, np.mean, "source", context.collect(),
-         "statement", answer, "Groundedness"),
-        (provider.relevance_with_cot_reasons, np.mean, "prompt", question, "response", answer, "Answer Relevance"),
-        (provider.context_relevance_with_cot_reasons, np.mean, "question", question, "context",
-         context, "Context Relevance")
+        (provider.groundedness_measure_with_cot_reasons, np.mean, "source",
+         context.collect(), "statement", answer, "Groundedness"),
+        (provider.relevance_with_cot_reasons, np.mean, "prompt", question,
+         "response", answer, "Answer Relevance"),
+        (provider.context_relevance_with_cot_reasons, np.mean, "question",
+         question, "context", context, "Context Relevance")
     ]:
-        f = Feedback(f_imp, if_exists=context, name = f_name).aggregate(f_agg)
+        f = Feedback(f_imp, if_exists=context, name=f_name).aggregate(f_agg)
         if arg1lens is not None:
             f = f.on(**{arg1name: arg1lens})
         else: