From 1c2a6462dba17fd1328b12472ba5c69e7eb5181c Mon Sep 17 00:00:00 2001
From: Josh Reini <60949774+joshreini1@users.noreply.github.com>
Date: Fri, 8 Mar 2024 18:01:49 -0500
Subject: [PATCH] QS Relevance -> Context Relevance (#977)

* qs relevance -> context relevance

* update quickstart notebooks

* update langchain quickstart

* update missed prompt change
---
 .../quickstart/langchain_quickstart.ipynb     |  16 ++-
 .../quickstart/llama_index_quickstart.ipynb   |  24 ++--
 .../examples/quickstart/quickstart.ipynb      |  17 ++-
 .../quickstart/text2text_quickstart.ipynb     |   6 +-
 trulens_eval/trulens_eval/feedback/prompts.py |   2 +-
 .../trulens_eval/feedback/provider/base.py    | 126 +++++++++++++++---
 .../trulens_eval/feedback/v2/feedback.py      |  56 ++------
 7 files changed, 150 insertions(+), 97 deletions(-)

diff --git a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb
index 79358d025..5b99175a2 100644
--- a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb
+++ b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb
@@ -56,7 +56,7 @@
    "outputs": [],
    "source": [
     "# Imports main tools:\n",
-    "from trulens_eval import TruChain, Feedback, Tru\n",
+    "from trulens_eval import TruChain, Tru\n",
     "tru = Tru()\n",
     "tru.reset_database()\n",
     "\n",
@@ -184,10 +184,11 @@
    "outputs": [],
    "source": [
     "from trulens_eval.feedback.provider import OpenAI\n",
+    "from trulens_eval import Feedback\n",
     "import numpy as np\n",
     "\n",
     "# Initialize provider class\n",
-    "openai = OpenAI()\n",
+    "provider = OpenAI()\n",
     "\n",
     "# select context to be used in feedback. the location of context is app specific.\n",
     "from trulens_eval.app import App\n",
@@ -204,10 +205,13 @@
     ")\n",
     "\n",
     "# Question/answer relevance between overall question and answer.\n",
-    "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n",
+    "f_answer_relevance = (\n",
+    "    Feedback(provider.relevance)\n",
+    "    .on_input_output()\n",
+    ")\n",
     "# Question/statement relevance between question and each context chunk.\n",
     "f_context_relevance = (\n",
-    "    Feedback(openai.qs_relevance)\n",
+    "    Feedback(provider.context_relevance_with_cot_reasons)\n",
     "    .on_input()\n",
     "    .on(context)\n",
     "    .aggregate(np.mean)\n",
@@ -230,7 +234,7 @@
    "source": [
     "tru_recorder = TruChain(rag_chain,\n",
     "    app_id='Chain1_ChatApplication',\n",
-    "    feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])"
+    "    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])"
    ]
   },
   {
@@ -437,7 +441,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.9.18"
   },
   "vscode": {
    "interpreter": {
diff --git a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb
index 04ddbdde0..8c45cc553 100644
--- a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb
+++ b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb
@@ -147,22 +147,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from trulens_eval.feedback.provider import OpenAI\n",
+    "from trulens_eval import Feedback\n",
     "import numpy as np\n",
     "\n",
     "# Initialize provider class\n",
-    "from trulens_eval.feedback.provider.openai import OpenAI\n",
-    "openai = OpenAI()\n",
+    "provider = OpenAI()\n",
     "\n",
     "# select context to be used in feedback. the location of context is app specific.\n",
     "from trulens_eval.app import App\n",
     "context = App.select_context(query_engine)\n",
     "\n",
-    "# imports for feedback\n",
-    "from trulens_eval import Feedback\n",
-    "\n",
-    "# Define a groundedness feedback function\n",
     "from trulens_eval.feedback import Groundedness\n",
     "grounded = Groundedness(groundedness_provider=OpenAI())\n",
+    "# Define a groundedness feedback function\n",
     "f_groundedness = (\n",
     "    Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
     "    .on(context.collect()) # collect context chunks into a list\n",
@@ -171,11 +169,13 @@
     ")\n",
     "\n",
     "# Question/answer relevance between overall question and answer.\n",
-    "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n",
-    "\n",
+    "f_answer_relevance = (\n",
+    "    Feedback(provider.relevance)\n",
+    "    .on_input_output()\n",
+    ")\n",
     "# Question/statement relevance between question and each context chunk.\n",
-    "f_qs_relevance = (\n",
-    "    Feedback(openai.qs_relevance)\n",
+    "f_context_relevance = (\n",
+    "    Feedback(provider.context_relevance_with_cot_reasons)\n",
     "    .on_input()\n",
     "    .on(context)\n",
     "    .aggregate(np.mean)\n",
@@ -199,7 +199,7 @@
     "from trulens_eval import TruLlama\n",
     "tru_query_engine_recorder = TruLlama(query_engine,\n",
     "    app_id='LlamaIndex_App1',\n",
-    "    feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])"
+    "    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])"
    ]
   },
   {
@@ -326,7 +326,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.5"
   },
   "vscode": {
    "interpreter": {
diff --git a/trulens_eval/examples/quickstart/quickstart.ipynb b/trulens_eval/examples/quickstart/quickstart.ipynb
index dbe2014a1..7d6dff3b4 100644
--- a/trulens_eval/examples/quickstart/quickstart.ipynb
+++ b/trulens_eval/examples/quickstart/quickstart.ipynb
@@ -204,14 +204,13 @@
    "source": [
     "from trulens_eval import Feedback, Select\n",
     "from trulens_eval.feedback import Groundedness\n",
-    "from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI\n",
+    "from trulens_eval.feedback.provider.openai import OpenAI\n",
     "\n",
     "import numpy as np\n",
     "\n",
-    "# Initialize provider class\n",
-    "fopenai = fOpenAI()\n",
+    "provider = OpenAI()\n",
     "\n",
-    "grounded = Groundedness(groundedness_provider=fopenai)\n",
+    "grounded = Groundedness(groundedness_provider=provider)\n",
     "\n",
     "# Define a groundedness feedback function\n",
     "f_groundedness = (\n",
@@ -222,15 +221,15 @@
     ")\n",
     "\n",
     "# Question/answer relevance between overall question and answer.\n",
-    "f_qa_relevance = (\n",
-    "    Feedback(fopenai.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
+    "f_answer_relevance = (\n",
+    "    Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
     "    .on(Select.RecordCalls.retrieve.args.query)\n",
     "    .on_output()\n",
     ")\n",
     "\n",
     "# Question/statement relevance between question and each context chunk.\n",
     "f_context_relevance = (\n",
-    "    Feedback(fopenai.qs_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
+    "    Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
     "    .on(Select.RecordCalls.retrieve.args.query)\n",
     "    .on(Select.RecordCalls.retrieve.rets.collect())\n",
     "    .aggregate(np.mean)\n",
@@ -254,7 +253,7 @@
     "from trulens_eval import TruCustomApp\n",
     "tru_rag = TruCustomApp(rag,\n",
     "    app_id = 'RAG v1',\n",
-    "    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])"
+    "    feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])"
    ]
   },
   {
@@ -310,7 +309,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb
index 19ddadc9c..82b77fd85 100644
--- a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb
+++ b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb
@@ -128,7 +128,7 @@
     "fopenai = fOpenAI()\n",
     "\n",
     "# Define a relevance function from openai\n",
-    "f_relevance = Feedback(fopenai.relevance).on_input_output()"
+    "f_answer_relevance = Feedback(fopenai.relevance).on_input_output()"
    ]
   },
   {
@@ -146,7 +146,7 @@
    "outputs": [],
    "source": [
     "from trulens_eval import TruBasicApp\n",
-    "tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_relevance])"
+    "tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_answer_relevance])"
    ]
   },
   {
@@ -220,7 +220,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/trulens_eval/trulens_eval/feedback/prompts.py b/trulens_eval/trulens_eval/feedback/prompts.py
index 5ca4c24c5..b679ef64e 100644
--- a/trulens_eval/trulens_eval/feedback/prompts.py
+++ b/trulens_eval/trulens_eval/feedback/prompts.py
@@ -46,7 +46,7 @@
 STATEMENT: {hypothesis}
 """
 
-QS_RELEVANCE = v2.QuestionStatementRelevance.prompt.template
+CONTEXT_RELEVANCE = v2.ContextRelevance.prompt.template
 PR_RELEVANCE = v2.PromptResponseRelevance.prompt.template
 
 SYSTEM_FIND_SUPPORTING = """
diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py
index af82a71d5..520f634a9 100644
--- a/trulens_eval/trulens_eval/feedback/provider/base.py
+++ b/trulens_eval/trulens_eval/feedback/provider/base.py
@@ -239,23 +239,56 @@ def generate_score_and_reasons(
             )
             return score, {}
 
-    def qs_relevance(self, question: str, statement: str) -> float:
+    def context_relevance(self, question: str, context: str) -> float:
         """
         Uses chat completion model. A function that completes a template to
-        check the relevance of the statement to the question.
+        check the relevance of the context to the question.
+        
+        Usage on RAG Contexts:
 
         ```python
-        feedback = Feedback(provider.qs_relevance).on_input_output() 
+        from trulens_eval.app import App
+        context = App.select_context(rag_app)
+        feedback = (
+            Feedback(provider.context_relevance_with_cot_reasons)
+            .on_input()
+            .on(context)
+            .aggregate(np.mean)
+            )
         ```
-        The `on_input_output()` selector can be changed. See [Feedback Function
-        Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/)
+
+        The `on(...)` selector can be changed. See [Feedback Function Guide :
+        Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
+
+        Args:
+            question (str): A question being asked. 
+            context (str): Context related to the question.
+
+        Returns:
+            float: A value between 0.0 (not relevant) and 1.0 (relevant).
+        """
+        
+        return self.generate_score(
+            system_prompt=str.format(
+                prompts.CONTEXT_RELEVANCE, question=question, context=context
+            )
+        )
+    def qs_relevance(self, question: str, context: str) -> float:
+        """
+        Uses chat completion model. A function that completes a template to
+        check the relevance of the statement to the question.
         
         Usage on RAG Contexts:
 
         ```python
-        feedback = Feedback(provider.qs_relevance).on_input().on(
-            TruLlama.select_source_nodes().node.text # See note below
-        ).aggregate(np.mean) 
+        from trulens_eval.app import App
+        context = App.select_context(rag_app)
+        feedback = (
+            Feedback(provider.context_relevance_with_cot_reasons)
+            .on_input()
+            .on(context)
+            .aggregate(np.mean)
+            )
         ```
 
         The `on(...)` selector can be changed. See [Feedback Function Guide :
@@ -263,53 +296,102 @@ def qs_relevance(self, question: str, statement: str) -> float:
 
         Args:
             question (str): A question being asked. 
-            statement (str): A statement to the question.
+            context (str): A context to the question.
 
         Returns:
             float: A value between 0.0 (not relevant) and 1.0 (relevant).
         """
+
+        warnings.warn(
+            "The method 'qs_relevance' is deprecated and will be removed in future versions. "
+            "Please use 'context_relevance' instead.", DeprecationWarning
+        )
+        
         return self.generate_score(
             system_prompt=str.format(
-                prompts.QS_RELEVANCE, question=question, statement=statement
+                prompts.CONTEXT_RELEVANCE, question=question, context=context
             )
         )
 
-    def qs_relevance_with_cot_reasons(self, question: str,
-                                      statement: str) -> Tuple[float, Dict]:
+    def context_relevance_with_cot_reasons(self, question: str,
+                                           context: str) -> Tuple[float, Dict]:
         """
         Uses chat completion model. A function that completes a
-        template to check the relevance of the statement to the question.
+        template to check the relevance of the context to the question.
         Also uses chain of thought methodology and emits the reasons.
 
         **Usage:**
+        Usage on RAG Contexts:
+
         ```
-        feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input_output() 
+        from trulens_eval.app import App
+        context = App.select_context(rag_app)
+        feedback = (
+            Feedback(provider.context_relevance_with_cot_reasons)
+            .on_input()
+            .on(context)
+            .aggregate(np.mean)
+            )
         ```
-        The `on_input_output()` selector can be changed. See [Feedback Function Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/)
-        
+        The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
+
+
+        Args:
+            question (str): A question being asked. 
+            context (str): Context related to the question.
+
+        Returns:
+            float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
+        """
+        system_prompt = str.format(
+            prompts.CONTEXT_RELEVANCE, question=question, context=context
+        )
+        system_prompt = system_prompt.replace(
+            "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
+        )
+        return self.generate_score_and_reasons(system_prompt)
+    
+    def qs_relevance_with_cot_reasons(self, question: str,
+                                           context: str) -> Tuple[float, Dict]:
+        """
+        Uses chat completion model. A function that completes a
+        template to check the relevance of the context to the question.
+        Also uses chain of thought methodology and emits the reasons.
+
+        **Usage:**
         Usage on RAG Contexts:
         ```
-        feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input().on(
-            TruLlama.select_source_nodes().node.text # See note below
-        ).aggregate(np.mean) 
-
+        from trulens_eval.app import App
+        context = App.select_context(rag_app)
+        feedback = (
+            Feedback(provider.qs_relevance_with_cot_reasons)
+            .on_input()
+            .on(context)
+            .aggregate(np.mean)
+            )
         ```
         The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
 
 
         Args:
             question (str): A question being asked. 
-            statement (str): A statement to the question.
+            context (str): Context related to the question.
 
         Returns:
             float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
         """
         system_prompt = str.format(
-            prompts.QS_RELEVANCE, question=question, statement=statement
+            prompts.CONTEXT_RELEVANCE, question=question, context=context
         )
         system_prompt = system_prompt.replace(
             "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
         )
+
+        warnings.warn(
+            "The method 'qs_relevance_with_cot_reasons' is deprecated and will be removed in future versions. "
+            "Please use 'context_relevance_with_cot_reasons' instead.", DeprecationWarning
+        )
+
         return self.generate_score_and_reasons(system_prompt)
 
     def relevance(self, prompt: str, response: str) -> float:
diff --git a/trulens_eval/trulens_eval/feedback/v2/feedback.py b/trulens_eval/trulens_eval/feedback/v2/feedback.py
index fb8093d7d..b774ed633 100644
--- a/trulens_eval/trulens_eval/feedback/v2/feedback.py
+++ b/trulens_eval/trulens_eval/feedback/v2/feedback.py
@@ -139,38 +139,8 @@ class Relevance(Semantics):
 This evaluates the *relevance* of the LLM response to the given text by LLM
 prompting.
 
-Relevance is currently only available with OpenAI ChatCompletion API.
-
-TruLens offers two particular flavors of relevance: 1. *Prompt response
-relevance* is best for measuring the relationship of the final answer to the
-user inputed question. This flavor of relevance is particularly optimized for
-the following features:
-
-    * Relevance requires adherence to the entire prompt.
-    * Responses that don't provide a definitive answer can still be relevant
-    * Admitting lack of knowledge and refusals are still relevant.
-    * Feedback mechanism should differentiate between seeming and actual
-      relevance.
-    * Relevant but inconclusive statements should get increasingly high scores
-      as they are more helpful for answering the query.
-
-    You can read more information about the performance of prompt response
-    relevance by viewing its [smoke test results](../pr_relevance_smoke_tests/).
-
-2. *Question statement relevance*, sometimes known as context relevance, is best
-   for measuring the relationship of a provided context to the user inputed
-   question. This flavor of relevance is optimized for a slightly different set
-   of features:
-    * Relevance requires adherence to the entire query.
-    * Long context with small relevant chunks are relevant.
-    * Context that provides no answer can still be relevant.
-    * Feedback mechanism should differentiate between seeming and actual
-      relevance.
-    * Relevant but inconclusive statements should get increasingly high scores
-      as they are more helpful for answering the query.
-
-    You can read more information about the performance of question statement
-    relevance by viewing its [smoke test results](../qs_relevance_smoke_tests/).
+Relevance is available for any LLM provider.
+
     """
     # openai.relevance
     # openai.relevance_with_cot_reasons
@@ -194,37 +164,35 @@ class Groundedness(Semantics, WithPrompt):
     )
 
 
-class QuestionStatementRelevance(Relevance, WithPrompt):
+class ContextRelevance(Relevance, WithPrompt):
     # openai.qs_relevance
     # openai.qs_relevance_with_cot_reasons
 
     prompt: ClassVar[PromptTemplate] = PromptTemplate.from_template(
-        """You are a RELEVANCE grader; providing the relevance of the given STATEMENT to the given QUESTION.
+        """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION.
 Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 
 
 A few additional scoring guidelines:
 
-- Long STATEMENTS should score equally well as short STATEMENTS.
-
-- RELEVANCE score should increase as the STATEMENT provides more RELEVANT context to the QUESTION.
+- Long CONTEXTS should score equally well as short CONTEXTS.
 
-- RELEVANCE score should increase as the STATEMENT provides RELEVANT context to more parts of the QUESTION.
+- RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION.
 
-- STATEMENT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.
+- RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION.
 
-- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.
+- CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.
 
-- STATEMENT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.
+- CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.
 
-- STATEMENT must be relevant and helpful for answering the entire QUESTION to get a score of 10.
+- CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.
 
-- Answers that intentionally do not answer the question, such as 'I don't know', should also be counted as the most relevant.
+- CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10.
 
 - Never elaborate.
 
 QUESTION: {question}
 
-STATEMENT: {statement}
+CONTEXT: {context}
 
 RELEVANCE: """
     )