From 1c2a6462dba17fd1328b12472ba5c69e7eb5181c Mon Sep 17 00:00:00 2001 From: Josh Reini <60949774+joshreini1@users.noreply.github.com> Date: Fri, 8 Mar 2024 18:01:49 -0500 Subject: [PATCH] QS Relevance -> Context Relevance (#977) * qs relevance -> context relevance * update quickstart notebooks * update langchain quickstart * update missed prompt change --- .../quickstart/langchain_quickstart.ipynb | 16 ++- .../quickstart/llama_index_quickstart.ipynb | 24 ++-- .../examples/quickstart/quickstart.ipynb | 17 ++- .../quickstart/text2text_quickstart.ipynb | 6 +- trulens_eval/trulens_eval/feedback/prompts.py | 2 +- .../trulens_eval/feedback/provider/base.py | 126 +++++++++++++++--- .../trulens_eval/feedback/v2/feedback.py | 56 ++------ 7 files changed, 150 insertions(+), 97 deletions(-) diff --git a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb index 79358d025..5b99175a2 100644 --- a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb @@ -56,7 +56,7 @@ "outputs": [], "source": [ "# Imports main tools:\n", - "from trulens_eval import TruChain, Feedback, Tru\n", + "from trulens_eval import TruChain, Tru\n", "tru = Tru()\n", "tru.reset_database()\n", "\n", @@ -184,10 +184,11 @@ "outputs": [], "source": [ "from trulens_eval.feedback.provider import OpenAI\n", + "from trulens_eval import Feedback\n", "import numpy as np\n", "\n", "# Initialize provider class\n", - "openai = OpenAI()\n", + "provider = OpenAI()\n", "\n", "# select context to be used in feedback. the location of context is app specific.\n", "from trulens_eval.app import App\n", @@ -204,10 +205,13 @@ ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n", + "f_answer_relevance = (\n", + " Feedback(provider.relevance)\n", + " .on_input_output()\n", + ")\n", "# Question/statement relevance between question and each context chunk.\n", "f_context_relevance = (\n", - " Feedback(openai.qs_relevance)\n", + " Feedback(provider.context_relevance_with_cot_reasons)\n", " .on_input()\n", " .on(context)\n", " .aggregate(np.mean)\n", @@ -230,7 +234,7 @@ "source": [ "tru_recorder = TruChain(rag_chain,\n", " app_id='Chain1_ChatApplication',\n", - " feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])" + " feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])" ] }, { @@ -437,7 +441,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb index 04ddbdde0..8c45cc553 100644 --- a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb @@ -147,22 +147,20 @@ "metadata": {}, "outputs": [], "source": [ + "from trulens_eval.feedback.provider import OpenAI\n", + "from trulens_eval import Feedback\n", "import numpy as np\n", "\n", "# Initialize provider class\n", - "from trulens_eval.feedback.provider.openai import OpenAI\n", - "openai = OpenAI()\n", + "provider = OpenAI()\n", "\n", "# select context to be used in feedback. the location of context is app specific.\n", "from trulens_eval.app import App\n", "context = App.select_context(query_engine)\n", "\n", - "# imports for feedback\n", - "from trulens_eval import Feedback\n", - "\n", - "# Define a groundedness feedback function\n", "from trulens_eval.feedback import Groundedness\n", "grounded = Groundedness(groundedness_provider=OpenAI())\n", + "# Define a groundedness feedback function\n", "f_groundedness = (\n", " Feedback(grounded.groundedness_measure_with_cot_reasons)\n", " .on(context.collect()) # collect context chunks into a list\n", @@ -171,11 +169,13 @@ ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = Feedback(openai.relevance).on_input_output()\n", - "\n", + "f_answer_relevance = (\n", + " Feedback(provider.relevance)\n", + " .on_input_output()\n", + ")\n", "# Question/statement relevance between question and each context chunk.\n", - "f_qs_relevance = (\n", - " Feedback(openai.qs_relevance)\n", + "f_context_relevance = (\n", + " Feedback(provider.context_relevance_with_cot_reasons)\n", " .on_input()\n", " .on(context)\n", " .aggregate(np.mean)\n", @@ -199,7 +199,7 @@ "from trulens_eval import TruLlama\n", "tru_query_engine_recorder = TruLlama(query_engine,\n", " app_id='LlamaIndex_App1',\n", - " feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])" + " feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])" ] }, { @@ -326,7 +326,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.5" }, "vscode": { "interpreter": { diff --git a/trulens_eval/examples/quickstart/quickstart.ipynb b/trulens_eval/examples/quickstart/quickstart.ipynb index dbe2014a1..7d6dff3b4 100644 --- a/trulens_eval/examples/quickstart/quickstart.ipynb +++ b/trulens_eval/examples/quickstart/quickstart.ipynb @@ -204,14 +204,13 @@ "source": [ "from trulens_eval import Feedback, Select\n", "from trulens_eval.feedback import Groundedness\n", - "from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI\n", + "from trulens_eval.feedback.provider.openai import OpenAI\n", "\n", "import numpy as np\n", "\n", - "# Initialize provider class\n", - "fopenai = fOpenAI()\n", + "provider = OpenAI()\n", "\n", - "grounded = Groundedness(groundedness_provider=fopenai)\n", + "grounded = Groundedness(groundedness_provider=provider)\n", "\n", "# Define a groundedness feedback function\n", "f_groundedness = (\n", @@ -222,15 +221,15 @@ ")\n", "\n", "# Question/answer relevance between overall question and answer.\n", - "f_qa_relevance = (\n", - " Feedback(fopenai.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", + "f_answer_relevance = (\n", + " Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", " .on(Select.RecordCalls.retrieve.args.query)\n", " .on_output()\n", ")\n", "\n", "# Question/statement relevance between question and each context chunk.\n", "f_context_relevance = (\n", - " Feedback(fopenai.qs_relevance_with_cot_reasons, name = \"Context Relevance\")\n", + " Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n", " .on(Select.RecordCalls.retrieve.args.query)\n", " .on(Select.RecordCalls.retrieve.rets.collect())\n", " .aggregate(np.mean)\n", @@ -254,7 +253,7 @@ "from trulens_eval import TruCustomApp\n", "tru_rag = TruCustomApp(rag,\n", " app_id = 'RAG v1',\n", - " feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])" + " feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])" ] }, { @@ -310,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb index 19ddadc9c..82b77fd85 100644 --- a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb @@ -128,7 +128,7 @@ "fopenai = fOpenAI()\n", "\n", "# Define a relevance function from openai\n", - "f_relevance = Feedback(fopenai.relevance).on_input_output()" + "f_answer_relevance = Feedback(fopenai.relevance).on_input_output()" ] }, { @@ -146,7 +146,7 @@ "outputs": [], "source": [ "from trulens_eval import TruBasicApp\n", - "tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_relevance])" + "tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_answer_relevance])" ] }, { @@ -220,7 +220,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/trulens_eval/trulens_eval/feedback/prompts.py b/trulens_eval/trulens_eval/feedback/prompts.py index 5ca4c24c5..b679ef64e 100644 --- a/trulens_eval/trulens_eval/feedback/prompts.py +++ b/trulens_eval/trulens_eval/feedback/prompts.py @@ -46,7 +46,7 @@ STATEMENT: {hypothesis} """ -QS_RELEVANCE = v2.QuestionStatementRelevance.prompt.template +CONTEXT_RELEVANCE = v2.ContextRelevance.prompt.template PR_RELEVANCE = v2.PromptResponseRelevance.prompt.template SYSTEM_FIND_SUPPORTING = """ diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py index af82a71d5..520f634a9 100644 --- a/trulens_eval/trulens_eval/feedback/provider/base.py +++ b/trulens_eval/trulens_eval/feedback/provider/base.py @@ -239,23 +239,56 @@ def generate_score_and_reasons( ) return score, {} - def qs_relevance(self, question: str, statement: str) -> float: + def context_relevance(self, question: str, context: str) -> float: """ Uses chat completion model. A function that completes a template to - check the relevance of the statement to the question. + check the relevance of the context to the question. + + Usage on RAG Contexts: ```python - feedback = Feedback(provider.qs_relevance).on_input_output() + from trulens_eval.app import App + context = App.select_context(rag_app) + feedback = ( + Feedback(provider.context_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) + ) ``` - The `on_input_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) + + The `on(...)` selector can be changed. See [Feedback Function Guide : + Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details) + + Args: + question (str): A question being asked. + context (str): Context related to the question. + + Returns: + float: A value between 0.0 (not relevant) and 1.0 (relevant). + """ + + return self.generate_score( + system_prompt=str.format( + prompts.CONTEXT_RELEVANCE, question=question, context=context + ) + ) + def qs_relevance(self, question: str, context: str) -> float: + """ + Uses chat completion model. A function that completes a template to + check the relevance of the statement to the question. Usage on RAG Contexts: ```python - feedback = Feedback(provider.qs_relevance).on_input().on( - TruLlama.select_source_nodes().node.text # See note below - ).aggregate(np.mean) + from trulens_eval.app import App + context = App.select_context(rag_app) + feedback = ( + Feedback(provider.context_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) + ) ``` The `on(...)` selector can be changed. See [Feedback Function Guide : @@ -263,53 +296,102 @@ def qs_relevance(self, question: str, statement: str) -> float: Args: question (str): A question being asked. - statement (str): A statement to the question. + context (str): A context to the question. Returns: float: A value between 0.0 (not relevant) and 1.0 (relevant). """ + + warnings.warn( + "The method 'qs_relevance' is deprecated and will be removed in future versions. " + "Please use 'context_relevance' instead.", DeprecationWarning + ) + return self.generate_score( system_prompt=str.format( - prompts.QS_RELEVANCE, question=question, statement=statement + prompts.CONTEXT_RELEVANCE, question=question, context=context ) ) - def qs_relevance_with_cot_reasons(self, question: str, - statement: str) -> Tuple[float, Dict]: + def context_relevance_with_cot_reasons(self, question: str, + context: str) -> Tuple[float, Dict]: """ Uses chat completion model. A function that completes a - template to check the relevance of the statement to the question. + template to check the relevance of the context to the question. Also uses chain of thought methodology and emits the reasons. **Usage:** + Usage on RAG Contexts: + ``` - feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input_output() + from trulens_eval.app import App + context = App.select_context(rag_app) + feedback = ( + Feedback(provider.context_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) + ) ``` - The `on_input_output()` selector can be changed. See [Feedback Function Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - + The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details) + + + Args: + question (str): A question being asked. + context (str): Context related to the question. + + Returns: + float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant". + """ + system_prompt = str.format( + prompts.CONTEXT_RELEVANCE, question=question, context=context + ) + system_prompt = system_prompt.replace( + "RELEVANCE:", prompts.COT_REASONS_TEMPLATE + ) + return self.generate_score_and_reasons(system_prompt) + + def qs_relevance_with_cot_reasons(self, question: str, + context: str) -> Tuple[float, Dict]: + """ + Uses chat completion model. A function that completes a + template to check the relevance of the context to the question. + Also uses chain of thought methodology and emits the reasons. + + **Usage:** Usage on RAG Contexts: ``` - feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input().on( - TruLlama.select_source_nodes().node.text # See note below - ).aggregate(np.mean) - + from trulens_eval.app import App + context = App.select_context(rag_app) + feedback = ( + Feedback(provider.qs_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) + ) ``` The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details) Args: question (str): A question being asked. - statement (str): A statement to the question. + context (str): Context related to the question. Returns: float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant". """ system_prompt = str.format( - prompts.QS_RELEVANCE, question=question, statement=statement + prompts.CONTEXT_RELEVANCE, question=question, context=context ) system_prompt = system_prompt.replace( "RELEVANCE:", prompts.COT_REASONS_TEMPLATE ) + + warnings.warn( + "The method 'qs_relevance_with_cot_reasons' is deprecated and will be removed in future versions. " + "Please use 'context_relevance_with_cot_reasons' instead.", DeprecationWarning + ) + return self.generate_score_and_reasons(system_prompt) def relevance(self, prompt: str, response: str) -> float: diff --git a/trulens_eval/trulens_eval/feedback/v2/feedback.py b/trulens_eval/trulens_eval/feedback/v2/feedback.py index fb8093d7d..b774ed633 100644 --- a/trulens_eval/trulens_eval/feedback/v2/feedback.py +++ b/trulens_eval/trulens_eval/feedback/v2/feedback.py @@ -139,38 +139,8 @@ class Relevance(Semantics): This evaluates the *relevance* of the LLM response to the given text by LLM prompting. -Relevance is currently only available with OpenAI ChatCompletion API. - -TruLens offers two particular flavors of relevance: 1. *Prompt response -relevance* is best for measuring the relationship of the final answer to the -user inputed question. This flavor of relevance is particularly optimized for -the following features: - - * Relevance requires adherence to the entire prompt. - * Responses that don't provide a definitive answer can still be relevant - * Admitting lack of knowledge and refusals are still relevant. - * Feedback mechanism should differentiate between seeming and actual - relevance. - * Relevant but inconclusive statements should get increasingly high scores - as they are more helpful for answering the query. - - You can read more information about the performance of prompt response - relevance by viewing its [smoke test results](../pr_relevance_smoke_tests/). - -2. *Question statement relevance*, sometimes known as context relevance, is best - for measuring the relationship of a provided context to the user inputed - question. This flavor of relevance is optimized for a slightly different set - of features: - * Relevance requires adherence to the entire query. - * Long context with small relevant chunks are relevant. - * Context that provides no answer can still be relevant. - * Feedback mechanism should differentiate between seeming and actual - relevance. - * Relevant but inconclusive statements should get increasingly high scores - as they are more helpful for answering the query. - - You can read more information about the performance of question statement - relevance by viewing its [smoke test results](../qs_relevance_smoke_tests/). +Relevance is available for any LLM provider. + """ # openai.relevance # openai.relevance_with_cot_reasons @@ -194,37 +164,35 @@ class Groundedness(Semantics, WithPrompt): ) -class QuestionStatementRelevance(Relevance, WithPrompt): +class ContextRelevance(Relevance, WithPrompt): # openai.qs_relevance # openai.qs_relevance_with_cot_reasons prompt: ClassVar[PromptTemplate] = PromptTemplate.from_template( - """You are a RELEVANCE grader; providing the relevance of the given STATEMENT to the given QUESTION. + """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION. Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. A few additional scoring guidelines: -- Long STATEMENTS should score equally well as short STATEMENTS. - -- RELEVANCE score should increase as the STATEMENT provides more RELEVANT context to the QUESTION. +- Long CONTEXTS should score equally well as short CONTEXTS. -- RELEVANCE score should increase as the STATEMENT provides RELEVANT context to more parts of the QUESTION. +- RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION. -- STATEMENT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE. +- RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION. -- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE. +- CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE. -- STATEMENT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE. +- CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE. -- STATEMENT must be relevant and helpful for answering the entire QUESTION to get a score of 10. +- CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE. -- Answers that intentionally do not answer the question, such as 'I don't know', should also be counted as the most relevant. +- CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10. - Never elaborate. QUESTION: {question} -STATEMENT: {statement} +CONTEXT: {context} RELEVANCE: """ )