Skip to content

Commit

Permalink
QS Relevance -> Context Relevance (#977)
Browse files Browse the repository at this point in the history
* qs relevance -> context relevance

* update quickstart notebooks

* update langchain quickstart

* update missed prompt change
  • Loading branch information
joshreini1 authored Mar 8, 2024
1 parent f77e7ee commit 1c2a646
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 97 deletions.
16 changes: 10 additions & 6 deletions trulens_eval/examples/quickstart/langchain_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
"outputs": [],
"source": [
"# Imports main tools:\n",
"from trulens_eval import TruChain, Feedback, Tru\n",
"from trulens_eval import TruChain, Tru\n",
"tru = Tru()\n",
"tru.reset_database()\n",
"\n",
Expand Down Expand Up @@ -184,10 +184,11 @@
"outputs": [],
"source": [
"from trulens_eval.feedback.provider import OpenAI\n",
"from trulens_eval import Feedback\n",
"import numpy as np\n",
"\n",
"# Initialize provider class\n",
"openai = OpenAI()\n",
"provider = OpenAI()\n",
"\n",
"# select context to be used in feedback. the location of context is app specific.\n",
"from trulens_eval.app import App\n",
Expand All @@ -204,10 +205,13 @@
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
"f_qa_relevance = Feedback(openai.relevance).on_input_output()\n",
"f_answer_relevance = (\n",
" Feedback(provider.relevance)\n",
" .on_input_output()\n",
")\n",
"# Question/statement relevance between question and each context chunk.\n",
"f_context_relevance = (\n",
" Feedback(openai.qs_relevance)\n",
" Feedback(provider.context_relevance_with_cot_reasons)\n",
" .on_input()\n",
" .on(context)\n",
" .aggregate(np.mean)\n",
Expand All @@ -230,7 +234,7 @@
"source": [
"tru_recorder = TruChain(rag_chain,\n",
" app_id='Chain1_ChatApplication',\n",
" feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])"
" feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])"
]
},
{
Expand Down Expand Up @@ -437,7 +441,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.9.18"
},
"vscode": {
"interpreter": {
Expand Down
24 changes: 12 additions & 12 deletions trulens_eval/examples/quickstart/llama_index_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -147,22 +147,20 @@
"metadata": {},
"outputs": [],
"source": [
"from trulens_eval.feedback.provider import OpenAI\n",
"from trulens_eval import Feedback\n",
"import numpy as np\n",
"\n",
"# Initialize provider class\n",
"from trulens_eval.feedback.provider.openai import OpenAI\n",
"openai = OpenAI()\n",
"provider = OpenAI()\n",
"\n",
"# select context to be used in feedback. the location of context is app specific.\n",
"from trulens_eval.app import App\n",
"context = App.select_context(query_engine)\n",
"\n",
"# imports for feedback\n",
"from trulens_eval import Feedback\n",
"\n",
"# Define a groundedness feedback function\n",
"from trulens_eval.feedback import Groundedness\n",
"grounded = Groundedness(groundedness_provider=OpenAI())\n",
"# Define a groundedness feedback function\n",
"f_groundedness = (\n",
" Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
" .on(context.collect()) # collect context chunks into a list\n",
Expand All @@ -171,11 +169,13 @@
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
"f_qa_relevance = Feedback(openai.relevance).on_input_output()\n",
"\n",
"f_answer_relevance = (\n",
" Feedback(provider.relevance)\n",
" .on_input_output()\n",
")\n",
"# Question/statement relevance between question and each context chunk.\n",
"f_qs_relevance = (\n",
" Feedback(openai.qs_relevance)\n",
"f_context_relevance = (\n",
" Feedback(provider.context_relevance_with_cot_reasons)\n",
" .on_input()\n",
" .on(context)\n",
" .aggregate(np.mean)\n",
Expand All @@ -199,7 +199,7 @@
"from trulens_eval import TruLlama\n",
"tru_query_engine_recorder = TruLlama(query_engine,\n",
" app_id='LlamaIndex_App1',\n",
" feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])"
" feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])"
]
},
{
Expand Down Expand Up @@ -326,7 +326,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.5"
},
"vscode": {
"interpreter": {
Expand Down
17 changes: 8 additions & 9 deletions trulens_eval/examples/quickstart/quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,13 @@
"source": [
"from trulens_eval import Feedback, Select\n",
"from trulens_eval.feedback import Groundedness\n",
"from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI\n",
"from trulens_eval.feedback.provider.openai import OpenAI\n",
"\n",
"import numpy as np\n",
"\n",
"# Initialize provider class\n",
"fopenai = fOpenAI()\n",
"provider = OpenAI()\n",
"\n",
"grounded = Groundedness(groundedness_provider=fopenai)\n",
"grounded = Groundedness(groundedness_provider=provider)\n",
"\n",
"# Define a groundedness feedback function\n",
"f_groundedness = (\n",
Expand All @@ -222,15 +221,15 @@
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
"f_qa_relevance = (\n",
" Feedback(fopenai.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
"f_answer_relevance = (\n",
" Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
" .on(Select.RecordCalls.retrieve.args.query)\n",
" .on_output()\n",
")\n",
"\n",
"# Question/statement relevance between question and each context chunk.\n",
"f_context_relevance = (\n",
" Feedback(fopenai.qs_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
" Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
" .on(Select.RecordCalls.retrieve.args.query)\n",
" .on(Select.RecordCalls.retrieve.rets.collect())\n",
" .aggregate(np.mean)\n",
Expand All @@ -254,7 +253,7 @@
"from trulens_eval import TruCustomApp\n",
"tru_rag = TruCustomApp(rag,\n",
" app_id = 'RAG v1',\n",
" feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])"
" feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance])"
]
},
{
Expand Down Expand Up @@ -310,7 +309,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions trulens_eval/examples/quickstart/text2text_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
"fopenai = fOpenAI()\n",
"\n",
"# Define a relevance function from openai\n",
"f_relevance = Feedback(fopenai.relevance).on_input_output()"
"f_answer_relevance = Feedback(fopenai.relevance).on_input_output()"
]
},
{
Expand All @@ -146,7 +146,7 @@
"outputs": [],
"source": [
"from trulens_eval import TruBasicApp\n",
"tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_relevance])"
"tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id=\"Happy Bot\", feedbacks=[f_answer_relevance])"
]
},
{
Expand Down Expand Up @@ -220,7 +220,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion trulens_eval/trulens_eval/feedback/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
STATEMENT: {hypothesis}
"""

QS_RELEVANCE = v2.QuestionStatementRelevance.prompt.template
CONTEXT_RELEVANCE = v2.ContextRelevance.prompt.template
PR_RELEVANCE = v2.PromptResponseRelevance.prompt.template

SYSTEM_FIND_SUPPORTING = """
Expand Down
126 changes: 104 additions & 22 deletions trulens_eval/trulens_eval/feedback/provider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,77 +239,159 @@ def generate_score_and_reasons(
)
return score, {}

def qs_relevance(self, question: str, statement: str) -> float:
def context_relevance(self, question: str, context: str) -> float:
"""
Uses chat completion model. A function that completes a template to
check the relevance of the statement to the question.
check the relevance of the context to the question.
Usage on RAG Contexts:
```python
feedback = Feedback(provider.qs_relevance).on_input_output()
from trulens_eval.app import App
context = App.select_context(rag_app)
feedback = (
Feedback(provider.context_relevance_with_cot_reasons)
.on_input()
.on(context)
.aggregate(np.mean)
)
```
The `on_input_output()` selector can be changed. See [Feedback Function
Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/)
The `on(...)` selector can be changed. See [Feedback Function Guide :
Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
Args:
question (str): A question being asked.
context (str): Context related to the question.
Returns:
float: A value between 0.0 (not relevant) and 1.0 (relevant).
"""

return self.generate_score(
system_prompt=str.format(
prompts.CONTEXT_RELEVANCE, question=question, context=context
)
)
def qs_relevance(self, question: str, context: str) -> float:
"""
Uses chat completion model. A function that completes a template to
check the relevance of the statement to the question.
Usage on RAG Contexts:
```python
feedback = Feedback(provider.qs_relevance).on_input().on(
TruLlama.select_source_nodes().node.text # See note below
).aggregate(np.mean)
from trulens_eval.app import App
context = App.select_context(rag_app)
feedback = (
Feedback(provider.context_relevance_with_cot_reasons)
.on_input()
.on(context)
.aggregate(np.mean)
)
```
The `on(...)` selector can be changed. See [Feedback Function Guide :
Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
Args:
question (str): A question being asked.
statement (str): A statement to the question.
context (str): A context to the question.
Returns:
float: A value between 0.0 (not relevant) and 1.0 (relevant).
"""

warnings.warn(
"The method 'qs_relevance' is deprecated and will be removed in future versions. "
"Please use 'context_relevance' instead.", DeprecationWarning
)

return self.generate_score(
system_prompt=str.format(
prompts.QS_RELEVANCE, question=question, statement=statement
prompts.CONTEXT_RELEVANCE, question=question, context=context
)
)

def qs_relevance_with_cot_reasons(self, question: str,
statement: str) -> Tuple[float, Dict]:
def context_relevance_with_cot_reasons(self, question: str,
context: str) -> Tuple[float, Dict]:
"""
Uses chat completion model. A function that completes a
template to check the relevance of the statement to the question.
template to check the relevance of the context to the question.
Also uses chain of thought methodology and emits the reasons.
**Usage:**
Usage on RAG Contexts:
```
feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input_output()
from trulens_eval.app import App
context = App.select_context(rag_app)
feedback = (
Feedback(provider.context_relevance_with_cot_reasons)
.on_input()
.on(context)
.aggregate(np.mean)
)
```
The `on_input_output()` selector can be changed. See [Feedback Function Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/)
The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
Args:
question (str): A question being asked.
context (str): Context related to the question.
Returns:
float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
"""
system_prompt = str.format(
prompts.CONTEXT_RELEVANCE, question=question, context=context
)
system_prompt = system_prompt.replace(
"RELEVANCE:", prompts.COT_REASONS_TEMPLATE
)
return self.generate_score_and_reasons(system_prompt)

def qs_relevance_with_cot_reasons(self, question: str,
context: str) -> Tuple[float, Dict]:
"""
Uses chat completion model. A function that completes a
template to check the relevance of the context to the question.
Also uses chain of thought methodology and emits the reasons.
**Usage:**
Usage on RAG Contexts:
```
feedback = Feedback(provider.qs_relevance_with_cot_reasons).on_input().on(
TruLlama.select_source_nodes().node.text # See note below
).aggregate(np.mean)
from trulens_eval.app import App
context = App.select_context(rag_app)
feedback = (
Feedback(provider.qs_relevance_with_cot_reasons)
.on_input()
.on(context)
.aggregate(np.mean)
)
```
The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details)
Args:
question (str): A question being asked.
statement (str): A statement to the question.
context (str): Context related to the question.
Returns:
float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
"""
system_prompt = str.format(
prompts.QS_RELEVANCE, question=question, statement=statement
prompts.CONTEXT_RELEVANCE, question=question, context=context
)
system_prompt = system_prompt.replace(
"RELEVANCE:", prompts.COT_REASONS_TEMPLATE
)

warnings.warn(
"The method 'qs_relevance_with_cot_reasons' is deprecated and will be removed in future versions. "
"Please use 'context_relevance_with_cot_reasons' instead.", DeprecationWarning
)

return self.generate_score_and_reasons(system_prompt)

def relevance(self, prompt: str, response: str) -> float:
Expand Down
Loading

0 comments on commit 1c2a646

Please sign in to comment.