Skip to content

Commit

Permalink
Automated File Generation from Docs Notebook Changes (#1137)
Browse files Browse the repository at this point in the history
Co-authored-by: joshreini1 <[email protected]>
Co-authored-by: Josh Reini <[email protected]>
  • Loading branch information
3 people authored May 15, 2024
1 parent 78dbb12 commit 95d8d0b
Show file tree
Hide file tree
Showing 11 changed files with 99 additions and 106 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ community](https://communityinviter.com/apps/aiqualityforum/josh)!

**Don't just vibe-check your llm app!** Systematically evaluate and track your
LLM experiments with TruLens. As you develop your app including prompts, models,
retrievers, knowledge sources and more, *TruLens-Eval* is the tool you need to
retreivers, knowledge sources and more, *TruLens-Eval* is the tool you need to
understand its performance.

Fine-grained, stack-agnostic instrumentation and comprehensive evaluations help
you to identify failure modes & systematically iterate to improve your
application.

Read more about the core concepts behind TruLens including [Feedback
Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/feedback_functions/),
Functions](https://www.trulens.org/trulens_eval/getting_started/core_concepts/
[The RAG Triad](https://www.trulens.org/trulens_eval/getting_started/core_concepts/rag_triad/),
and [Honest, Harmless and Helpful
Evals](https://www.trulens.org/trulens_eval/getting_started/core_concepts/honest_harmless_helpful_evals/).
Expand Down
49 changes: 21 additions & 28 deletions docs/trulens_eval/all_tools.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,11 @@
"from trulens_eval.app import App\n",
"context = App.select_context(rag_chain)\n",
"\n",
"from trulens_eval.feedback import Groundedness\n",
"grounded = Groundedness(groundedness_provider=OpenAI())\n",
"# Define a groundedness feedback function\n",
"f_groundedness = (\n",
" Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
" Feedback(provider.groundedness_measure_with_cot_reasons)\n",
" .on(context.collect()) # collect context chunks into a list\n",
" .on_output()\n",
" .aggregate(grounded.grounded_statements_aggregator)\n",
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
Expand Down Expand Up @@ -582,14 +579,12 @@
"from trulens_eval.app import App\n",
"context = App.select_context(query_engine)\n",
"\n",
"from trulens_eval.feedback import Groundedness\n",
"grounded = Groundedness(groundedness_provider=OpenAI())\n",
"# Define a groundedness feedback function\n",
"f_groundedness = (\n",
" Feedback(grounded.groundedness_measure_with_cot_reasons)\n",
" Feedback(provider.groundedness_measure_with_cot_reasons)\n",
" .on(context.collect()) # collect context chunks into a list\n",
" .on_output()\n",
" .aggregate(grounded.grounded_statements_aggregator)\n",
" .aggregate(provider.grounded_statements_aggregator)\n",
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
Expand Down Expand Up @@ -762,7 +757,8 @@
"outputs": [],
"source": [
"import os\n",
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
"os.environ[\"HUGGINGFACE_API_KEY\"] = \"hf_...\""
]
},
{
Expand Down Expand Up @@ -862,6 +858,9 @@
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"oai_client = OpenAI()\n",
"\n",
"class RAG_from_scratch:\n",
" @instrument\n",
" def retrieve(self, query: str) -> list:\n",
Expand All @@ -872,7 +871,7 @@
" query_texts=query,\n",
" n_results=2\n",
" )\n",
" return results['documents'][0]\n",
" return results['documents']\n",
"\n",
" @instrument\n",
" def generate_completion(self, query: str, context_str: list) -> str:\n",
Expand Down Expand Up @@ -921,36 +920,31 @@
"outputs": [],
"source": [
"from trulens_eval import Feedback, Select\n",
"from trulens_eval.feedback import Groundedness\n",
"from trulens_eval.feedback.provider.openai import OpenAI\n",
"\n",
"import numpy as np\n",
"\n",
"provider = OpenAI()\n",
"\n",
"grounded = Groundedness(groundedness_provider=provider)\n",
"\n",
"# Define a groundedness feedback function\n",
"f_groundedness = (\n",
" Feedback(grounded.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n",
" Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n",
" .on(Select.RecordCalls.retrieve.rets.collect())\n",
" .on_output()\n",
" .aggregate(grounded.grounded_statements_aggregator)\n",
")\n",
"\n",
"# Question/answer relevance between overall question and answer.\n",
"f_answer_relevance = (\n",
" Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n",
" .on(Select.RecordCalls.retrieve.args.query)\n",
" .on_output()\n",
")\n",
"\n",
"# Question/statement relevance between question and each context chunk.\n",
"# Context relevance between question and each context chunk.\n",
"f_context_relevance = (\n",
" Feedback(provider.context_relevance_with_cot_reasons, name = \"Context Relevance\")\n",
" .on(Select.RecordCalls.retrieve.args.query)\n",
" .on(Select.RecordCalls.retrieve.rets.collect())\n",
" .aggregate(np.mean)\n",
" .on(Select.RecordCalls.retrieve.rets)\n",
" .aggregate(np.mean) # choose a different aggregation method if you wish\n",
")"
]
},
Expand Down Expand Up @@ -2016,32 +2010,31 @@
"from trulens_eval.feedback import prompts\n",
"\n",
"class Custom_AzureOpenAI(AzureOpenAI):\n",
" def qs_relevance_with_cot_reasons_extreme(self, question: str, statement: str) -> Tuple[float, Dict]:\n",
" def context_relevance_with_cot_reasons_extreme(self, question: str, context: str) -> Tuple[float, Dict]:\n",
" \"\"\"\n",
" Tweaked version of question statement relevance, extending AzureOpenAI provider.\n",
" Tweaked version of context relevance, extending AzureOpenAI provider.\n",
" A function that completes a template to check the relevance of the statement to the question.\n",
" Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.\n",
" Also uses chain of thought methodology and emits the reasons.\n",
"\n",
" Args:\n",
" question (str): A question being asked. \n",
" statement (str): A statement to the question.\n",
" context (str): A statement to the question.\n",
"\n",
" Returns:\n",
" float: A value between 0 and 1. 0 being \"not relevant\" and 1 being \"relevant\".\n",
" \"\"\"\n",
"\n",
" system_prompt = str.format(prompts.QS_RELEVANCE, question = question, statement = statement)\n",
"\n",
" # remove scoring guidelines around middle scores\n",
" system_prompt = system_prompt.replace(\n",
" system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(\n",
" \"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\\n\\n\", \"\")\n",
" \n",
" system_prompt = system_prompt.replace(\n",
" user_prompt = str.format(prompts.CONTEXT_RELEVANCE_USER, question = question, context = context)\n",
" user_prompt = user_prompt.replace(\n",
" \"RELEVANCE:\", prompts.COT_REASONS_TEMPLATE\n",
" )\n",
"\n",
" return self.generate_score_and_reasons(system_prompt)"
" return self.generate_score_and_reasons(system_prompt, user_prompt)"
]
},
{
Expand Down Expand Up @@ -2125,7 +2118,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
58 changes: 26 additions & 32 deletions trulens_eval/generated_files/all_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,11 @@ def format_docs(docs):

context = App.select_context(rag_chain)

from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=OpenAI())
# Define a groundedness feedback function
f_groundedness = (
Feedback(grounded.groundedness_measure_with_cot_reasons
Feedback(provider.groundedness_measure_with_cot_reasons
).on(context.collect()) # collect context chunks into a list
.on_output().aggregate(grounded.grounded_statements_aggregator)
.on_output()
)

# Question/answer relevance between overall question and answer.
Expand Down Expand Up @@ -345,14 +342,11 @@ def display_call_stack(data):

context = App.select_context(query_engine)

from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=OpenAI())
# Define a groundedness feedback function
f_groundedness = (
Feedback(grounded.groundedness_measure_with_cot_reasons
Feedback(provider.groundedness_measure_with_cot_reasons
).on(context.collect()) # collect context chunks into a list
.on_output().aggregate(grounded.grounded_statements_aggregator)
.on_output().aggregate(provider.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
Expand Down Expand Up @@ -447,6 +441,7 @@ def display_call_stack(data):
import os

os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."

# ## Get Data
#
Expand Down Expand Up @@ -500,6 +495,10 @@ def display_call_stack(data):

# In[ ]:

from openai import OpenAI

oai_client = OpenAI()


class RAG_from_scratch:

Expand All @@ -509,7 +508,7 @@ def retrieve(self, query: str) -> list:
Retrieve relevant text from vector store.
"""
results = vector_store.query(query_texts=query, n_results=2)
return results['documents'][0]
return results['documents']

@instrument
def generate_completion(self, query: str, context_str: list) -> str:
Expand Down Expand Up @@ -552,34 +551,30 @@ def query(self, query: str) -> str:

from trulens_eval import Feedback
from trulens_eval import Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI

provider = OpenAI()

grounded = Groundedness(groundedness_provider=provider)

# Define a groundedness feedback function
f_groundedness = (
Feedback(
grounded.groundedness_measure_with_cot_reasons, name="Groundedness"
).on(Select.RecordCalls.retrieve.rets.collect()
).on_output().aggregate(grounded.grounded_statements_aggregator)
provider.groundedness_measure_with_cot_reasons, name="Groundedness"
).on(Select.RecordCalls.retrieve.rets.collect()).on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance").on(
Select.RecordCalls.retrieve.args.query
).on_output()
)

# Question/statement relevance between question and each context chunk.
# Context relevance between question and each context chunk.
f_context_relevance = (
Feedback(
provider.context_relevance_with_cot_reasons, name="Context Relevance"
).on(Select.RecordCalls.retrieve.args.query
).on(Select.RecordCalls.retrieve.rets.collect()).aggregate(np.mean)
).on(Select.RecordCalls.retrieve.args.query).on(
Select.RecordCalls.retrieve.rets
).aggregate(np.mean) # choose a different aggregation method if you wish
)

# ## Construct the app
Expand Down Expand Up @@ -1193,38 +1188,37 @@ def style_check_professional(self, response: str) -> float:

class Custom_AzureOpenAI(AzureOpenAI):

def qs_relevance_with_cot_reasons_extreme(
self, question: str, statement: str
def context_relevance_with_cot_reasons_extreme(
self, question: str, context: str
) -> Tuple[float, Dict]:
"""
Tweaked version of question statement relevance, extending AzureOpenAI provider.
Tweaked version of context relevance, extending AzureOpenAI provider.
A function that completes a template to check the relevance of the statement to the question.
Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
Also uses chain of thought methodology and emits the reasons.
Args:
question (str): A question being asked.
statement (str): A statement to the question.
context (str): A statement to the question.
Returns:
float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
"""

system_prompt = str.format(
prompts.QS_RELEVANCE, question=question, statement=statement
)

# remove scoring guidelines around middle scores
system_prompt = system_prompt.replace(
system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace(
"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
""
)

system_prompt = system_prompt.replace(
user_prompt = str.format(
prompts.CONTEXT_RELEVANCE_USER, question=question, context=context
)
user_prompt = user_prompt.replace(
"RELEVANCE:", prompts.COT_REASONS_TEMPLATE
)

return self.generate_score_and_reasons(system_prompt)
return self.generate_score_and_reasons(system_prompt, user_prompt)


# ## Multi-Output Feedback functions
Expand Down
15 changes: 6 additions & 9 deletions trulens_eval/trulens_eval/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,8 @@ class RecordingContext():
"""

def __init__(self, app: mod_app.App, record_metadata: JSON = None):
self.calls: Dict[mod_types_schema.CallID, mod_record_schema.RecordAppCall] = {}
self.calls: Dict[mod_types_schema.CallID,
mod_record_schema.RecordAppCall] = {}
"""A record (in terms of its RecordAppCall) in process of being created.
Storing as a map as we want to override calls with the same id which may
Expand Down Expand Up @@ -418,11 +419,9 @@ def add_call(self, call: mod_record_schema.RecordAppCall):
def finish_record(
self,
calls_to_record: Callable[[
List[mod_record_schema.RecordAppCall],
mod_types_schema.Metadata,
Optional[mod_record_schema.Record]
], mod_record_schema.Record
],
List[mod_record_schema.RecordAppCall], mod_types_schema.
Metadata, Optional[mod_record_schema.Record]
], mod_record_schema.Record],
existing_record: Optional[mod_record_schema.Record] = None
):
"""
Expand All @@ -432,9 +431,7 @@ def finish_record(

with self.lock:
record = calls_to_record(
list(self.calls.values()),
self.record_metadata,
existing_record
list(self.calls.values()), self.record_metadata, existing_record
)
self.calls = {}

Expand Down
13 changes: 7 additions & 6 deletions trulens_eval/trulens_eval/feedback/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,14 @@ def rag_triad(
ret = {}

for f_imp, f_agg, arg1name, arg1lens, arg2name, arg2lens, f_name in [
(provider.groundedness_measure_with_cot_reasons, np.mean, "source", context.collect(),
"statement", answer, "Groundedness"),
(provider.relevance_with_cot_reasons, np.mean, "prompt", question, "response", answer, "Answer Relevance"),
(provider.context_relevance_with_cot_reasons, np.mean, "question", question, "context",
context, "Context Relevance")
(provider.groundedness_measure_with_cot_reasons, np.mean, "source",
context.collect(), "statement", answer, "Groundedness"),
(provider.relevance_with_cot_reasons, np.mean, "prompt", question,
"response", answer, "Answer Relevance"),
(provider.context_relevance_with_cot_reasons, np.mean, "question",
question, "context", context, "Context Relevance")
]:
f = Feedback(f_imp, if_exists=context, name = f_name).aggregate(f_agg)
f = Feedback(f_imp, if_exists=context, name=f_name).aggregate(f_agg)
if arg1lens is not None:
f = f.on(**{arg1name: arg1lens})
else:
Expand Down
Loading

0 comments on commit 95d8d0b

Please sign in to comment.