From 7a29b5329b59a8200c7b0f4420cd2d21709a4351 Mon Sep 17 00:00:00 2001 From: Josh Reini <60949774+joshreini1@users.noreply.github.com> Date: Fri, 15 Dec 2023 10:11:20 -0800 Subject: [PATCH] Releases/rc trulens eval 0.19.0 (#680) * update nb tagged versions * bump py script quickstarts --- .../llama_index/llama_index_agents.ipynb | 2 +- .../llama_index/llama_index_async.ipynb | 2 +- .../llama_index_complex_evals.ipynb | 2 +- .../llama_index/llama_index_groundtruth.ipynb | 2 +- .../llama_index/llama_index_multimodal.ipynb | 2 +- .../llama_index_queryplanning.ipynb | 2 +- .../llama_index_retrievalquality.ipynb | 2 +- .../quickstart/groundtruth_evals.ipynb | 2 +- .../examples/quickstart/human_feedback.ipynb | 2 +- .../quickstart/langchain_quickstart.ipynb | 2 +- .../quickstart/llama_index_quickstart.ipynb | 2 +- .../examples/quickstart/prototype_evals.ipynb | 2 +- .../py_script_quickstarts/all_tools.py | 622 +++++++++++------- .../langchain_quickstart.py | 61 +- .../llama_index_quickstart.py | 59 +- .../py_script_quickstarts/quickstart.py | 129 ++-- .../text2text_quickstart.py | 59 +- .../examples/quickstart/quickstart.ipynb | 2 +- .../quickstart/text2text_quickstart.ipynb | 2 +- trulens_eval/trulens_eval/__init__.py | 2 +- 20 files changed, 580 insertions(+), 380 deletions(-) diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_agents.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_agents.ipynb index 9ddd74181..586e75d84 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_agents.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_agents.ipynb @@ -48,7 +48,7 @@ }, "outputs": [], "source": [ - "#! pip install trulens_eval==0.18.2 llama_index==0.9.11.post1 llama_hub==0.0.52 yelpapi==2.5.1 openai==1.3.7" + "#! pip install trulens_eval==0.19.0 llama_index==0.9.15.post2 llama_hub==0.0.52 yelpapi==2.5.1 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_async.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_async.ipynb index 43047b722..6687483ce 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_async.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_async.ipynb @@ -24,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.17.0 llama_index>=0.8.29post1 html2text>=2020.1.16 openai<=0.28.1" + "# ! pip install trulens_eval==0.19.0 llama_index>=0.9.15.post2 html2text>=2020.1.16 openai<=0.28.1" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_complex_evals.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_complex_evals.ipynb index a02cc8433..c17ff6518 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_complex_evals.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_complex_evals.ipynb @@ -28,7 +28,7 @@ }, "outputs": [], "source": [ - "#!pip install trulens-eval==0.12.0 llama-index==0.8.29post1 sentence-transformers transformers pypdf gdown" + "#!pip install trulens-eval==0.19.0 llama-index==0.9.15.post2 sentence-transformers transformers pypdf gdown" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_groundtruth.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_groundtruth.ipynb index a6fd88d71..5809ccace 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_groundtruth.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_groundtruth.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "#! pip install trulens_eval==0.11.0 llama_index==0.8.21" + "#! pip install trulens_eval==0.19.0 llama_index==0.9.15.post2" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_multimodal.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_multimodal.ipynb index cc0fefc18..e4e03dbbe 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_multimodal.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_multimodal.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval llama_index ftfy regex tqdm git+https://github.com/openai/CLIP.git torch torchvision matplotlib scikit-image qdrant_client" + "# ! pip install trulens_eval==0.19.0 llama_index==0.9.15.post2 ftfy regex tqdm git+https://github.com/openai/CLIP.git torch torchvision matplotlib scikit-image qdrant_client" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_queryplanning.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_queryplanning.ipynb index 234a889c9..cb6c5d5f4 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_queryplanning.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_queryplanning.ipynb @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "#! pip install trulens_eval==0.11.0 llama_index==0.8.21" + "#! pip install trulens_eval==0.19.0 llama_index==0.9.15.post2" ] }, { diff --git a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_retrievalquality.ipynb b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_retrievalquality.ipynb index 43fdccf79..9e7582747 100644 --- a/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_retrievalquality.ipynb +++ b/trulens_eval/examples/expositional/frameworks/llama_index/llama_index_retrievalquality.ipynb @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "#! pip install trulens-eval==0.14.0 llama_index>=0.8.29post1 html2text>=2020.1.16" + "#! pip install trulens-eval==0.19.0 llama_index>=0.9.15.post2 html2text>=2020.1.16" ] }, { diff --git a/trulens_eval/examples/quickstart/groundtruth_evals.ipynb b/trulens_eval/examples/quickstart/groundtruth_evals.ipynb index fa488be34..3f664d184 100644 --- a/trulens_eval/examples/quickstart/groundtruth_evals.ipynb +++ b/trulens_eval/examples/quickstart/groundtruth_evals.ipynb @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3 openai==1.3.7" + "# ! pip install trulens_eval==0.19.0 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/human_feedback.ipynb b/trulens_eval/examples/quickstart/human_feedback.ipynb index 4e550c793..571403ade 100644 --- a/trulens_eval/examples/quickstart/human_feedback.ipynb +++ b/trulens_eval/examples/quickstart/human_feedback.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3 openai==1.3.7" + "# ! pip install trulens_eval==0.19.0 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb index ae2aa6ad9..30ee264af 100644 --- a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3 openai==1.3.7" + "# ! pip install trulens_eval==0.19.0 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb index 0be445ca9..48fa95874 100644 --- a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "# pip install trulens_eval==0.18.3 llama_index>=0.8.69 html2text>=2020.1.16 " + "# pip install trulens_eval==0.19.0 llama_index>=0.9.15post2 html2text>=2020.1.16 " ] }, { diff --git a/trulens_eval/examples/quickstart/prototype_evals.ipynb b/trulens_eval/examples/quickstart/prototype_evals.ipynb index 9715d5167..8e3485b39 100644 --- a/trulens_eval/examples/quickstart/prototype_evals.ipynb +++ b/trulens_eval/examples/quickstart/prototype_evals.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3" + "# ! pip install trulens_eval==0.19.0" ] }, { diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py b/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py index 31980f432..c10568196 100644 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py +++ b/trulens_eval/examples/quickstart/py_script_quickstarts/all_tools.py @@ -2,9 +2,9 @@ # coding: utf-8 # # Langchain Quickstart -# +# # In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb) # ## Setup @@ -13,28 +13,28 @@ # In[ ]: -# ! pip install trulens_eval==0.18.3 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 openai==1.3.7 + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." os.environ["HUGGINGFACE_API_KEY"] = "..." + # ### Import from LangChain and TruLens # In[ ]: + from IPython.display import JSON # Imports main tools: -from trulens_eval import Feedback -from trulens_eval import Huggingface -from trulens_eval import Tru -from trulens_eval import TruChain +from trulens_eval import TruChain, Feedback, Huggingface, Tru from trulens_eval.schema import FeedbackResult - tru = Tru() # Imports from langchain to build app. You may need to install langchain first @@ -42,16 +42,17 @@ # ! pip install langchain>=0.0.170 from langchain.chains import LLMChain from langchain.llms import OpenAI -from langchain.prompts import ChatPromptTemplate +from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.prompts import HumanMessagePromptTemplate -from langchain.prompts import PromptTemplate + # ### Create Simple LLM Application -# +# # This example uses a LangChain framework and OpenAI LLM # In[ ]: + full_prompt = HumanMessagePromptTemplate( prompt=PromptTemplate( template= @@ -66,22 +67,28 @@ chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True) + # ### Send your first request # In[ ]: + prompt_input = '¿que hora es?' + # In[ ]: + llm_response = chain(prompt_input) display(llm_response) + # ## Initialize Feedback Function(s) # In[ ]: + # Initialize Huggingface-based feedback function collection class: hugs = Huggingface() @@ -90,56 +97,67 @@ # By default this will check language match on the main app input and main app # output. + # ## Instrument chain for logging with TruLens # In[ ]: -tru_recorder = TruChain( - chain, app_id='Chain1_ChatApplication', feedbacks=[f_lang_match] -) + +tru_recorder = TruChain(chain, + app_id='Chain1_ChatApplication', + feedbacks=[f_lang_match]) + # In[ ]: + with tru_recorder as recording: llm_response = chain(prompt_input) display(llm_response) + # ## Retrieve records and feedback # In[ ]: + # The record of the ap invocation can be retrieved from the `recording`: -rec = recording.get() # use .get if only one record +rec = recording.get() # use .get if only one record # recs = recording.records # use .records if multiple display(rec) + # In[ ]: + # The results of the feedback functions can be rertireved from the record. These # are `Future` instances (see `concurrent.futures`). You can use `as_completed` # to wait until they have finished evaluating. from concurrent.futures import as_completed -for feedback_future in as_completed(rec.feedback_results): +for feedback_future in as_completed(rec.feedback_results): feedback, feedback_result = feedback_future.result() - + feedback: Feedback feedbac_result: FeedbackResult display(feedback.name, feedback_result.result) + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. @@ -148,74 +166,83 @@ # In[ ]: -tru.get_records_and_feedback(app_ids=[] - )[0] # pass an empty list of app_ids to get all + +tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all + # # Llama-Index Quickstart -# +# # In this quickstart you will create a simple Llama Index App and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb) # ## Setup -# +# # ### Install dependencies # Let's install some of the dependencies for this notebook if we don't have them already # In[ ]: -# pip install trulens_eval==0.18.3 llama_index>=0.8.69 html2text>=2020.1.16 + +# pip install trulens_eval==0.19.0 llama_index>=0.9.15post2 html2text>=2020.1.16 + # ### Add API keys # For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation. # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." + # ### Import from LlamaIndex and TruLens # In[ ]: -from trulens_eval import Feedback -from trulens_eval import Tru -from trulens_eval import TruLlama + +from trulens_eval import Feedback, Tru, TruLlama from trulens_eval.feedback import Groundedness from trulens_eval.feedback.provider.openai import OpenAI tru = Tru() + # ### Create Simple LLM Application -# +# # This example uses LlamaIndex which internally uses an OpenAI LLM. # In[ ]: + from llama_index import VectorStoreIndex from llama_index.readers.web import SimpleWebPageReader -documents = SimpleWebPageReader(html_to_text=True).load_data( - ["http://paulgraham.com/worked.html"] -) +documents = SimpleWebPageReader( + html_to_text=True +).load_data(["http://paulgraham.com/worked.html"]) index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() + # ### Send your first request # In[ ]: + response = query_engine.query("What did the author do growing up?") print(response) + # ## Initialize Feedback Function(s) # In[ ]: + import numpy as np # Initialize provider class @@ -226,7 +253,8 @@ # Define a groundedness feedback function f_groundedness = Feedback(grounded.groundedness_measure_with_cot_reasons).on( TruLlama.select_source_nodes().node.text.collect() -).on_output().aggregate(grounded.grounded_statements_aggregator) + ).on_output( + ).aggregate(grounded.grounded_statements_aggregator) # Question/answer relevance between overall question and answer. f_qa_relevance = Feedback(openai.relevance).on_input_output() @@ -234,32 +262,37 @@ # Question/statement relevance between question and each context chunk. f_qs_relevance = Feedback(openai.qs_relevance).on_input().on( TruLlama.select_source_nodes().node.text -).aggregate(np.mean) + ).aggregate(np.mean) + # ## Instrument app for logging with TruLens # In[ ]: -tru_query_engine_recorder = TruLlama( - query_engine, + +tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', - feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance] -) + feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance]) + # In[ ]: + # or as context manager with tru_query_engine_recorder as recording: query_engine.query("What did the author do growing up?") + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. @@ -268,33 +301,38 @@ # In[ ]: -tru.get_records_and_feedback(app_ids=[] - )[0] # pass an empty list of app_ids to get all + +tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all + # # TruLens Quickstart -# +# # In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb) # In[ ]: -# ! pip install trulens_eval==0.18.3 chromadb==0.4.18 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 chromadb==0.4.18 openai==1.3.7 + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # ## Get Data -# +# # In this case, we'll just initialize some simple text in the notebook. # In[ ]: + university_info = """ The University of Washington, founded in 1861 in Seattle, is a public research university with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. @@ -303,63 +341,71 @@ including one of the largest library systems in the world. """ + # ## Create Vector Store -# +# # Create a chromadb vector store in memory. # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() oai_client.embeddings.create( - model="text-embedding-ada-002", input=university_info -) + model="text-embedding-ada-002", + input=university_info + ) + # In[ ]: + import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction -embedding_function = OpenAIEmbeddingFunction( - api_key=os.environ.get('OPENAI_API_KEY'), - model_name="text-embedding-ada-002" -) +embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), + model_name="text-embedding-ada-002") + chroma_client = chromadb.Client() -vector_store = chroma_client.get_or_create_collection( - name="Universities", embedding_function=embedding_function -) +vector_store = chroma_client.get_or_create_collection(name="Universities", + embedding_function=embedding_function) + # Add the university_info to the embedding database. # In[ ]: + vector_store.add("uni_info", documents=university_info) + # ## Build RAG from scratch -# +# # Build a custom RAG from scratch, and add TruLens custom instrumentation. # In[ ]: + from trulens_eval import Tru from trulens_eval.tru_custom_app import instrument - tru = Tru() + # In[ ]: class RAG_from_scratch: - @instrument def retrieve(self, query: str) -> list: """ Retrieve relevant text from vector store. """ - results = vector_store.query(query_texts=query, n_results=2) + results = vector_store.query( + query_texts=query, + n_results=2 + ) return results['documents'][0] @instrument @@ -368,19 +414,19 @@ def generate_completion(self, query: str, context_str: list) -> str: Generate answer from context. """ completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": - f"We have provided context information below. \n" - f"---------------------\n" - f"{context_str}" - f"\n---------------------\n" - f"Given this information, please answer the question: {query}" - } - ] + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"We have provided context information below. \n" + f"---------------------\n" + f"{context_str}" + f"\n---------------------\n" + f"Given this information, please answer the question: {query}" + } + ] ).choices[0].message.content return completion @@ -390,22 +436,22 @@ def query(self, query: str) -> str: completion = self.generate_completion(query, context_str) return completion - rag = RAG_from_scratch() + # ## Set up feedback functions. -# +# # Here we'll use groundedness, answer relevance and context relevance to detect hallucination. # In[ ]: -import numpy as np -from trulens_eval import Feedback -from trulens_eval import Select +from trulens_eval import Feedback, Select from trulens_eval.feedback import Groundedness from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI +import numpy as np + # Initialize provider class fopenai = fOpenAI() @@ -413,74 +459,82 @@ def query(self, query: str) -> str: # Define a groundedness feedback function f_groundedness = ( - Feedback( - grounded.groundedness_measure_with_cot_reasons, name="Groundedness" - ).on(Select.RecordCalls.retrieve.rets.collect() - ).on_output().aggregate(grounded.grounded_statements_aggregator) + Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness") + .on(Select.RecordCalls.retrieve.rets.collect()) + .on_output() + .aggregate(grounded.grounded_statements_aggregator) ) # Question/answer relevance between overall question and answer. f_qa_relevance = ( - Feedback(fopenai.relevance_with_cot_reasons, name="Answer Relevance").on( - Select.RecordCalls.retrieve.args.query - ).on_output() + Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on_output() ) # Question/statement relevance between question and each context chunk. f_context_relevance = ( - Feedback(fopenai.qs_relevance_with_cot_reasons, - name="Context Relevance").on( - Select.RecordCalls.retrieve.args.query - ).on(Select.RecordCalls.retrieve.rets.collect() - ).aggregate(np.mean) + Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on(Select.RecordCalls.retrieve.rets.collect()) + .aggregate(np.mean) ) + # ## Construct the app # Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval # In[ ]: + from trulens_eval import TruCustomApp +tru_rag = TruCustomApp(rag, + app_id = 'RAG v1', + feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance]) -tru_rag = TruCustomApp( - rag, - app_id='RAG v1', - feedbacks=[f_groundedness, f_qa_relevance, f_context_relevance] -) # ## Run the app # Use `tru_rag` as a context manager for the custom RAG-from-scratch app. # In[ ]: + with tru_rag as recording: rag.query("When was the University of Washington founded?") + # In[ ]: + tru.get_leaderboard(app_ids=["RAG v1"]) + # In[ ]: + tru.run_dashboard() + # # Prototype Evals # This notebook shows the use of the dummy feedback function provider which # behaves like the huggingface provider except it does not actually perform any # network calls and just produces constant results. It can be used to prototype # feedback function wiring for your apps before invoking potentially slow (to # run/to load) feedback functions. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/prototype_evals.ipynb) # ## Import libraries # In[ ]: -# ! pip install trulens_eval==0.18.3 + +# ! pip install trulens_eval==0.19.0 + # In[ ]: + from trulens_eval import Feedback from trulens_eval import Tru @@ -488,50 +542,52 @@ def query(self, query: str) -> str: tru.run_dashboard() + # ## Set keys # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." + # ## Build the app # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() + # ## Create dummy feedback -# +# # By setting the provider as `Dummy()`, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later. # In[ ]: + from trulens_eval.feedback.provider.hugs import Dummy # hugs = Huggingface() @@ -539,40 +595,49 @@ def completion(self, prompt): f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output() + # ## Create the app # In[ ]: + # add trulens as a context manager for llm_app with dummy feedback from trulens_eval import TruCustomApp +tru_app = TruCustomApp(llm_app, + app_id = 'LLM App v1', + feedbacks = [f_positive_sentiment]) -tru_app = TruCustomApp( - llm_app, app_id='LLM App v1', feedbacks=[f_positive_sentiment] -) # ## Run the app # In[ ]: + with tru_app as recording: llm_app.completion('give me a good name for a colorful sock company') + # In[ ]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # ## Logging Human Feedback -# +# # In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/human_feedback.ipynb) # In[ ]: -# ! pip install trulens_eval==0.18.3 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 openai==1.3.7 + # In[ ]: + import os from pathlib import Path import sys @@ -582,118 +647,124 @@ def completion(self, prompt): tru = Tru() + # ## Set Keys -# +# # For this example, you need an OpenAI key. # In[ ]: + os.environ["OPENAI_API_KEY"] = "..." + # ## Set up your app -# +# # Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app. # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() # add trulens as a context manager for llm_app from trulens_eval import TruCustomApp +tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1') -tru_app = TruCustomApp(llm_app, app_id='LLM App v1') # ## Run the app # In[ ]: + with tru_app as recording: llm_app.completion("Give me 10 names for a colorful sock company") + # ## Get the `record_id` that you will log human feedback to. # In[ ]: + records, feedback = tru.get_records_and_feedback(app_ids=["LLM App v1"]) record_id = records.record_id[0] + # ## Create a mechamism for recording human feedback. -# +# # Be sure to click an emoji in the record to record `human_feedback` to log. # In[ ]: -from ipywidgets import Button -from ipywidgets import HBox -from ipywidgets import VBox + +from ipywidgets import Button, HBox, VBox thumbs_up_button = Button(description='👍') thumbs_down_button = Button(description='👎') human_feedback = None - def on_thumbs_up_button_clicked(b): global human_feedback human_feedback = 1 - def on_thumbs_down_button_clicked(b): global human_feedback human_feedback = 0 - thumbs_up_button.on_click(on_thumbs_up_button_clicked) thumbs_down_button.on_click(on_thumbs_down_button_clicked) HBox([thumbs_up_button, thumbs_down_button]) + # In[ ]: + # add the human feedback to a particular app and record tru.add_feedback( - name="Human Feedack", - record_id=record_id, - app_id=tru_app.app_id, - result=human_feedback -) + name="Human Feedack", + record_id=record_id, + app_id=tru_app.app_id, + result=human_feedback + ) + # ## See the result logged with your app. # In[ ]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # # Ground Truth Evaluations -# +# # In this quickstart you will create a evaluate a LangChain app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right. -# +# # Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/groundtruth_evals.ipynb) # ### Add API keys @@ -701,188 +772,264 @@ def on_thumbs_down_button_clicked(b): # In[ ]: -# ! pip install trulens_eval==0.18.3 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 openai==1.3.7 + # In[2]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." + # In[3]: + from trulens_eval import Tru tru = Tru() + # ### Create Simple LLM Application # In[4]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() + # ## Initialize Feedback Function(s) # In[5]: + from trulens_eval import Feedback from trulens_eval.feedback import GroundTruthAgreement golden_set = [ - { - "query": "who invented the lightbulb?", - "response": "Thomas Edison" - }, { - "query": "¿quien invento la bombilla?", - "response": "Thomas Edison" - } + {"query": "who invented the lightbulb?", "response": "Thomas Edison"}, + {"query": "¿quien invento la bombilla?", "response": "Thomas Edison"} ] -f_groundtruth = Feedback( - GroundTruthAgreement(golden_set).agreement_measure, name="Ground Truth" -).on_input_output() +f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = "Ground Truth").on_input_output() + # ## Instrument chain for logging with TruLens # In[6]: + # add trulens as a context manager for llm_app from trulens_eval import TruCustomApp +tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1', feedbacks = [f_groundtruth]) -tru_app = TruCustomApp(llm_app, app_id='LLM App v1', feedbacks=[f_groundtruth]) # In[7]: + # Instrumented query engine can operate as a context manager: with tru_app as recording: llm_app.completion("¿quien invento la bombilla?") llm_app.completion("who invented the lightbulb?") + # ## See results # In[8]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # # Logging Methods -# +# # ## Automatic Logging -# +# # The simplest method for logging with TruLens is by wrapping with TruChain and including the tru argument, as shown in the quickstart. -# +# # This is done like so: # In[ ]: -truchain = TruChain(chain, app_id='Chain1_ChatApplication', tru=tru) + +from IPython.display import JSON + +# Imports main tools: +from trulens_eval import Feedback +from trulens_eval import Huggingface +from trulens_eval import Tru +from trulens_eval import TruChain +from trulens_eval.schema import FeedbackResult + +tru = Tru() + +Tru().migrate_database() + +from langchain.chains import LLMChain +from langchain.llms import OpenAI +from langchain.prompts import ChatPromptTemplate +from langchain.prompts import HumanMessagePromptTemplate +from langchain.prompts import PromptTemplate + +full_prompt = HumanMessagePromptTemplate( + prompt=PromptTemplate( + template= + "Provide a helpful response with relevant background information for the following: {prompt}", + input_variables=["prompt"], + ) +) + +chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt]) + +llm = OpenAI(temperature=0.9, max_tokens=128) + +chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True) + +truchain = TruChain( + chain, + app_id='Chain1_ChatApplication', + tru=tru +) truchain("This will be automatically logged.") + # Feedback functions can also be logged automatically by providing them in a list to the feedbacks arg. # In[ ]: + +# Initialize Huggingface-based feedback function collection class: +hugs = Huggingface() + +# Define a language match feedback function using HuggingFace. +f_lang_match = Feedback(hugs.language_match).on_input_output() +# By default this will check language match on the main app input and main app +# output. + + +# In[ ]: + + truchain = TruChain( chain, app_id='Chain1_ChatApplication', - feedbacks=[f_lang_match], # feedback functions + feedbacks=[f_lang_match], # feedback functions tru=tru ) truchain("This will be automatically logged.") + # ## Manual Logging -# +# # ### Wrap with TruChain to instrument your chain # In[ ]: + tc = TruChain(chain, app_id='Chain1_ChatApplication') + # ### Set up logging and instrumentation -# +# # Making the first call to your wrapped LLM Application will now also produce a log or "record" of the chain execution. -# +# # In[ ]: + prompt_input = 'que hora es?' gpt3_response, record = tc.call_with_record(prompt_input) + # We can log the records but first we need to log the chain itself. # In[ ]: + tru.add_app(app=truchain) + # Then we can log the record: # In[ ]: + tru.add_record(record) + # ### Log App Feedback # Capturing app feedback such as user feedback of the responses can be added with one call. # In[ ]: + thumb_result = True tru.add_feedback( - name="👍 (1) or 👎 (0)", record_id=record.record_id, result=thumb_result + name="👍 (1) or 👎 (0)", + record_id=record.record_id, + result=thumb_result ) + # ### Evaluate Quality -# +# # Following the request to your app, you can then evaluate LLM quality using feedback functions. This is completed in a sequential call to minimize latency for your application, and evaluations will also be logged to your local machine. -# +# # To get feedback on the quality of your LLM, you can use any of the provided feedback functions or add your own. -# +# # To assess your LLM quality, you can provide the feedback functions to `tru.run_feedback()` in a list provided to `feedback_functions`. -# +# # In[ ]: + feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[f_lang_match] + record=record, + feedback_functions=[f_lang_match] ) -display(feedback_results) +for result in feedback_results: + display(result) + # After capturing feedback, you can then log it to your local database. # In[ ]: + tru.add_feedbacks(feedback_results) + # ### Out-of-band Feedback evaluation -# +# # In the above example, the feedback function evaluation is done in the same process as the chain evaluation. The alternative approach is the use the provided persistent evaluator started via `tru.start_deferred_feedback_evaluator`. Then specify the `feedback_mode` for `TruChain` as `deferred` to let the evaluator handle the feedback functions. -# +# # For demonstration purposes, we start the evaluator here but it can be started in another process. # In[ ]: + truchain: TruChain = TruChain( chain, app_id='Chain1_ChatApplication', @@ -891,29 +1038,28 @@ def completion(self, prompt): feedback_mode="deferred" ) +with truchain: + chain("This will be logged by deferred evaluator.") + tru.start_evaluator() -truchain("This will be logged by deferred evaluator.") -tru.stop_evaluator() +# tru.stop_evaluator() + # # Custom Functions -# +# # Feedback functions are an extensible framework for evaluating LLMs. You can add your own feedback functions to evaluate the qualities required by your application by updating `trulens_eval/feedback.py`, or simply creating a new provider class and feedback function in youre notebook. If your contributions would be useful for others, we encourage you to contribute to TruLens! -# +# # Feedback functions are organized by model provider into Provider classes. -# +# # The process for adding new feedback functions is: # 1. Create a new Provider class or locate an existing one that applies to your feedback function. If your feedback function does not rely on a model provider, you can create a standalone class. Add the new feedback function method to your selected class. Your new method can either take a single text (str) as a parameter or both prompt (str) and response (str). It should return a float between 0 (worst) and 1 (best). # In[ ]: -from trulens_eval import Feedback -from trulens_eval import Provider -from trulens_eval import Select -from trulens_eval import Tru +from trulens_eval import Provider, Feedback, Select, Tru class StandAlone(Provider): - def custom_feedback(self, my_text_field: str) -> float: """ A dummy function of text inputs to float outputs. @@ -931,53 +1077,57 @@ def custom_feedback(self, my_text_field: str) -> float: # In[ ]: + standalone = StandAlone() -f_custom_function = Feedback(standalone.custom_feedback - ).on(my_text_field=Select.RecordOutput) +f_custom_function = Feedback(standalone.custom_feedback).on( + my_text_field=Select.RecordOutput +) + # 3. Your feedback function is now ready to use just like the out of the box feedback functions. Below is an example of it being used. # In[ ]: + tru = Tru() feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[f_custom_function] + record=record, + feedback_functions=[f_custom_function] ) tru.add_feedbacks(feedback_results) + # ## Multi-Output Feedback functions # Trulens also supports multi-output feedback functions. As a typical feedback function will output a float between 0 and 1, multi-output should output a dictionary of `output_key` to a float between 0 and 1. The feedbacks table will display the feedback with column `feedback_name:::outputkey` # In[ ]: -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, name="multi" -).on(input_param=Select.RecordOutput) + +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi").on( + input_param=Select.RecordOutput +) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + # In[ ]: + # Aggregators will run on the same dict keys. import numpy as np - -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, - name="multi-agg" -).on(input_param=Select.RecordOutput).aggregate(np.mean) +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg").on( + input_param=Select.RecordOutput +).aggregate(np.mean) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + # In[ ]: @@ -987,16 +1137,12 @@ def dict_aggregator(list_dict_input): for dict_input in list_dict_input: agg += dict_input['output_key1'] return agg - - -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, - name="multi-agg-dict" -).on(input_param=Select.RecordOutput).aggregate(dict_aggregator) +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg-dict").on( + input_param=Select.RecordOutput +).aggregate(dict_aggregator) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py index 7c352108e..c334a0f51 100644 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py +++ b/trulens_eval/examples/quickstart/py_script_quickstarts/langchain_quickstart.py @@ -2,9 +2,9 @@ # coding: utf-8 # # Langchain Quickstart -# +# # In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb) # ## Setup @@ -13,28 +13,28 @@ # In[ ]: -# ! pip install trulens_eval==0.18.3 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 openai==1.3.7 + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." os.environ["HUGGINGFACE_API_KEY"] = "..." + # ### Import from LangChain and TruLens # In[ ]: + from IPython.display import JSON # Imports main tools: -from trulens_eval import Feedback -from trulens_eval import Huggingface -from trulens_eval import Tru -from trulens_eval import TruChain +from trulens_eval import TruChain, Feedback, Huggingface, Tru from trulens_eval.schema import FeedbackResult - tru = Tru() # Imports from langchain to build app. You may need to install langchain first @@ -42,16 +42,17 @@ # ! pip install langchain>=0.0.170 from langchain.chains import LLMChain from langchain.llms import OpenAI -from langchain.prompts import ChatPromptTemplate +from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.prompts import HumanMessagePromptTemplate -from langchain.prompts import PromptTemplate + # ### Create Simple LLM Application -# +# # This example uses a LangChain framework and OpenAI LLM # In[ ]: + full_prompt = HumanMessagePromptTemplate( prompt=PromptTemplate( template= @@ -66,22 +67,28 @@ chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True) + # ### Send your first request # In[ ]: + prompt_input = '¿que hora es?' + # In[ ]: + llm_response = chain(prompt_input) display(llm_response) + # ## Initialize Feedback Function(s) # In[ ]: + # Initialize Huggingface-based feedback function collection class: hugs = Huggingface() @@ -90,56 +97,67 @@ # By default this will check language match on the main app input and main app # output. + # ## Instrument chain for logging with TruLens # In[ ]: -tru_recorder = TruChain( - chain, app_id='Chain1_ChatApplication', feedbacks=[f_lang_match] -) + +tru_recorder = TruChain(chain, + app_id='Chain1_ChatApplication', + feedbacks=[f_lang_match]) + # In[ ]: + with tru_recorder as recording: llm_response = chain(prompt_input) display(llm_response) + # ## Retrieve records and feedback # In[ ]: + # The record of the ap invocation can be retrieved from the `recording`: -rec = recording.get() # use .get if only one record +rec = recording.get() # use .get if only one record # recs = recording.records # use .records if multiple display(rec) + # In[ ]: + # The results of the feedback functions can be rertireved from the record. These # are `Future` instances (see `concurrent.futures`). You can use `as_completed` # to wait until they have finished evaluating. from concurrent.futures import as_completed -for feedback_future in as_completed(rec.feedback_results): +for feedback_future in as_completed(rec.feedback_results): feedback, feedback_result = feedback_future.result() - + feedback: Feedback feedbac_result: FeedbackResult display(feedback.name, feedback_result.result) + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. @@ -148,5 +166,6 @@ # In[ ]: -tru.get_records_and_feedback(app_ids=[] - )[0] # pass an empty list of app_ids to get all + +tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all + diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py index 8799338a6..d505ffb4f 100644 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py +++ b/trulens_eval/examples/quickstart/py_script_quickstarts/llama_index_quickstart.py @@ -2,70 +2,78 @@ # coding: utf-8 # # Llama-Index Quickstart -# +# # In this quickstart you will create a simple Llama Index App and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb) # ## Setup -# +# # ### Install dependencies # Let's install some of the dependencies for this notebook if we don't have them already # In[ ]: -# pip install trulens_eval==0.18.3 llama_index>=0.8.69 html2text>=2020.1.16 + +# pip install trulens_eval==0.19.0 llama_index>=0.9.15post2 html2text>=2020.1.16 + # ### Add API keys # For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation. # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." + # ### Import from LlamaIndex and TruLens # In[ ]: -from trulens_eval import Feedback -from trulens_eval import Tru -from trulens_eval import TruLlama + +from trulens_eval import Feedback, Tru, TruLlama from trulens_eval.feedback import Groundedness from trulens_eval.feedback.provider.openai import OpenAI tru = Tru() + # ### Create Simple LLM Application -# +# # This example uses LlamaIndex which internally uses an OpenAI LLM. # In[ ]: + from llama_index import VectorStoreIndex from llama_index.readers.web import SimpleWebPageReader -documents = SimpleWebPageReader(html_to_text=True).load_data( - ["http://paulgraham.com/worked.html"] -) +documents = SimpleWebPageReader( + html_to_text=True +).load_data(["http://paulgraham.com/worked.html"]) index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() + # ### Send your first request # In[ ]: + response = query_engine.query("What did the author do growing up?") print(response) + # ## Initialize Feedback Function(s) # In[ ]: + import numpy as np # Initialize provider class @@ -76,7 +84,8 @@ # Define a groundedness feedback function f_groundedness = Feedback(grounded.groundedness_measure_with_cot_reasons).on( TruLlama.select_source_nodes().node.text.collect() -).on_output().aggregate(grounded.grounded_statements_aggregator) + ).on_output( + ).aggregate(grounded.grounded_statements_aggregator) # Question/answer relevance between overall question and answer. f_qa_relevance = Feedback(openai.relevance).on_input_output() @@ -84,32 +93,37 @@ # Question/statement relevance between question and each context chunk. f_qs_relevance = Feedback(openai.qs_relevance).on_input().on( TruLlama.select_source_nodes().node.text -).aggregate(np.mean) + ).aggregate(np.mean) + # ## Instrument app for logging with TruLens # In[ ]: -tru_query_engine_recorder = TruLlama( - query_engine, + +tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', - feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance] -) + feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance]) + # In[ ]: + # or as context manager with tru_query_engine_recorder as recording: query_engine.query("What did the author do growing up?") + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. @@ -118,5 +132,6 @@ # In[ ]: -tru.get_records_and_feedback(app_ids=[] - )[0] # pass an empty list of app_ids to get all + +tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all + diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py index 3f98ec0c4..5f030c241 100644 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py +++ b/trulens_eval/examples/quickstart/py_script_quickstarts/quickstart.py @@ -2,29 +2,33 @@ # coding: utf-8 # # TruLens Quickstart -# +# # In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb) # In[ ]: -# ! pip install trulens_eval==0.18.3 chromadb==0.4.18 openai==1.3.7 + +# ! pip install trulens_eval==0.19.0 chromadb==0.4.18 openai==1.3.7 + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # ## Get Data -# +# # In this case, we'll just initialize some simple text in the notebook. # In[ ]: + university_info = """ The University of Washington, founded in 1861 in Seattle, is a public research university with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. @@ -33,63 +37,71 @@ including one of the largest library systems in the world. """ + # ## Create Vector Store -# +# # Create a chromadb vector store in memory. # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() oai_client.embeddings.create( - model="text-embedding-ada-002", input=university_info -) + model="text-embedding-ada-002", + input=university_info + ) + # In[ ]: + import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction -embedding_function = OpenAIEmbeddingFunction( - api_key=os.environ.get('OPENAI_API_KEY'), - model_name="text-embedding-ada-002" -) +embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), + model_name="text-embedding-ada-002") + chroma_client = chromadb.Client() -vector_store = chroma_client.get_or_create_collection( - name="Universities", embedding_function=embedding_function -) +vector_store = chroma_client.get_or_create_collection(name="Universities", + embedding_function=embedding_function) + # Add the university_info to the embedding database. # In[ ]: + vector_store.add("uni_info", documents=university_info) + # ## Build RAG from scratch -# +# # Build a custom RAG from scratch, and add TruLens custom instrumentation. # In[ ]: + from trulens_eval import Tru from trulens_eval.tru_custom_app import instrument - tru = Tru() + # In[ ]: class RAG_from_scratch: - @instrument def retrieve(self, query: str) -> list: """ Retrieve relevant text from vector store. """ - results = vector_store.query(query_texts=query, n_results=2) + results = vector_store.query( + query_texts=query, + n_results=2 + ) return results['documents'][0] @instrument @@ -98,19 +110,19 @@ def generate_completion(self, query: str, context_str: list) -> str: Generate answer from context. """ completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": - f"We have provided context information below. \n" - f"---------------------\n" - f"{context_str}" - f"\n---------------------\n" - f"Given this information, please answer the question: {query}" - } - ] + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"We have provided context information below. \n" + f"---------------------\n" + f"{context_str}" + f"\n---------------------\n" + f"Given this information, please answer the question: {query}" + } + ] ).choices[0].message.content return completion @@ -120,22 +132,22 @@ def query(self, query: str) -> str: completion = self.generate_completion(query, context_str) return completion - rag = RAG_from_scratch() + # ## Set up feedback functions. -# +# # Here we'll use groundedness, answer relevance and context relevance to detect hallucination. # In[ ]: -import numpy as np -from trulens_eval import Feedback -from trulens_eval import Select +from trulens_eval import Feedback, Select from trulens_eval.feedback import Groundedness from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI +import numpy as np + # Initialize provider class fopenai = fOpenAI() @@ -143,53 +155,58 @@ def query(self, query: str) -> str: # Define a groundedness feedback function f_groundedness = ( - Feedback( - grounded.groundedness_measure_with_cot_reasons, name="Groundedness" - ).on(Select.RecordCalls.retrieve.rets.collect() - ).on_output().aggregate(grounded.grounded_statements_aggregator) + Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness") + .on(Select.RecordCalls.retrieve.rets.collect()) + .on_output() + .aggregate(grounded.grounded_statements_aggregator) ) # Question/answer relevance between overall question and answer. f_qa_relevance = ( - Feedback(fopenai.relevance_with_cot_reasons, name="Answer Relevance").on( - Select.RecordCalls.retrieve.args.query - ).on_output() + Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on_output() ) # Question/statement relevance between question and each context chunk. f_context_relevance = ( - Feedback(fopenai.qs_relevance_with_cot_reasons, - name="Context Relevance").on( - Select.RecordCalls.retrieve.args.query - ).on(Select.RecordCalls.retrieve.rets.collect() - ).aggregate(np.mean) + Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on(Select.RecordCalls.retrieve.rets.collect()) + .aggregate(np.mean) ) + # ## Construct the app # Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval # In[ ]: + from trulens_eval import TruCustomApp +tru_rag = TruCustomApp(rag, + app_id = 'RAG v1', + feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance]) -tru_rag = TruCustomApp( - rag, - app_id='RAG v1', - feedbacks=[f_groundedness, f_qa_relevance, f_context_relevance] -) # ## Run the app # Use `tru_rag` as a context manager for the custom RAG-from-scratch app. # In[ ]: + with tru_rag as recording: rag.query("When was the University of Washington founded?") + # In[ ]: + tru.get_leaderboard(app_ids=["RAG v1"]) + # In[ ]: + tru.run_dashboard() + diff --git a/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py b/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py index 332e80089..a8e918b69 100644 --- a/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py +++ b/trulens_eval/examples/quickstart/py_script_quickstarts/text2text_quickstart.py @@ -2,9 +2,9 @@ # coding: utf-8 # # Text to Text Quickstart -# +# # In this quickstart you will create a simple text to text application and learn how to log it and get feedback. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/text2text_quickstart.ipynb) # ## Setup @@ -13,34 +13,36 @@ # In[ ]: -# ! pip install trulens_eval==0.18.3 openai==1.3.1 + +# ! pip install trulens_eval==0.19.0 openai==1.3.1 + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "..." + # ### Import from TruLens # In[ ]: + from IPython.display import JSON + # Create openai client from openai import OpenAI - client = OpenAI() # Imports main tools: -from trulens_eval import Feedback -from trulens_eval import OpenAI as fOpenAI -from trulens_eval import Tru - +from trulens_eval import Feedback, OpenAI as fOpenAI, Tru tru = Tru() tru.reset_database() + # ### Create Simple Text to Text Application -# +# # This example uses a bare bones OpenAI LLM, and a non-LLM just for demonstration purposes. # In[ ]: @@ -48,17 +50,10 @@ def llm_standalone(prompt): return client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - { - "role": - "system", - "content": - "You are a question and answer bot, and you answer super upbeat." - }, { - "role": "user", - "content": prompt - } + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a question and answer bot, and you answer super upbeat."}, + {"role": "user", "content": prompt} ] ).choices[0].message.content @@ -67,48 +62,56 @@ def llm_standalone(prompt): # In[ ]: -prompt_input = "How good is language AI?" + +prompt_input="How good is language AI?" prompt_output = llm_standalone(prompt_input) prompt_output + # ## Initialize Feedback Function(s) # In[ ]: + # Initialize OpenAI-based feedback function collection class: fopenai = fOpenAI() # Define a relevance function from openai f_relevance = Feedback(fopenai.relevance).on_input_output() + # ## Instrument the callable for logging with TruLens # In[ ]: + from trulens_eval import TruBasicApp +tru_llm_standalone_recorder = TruBasicApp(llm_standalone, app_id="Happy Bot", feedbacks=[f_relevance]) -tru_llm_standalone_recorder = TruBasicApp( - llm_standalone, app_id="Happy Bot", feedbacks=[f_relevance] -) # In[ ]: + with tru_llm_standalone_recorder as recording: tru_llm_standalone_recorder.app(prompt_input) + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # ## Or view results directly in your notebook # In[ ]: -tru.get_records_and_feedback(app_ids=[] - )[0] # pass an empty list of app_ids to get all + +tru.get_records_and_feedback(app_ids=[])[0] # pass an empty list of app_ids to get all + diff --git a/trulens_eval/examples/quickstart/quickstart.ipynb b/trulens_eval/examples/quickstart/quickstart.ipynb index ff3ee7548..690bea3e4 100644 --- a/trulens_eval/examples/quickstart/quickstart.ipynb +++ b/trulens_eval/examples/quickstart/quickstart.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3 chromadb==0.4.18 openai==1.3.7" + "# ! pip install trulens_eval==0.19.0 chromadb==0.4.18 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb index 6eed6c48c..7d4e2335e 100644 --- a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.18.3 openai==1.3.1" + "# ! pip install trulens_eval==0.19.0 openai==1.3.1" ] }, { diff --git a/trulens_eval/trulens_eval/__init__.py b/trulens_eval/trulens_eval/__init__.py index eaddd980b..86705eb43 100644 --- a/trulens_eval/trulens_eval/__init__.py +++ b/trulens_eval/trulens_eval/__init__.py @@ -78,7 +78,7 @@ """ -__version__ = "0.18.3" +__version__ = "0.19.0" from trulens_eval.feedback import Bedrock from trulens_eval.feedback import Feedback