diff --git a/docs/trulens_eval/api/providers.md b/docs/trulens_eval/api/providers.md index ae584385d..4e0624a26 100644 --- a/docs/trulens_eval/api/providers.md +++ b/docs/trulens_eval/api/providers.md @@ -9,8 +9,6 @@ ::: trulens_eval.feedback.provider.base.LLMProvider -::: trulens_eval.feedback.groundedness - ::: trulens_eval.feedback.groundtruth ::: trulens_eval.feedback.embeddings diff --git a/docs/trulens_eval/evaluation/feedback_implementations/stock.md b/docs/trulens_eval/evaluation/feedback_implementations/stock.md index ee330836d..0d6792f23 100644 --- a/docs/trulens_eval/evaluation/feedback_implementations/stock.md +++ b/docs/trulens_eval/evaluation/feedback_implementations/stock.md @@ -83,7 +83,6 @@ API Reference: [LLMProvider][trulens_eval.feedback.provider.base.LLMProvider]. filters: - "!^_" - ## Embedding-based API Reference: [Embeddings][trulens_eval.feedback.embeddings.Embeddings]. @@ -111,35 +110,7 @@ API Reference: [Embeddings][trulens_eval.feedback.embeddings.Embeddings]. filters: - "!^_" -## Combinators - -### Groundedness - -API Reference: [Groundedness][trulens_eval.feedback.groundedness.Groundedness] - -::: trulens_eval.feedback.groundedness.Groundedness - options: - heading_level: 4 - show_bases: false - show_root_heading: false - show_root_toc_entry: false - show_source: false - show_docstring_classes: false - show_docstring_modules: false - show_docstring_parameters: false - show_docstring_returns: false - show_docstring_description: true - show_docstring_examples: false - show_docstring_other_parameters: false - show_docstring_attributes: false - show_signature: false - separate_signature: false - summary: false - group_by_category: false - members_order: alphabetical - filters: - - "!^_" - +## Combinations ### Ground Truth Agreement @@ -167,4 +138,3 @@ API Reference: [GroundTruthAgreement][trulens_eval.feedback.groundtruth.GroundTr members_order: alphabetical filters: - "!^_" - diff --git a/docs/trulens_eval/tracking/instrumentation/langchain.ipynb b/docs/trulens_eval/tracking/instrumentation/langchain.ipynb index effd4da10..7136c5940 100644 --- a/docs/trulens_eval/tracking/instrumentation/langchain.ipynb +++ b/docs/trulens_eval/tracking/instrumentation/langchain.ipynb @@ -25,89 +25,92 @@ "source": [ "## Example Usage\n", "\n", - "Below is a quick example of usage. First, we'll create a standard LLMChain." + "To demonstrate usage, we'll create a standard RAG defined with LCEL.\n", + "\n", + "First, this requires loading data into a vector store." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# required imports\n", - "from langchain_openai import OpenAI\n", - "from langchain.chains import LLMChain\n", - "from langchain.prompts import PromptTemplate\n", - "from langchain.prompts.chat import HumanMessagePromptTemplate, ChatPromptTemplate\n", - "from trulens_eval import TruChain\n", + "import bs4\n", + "from langchain.document_loaders import WebBaseLoader\n", "\n", - "# typical LangChain rag setup\n", - "full_prompt = HumanMessagePromptTemplate(\n", - " prompt=PromptTemplate(\n", - " template=\n", - " \"Provide a helpful response with relevant background information for the following: {prompt}\",\n", - " input_variables=[\"prompt\"],\n", - " )\n", + "loader = WebBaseLoader(\n", + " web_paths=(\"https://lilianweng.github.io/posts/2023-06-23-agent/\",),\n", + " bs_kwargs=dict(\n", + " parse_only=bs4.SoupStrainer(\n", + " class_=(\"post-content\", \"post-title\", \"post-header\")\n", + " )\n", + " ),\n", ")\n", - "chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt])\n", + "docs = loader.load()\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", - "llm = OpenAI(temperature=0.9, max_tokens=128)\n", - "chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True)" + "text_splitter = RecursiveCharacterTextSplitter()\n", + "documents = text_splitter.split_documents(docs)\n", + "vectorstore = FAISS.from_documents(documents, embeddings)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To instrument an LLM chain, all that's required is to wrap it using TruChain." + "Then we can define the retriever chain using LCEL." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿฆ‘ Tru initialized with db url sqlite:///default.sqlite .\n", - "๐Ÿ›‘ Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.\n" - ] - } - ], + "outputs": [], "source": [ - "# instrument with TruChain\n", - "tru_recorder = TruChain(chain)" + "from langchain.schema import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain import hub\n", + "\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "prompt = hub.pull(\"rlm/rag-prompt\")\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Similarly, LangChain apps defined with LangChain Expression Language (LCEL) are also supported." + "To instrument an LLM chain, all that's required is to wrap it using TruChain." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from langchain.prompts import ChatPromptTemplate\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "prompt = ChatPromptTemplate.from_template(\"tell me a short joke about {topic}\")\n", - "model = ChatOpenAI()\n", - "output_parser = StrOutputParser()\n", - "\n", - "chain = prompt | model | output_parser\n", - "\n", - "tru_recorder = TruChain(\n", - " chain,\n", - " app_id='Chain1_ChatApplication'\n", - ")" + "from trulens_eval import TruChain\n", + "# instrument with TruChain\n", + "tru_recorder = TruChain(rag_chain)" ] }, { @@ -134,10 +137,10 @@ "\n", "provider = OpenAI()\n", "\n", - "context = TruChain.select_context(chain)\n", + "context = TruChain.select_context(rag_chain)\n", "\n", "f_context_relevance = (\n", - " Feedback(provider.qs_relevance)\n", + " Feedback(provider.context_relevance)\n", " .on_input()\n", " .on(context)\n", " .aggregate(np.mean)\n", @@ -160,7 +163,7 @@ "outputs": [], "source": [ "from trulens_eval.app import App\n", - "context = App.select_context(chain)" + "context = App.select_context(rag_chain)" ] }, { @@ -183,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -217,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -246,63 +249,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Module langchain*\n", - " Class langchain.agents.agent.BaseMultiActionAgent\n", - " Method plan: (self, intermediate_steps: 'List[Tuple[AgentAction, str]]', callbacks: 'Callbacks' = None, **kwargs: 'Any') -> 'Union[List[AgentAction], AgentFinish]'\n", - " Method aplan: (self, intermediate_steps: 'List[Tuple[AgentAction, str]]', callbacks: 'Callbacks' = None, **kwargs: 'Any') -> 'Union[List[AgentAction], AgentFinish]'\n", - " Class langchain.agents.agent.BaseSingleActionAgent\n", - " Method plan: (self, intermediate_steps: 'List[Tuple[AgentAction, str]]', callbacks: 'Callbacks' = None, **kwargs: 'Any') -> 'Union[AgentAction, AgentFinish]'\n", - " Method aplan: (self, intermediate_steps: 'List[Tuple[AgentAction, str]]', callbacks: 'Callbacks' = None, **kwargs: 'Any') -> 'Union[AgentAction, AgentFinish]'\n", - " Class langchain.chains.base.Chain\n", - " Method __call__: (self, inputs: Union[Dict[str, Any], Any], return_only_outputs: bool = False, callbacks: Union[List[langchain_core.callbacks.base.BaseCallbackHandler], langchain_core.callbacks.base.BaseCallbackManager, NoneType] = None, *, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, run_name: Optional[str] = None, include_run_info: bool = False) -> Dict[str, Any]\n", - " Method invoke: (self, input: Dict[str, Any], config: Optional[langchain_core.runnables.config.RunnableConfig] = None, **kwargs: Any) -> Dict[str, Any]\n", - " Method ainvoke: (self, input: Dict[str, Any], config: Optional[langchain_core.runnables.config.RunnableConfig] = None, **kwargs: Any) -> Dict[str, Any]\n", - " Method run: (self, *args: Any, callbacks: Union[List[langchain_core.callbacks.base.BaseCallbackHandler], langchain_core.callbacks.base.BaseCallbackManager, NoneType] = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any\n", - " Method arun: (self, *args: Any, callbacks: Union[List[langchain_core.callbacks.base.BaseCallbackHandler], langchain_core.callbacks.base.BaseCallbackManager, NoneType] = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any\n", - " Method _call: (self, inputs: Dict[str, Any], run_manager: Optional[langchain_core.callbacks.manager.CallbackManagerForChainRun] = None) -> Dict[str, Any]\n", - " Method _acall: (self, inputs: Dict[str, Any], run_manager: Optional[langchain_core.callbacks.manager.AsyncCallbackManagerForChainRun] = None) -> Dict[str, Any]\n", - " Method acall: (self, inputs: Union[Dict[str, Any], Any], return_only_outputs: bool = False, callbacks: Union[List[langchain_core.callbacks.base.BaseCallbackHandler], langchain_core.callbacks.base.BaseCallbackManager, NoneType] = None, *, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, run_name: Optional[str] = None, include_run_info: bool = False) -> Dict[str, Any]\n", - " Class langchain.memory.chat_memory.BaseChatMemory\n", - " Method save_context: (self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None\n", - " Method clear: (self) -> None\n", - " Class langchain_core.chat_history.BaseChatMessageHistory\n", - " Class langchain_core.documents.base.Document\n", - " Class langchain_core.language_models.base.BaseLanguageModel\n", - " Class langchain_core.language_models.llms.BaseLLM\n", - " Class langchain_core.load.serializable.Serializable\n", - " Class langchain_core.memory.BaseMemory\n", - " Method save_context: (self, inputs: 'Dict[str, Any]', outputs: 'Dict[str, str]') -> 'None'\n", - " Method clear: (self) -> 'None'\n", - " Class langchain_core.prompts.base.BasePromptTemplate\n", - " Class langchain_core.retrievers.BaseRetriever\n", - " Method _get_relevant_documents: (self, query: 'str', *, run_manager: 'CallbackManagerForRetrieverRun') -> 'List[Document]'\n", - " Method get_relevant_documents: (self, query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]'\n", - " Method aget_relevant_documents: (self, query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]'\n", - " Method _aget_relevant_documents: (self, query: 'str', *, run_manager: 'AsyncCallbackManagerForRetrieverRun') -> 'List[Document]'\n", - " Class langchain_core.runnables.base.RunnableSerializable\n", - " Class langchain_core.tools.BaseTool\n", - " Method _arun: (self, *args: 'Any', **kwargs: 'Any') -> 'Any'\n", - " Method _run: (self, *args: 'Any', **kwargs: 'Any') -> 'Any'\n", - "\n", - "Module trulens_eval.*\n", - " Class trulens_eval.feedback.feedback.Feedback\n", - " Method __call__: (self, *args, **kwargs) -> 'Any'\n", - " Class trulens_eval.utils.langchain.WithFeedbackFilterDocuments\n", - " Method _get_relevant_documents: (self, query: str, *, run_manager) -> List[langchain_core.documents.base.Document]\n", - " Method get_relevant_documents: (self, query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]'\n", - " Method aget_relevant_documents: (self, query: 'str', *, callbacks: 'Callbacks' = None, tags: 'Optional[List[str]]' = None, metadata: 'Optional[Dict[str, Any]]' = None, run_name: 'Optional[str]' = None, **kwargs: 'Any') -> 'List[Document]'\n", - " Method _aget_relevant_documents: (self, query: 'str', *, run_manager: 'AsyncCallbackManagerForRetrieverRun') -> 'List[Document]'\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from trulens_eval.tru_chain import LangChainInstrument\n", "LangChainInstrument().print_instrumentation()" @@ -330,37 +279,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Components:\n", - "\tTruChain (Other) at 0x2b60a3660 with path __app__\n", - "\tLLMChain (Other) at 0x2b5cdb3e0 with path __app__.app\n", - "\tPromptTemplate (Custom) at 0x2b605e580 with path __app__.app.prompt\n", - "\tChatOpenAI (Custom) at 0x2b5cdb4d0 with path __app__.app.llm\n", - "\tStrOutputParser (Custom) at 0x2b60a3750 with path __app__.app.output_parser\n", - "\n", - "Methods:\n", - "Object at 0x2b5cdb3e0:\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n", - "\t with path __app__.app\n" - ] - } - ], + "outputs": [], "source": [ "async_tc_recorder.print_instrumented()" ] diff --git a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb index 7314f2b66..58cafe9a4 100644 --- a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval openai langchain chromadb langchainhub bs4 tiktoken" + "# ! pip install trulens_eval openai langchain langchain-openai faiss-cpu bs4 tiktoken" ] }, { @@ -58,17 +58,13 @@ "# Imports main tools:\n", "from trulens_eval import TruChain, Tru\n", "tru = Tru()\n", - "tru.reset_database()\n", "\n", "# Imports from LangChain to build app\n", "import bs4\n", "from langchain import hub\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain.document_loaders import WebBaseLoader\n", - "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.schema import StrOutputParser\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain.vectorstores import Chroma\n", "from langchain_core.runnables import RunnablePassthrough" ] }, @@ -110,17 +106,17 @@ "metadata": {}, "outputs": [], "source": [ - "text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=1000,\n", - " chunk_overlap=200\n", - ")\n", + "from langchain_openai import OpenAIEmbeddings\n", "\n", - "splits = text_splitter.split_documents(docs)\n", + "embeddings = OpenAIEmbeddings()\n", "\n", - "vectorstore = Chroma.from_documents(\n", - " documents=splits,\n", - " embedding=OpenAIEmbeddings()\n", - ")" + "from langchain_community.vectorstores import FAISS\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter()\n", + "documents = text_splitter.split_documents(docs)\n", + "vectorstore = FAISS.from_documents(documents, embeddings)" ] }, { diff --git a/trulens_eval/generated_files/all_tools.py b/trulens_eval/generated_files/all_tools.py index 5126f26da..8a68ac395 100644 --- a/trulens_eval/generated_files/all_tools.py +++ b/trulens_eval/generated_files/all_tools.py @@ -2,9 +2,9 @@ # coding: utf-8 # # ๐Ÿ““ _LangChain_ Quickstart -# +# # In this quickstart you will create a simple LLM Chain and learn how to log it and get feedback on an LLM response. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/langchain_quickstart.ipynb) # ## Setup @@ -13,22 +13,24 @@ # In[ ]: + # ! pip install trulens_eval openai langchain chromadb langchainhub bs4 tiktoken + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # ### Import from LangChain and TruLens # In[ ]: -# Imports main tools: -from trulens_eval import Tru -from trulens_eval import TruChain +# Imports main tools: +from trulens_eval import TruChain, Tru tru = Tru() tru.reset_database() @@ -43,10 +45,12 @@ from langchain.vectorstores import Chroma from langchain_core.runnables import RunnablePassthrough + # ### Load documents # In[ ]: + loader = WebBaseLoader( web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), bs_kwargs=dict( @@ -57,108 +61,123 @@ ) docs = loader.load() + # ### Create Vector Store # In[ ]: + text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, chunk_overlap=200 + chunk_size=1000, + chunk_overlap=200 ) splits = text_splitter.split_documents(docs) vectorstore = Chroma.from_documents( - documents=splits, embedding=OpenAIEmbeddings() + documents=splits, + embedding=OpenAIEmbeddings() ) + # ### Create RAG # In[ ]: + retriever = vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) - def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) - rag_chain = ( - { - "context": retriever | format_docs, - "question": RunnablePassthrough() - } | prompt | llm | StrOutputParser() + {"context": retriever | format_docs, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() ) + # ### Send your first request # In[ ]: + rag_chain.invoke("What is Task Decomposition?") + # ## Initialize Feedback Function(s) # In[ ]: -import numpy as np -from trulens_eval import Feedback from trulens_eval.feedback.provider import OpenAI +from trulens_eval import Feedback +import numpy as np # Initialize provider class provider = OpenAI() # select context to be used in feedback. the location of context is app specific. from trulens_eval.app import App - context = App.select_context(rag_chain) # Define a groundedness feedback function f_groundedness = ( - Feedback(provider.groundedness_measure_with_cot_reasons - ).on(context.collect()) # collect context chunks into a list + Feedback(provider.groundedness_measure_with_cot_reasons) + .on(context.collect()) # collect context chunks into a list .on_output() ) # Question/answer relevance between overall question and answer. -f_answer_relevance = (Feedback(provider.relevance).on_input_output()) +f_answer_relevance = ( + Feedback(provider.relevance) + .on_input_output() +) # Question/statement relevance between question and each context chunk. f_context_relevance = ( - Feedback(provider.context_relevance_with_cot_reasons - ).on_input().on(context).aggregate(np.mean) + Feedback(provider.context_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) ) + # ## Instrument chain for logging with TruLens # In[ ]: -tru_recorder = TruChain( - rag_chain, + +tru_recorder = TruChain(rag_chain, app_id='Chain1_ChatApplication', - feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness] -) + feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness]) + # In[ ]: -response, tru_record = tru_recorder.with_record( - rag_chain.invoke, "What is Task Decomposition?" -) + +response, tru_record = tru_recorder.with_record(rag_chain.invoke, "What is Task Decomposition?") + # In[ ]: + json_like = tru_record.layout_calls_as_app() + # In[ ]: + json_like + # In[ ]: -from ipytree import Node -from ipytree import Tree +from ipytree import Tree, Node def display_call_stack(data): tree = Tree() @@ -171,14 +190,14 @@ def display_call_stack(data): tree.add_node(Node('Main Input: {}'.format(data['main_input']))) tree.add_node(Node('Main Output: {}'.format(data['main_output']))) tree.add_node(Node('Main Error: {}'.format(data['main_error']))) - + calls_node = Node('Calls') tree.add_node(calls_node) - + for call in data['calls']: call_node = Node('Call') calls_node.add_node(call_node) - + for step in call['stack']: step_node = Node('Step: {}'.format(step['path'])) call_node.add_node(step_node) @@ -186,42 +205,47 @@ def display_call_stack(data): expanded_node = Node('Expanded') step_node.add_node(expanded_node) for expanded_step in step['expanded']: - expanded_step_node = Node( - 'Step: {}'.format(expanded_step['path']) - ) + expanded_step_node = Node('Step: {}'.format(expanded_step['path'])) expanded_node.add_node(expanded_step_node) - + return tree - # Usage tree = display_call_stack(json_like) tree + # In[ ]: + tree + # In[ ]: + with tru_recorder as recording: llm_response = rag_chain.invoke("What is Task Decomposition?") display(llm_response) + # ## Retrieve records and feedback # In[ ]: + # The record of the app invocation can be retrieved from the `recording`: -rec = recording.get() # use .get if only one record +rec = recording.get() # use .get if only one record # recs = recording.records # use .records if multiple display(rec) + # In[ ]: + # The results of the feedback functions can be rertireved from # `Record.feedback_results` or using the `wait_for_feedback_result` method. The # results if retrieved directly are `Future` instances (see @@ -234,164 +258,188 @@ def display_call_stack(data): # See more about wait_for_feedback_results: # help(rec.wait_for_feedback_results) + # In[ ]: -records, feedback = tru.get_records_and_feedback( - app_ids=["Chain1_ChatApplication"] -) + +records, feedback = tru.get_records_and_feedback(app_ids=["Chain1_ChatApplication"]) records.head() + # In[ ]: + tru.get_leaderboard(app_ids=["Chain1_ChatApplication"]) + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # Note: Feedback functions evaluated in the deferred manner can be seen in the "Progress" page of the TruLens dashboard. # # ๐Ÿ““ LlamaIndex Quickstart -# +# # In this quickstart you will create a simple Llama Index app and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb) # ## Setup -# +# # ### Install dependencies # Let's install some of the dependencies for this notebook if we don't have them already # In[ ]: + # pip install trulens_eval llama_index openai + # ### Add API keys # For this quickstart, you will need Open AI and Huggingface keys. The OpenAI key is used for embeddings and GPT, and the Huggingface key is used for evaluation. # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # ### Import from TruLens # In[ ]: -from trulens_eval import Tru +from trulens_eval import Tru tru = Tru() + # ### Download data -# +# # This example uses the text of Paul Grahamโ€™s essay, [โ€œWhat I Worked Onโ€](https://paulgraham.com/worked.html), and is the canonical llama-index example. -# +# # The easiest way to get it is to [download it via this link](https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt) and save it in a folder called data. You can do so with the following command: # In[ ]: -get_ipython().system( - 'wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/' -) + +get_ipython().system('wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/') + # ### Create Simple LLM Application -# +# # This example uses LlamaIndex which internally uses an OpenAI LLM. # In[ ]: -from llama_index.core import SimpleDirectoryReader -from llama_index.core import VectorStoreIndex + +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader documents = SimpleDirectoryReader("data").load_data() index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() + # ### Send your first request # In[ ]: + response = query_engine.query("What did the author do growing up?") print(response) + # ## Initialize Feedback Function(s) # In[ ]: -import numpy as np -from trulens_eval import Feedback from trulens_eval.feedback.provider import OpenAI +from trulens_eval import Feedback +import numpy as np # Initialize provider class provider = OpenAI() # select context to be used in feedback. the location of context is app specific. from trulens_eval.app import App - context = App.select_context(query_engine) # Define a groundedness feedback function f_groundedness = ( - Feedback(provider.groundedness_measure_with_cot_reasons - ).on(context.collect()) # collect context chunks into a list + Feedback(provider.groundedness_measure_with_cot_reasons) + .on(context.collect()) # collect context chunks into a list .on_output() + .aggregate(provider.grounded_statements_aggregator) ) # Question/answer relevance between overall question and answer. -f_answer_relevance = (Feedback(provider.relevance).on_input_output()) +f_answer_relevance = ( + Feedback(provider.relevance) + .on_input_output() +) # Question/statement relevance between question and each context chunk. f_context_relevance = ( - Feedback(provider.context_relevance_with_cot_reasons - ).on_input().on(context).aggregate(np.mean) + Feedback(provider.context_relevance_with_cot_reasons) + .on_input() + .on(context) + .aggregate(np.mean) ) + # ## Instrument app for logging with TruLens # In[ ]: -from trulens_eval import TruLlama -tru_query_engine_recorder = TruLlama( - query_engine, +from trulens_eval import TruLlama +tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', - feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance] -) + feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance]) + # In[ ]: + # or as context manager with tru_query_engine_recorder as recording: query_engine.query("What did the author do growing up?") + # ## Retrieve records and feedback # In[ ]: + # The record of the app invocation can be retrieved from the `recording`: -rec = recording.get() # use .get if only one record +rec = recording.get() # use .get if only one record # recs = recording.records # use .records if multiple display(rec) + # In[ ]: + tru.run_dashboard() + # In[ ]: + # The results of the feedback functions can be rertireved from # `Record.feedback_results` or using the `wait_for_feedback_result` method. The # results if retrieved directly are `Future` instances (see @@ -404,51 +452,62 @@ def display_call_stack(data): # See more about wait_for_feedback_results: # help(rec.wait_for_feedback_results) + # In[ ]: + records, feedback = tru.get_records_and_feedback(app_ids=["LlamaIndex_App1"]) records.head() + # In[ ]: + tru.get_leaderboard(app_ids=["LlamaIndex_App1"]) + # ## Explore in a Dashboard # In[ ]: -tru.run_dashboard() # open a local streamlit app to explore + +tru.run_dashboard() # open a local streamlit app to explore # tru.stop_dashboard() # stop if needed + # Alternatively, you can run `trulens-eval` from a command line in the same folder to start the dashboard. # # ๐Ÿ““ TruLens Quickstart -# +# # In this quickstart you will create a RAG from scratch and learn how to log it and get feedback on an LLM response. -# +# # For evaluation, we will leverage the "hallucination triad" of groundedness, context relevance and answer relevance. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/quickstart.ipynb) # In[ ]: + # ! pip install trulens_eval chromadb openai + # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." os.environ["HUGGINGFACE_API_KEY"] = "hf_..." + # ## Get Data -# +# # In this case, we'll just initialize some simple text in the notebook. # In[ ]: + university_info = """ The University of Washington, founded in 1861 in Seattle, is a public research university with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. @@ -457,57 +516,62 @@ def display_call_stack(data): including one of the largest library systems in the world. """ + # ## Create Vector Store -# +# # Create a chromadb vector store in memory. # In[ ]: + import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction -embedding_function = OpenAIEmbeddingFunction( - api_key=os.environ.get('OPENAI_API_KEY'), - model_name="text-embedding-ada-002" -) +embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), + model_name="text-embedding-ada-002") + chroma_client = chromadb.Client() -vector_store = chroma_client.get_or_create_collection( - name="Universities", embedding_function=embedding_function -) +vector_store = chroma_client.get_or_create_collection(name="Universities", + embedding_function=embedding_function) + # Add the university_info to the embedding database. # In[ ]: + vector_store.add("uni_info", documents=university_info) + # ## Build RAG from scratch -# +# # Build a custom RAG from scratch, and add TruLens custom instrumentation. # In[ ]: + from trulens_eval import Tru from trulens_eval.tru_custom_app import instrument - tru = Tru() + # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() - class RAG_from_scratch: - @instrument def retrieve(self, query: str) -> list: """ Retrieve relevant text from vector store. """ - results = vector_store.query(query_texts=query, n_results=2) + results = vector_store.query( + query_texts=query, + n_results=2 + ) return results['documents'] @instrument @@ -516,19 +580,19 @@ def generate_completion(self, query: str, context_str: list) -> str: Generate answer from context. """ completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": - f"We have provided context information below. \n" - f"---------------------\n" - f"{context_str}" - f"\n---------------------\n" - f"Given this information, please answer the question: {query}" - } - ] + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"We have provided context information below. \n" + f"---------------------\n" + f"{context_str}" + f"\n---------------------\n" + f"Given this information, please answer the question: {query}" + } + ] ).choices[0].message.content return completion @@ -538,91 +602,99 @@ def query(self, query: str) -> str: completion = self.generate_completion(query, context_str) return completion - rag = RAG_from_scratch() + # ## Set up feedback functions. -# +# # Here we'll use groundedness, answer relevance and context relevance to detect hallucination. # In[ ]: -import numpy as np -from trulens_eval import Feedback -from trulens_eval import Select +from trulens_eval import Feedback, Select from trulens_eval.feedback.provider.openai import OpenAI +import numpy as np + provider = OpenAI() # Define a groundedness feedback function f_groundedness = ( - Feedback( - provider.groundedness_measure_with_cot_reasons, name="Groundedness" - ).on(Select.RecordCalls.retrieve.rets.collect()).on_output() + Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness") + .on(Select.RecordCalls.retrieve.rets.collect()) + .on_output() ) # Question/answer relevance between overall question and answer. f_answer_relevance = ( - Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance").on( - Select.RecordCalls.retrieve.args.query - ).on_output() + Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on_output() ) # Context relevance between question and each context chunk. f_context_relevance = ( - Feedback( - provider.context_relevance_with_cot_reasons, name="Context Relevance" - ).on(Select.RecordCalls.retrieve.args.query).on( - Select.RecordCalls.retrieve.rets - ).aggregate(np.mean) # choose a different aggregation method if you wish + Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance") + .on(Select.RecordCalls.retrieve.args.query) + .on(Select.RecordCalls.retrieve.rets) + .aggregate(np.mean) # choose a different aggregation method if you wish ) + # ## Construct the app # Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval # In[ ]: + from trulens_eval import TruCustomApp +tru_rag = TruCustomApp(rag, + app_id = 'RAG v1', + feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance]) -tru_rag = TruCustomApp( - rag, - app_id='RAG v1', - feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance] -) # ## Run the app # Use `tru_rag` as a context manager for the custom RAG-from-scratch app. # In[ ]: + with tru_rag as recording: rag.query("When was the University of Washington founded?") + # In[ ]: + tru.get_leaderboard(app_ids=["RAG v1"]) + # In[ ]: + tru.run_dashboard() + # # Prototype Evals # This notebook shows the use of the dummy feedback function provider which # behaves like the huggingface provider except it does not actually perform any # network calls and just produces constant results. It can be used to prototype # feedback function wiring for your apps before invoking potentially slow (to # run/to load) feedback functions. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/prototype_evals.ipynb) # ## Import libraries # In[ ]: + # ! pip install trulens_eval + # In[ ]: + from trulens_eval import Feedback from trulens_eval import Tru @@ -630,50 +702,52 @@ def query(self, query: str) -> str: tru.run_dashboard() + # ## Set keys # In[ ]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # ## Build the app # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() + # ## Create dummy feedback -# +# # By setting the provider as `Dummy()`, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later. # In[ ]: + from trulens_eval.feedback.provider.hugs import Dummy # hugs = Huggingface() @@ -681,40 +755,49 @@ def completion(self, prompt): f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output() + # ## Create the app # In[ ]: + # add trulens as a context manager for llm_app with dummy feedback from trulens_eval import TruCustomApp +tru_app = TruCustomApp(llm_app, + app_id = 'LLM App v1', + feedbacks = [f_positive_sentiment]) -tru_app = TruCustomApp( - llm_app, app_id='LLM App v1', feedbacks=[f_positive_sentiment] -) # ## Run the app # In[ ]: + with tru_app as recording: llm_app.completion('give me a good name for a colorful sock company') + # In[ ]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # # ๐Ÿ““ Logging Human Feedback -# +# # In many situations, it can be useful to log human feedback from your users about your LLM app's performance. Combining human feedback along with automated feedback can help you drill down on subsets of your app that underperform, and uncover new failure modes. This example will walk you through a simple example of recording human feedback with TruLens. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/human_feedback.ipynb) # In[ ]: + # ! pip install trulens_eval openai + # In[ ]: + import os from trulens_eval import Tru @@ -722,94 +805,98 @@ def completion(self, prompt): tru = Tru() + # ## Set Keys -# +# # For this example, you need an OpenAI key. # In[ ]: + os.environ["OPENAI_API_KEY"] = "sk-..." + # ## Set up your app -# +# # Here we set up a custom application using just an OpenAI chat completion. The process for logging human feedback is the same however you choose to set up your app. # In[ ]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() # add trulens as a context manager for llm_app -tru_app = TruCustomApp(llm_app, app_id='LLM App v1') +tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1') + # ## Run the app # In[ ]: + with tru_app as recording: llm_app.completion("Give me 10 names for a colorful sock company") + # In[ ]: + # Get the record to add the feedback to. record = recording.get() + # ## Create a mechamism for recording human feedback. -# +# # Be sure to click an emoji in the record to record `human_feedback` to log. # In[ ]: -from ipywidgets import Button -from ipywidgets import HBox -from ipywidgets import VBox + +from ipywidgets import Button, HBox, VBox thumbs_up_button = Button(description='๐Ÿ‘') thumbs_down_button = Button(description='๐Ÿ‘Ž') human_feedback = None - def on_thumbs_up_button_clicked(b): global human_feedback human_feedback = 1 - def on_thumbs_down_button_clicked(b): global human_feedback human_feedback = 0 - thumbs_up_button.on_click(on_thumbs_up_button_clicked) thumbs_down_button.on_click(on_thumbs_down_button_clicked) HBox([thumbs_up_button, thumbs_down_button]) + # In[ ]: + # add the human feedback to a particular app and record tru.add_feedback( name="Human Feedack", @@ -818,18 +905,21 @@ def on_thumbs_down_button_clicked(b): result=human_feedback ) + # ## See the result logged with your app. # In[ ]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # # ๐Ÿ““ Ground Truth Evaluations -# +# # In this quickstart you will create a evaluate a _LangChain_ app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right. -# +# # Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response. -# +# # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/quickstart/groundtruth_evals.ipynb) # ### Add API keys @@ -837,104 +927,109 @@ def on_thumbs_down_button_clicked(b): # In[ ]: + # ! pip install trulens_eval openai + # In[2]: -import os +import os os.environ["OPENAI_API_KEY"] = "sk-..." + # In[3]: + from trulens_eval import Tru tru = Tru() + # ### Create Simple LLM Application # In[4]: -from openai import OpenAI +from openai import OpenAI oai_client = OpenAI() from trulens_eval.tru_custom_app import instrument - class APP: - @instrument def completion(self, prompt): completion = oai_client.chat.completions.create( - model="gpt-3.5-turbo", - temperature=0, - messages=[ - { - "role": "user", - "content": f"Please answer the question: {prompt}" - } - ] - ).choices[0].message.content + model="gpt-3.5-turbo", + temperature=0, + messages= + [ + {"role": "user", + "content": + f"Please answer the question: {prompt}" + } + ] + ).choices[0].message.content return completion - - + llm_app = APP() + # ## Initialize Feedback Function(s) # In[5]: + from trulens_eval import Feedback from trulens_eval.feedback import GroundTruthAgreement golden_set = [ - { - "query": "who invented the lightbulb?", - "response": "Thomas Edison" - }, { - "query": "ยฟquien invento la bombilla?", - "response": "Thomas Edison" - } + {"query": "who invented the lightbulb?", "response": "Thomas Edison"}, + {"query": "ยฟquien invento la bombilla?", "response": "Thomas Edison"} ] -f_groundtruth = Feedback( - GroundTruthAgreement(golden_set).agreement_measure, name="Ground Truth" -).on_input_output() +f_groundtruth = Feedback(GroundTruthAgreement(golden_set).agreement_measure, name = "Ground Truth").on_input_output() + # ## Instrument chain for logging with TruLens # In[6]: + # add trulens as a context manager for llm_app from trulens_eval import TruCustomApp +tru_app = TruCustomApp(llm_app, app_id = 'LLM App v1', feedbacks = [f_groundtruth]) -tru_app = TruCustomApp(llm_app, app_id='LLM App v1', feedbacks=[f_groundtruth]) # In[7]: + # Instrumented query engine can operate as a context manager: with tru_app as recording: llm_app.completion("ยฟquien invento la bombilla?") llm_app.completion("who invented the lightbulb?") + # ## See results # In[8]: + tru.get_leaderboard(app_ids=[tru_app.app_id]) + # # Logging Methods -# +# # ## Automatic Logging -# +# # The simplest method for logging with TruLens is by wrapping with TruChain and # including the tru argument, as shown in the quickstart. -# +# # This is done like so: # In[ ]: + # Imports main tools: from trulens_eval import Feedback from trulens_eval import Huggingface @@ -946,10 +1041,10 @@ def completion(self, prompt): Tru().migrate_database() from langchain.chains import LLMChain +from langchain_community.llms import OpenAI from langchain.prompts import ChatPromptTemplate from langchain.prompts import HumanMessagePromptTemplate from langchain.prompts import PromptTemplate -from langchain_community.llms import OpenAI full_prompt = HumanMessagePromptTemplate( prompt=PromptTemplate( @@ -965,15 +1060,21 @@ def completion(self, prompt): chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True) -truchain = TruChain(chain, app_id='Chain1_ChatApplication', tru=tru) +truchain = TruChain( + chain, + app_id='Chain1_ChatApplication', + tru=tru +) with truchain: chain("This will be automatically logged.") + # Feedback functions can also be logged automatically by providing them in a list # to the feedbacks arg. # In[ ]: + # Initialize Huggingface-based feedback function collection class: hugs = Huggingface() @@ -982,98 +1083,119 @@ def completion(self, prompt): # By default this will check language match on the main app input and main app # output. + # In[ ]: + truchain = TruChain( chain, app_id='Chain1_ChatApplication', - feedbacks=[f_lang_match], # feedback functions + feedbacks=[f_lang_match], # feedback functions tru=tru ) with truchain: chain("This will be automatically logged.") + # ## Manual Logging -# +# # ### Wrap with TruChain to instrument your chain # In[ ]: + tc = TruChain(chain, app_id='Chain1_ChatApplication') + # ### Set up logging and instrumentation -# +# # Making the first call to your wrapped LLM Application will now also produce a log or "record" of the chain execution. -# +# # In[ ]: + prompt_input = 'que hora es?' gpt3_response, record = tc.with_record(chain.__call__, prompt_input) + # We can log the records but first we need to log the chain itself. # In[ ]: + tru.add_app(app=truchain) + # Then we can log the record: # In[ ]: + tru.add_record(record) + # ### Log App Feedback # Capturing app feedback such as user feedback of the responses can be added with # one call. # In[ ]: + thumb_result = True tru.add_feedback( - name="๐Ÿ‘ (1) or ๐Ÿ‘Ž (0)", record_id=record.record_id, result=thumb_result + name="๐Ÿ‘ (1) or ๐Ÿ‘Ž (0)", + record_id=record.record_id, + result=thumb_result ) + # ### Evaluate Quality -# +# # Following the request to your app, you can then evaluate LLM quality using # feedback functions. This is completed in a sequential call to minimize latency # for your application, and evaluations will also be logged to your local machine. -# +# # To get feedback on the quality of your LLM, you can use any of the provided # feedback functions or add your own. -# +# # To assess your LLM quality, you can provide the feedback functions to # `tru.run_feedback()` in a list provided to `feedback_functions`. -# +# # In[ ]: + feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[f_lang_match] + record=record, + feedback_functions=[f_lang_match] ) for result in feedback_results: display(result) + # After capturing feedback, you can then log it to your local database. # In[ ]: + tru.add_feedbacks(feedback_results) + # ### Out-of-band Feedback evaluation -# +# # In the above example, the feedback function evaluation is done in the same # process as the chain evaluation. The alternative approach is the use the # provided persistent evaluator started via # `tru.start_deferred_feedback_evaluator`. Then specify the `feedback_mode` for # `TruChain` as `deferred` to let the evaluator handle the feedback functions. -# +# # For demonstration purposes, we start the evaluator here but it can be started in # another process. # In[ ]: + truchain: TruChain = TruChain( chain, app_id='Chain1_ChatApplication', @@ -1088,25 +1210,22 @@ def completion(self, prompt): tru.start_evaluator() # tru.stop_evaluator() + # # ๐Ÿ““ Custom Feedback Functions -# +# # Feedback functions are an extensible framework for evaluating LLMs. You can add your own feedback functions to evaluate the qualities required by your application by updating `trulens_eval/feedback.py`, or simply creating a new provider class and feedback function in youre notebook. If your contributions would be useful for others, we encourage you to contribute to TruLens! -# +# # Feedback functions are organized by model provider into Provider classes. -# +# # The process for adding new feedback functions is: # 1. Create a new Provider class or locate an existing one that applies to your feedback function. If your feedback function does not rely on a model provider, you can create a standalone class. Add the new feedback function method to your selected class. Your new method can either take a single text (str) as a parameter or both prompt (str) and response (str). It should return a float between 0 (worst) and 1 (best). # In[ ]: -from trulens_eval import Feedback -from trulens_eval import Provider -from trulens_eval import Select -from trulens_eval import Tru +from trulens_eval import Provider, Feedback, Select, Tru class StandAlone(Provider): - def custom_feedback(self, my_text_field: str) -> float: """ A dummy function of text inputs to float outputs. @@ -1124,36 +1243,41 @@ def custom_feedback(self, my_text_field: str) -> float: # In[ ]: + standalone = StandAlone() -f_custom_function = Feedback(standalone.custom_feedback - ).on(my_text_field=Select.RecordOutput) +f_custom_function = Feedback(standalone.custom_feedback).on( + my_text_field=Select.RecordOutput +) + # 3. Your feedback function is now ready to use just like the out of the box feedback functions. Below is an example of it being used. # In[ ]: + tru = Tru() feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[f_custom_function] + record=record, + feedback_functions=[f_custom_function] ) tru.add_feedbacks(feedback_results) + # ## Extending existing providers. -# +# # In addition to calling your own methods, you can also extend stock feedback providers (such as `OpenAI`, `AzureOpenAI`, `Bedrock`) to custom feedback implementations. This can be especially useful for tweaking stock feedback functions, or running custom feedback function prompts while letting TruLens handle the backend LLM provider. -# +# # This is done by subclassing the provider you wish to extend, and using the `generate_score` method that runs the provided prompt with your specified provider, and extracts a float score from 0-1. Your prompt should request the LLM respond on the scale from 0 to 10, then the `generate_score` method will normalize to 0-1. -# +# # See below for example usage: # In[ ]: + from trulens_eval.feedback.provider import AzureOpenAI from trulens_eval.utils.generated import re_0_10_rating - class Custom_AzureOpenAI(AzureOpenAI): - def style_check_professional(self, response: str) -> float: """ Custom feedback function to grade the professional style of the resposne, extending AzureOpenAI provider. @@ -1164,33 +1288,26 @@ def style_check_professional(self, response: str) -> float: Returns: float: A value between 0 and 1. 0 being "not professional" and 1 being "professional". """ - professional_prompt = str.format( - "Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}", - response - ) + professional_prompt = str.format("Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}", response) return self.generate_score(system_prompt=professional_prompt) # Running "chain of thought evaluations" is another use case for extending providers. Doing so follows a similar process as above, where the base provider (such as `AzureOpenAI`) is subclassed. -# +# # For this case, the method `generate_score_and_reasons` can be used to extract both the score and chain of thought reasons from the LLM response. -# +# # To use this method, the prompt used should include the `COT_REASONS_TEMPLATE` available from the TruLens prompts library (`trulens_eval.feedback.prompts`). -# +# # See below for example usage: # In[ ]: -from typing import Dict, Tuple +from typing import Tuple, Dict from trulens_eval.feedback import prompts - class Custom_AzureOpenAI(AzureOpenAI): - - def context_relevance_with_cot_reasons_extreme( - self, question: str, context: str - ) -> Tuple[float, Dict]: + def context_relevance_with_cot_reasons_extreme(self, question: str, context: str) -> Tuple[float, Dict]: """ Tweaked version of context relevance, extending AzureOpenAI provider. A function that completes a template to check the relevance of the statement to the question. @@ -1207,13 +1324,9 @@ def context_relevance_with_cot_reasons_extreme( # remove scoring guidelines around middle scores system_prompt = prompts.CONTEXT_RELEVANCE_SYSTEM.replace( - "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n", - "" - ) - - user_prompt = str.format( - prompts.CONTEXT_RELEVANCE_USER, question=question, context=context - ) + "- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n", "") + + user_prompt = str.format(prompts.CONTEXT_RELEVANCE_USER, question = question, context = context) user_prompt = user_prompt.replace( "RELEVANCE:", prompts.COT_REASONS_TEMPLATE ) @@ -1226,34 +1339,32 @@ def context_relevance_with_cot_reasons_extreme( # In[ ]: -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, name="multi" -).on(input_param=Select.RecordOutput) + +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi").on( + input_param=Select.RecordOutput +) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + # In[ ]: + # Aggregators will run on the same dict keys. import numpy as np - -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, - name="multi-agg" -).on(input_param=Select.RecordOutput).aggregate(np.mean) +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg").on( + input_param=Select.RecordOutput +).aggregate(np.mean) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + # In[ ]: @@ -1263,16 +1374,12 @@ def dict_aggregator(list_dict_input): for dict_input in list_dict_input: agg += dict_input['output_key1'] return agg - - -multi_output_feedback = Feedback( - lambda input_param: { - 'output_key1': 0.1, - 'output_key2': 0.9 - }, - name="multi-agg-dict" -).on(input_param=Select.RecordOutput).aggregate(dict_aggregator) +multi_output_feedback = Feedback(lambda input_param: {'output_key1': 0.1, 'output_key2': 0.9}, name="multi-agg-dict").on( + input_param=Select.RecordOutput +).aggregate(dict_aggregator) feedback_results = tru.run_feedback_functions( - record=record, feedback_functions=[multi_output_feedback] + record=record, + feedback_functions=[multi_output_feedback] ) tru.add_feedbacks(feedback_results) + diff --git a/trulens_eval/trulens_eval/__init__.py b/trulens_eval/trulens_eval/__init__.py index ae9287d01..eb89d6526 100644 --- a/trulens_eval/trulens_eval/__init__.py +++ b/trulens_eval/trulens_eval/__init__.py @@ -4,7 +4,7 @@ This top-level import includes everything to get started. """ -__version_info__ = (0, 28, 1) +__version_info__ = (0, 29, 0) """Version number components for major, minor, patch.""" __version__ = '.'.join(map(str, __version_info__)) diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py index 2d09b5147..59d133d89 100644 --- a/trulens_eval/trulens_eval/feedback/provider/base.py +++ b/trulens_eval/trulens_eval/feedback/provider/base.py @@ -547,8 +547,7 @@ def _langchain_evaluate_with_cot_reasons( criteria (str): The specific criteria for evaluation. Returns: - Tuple[float, str]: A tuple containing a value between 0.0 and 1.0, representing the specified - evaluation, and a string containing the reasons for the evaluation. + Tuple[float, str]: A tuple containing a value between 0.0 and 1.0, representing the specified evaluation, and a string containing the reasons for the evaluation. """ system_prompt = str.format( @@ -596,9 +595,7 @@ def conciseness_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text: The text to evaluate the conciseness of. Returns: - A value between 0.0 (not concise) and 1.0 (concise) - - A dictionary containing the reasons for the evaluation. + Tuple[float, str]: A tuple containing a value between 0.0 (not concise) and 1.0 (concise) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_CONCISENESS_SYSTEM_PROMPT @@ -641,7 +638,7 @@ def correctness_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): Text to evaluate. Returns: - float: A value between 0.0 (not correct) and 1.0 (correct). + Tuple[float, str]: A tuple containing a value between 0 (not correct) and 1.0 (correct) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_CORRECTNESS_SYSTEM_PROMPT @@ -684,7 +681,7 @@ def coherence_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not coherent) and 1.0 (coherent). + Tuple[float, str]: A tuple containing a value between 0 (not coherent) and 1.0 (coherent) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_COHERENCE_SYSTEM_PROMPT @@ -727,7 +724,7 @@ def harmfulness_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not harmful) and 1.0 (harmful). + Tuple[float, str]: A tuple containing a value between 0 (not harmful) and 1.0 (harmful) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( @@ -772,7 +769,7 @@ def maliciousness_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not malicious) and 1.0 (malicious). + Tuple[float, str]: A tuple containing a value between 0 (not malicious) and 1.0 (malicious) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_MALICIOUSNESS_SYSTEM_PROMPT @@ -815,7 +812,7 @@ def helpfulness_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not helpful) and 1.0 (helpful). + Tuple[float, str]: A tuple containing a value between 0 (not helpful) and 1.0 (helpful) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_HELPFULNESS_SYSTEM_PROMPT @@ -862,7 +859,7 @@ def controversiality_with_cot_reasons(self, text (str): The text to evaluate. Returns: - float: A value between 0.0 (not controversial) and 1.0 (controversial). + Tuple[float, str]: A tuple containing a value between 0 (not controversial) and 1.0 (controversial) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, @@ -906,7 +903,7 @@ def misogyny_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not misogynistic) and 1.0 (misogynistic). + Tuple[float, str]: A tuple containing a value between 0.0 (not misogynistic) and 1.0 (misogynistic) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_MISOGYNY_SYSTEM_PROMPT @@ -950,7 +947,7 @@ def criminality_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not criminal) and 1.0 (criminal). + Tuple[float, str]: A tuple containing a value between 0.0 (not criminal) and 1.0 (criminal) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_CRIMINALITY_SYSTEM_PROMPT @@ -993,7 +990,7 @@ def insensitivity_with_cot_reasons(self, text: str) -> Tuple[float, Dict]: text (str): The text to evaluate. Returns: - float: A value between 0.0 (not insensitive) and 1.0 (insensitive). + Tuple[float, str]: A tuple containing a value between 0.0 (not insensitive) and 1.0 (insensitive) and a string containing the reasons for the evaluation. """ return self._langchain_evaluate_with_cot_reasons( text=text, criteria=prompts.LANGCHAIN_INSENSITIVITY_SYSTEM_PROMPT @@ -1086,8 +1083,7 @@ def comprehensiveness_with_cot_reasons(self, source: str, summary (str): Text corresponding to a summary. Returns: - A value between 0.0 (main points missed) and 1.0 (no main - points missed). + Tuple[float, str]: A tuple containing a value between 0.0 (not comprehensive) and 1.0 (comprehensive) and a string containing the reasons for the evaluation. """ key_points = self._generate_key_points(source) @@ -1131,8 +1127,7 @@ def stereotypes(self, prompt: str, response: str) -> float: response (str): The agent's response to the prompt. Returns: - A value between 0.0 (no stereotypes assumed) and 1.0 - (stereotypes assumed). + A value between 0.0 (no stereotypes assumed) and 1.0 (stereotypes assumed). """ system_prompt = prompts.STEREOTYPES_SYSTEM_PROMPT user_prompt = str.format( @@ -1159,8 +1154,7 @@ def stereotypes_with_cot_reasons(self, prompt: str, response (str): The agent's response to the prompt. Returns: - A value between 0.0 (no stereotypes assumed) and 1.0 - (stereotypes assumed). + Tuple[float, str]: A tuple containing a value between 0.0 (no stereotypes assumed) and 1.0 (stereotypes assumed) and a string containing the reasons for the evaluation. """ system_prompt = prompts.STEREOTYPES_SYSTEM_PROMPT + prompts.COT_REASONS_TEMPLATE user_prompt = str.format( @@ -1197,7 +1191,7 @@ def groundedness_measure_with_cot_reasons( statement: The statement to check groundedness. Returns: - A measure between 0 and 1, where 1 means each sentence is grounded in the source. + Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation. """ nltk.download('punkt') groundedness_scores = {} diff --git a/trulens_eval/trulens_eval/feedback/provider/hugs.py b/trulens_eval/trulens_eval/feedback/provider/hugs.py index 7c597b5bb..6cd4835c7 100644 --- a/trulens_eval/trulens_eval/feedback/provider/hugs.py +++ b/trulens_eval/trulens_eval/feedback/provider/hugs.py @@ -157,9 +157,7 @@ def language_match(self, text1: str, text2: str) -> Tuple[float, Dict]: text2 (str): Comparative text to evaluate. Returns: - - float: A value between 0 and 1. 0 being "different languages" and 1 - being "same languages". + float: A value between 0 and 1. 0 being "different languages" and 1 being "same languages". """ def get_scores(text): @@ -205,10 +203,10 @@ def groundedness_measure_with_nli(self, source: str, from trulens_eval.feedback import Feedback from trulens_eval.feedback.provider.hugs = Huggingface - provider = Huggingface() + huggingface_provider = Huggingface() f_groundedness = ( - Feedback(provider.groundedness_measure_with_nli) + Feedback(huggingface_provider.groundedness_measure_with_nli) .on(context) .on_output() ``` @@ -218,8 +216,7 @@ def groundedness_measure_with_nli(self, source: str, statement (str): The statement to check groundedness Returns: - float: A measure between 0 and 1, where 1 means each sentence is grounded in the source. - str: + Tuple[float, str]: A tuple containing a value between 0.0 (not grounded) and 1.0 (grounded) and a string containing the reasons for the evaluation. """ nltk.download('punkt') groundedness_scores = {} @@ -251,24 +248,28 @@ def context_relevance(self, prompt: str, context: str) -> float: Uses Huggingface's truera/context_relevance model, a model that uses computes the relevance of a given context to the prompt. The model can be found at https://huggingface.co/truera/context_relevance. - **Usage:** - ```python - from trulens_eval import Feedback - from trulens_eval.feedback.provider.hugs import Huggingface - huggingface_provider = Huggingface() + + !!! example - feedback = Feedback(huggingface_provider.context_relevance).on_input_output() - ``` - The `on_input_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) + ```python + from trulens_eval import Feedback + from trulens_eval.feedback.provider.hugs import Huggingface + huggingface_provider = Huggingface() + + feedback = ( + Feedback(huggingface_provider.context_relevance) + .on_input() + .on(context) + .aggregate(np.mean) + ) + ``` Args: prompt (str): The given prompt. context (str): Comparative contextual information. Returns: - float: A value between 0 and 1. 0 being irrelevant and 1 - being a relevant context for addressing the prompt. + float: A value between 0 and 1. 0 being irrelevant and 1 being a relevant context for addressing the prompt. """ if prompt[len(prompt) - 1] != '.': @@ -304,15 +305,11 @@ def positive_sentiment(self, text: str) -> float: feedback = Feedback(huggingface_provider.positive_sentiment).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. Returns: - float: A value between 0 and 1. 0 being "negative sentiment" and 1 - being "positive sentiment". + float: A value between 0 (negative sentiment) and 1 (positive sentiment). """ max_length = 500 @@ -343,19 +340,14 @@ def toxic(self, text: str) -> float: from trulens_eval.feedback.provider.hugs import Huggingface huggingface_provider = Huggingface() - feedback = Feedback(huggingface_provider.not_toxic).on_output() + feedback = Feedback(huggingface_provider.toxic).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - - Args: text (str): Text to evaluate. Returns: - float: A value between 0 and 1. 1 being "toxic" and 0 being "not - toxic". + float: A value between 0 (not toxic) and 1 (toxic). """ assert len(text) > 0, "Input cannot be blank." @@ -441,10 +433,10 @@ def pii_detection(self, text: str) -> float: Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details) Args: - text: A text prompt that may contain a name. + text: A text prompt that may contain a PII. Returns: - The likelihood that a name is contained in the input text. + float: The likelihood that a PII is contained in the input text. """ # Initialize a list to store scores for "NAME" entities @@ -501,6 +493,12 @@ def pii_detection_with_cot_reasons(self, text: str): The `on(...)` selector can be changed. See [Feedback Function Guide : Selectors](https://www.trulens.org/trulens_eval/feedback_function_guide/#selector-details) + + Args: + text: A text prompt that may contain a name. + + Returns: + Tuple[float, str]: A tuple containing a the likelihood that a PII is contained in the input text and a string containing what PII is detected (if any). """ # Initialize a dictionary to store reasons @@ -570,14 +568,14 @@ def hallucination_evaluator( true/false boolean. if the return is greater than 0.5 the statement is evaluated as true. if the return is less than 0.5 the statement is evaluated as a hallucination. - **!!! example - ** - ```python - from trulens_eval.feedback.provider.hugs import Huggingface - huggingface_provider = Huggingface() + !!! example + + ```python + from trulens_eval.feedback.provider.hugs import Huggingface + huggingface_provider = Huggingface() - score = huggingface_provider.hallucination_evaluator("The sky is blue. [SEP] Apples are red , the grass is green.") - ``` + score = huggingface_provider.hallucination_evaluator("The sky is blue. [SEP] Apples are red , the grass is green.") + ``` Args: model_output (str): This is what an LLM returns based on the text chunks retrieved during RAG diff --git a/trulens_eval/trulens_eval/feedback/provider/openai.py b/trulens_eval/trulens_eval/feedback/provider/openai.py index 275565669..ac99fb09e 100644 --- a/trulens_eval/trulens_eval/feedback/provider/openai.py +++ b/trulens_eval/trulens_eval/feedback/provider/openai.py @@ -140,9 +140,6 @@ def moderation_hate(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -170,9 +167,6 @@ def moderation_hatethreatening(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -201,9 +195,6 @@ def moderation_selfharm(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -232,9 +223,6 @@ def moderation_sexual(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -263,15 +251,11 @@ def moderation_sexualminors(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. Returns: - float: A value between 0.0 (not sexual minors) and 1.0 (sexual - minors). + float: A value between 0.0 (not sexual minors) and 1.0 (sexual minors). """ openai_response = self._moderation(text) @@ -296,9 +280,6 @@ def moderation_violence(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -327,15 +308,11 @@ def moderation_violencegraphic(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. Returns: - float: A value between 0.0 (not graphic violence) and 1.0 (graphic - violence). + float: A value between 0.0 (not graphic violence) and 1.0 (graphic violence). """ openai_response = self._moderation(text) @@ -359,9 +336,6 @@ def moderation_harassment(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate. @@ -389,9 +363,6 @@ def moderation_harassment_threatening(self, text: str) -> float: ).on_output() ``` - The `on_output()` selector can be changed. See [Feedback Function - Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/) - Args: text (str): Text to evaluate.