diff --git a/README.md b/README.md index f71dbb6ff..3ed646784 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,8 @@ Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab. ### πŸ’‘ Contributing -Interested in contributing? See our [contribution -guide](https://www.trulens.org/trulens_eval/CONTRIBUTING/) for more details. +Interested in contributing? See our [contributing +guide](https://www.trulens.org/trulens_eval/contributing/) for more details. \ No newline at end of file diff --git a/docs/trulens_explain/gh_top_intro.md b/docs/trulens_explain/gh_top_intro.md index c3413c4cb..083b4fe7f 100644 --- a/docs/trulens_explain/gh_top_intro.md +++ b/docs/trulens_explain/gh_top_intro.md @@ -61,7 +61,7 @@ notebooks: Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1f-ETsdlppODJGQCdMXG-jmGmfyWyW2VD?usp=sharing) For more information, see [TruLens-Explain -Documentation](https://www.trulens.org/trulens_explain/quickstart/). +Documentation](https://www.trulens.org/trulens_explain/getting_started/quickstart/). \ No newline at end of file diff --git a/trulens_eval/examples/experimental/dev_notebook.ipynb b/trulens_eval/examples/experimental/dev_notebook.ipynb index 9fec58a49..7923da68e 100644 --- a/trulens_eval/examples/experimental/dev_notebook.ipynb +++ b/trulens_eval/examples/experimental/dev_notebook.ipynb @@ -11,56 +11,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Volumes/dev_new/trulens/trulens_eval\n", - "βœ… Key OPENAI_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).\n", - "βœ… Key HUGGINGFACE_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).\n", - "πŸ¦‘ Tru initialized with db url sqlite:///default.sqlite .\n", - "πŸ›‘ Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.\n", - "Force stopping dashboard ...\n", - "Starting dashboard ...\n", - "Config file already exists. Skipping writing process.\n", - "Credentials file already exists. Skipping writing process.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6428d162dbbe4bab834fab3f6c507fa1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dashboard started at http://192.168.1.60:8501 .\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# pip uninstall -y trulens_eval\n", "# pip install git+https://github.com/truera/trulens@piotrm/azure_bugfixes#subdirectory=trulens_eval\n", @@ -110,18 +63,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "βœ… Key OPENAI_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).\n", - "βœ… Key HUGGINGFACE_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).\n" - ] - } - ], + "outputs": [], "source": [ "from trulens_eval.tru_llama import TruLlama\n", "\n", @@ -130,9 +74,10 @@ "\n", "from llama_index.core import SimpleDirectoryReader\n", "from llama_index.core import VectorStoreIndex\n", - "#os.system(\n", - "# 'wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/'\n", - "#)\n", + "if not os.path.exists(\"data/paul_graham_essay.txt\"):\n", + " os.system(\n", + " 'wget https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt -P data/'\n", + " )\n", "\n", "documents = SimpleDirectoryReader(\"data\").load_data()\n", "index = VectorStoreIndex.from_documents(documents)\n", @@ -145,35 +90,30 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from trulens_eval.utils.asynchro import sync\n", + "from trulens_eval.feedback.provider.hugs import Dummy\n", + "from trulens_eval import Select\n", + "from trulens_eval.feedback.feedback import Feedback\n", "\n", - "tru_query_engine_recorder = TruLlama(query_engine)\n", - "llm_response_async, record_async = sync(tru_query_engine_recorder.awith_record,\n", - " query_engine.aquery, \"What did the author do growing up?\"\n", - ")" + "f = Feedback(Dummy().language_match).on(Select.RecordCalls._retriever.retrieve.rets[42])\n", + "\n", + "tru_query_engine_recorder = TruLlama(query_engine, feedbacks=[f])\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Record(record_id='record_hash_b36cc0050b7a66dfe92164777c1cc1a1', app_id='app_hash_1c131ab7ca5b8ed0cee364cd581a6e73', cost=Cost(n_requests=0, n_successful_requests=0, n_classes=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0), perf=Perf(start_time=datetime.datetime(2024, 2, 22, 20, 26, 44, 198151), end_time=datetime.datetime(2024, 2, 22, 20, 26, 44, 342939)), ts=datetime.datetime(2024, 2, 22, 20, 26, 44, 343091), tags='-', meta=None, main_input='What did the author do growing up?', main_output='The author worked on writing short stories and programming, particularly experimenting with early programming languages like Fortran on the IBM 1401 in 9th grade. Later, the author transitioned to working with microcomputers, building simple games and a word processor on a TRS-80 in the early 1980s.', main_error=None, calls=[RecordAppCall(stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine, id=11581362896, init_bindings=None), name='aquery'))], args={'str_or_query_bundle': 'What did the author do growing up?'}, rets={'response': 'The author worked on writing short stories and programming, particularly experimenting with early programming languages like Fortran on the IBM 1401 in 9th grade. Later, the author transitioned to working with microcomputers, building simple games and a word processor on a TRS-80 in the early 1980s.', 'source_nodes': [{'node': {'id_': '0b30b3b1-8d60-4c4f-93b1-214162f4f0b9', 'embedding': None, 'metadata': {'file_path': 'data/paul_graham_essay.txt.2', 'file_name': 'paul_graham_essay.txt.2', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'excluded_embed_metadata_keys': ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], 'excluded_llm_metadata_keys': ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], 'relationships': {: {'node_id': '767c42d7-a4cf-4292-bc32-f25dd5002f44', 'node_type': , 'metadata': {'file_path': 'data/paul_graham_essay.txt.2', 'file_name': 'paul_graham_essay.txt.2', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'hash': 'e70e6caa9d89f9905df17c6ab06f93bf0241825a54fa20c1312e3a72c44368a2'}, : {'node_id': '217c4584-77c9-4824-b025-d14f2d227e43', 'node_type': , 'metadata': {'file_path': 'data/paul_graham_essay.txt.1', 'file_name': 'paul_graham_essay.txt.1', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'hash': '67e784cc6d90c89a7d78a76d2e7b3fe06ac842cc5e6cbf0566523eaca5271a23'}, : {'node_id': 'cf5c07e0-2237-423e-81b4-3fe455d73250', 'node_type': , 'metadata': {}, 'hash': '692d74391b6ad961556d19669374a543e4752bab10e415d8eeeda9ccf682a597'}}, 'text': 'What I Worked On\\n\\nFebruary 2021\\n\\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\\n\\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This was in 9th grade, so I was 13 or 14. The school district\\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\\'s lair down there, with all these alien-looking machines β€” CPU, disk drives, printer, card reader β€” sitting up on a raised floor under bright fluorescent lights.\\n\\nThe language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it. The result would ordinarily be to print something on the spectacularly loud printer.\\n\\nI was puzzled by the 1401. I couldn\\'t figure out what to do with it. And in retrospect there\\'s not much I could have done with it. The only form of input to programs was data stored on punched cards, and I didn\\'t have any data stored on punched cards. The only other option was to do things that didn\\'t rely on any input, like calculate approximations of pi, but I didn\\'t know enough math to do anything interesting of that type. So I\\'m not surprised I can\\'t remember any programs I wrote, because they can\\'t have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn\\'t. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager\\'s expression made clear.\\n\\nWith microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]\\n\\nThe first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.\\n\\nComputers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he\\'d write 2 pages at a time and then print them out, but it was a lot better than a typewriter.\\n\\nThough I liked programming, I didn\\'t plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowledge. What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn\\'t much left for these supposed ultimate truths. All that seemed left for philosophy were edge cases that people in other fields felt could safely be ignored.\\n\\nI couldn\\'t have put this into words when I was 18. All I knew at the time was that I kept taking philosophy courses and they kept being boring. So I decided to switch to AI.\\n\\nAI was in the air in the mid 1980s, but there were two things especially that made me want to work on it: a novel by Heinlein called The Moon is a Harsh Mistress, which featured an intelligent computer called Mike, and a PBS documentary that showed Terry Winograd using SHRDLU. I haven\\'t tried rereading The Moon is a Harsh Mistress, so I don\\'t know how well it has aged, but when I read it I was drawn entirely into its world. It seemed only a matter of time before we\\'d have Mike, and when I saw Winograd using SHRDLU, it seemed like that time would be a few years at most.', 'start_char_idx': 2, 'end_char_idx': 4320, 'text_template': '{metadata_str}\\n\\n{content}', 'metadata_template': '{key}: {value}', 'metadata_seperator': '\\n'}, 'score': 0.8184159536776312}, {'node': {'id_': '5419a652-ed4d-4f9b-a69b-0e1eb394ac0f', 'embedding': None, 'metadata': {'file_path': 'data/paul_graham_essay.txt.1', 'file_name': 'paul_graham_essay.txt.1', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'excluded_embed_metadata_keys': ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], 'excluded_llm_metadata_keys': ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], 'relationships': {: {'node_id': 'f24e7079-9d85-442b-874c-2660d6bed98b', 'node_type': , 'metadata': {'file_path': 'data/paul_graham_essay.txt.1', 'file_name': 'paul_graham_essay.txt.1', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'hash': '58ab0c40edb1df9fdff9b69444e42659b29d0470b3cad9765cf2db5424537b93'}, : {'node_id': '3fb8a58d-41d1-4c6e-9cd1-6495d19121c2', 'node_type': , 'metadata': {'file_path': 'data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, 'hash': '1dd608c90fd9c27f5a75d7a2c512070454c8fe49f71c7bf58e4444219dfb02a7'}, : {'node_id': '394d7fb4-e040-4471-930f-7430feffc843', 'node_type': , 'metadata': {}, 'hash': '692d74391b6ad961556d19669374a543e4752bab10e415d8eeeda9ccf682a597'}}, 'text': 'What I Worked On\\n\\nFebruary 2021\\n\\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\\n\\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This was in 9th grade, so I was 13 or 14. The school district\\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\\'s lair down there, with all these alien-looking machines β€” CPU, disk drives, printer, card reader β€” sitting up on a raised floor under bright fluorescent lights.\\n\\nThe language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it. The result would ordinarily be to print something on the spectacularly loud printer.\\n\\nI was puzzled by the 1401. I couldn\\'t figure out what to do with it. And in retrospect there\\'s not much I could have done with it. The only form of input to programs was data stored on punched cards, and I didn\\'t have any data stored on punched cards. The only other option was to do things that didn\\'t rely on any input, like calculate approximations of pi, but I didn\\'t know enough math to do anything interesting of that type. So I\\'m not surprised I can\\'t remember any programs I wrote, because they can\\'t have done much. My clearest memory is of the moment I learned it was possible for programs not to terminate, when one of mine didn\\'t. On a machine without time-sharing, this was a social as well as a technical error, as the data center manager\\'s expression made clear.\\n\\nWith microcomputers, everything changed. Now you could have a computer sitting right in front of you, on a desk, that could respond to your keystrokes as it was running instead of just churning through a stack of punch cards and then stopping. [1]\\n\\nThe first of my friends to get a microcomputer built it himself. It was sold as a kit by Heathkit. I remember vividly how impressed and envious I felt watching him sitting in front of it, typing programs right into the computer.\\n\\nComputers were expensive in those days and it took me years of nagging before I convinced my father to buy one, a TRS-80, in about 1980. The gold standard then was the Apple II, but a TRS-80 was good enough. This was when I really started programming. I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book. There was only room in memory for about 2 pages of text, so he\\'d write 2 pages at a time and then print them out, but it was a lot better than a typewriter.\\n\\nThough I liked programming, I didn\\'t plan to study it in college. In college I was going to study philosophy, which sounded much more powerful. It seemed, to my naive high school self, to be the study of the ultimate truths, compared to which the things studied in other fields would be mere domain knowledge. What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn\\'t much left for these supposed ultimate truths. All that seemed left for philosophy were edge cases that people in other fields felt could safely be ignored.\\n\\nI couldn\\'t have put this into words when I was 18. All I knew at the time was that I kept taking philosophy courses and they kept being boring. So I decided to switch to AI.\\n\\nAI was in the air in the mid 1980s, but there were two things especially that made me want to work on it: a novel by Heinlein called The Moon is a Harsh Mistress, which featured an intelligent computer called Mike, and a PBS documentary that showed Terry Winograd using SHRDLU. I haven\\'t tried rereading The Moon is a Harsh Mistress, so I don\\'t know how well it has aged, but when I read it I was drawn entirely into its world. It seemed only a matter of time before we\\'d have Mike, and when I saw Winograd using SHRDLU, it seemed like that time would be a few years at most.', 'start_char_idx': 2, 'end_char_idx': 4320, 'text_template': '{metadata_str}\\n\\n{content}', 'metadata_template': '{key}: {value}', 'metadata_seperator': '\\n'}, 'score': 0.8177867173106038}], 'metadata': {'0b30b3b1-8d60-4c4f-93b1-214162f4f0b9': {'file_path': 'data/paul_graham_essay.txt.2', 'file_name': 'paul_graham_essay.txt.2', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}, '5419a652-ed4d-4f9b-a69b-0e1eb394ac0f': {'file_path': 'data/paul_graham_essay.txt.1', 'file_name': 'paul_graham_essay.txt.1', 'file_type': None, 'file_size': 75042, 'creation_date': '2024-02-22', 'last_modified_date': '2024-02-22', 'last_accessed_date': '2024-02-22'}}}, error=None, perf=Perf(start_time=datetime.datetime(2024, 2, 22, 20, 26, 44, 198151), end_time=datetime.datetime(2024, 2, 22, 20, 26, 44, 342939)), pid=74117, tid=35148333)], feedback_and_future_results=[], feedback_results=[])" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "from trulens_eval.utils.asynchro import sync\n", + "\n", + "llm_response_async, record_async = sync(tru_query_engine_recorder.awith_record,\n", + " query_engine.aquery, \"What did the author do growing up?\"\n", + ")\n", "record_async" ] }, diff --git a/trulens_eval/trulens_eval/app.py b/trulens_eval/trulens_eval/app.py index 13faf8afa..9139f074c 100644 --- a/trulens_eval/trulens_eval/app.py +++ b/trulens_eval/trulens_eval/app.py @@ -13,13 +13,14 @@ import queue import threading from threading import Lock -from typing import ( - Any, Awaitable, Callable, ClassVar, Dict, Hashable, Iterable, List, - Optional, Sequence, Set, Tuple, Type, TypeVar -) +import datetime +from typing import (Any, Awaitable, Callable, ClassVar, Dict, Hashable, + Iterable, List, Optional, Sequence, Set, Tuple, Type, + TypeVar) import pydantic +from trulens_eval import schema from trulens_eval.db import DB from trulens_eval.feedback import Feedback from trulens_eval.instruments import Instrument @@ -33,6 +34,7 @@ from trulens_eval.schema import RecordAppCall from trulens_eval.schema import Select from trulens_eval.tru import Tru +from trulens_eval.utils import pyschema from trulens_eval.utils.asynchro import CallableMaybeAwaitable from trulens_eval.utils.asynchro import desync from trulens_eval.utils.asynchro import sync @@ -480,8 +482,23 @@ class App(AppDefinition, WithInstrumentCallbacks, Hashable): manage_pending_feedback_results_thread: Optional[threading.Thread] = \ pydantic.Field(exclude=True, default=None) - """Thread for manager of pending feedback results queue. See - _manage_pending_feedback_results.""" + """Thread for manager of pending feedback results queue. + + See _manage_pending_feedback_results.""" + + selector_check_warning: bool = False + """Issue warnings when selectors are not found in the app with a placeholder + record. + + If False, constructor will raise an error instead. + """ + + selector_nocheck: bool = False + """Ignore selector checks entirely. + + This may be necessary if the expected record content cannot be determined + before it is produced. + """ def __init__( self, @@ -514,12 +531,6 @@ def __init__( pass if self.feedback_mode == FeedbackMode.WITH_APP_THREAD: - # EXPERIMENTAL: Start the thread that manages the queue of records - # with pending feedback results. This is meant to be run - # permentantly in a separate thread. It will remove records from the - # queue `records_with_pending_feedback_results` as their feedback - # results are computed and makes sure the queue does not keep - # growing. self._start_manage_pending_feedback_results() self._tru_post_init() @@ -529,12 +540,13 @@ def __del__(self): pass def _start_manage_pending_feedback_results(self) -> None: - """ - EXPERIMENTAL: Start the thread that manages the queue of records with - pending feedback results. This is meant to be run permentantly in a - separate thread. It will remove records from the queue - `records_with_pending_feedback_results` as their feedback results are - computed and makes sure the queue does not keep growing. + """Start the thread that manages the queue of records with + pending feedback results. + + This is meant to be run permentantly in a separate thread. It will + remove records from the queue `records_with_pending_feedback_results` as + their feedback results are computed and makes sure the queue does not + keep growing. """ if self.manage_pending_feedback_results_thread is not None: @@ -547,8 +559,8 @@ def _start_manage_pending_feedback_results(self) -> None: self.manage_pending_feedback_results_thread.start() def _manage_pending_feedback_results(self) -> None: - """ - EXPERIMENTAL: Manage the queue of records with pending feedback results. + """Manage the queue of records with pending feedback results. + This is meant to be run permentantly in a separate thread. It will remove records from the queue records_with_pending_feedback_results as their feedback results are computed and makes sure the queue does not @@ -608,7 +620,17 @@ def __hash__(self): def _tru_post_init(self): """ - Database-related initialization. + Database-related initialization and additional data checks. + + DB: + - Insert the app into the database. + - Insert feedback function definitions into the database. + + Checks: + - In deferred mode, try to serialize and deserialize feedback functions. + - Check that feedback function selectors are likely to refer to expected + app or record components. + """ if self.tru is None: @@ -650,7 +672,19 @@ def _tru_post_init(self): except Exception as e: raise Exception( f"Feedback function {f} is not loadable. Cannot use DEFERRED feedback mode. {e}" - ) + ) from e + + if not self.selector_nocheck: + + dummy = self.dummy_record() + + for feedback in self.feedbacks: + feedback.check_selectors( + app=self, + # Don't have a record yet, but use an empty one for the non-call related fields. + record=dummy, + warning=self.selector_check_warning + ) def main_call(self, human: str) -> str: """If available, a single text to a single text invocation of this app.""" @@ -751,7 +785,7 @@ def main_input( focus = self._extract_content(focus) if not isinstance(focus, Sequence): - logger.warning(f"focus {focus} is not a sequence") + logger.warning("Focus %s is not a sequence.", focus) break if isinstance(focus, JSON_BASES): @@ -771,7 +805,7 @@ def main_input( focus = self._extract_content(focus) if not isinstance(focus, Sequence): - logger.warning(f"focus {focus} is not a sequence") + logger.warning("Focus %s is not a sequence.", focus) break if isinstance(focus, JSON_BASES): @@ -834,8 +868,8 @@ def on_method_instrumented(self, obj: object, func: Callable, path: Lens): if path != old_path: logger.warning( - f"Method {func} was already instrumented on path {old_path}. " - f"Calls at {path} may not be recorded." + "Method %s was already instrumented on path %s. " + "Calls at %s may not be recorded.", func, old_path, path ) return @@ -1293,9 +1327,78 @@ def __getattr__(self, __name: str) -> Any: f"'{type(self).__name__}' object has no attribute '{__name}'" ) + def dummy_record( + self, + cost: Cost = schema.Cost(), + perf: Perf = schema.Perf.now(), + ts: datetime.datetime = datetime.datetime.now(), + main_input: str ="main_input are strings.", + main_output: str ="main_output are strings.", + main_error: str = "main_error are strings.", + meta: Dict ={'metakey': 'meta are dicts'}, + tags: str ='tags are strings' + ) -> Record: + """Create a dummy record with some of the expected structure without + actually invoking the app. + + The record is a guess of what an actual record might look like but will + be missing information that can only be determined after a call is made. + + All args are [Record][trulens_eval.schema.Record] fields except these: + + - `record_id` is generated using the default id naming schema. + - `app_id` is taken from this recorder. + - `calls` field is constructed based on instrumented methods. + """ + + calls = [] + + for methods in self.instrumented_methods.values(): + for func, lens in methods.items(): + + component = lens.get_sole_item(self) + + if not hasattr(component, func.__name__): + continue + method = getattr(component, func.__name__) + + sig = inspect.signature(method) + + method_serial = pyschema.FunctionOrMethod.of_callable(method) + + sample_args = {} + for p in sig.parameters.values(): + if p.default == inspect.Parameter.empty: + sample_args[p.name] = None + else: + sample_args[p.name] = p.default + + sample_call = RecordAppCall( + stack=[schema.RecordAppCallMethod(path=lens, method=method_serial)], + args=sample_args, + rets=None, + pid=0, + tid=0 + ) + + calls.append(sample_call) + + return Record( + app_id=self.app_id, + calls=calls, + cost=cost, + perf=perf, + ts=ts, + main_input=main_input, + main_output=main_output, + main_error=main_error, + meta=meta, + tags=tags + ) + def instrumented(self) -> Iterable[Tuple[Lens, ComponentView]]: """ - Enumerate instrumented components and their categories. + Iteration over instrumented components and their categories. """ for q, c in instrumented_component_views(self.model_dump()): @@ -1325,9 +1428,7 @@ def format_instrumented_methods(self) -> str: ) def print_instrumented_methods(self) -> None: - """ - Print instrumented methods. - """ + """Print instrumented methods.""" print(self.format_instrumented_methods()) diff --git a/trulens_eval/trulens_eval/feedback/feedback.py b/trulens_eval/trulens_eval/feedback/feedback.py index 2f035fa12..65219ed1b 100644 --- a/trulens_eval/trulens_eval/feedback/feedback.py +++ b/trulens_eval/trulens_eval/feedback/feedback.py @@ -7,17 +7,21 @@ import itertools import json import logging -import pprint import traceback from typing import ( Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union ) import warnings +from rich.pretty import pretty_repr +from rich import print as rprint +from rich.markdown import Markdown + import numpy as np import pandas import pydantic +from pprint import pformat from trulens_eval.feedback.provider.base import LLMProvider from trulens_eval.feedback.provider.endpoint.base import Endpoint from trulens_eval.schema import AppDefinition @@ -31,17 +35,15 @@ from trulens_eval.schema import Select from trulens_eval.utils.json import jsonify from trulens_eval.utils.pyschema import FunctionOrMethod -from trulens_eval.utils.python import callable_name +from trulens_eval.utils.python import callable_name, class_name from trulens_eval.utils.python import Future -from trulens_eval.utils.serial import JSON +from trulens_eval.utils.serial import JSON, GetItemOrAttribute from trulens_eval.utils.serial import Lens -from trulens_eval.utils.text import UNICODE_CHECK +from trulens_eval.utils.text import UNICODE_CHECK, retab from trulens_eval.utils.threading import TP logger = logging.getLogger(__name__) -pp = pprint.PrettyPrinter() - A = TypeVar("A") ImpCallable = Callable[[A], Union[float, Tuple[float, Dict[str, Any]]]] @@ -489,6 +491,7 @@ def on_prompt(self, arg: Optional[str] = None) -> Feedback: return ret + # alias on_input = on_prompt def on_response(self, arg: Optional[str] = None) -> Feedback: @@ -511,6 +514,7 @@ def on_response(self, arg: Optional[str] = None) -> Feedback: return ret + # alias on_output = on_response def on(self, *args, **kwargs) -> Feedback: @@ -522,9 +526,22 @@ def on(self, *args, **kwargs) -> Feedback: """ new_selectors = self.selectors.copy() + + for k, v in kwargs.items(): + if not isinstance(v, Lens): + raise ValueError( + f"Expected a Lens but got `{v}` of type `{class_name(type(v))}`." + ) + new_selectors[k] = v + new_selectors.update(kwargs) for path in args: + if not isinstance(path, Lens): + raise ValueError( + f"Expected a Lens but got `{path}` of type `{class_name(type(path))}`." + ) + argname = self._next_unselected_arg_name() new_selectors[argname] = path self._print_guessed_selector(argname, path) @@ -546,6 +563,135 @@ def sig(self) -> inspect.Signature: return signature(self.imp) + def check_selectors( + self, + app: Union[AppDefinition, JSON], + record: Record, + source_data: Optional[Dict[str, Any]] = None, + warning: bool = False + ) -> bool: + """Check that the selectors are valid for the given app and record. + + Args: + app: The app that produced the record. + + record: The record that the feedback will run on. This can be a + mostly empty record for checking ahead of producing one. The + utility method + [App.dummy_record][trulens_eval.app.App.dummy_record] is built + for this prupose. + + source_data: Additional data to select from when extracting feedback + function arguments. + + warning: Issue a warning instead of raising an error if a selector is + invalid. As some parts of a Record cannot be known ahead of + producing it, it may be necessary to not raise exception here + and only issue a warning. + + Returns: + True if the selectors are valid. False if not (if warning is set). + + Raises: + ValueError: If a selector is invalid and warning is not set. + """ + + from trulens_eval.app import App + + if source_data is None: + source_data = {} + + app_type: str = "trulens recorder (`TruChain`, `TruLlama`, etc)" + + if isinstance(app, App): + app_type = f"`{type(app).__name__}`" + app = jsonify(app, instrument=app.instrument, skip_specials=True, redact_keys=True) + + elif isinstance(app, AppDefinition): + app = jsonify(app, skip_specials=True, redact_keys=True) + + source_data = self._construct_source_data( + app=app, record=record, source_data=source_data + ) + + # Build the hint message here. + msg = "" + + # Keep track whether any selectors failed to validate. + check_good: bool = True + + # with c.capture() as cap: + for k, q in self.selectors.items(): + if q.exists(source_data): + continue + + msg += f""" +# Selector check failed + +Source of argument `{k}` to `{self.name}` does not exist in app or expected +record: + +```python +{q} +``` + +The data used to make this check may be incomplete. If you expect records +produced by your app to contain the selected content, you can ignore this error +by setting `selectors_nocheck` in the {app_type} constructor. Alternatively, +setting `selectors_check_warning` will print out this message but will not raise +an error. + +## Additional information: + +Feedback function signature: +```python +{self.sig} +``` + +""" + prefix = q.existing_prefix(source_data) + + if prefix is None: + continue + + if len(prefix.path) >= 2 and isinstance(prefix.path[-1], GetItemOrAttribute) and prefix.path[-1].get_item_or_attribute() == "rets": + # If the selector check failed because the selector was pointing + # to something beyond the rets of a record call, we have to + # ignore it as we cannot tell what will be in the rets ahead of + # invoking app. + continue + + if len(prefix.path) >= 3 and isinstance(prefix.path[-2], GetItemOrAttribute) and prefix.path[-2].get_item_or_attribute() == "args": + # Likewise if failure was because the selector was pointing to + # method args beyond their parameter names, we also cannot tell + # their contents so skip. + continue + + check_good = False + + msg += f"The prefix `{prefix}` selects this data that exists in your app or typical records:\n\n" + + try: + for prefix_obj in prefix.get(source_data): + msg += f"- Object of type `{class_name(type(prefix_obj))}` starting with:\n" + msg += "```python\n" + retab(tab="\t ", s=pretty_repr(prefix_obj, max_depth=2, indent_size=2)) + "\n```\n" + + except Exception as e: + msg += f"Some non-existant object because: {pretty_repr(e)}" + + if check_good: + return True + + # Output using rich text. + rprint(Markdown(msg)) + + if warning: + return False + + else: + raise ValueError("Some selectors do not exist in the app or record.") + + def run( self, app: Optional[Union[AppDefinition, JSON]] = None, @@ -631,7 +777,7 @@ def run( cost += part_cost except Exception as e: raise RuntimeError( - f"Evaluation of {self.name} failed on inputs: \n{pp.pformat(ins)[0:128]}." + f"Evaluation of {self.name} failed on inputs: \n{pformat(ins)[0:128]}." ) from e if isinstance(result_and_meta, Tuple): @@ -837,7 +983,7 @@ def _construct_source_data( """ if source_data is None: - source_data = dict() + source_data = {} else: source_data = dict(source_data) # copy diff --git a/trulens_eval/trulens_eval/requirements.txt b/trulens_eval/trulens_eval/requirements.txt index b952d0158..210706139 100644 --- a/trulens_eval/trulens_eval/requirements.txt +++ b/trulens_eval/trulens_eval/requirements.txt @@ -28,6 +28,7 @@ humanize >= 4.6.0 streamlit >= 1.31.1 streamlit-aggrid >= 0.3.4.post3 streamlit-extras >= 0.2.7 +rich >= 13.6.0 # DB and migration sqlalchemy >= 2.0.19 diff --git a/trulens_eval/trulens_eval/schema.py b/trulens_eval/trulens_eval/schema.py index 26418c44d..5afac6c36 100644 --- a/trulens_eval/trulens_eval/schema.py +++ b/trulens_eval/trulens_eval/schema.py @@ -141,6 +141,23 @@ class Perf(serial.SerialModel, pydantic.BaseModel): end_time: datetime.datetime """Datetime after the recorded call.""" + @staticmethod + def now(latency: Optional[datetime.timedelta] = None) -> Perf: + """Create a `Perf` instance starting now and ending now plus latency. + + Args: + latency: Latency in seconds. If given, end time will be now plus + latency. Otherwise end time will be a minimal interval plus start_time. + """ + + start_time = datetime.datetime.now() + if latency is not None: + end_time = start_time + latency + else: + end_time = start_time + datetime.timedelta(microseconds=1) + + return Perf(start_time=start_time, end_time=end_time) + @property def latency(self): """Latency in seconds.""" diff --git a/trulens_eval/trulens_eval/utils/json.py b/trulens_eval/trulens_eval/utils/json.py index 9e66a6d64..4ca830a65 100644 --- a/trulens_eval/trulens_eval/utils/json.py +++ b/trulens_eval/trulens_eval/utils/json.py @@ -99,10 +99,11 @@ def json_default(obj: Any) -> str: def jsonify_for_ui(*args, **kwargs): + """Options for jsonify common to UI displays. + + Redacts keys and hides special fields introduced by trulens. """ - Options for jsonify common to UI displays. Redact keys and hide special - fields. - """ + return jsonify(*args, **kwargs, redact_keys=True, skip_specials=True) @@ -134,7 +135,9 @@ def jsonify( include_excluded: include fields that are annotated to be excluded by pydantic. Returns: - The jsonified version of the given object. + The jsonified version of the given object. Jsonified means that the the + object is either a JSON base type, a list, or a dict with the containing + elements of the same. """ skip_excluded = not include_excluded # Hack so that our models do not get exludes dumped which causes many @@ -148,26 +151,26 @@ def jsonify( if instrument is None: instrument = Instrument() - dicted = dicted or dict() + dicted = dicted or {} if skip_specials: - recur_key = lambda k: isinstance( - k, JSON_BASES - ) and k not in ALL_SPECIAL_KEYS + def recur_key(k): + return isinstance(k, JSON_BASES) and k not in ALL_SPECIAL_KEYS else: - recur_key = lambda k: isinstance(k, JSON_BASES) and True + def recur_key(k): + return isinstance(k, JSON_BASES) if id(obj) in dicted: if skip_specials: return None - else: - return {CIRCLE: id(obj)} + + return {CIRCLE: id(obj)} if isinstance(obj, JSON_BASES): if redact_keys and isinstance(obj, str): return redact_value(obj) - else: - return obj + + return obj # TODO: remove eventually if isinstance(obj, SerialBytes): @@ -182,14 +185,15 @@ def jsonify( # TODO: should we include duplicates? If so, dicted needs to be adjusted. new_dicted = dict(dicted) - recur = lambda o: jsonify( - obj=o, - dicted=new_dicted, - instrument=instrument, - skip_specials=skip_specials, - redact_keys=redact_keys, - include_excluded=include_excluded - ) + def recur(o): + return jsonify( + obj=o, + dicted=new_dicted, + instrument=instrument, + skip_specials=skip_specials, + redact_keys=redact_keys, + include_excluded=include_excluded + ) content = None @@ -314,16 +318,17 @@ def jsonify( else: logger.debug( - f"Do not know how to jsonify an object '{str(obj)[0:32]}' of type '{type(obj)}'." + "Do not know how to jsonify an object '%s' of type '%s'.", str(obj)[0:32], type(obj) ) content = noserio(obj) # Add class information for objects that are to be instrumented, known as # "components". - if isinstance(content, dict) and not isinstance( + if not skip_specials and isinstance(content, dict) and not isinstance( obj, dict) and (instrument.to_instrument_object(obj) or isinstance(obj, WithClassInfo)): + content[CLASS_INFO] = Class.of_class( cls=obj.__class__, with_bases=True ).model_dump() diff --git a/trulens_eval/trulens_eval/utils/serial.py b/trulens_eval/trulens_eval/utils/serial.py index 96a220e23..fe45f48f1 100644 --- a/trulens_eval/trulens_eval/utils/serial.py +++ b/trulens_eval/trulens_eval/utils/serial.py @@ -10,13 +10,12 @@ import ast from ast import dump from ast import parse +from contextvars import ContextVar from copy import copy import logging -from pprint import PrettyPrinter -from typing import ( - Any, Callable, Dict, Generic, Hashable, Iterable, List, Optional, Sequence, - Sized, Tuple, TypeVar, Union -) +from typing import (Any, Callable, ClassVar, Dict, Generic, Hashable, Iterable, + List, Optional, Sequence, Set, Sized, Tuple, TypeVar, + Union) from merkle_json import MerkleJson from munch import Munch as Bunch @@ -24,11 +23,12 @@ from pydantic import GetCoreSchemaHandler from pydantic_core import core_schema from pydantic_core import CoreSchema +import rich from trulens_eval.utils.containers import iterable_peek +from trulens_eval.utils.python import class_name logger = logging.getLogger(__name__) -pp = PrettyPrinter() T = TypeVar("T") @@ -100,13 +100,46 @@ def model_dump(obj: Union[pydantic.BaseModel, pydantic.v1.BaseModel]) -> dict: else: raise ValueError("Not a pydantic.BaseModel.") - class SerialModel(pydantic.BaseModel): """ Trulens-specific additions on top of pydantic models. Includes utilities to help serialization mostly. """ + formatted_objects: ClassVar[ContextVar[Set[int]]] = ContextVar("formatted_objects") + + def __rich_repr__(self) -> rich.repr.Result: + """Requirement for pretty printing using the rich package.""" + + # yield class_name(type(self)) + + # If this is a root repr, create a new set for already-formatted objects. + tok = None + if SerialModel.formatted_objects.get(None) is None: + tok = SerialModel.formatted_objects.set(set()) + + formatted_objects = SerialModel.formatted_objects.get() + + if formatted_objects is None: + formatted_objects = set() + + if id(self) in formatted_objects: + yield f"{class_name(type(self))}@0x{id(self):x}" + + if tok is not None: + SerialModel.formatted_objects.reset(tok) + + return + + formatted_objects.add(id(self)) + + for k, v in self.__dict__.items(): + # This might result in recursive calls to __rich_repr__ of v. + yield k, v + + if tok is not None: + SerialModel.formatted_objects.reset(tok) + def model_dump_json(self, **kwargs): from trulens_eval.utils.json import json_str_of_obj @@ -619,19 +652,34 @@ def __init__(self, path: Optional[Iterable[Step]] = None): super().__init__(path=tuple(path)) - def exists(self, obj: Any) -> bool: - """ - Check whether the path exists in the given object. + def existing_prefix(self, obj: Any) -> Lens: + """Get the Lens representing the longest prefix of the path that exists + in the given object. """ + last_lens = Lens() + current_lens = last_lens + + for i, step in enumerate(self.path): + last_lens = current_lens + current_lens = current_lens._append(step) + if not current_lens.exists(obj): + return last_lens + + return current_lens + + def exists(self, obj: Any) -> bool: + """Check whether the path exists in the given object.""" + try: for _ in self.get(obj): - return True + # Check that all named values exist, not just the first one. + pass except (KeyError, IndexError, ValueError): return False - - return False + + return True @staticmethod def of_string(s: str) -> Lens: @@ -958,7 +1006,7 @@ def get(self, obj: Any) -> Iterable[Any]: for last_selection in last_step.get(start_selection): yield last_selection - def _append(self, step: Step) -> 'Lens': + def _append(self, step: Step) -> Lens: return Lens(path=self.path + (step,)) def __getitem__(