Skip to content

Commit

Permalink
update comprehensiveness + nb (#1064)
Browse files Browse the repository at this point in the history
* update comprehensiveness + nb

* nb expansion

* fix typo

* meetingbank transcript data

* oss models in app

* test

* benchmarking gpt-3.5-turbo, gpt-4-turbo, and gpt-4o

* update path

* comprehensiveness benchmark

* updated summarization_eval nb

* fix normalization

* show improvement in comprehensiveness feedback functions

---------

Co-authored-by: Daniel <[email protected]>
  • Loading branch information
joshreini1 and daniel-huang-1230 authored May 16, 2024
1 parent e41e513 commit 32de002
Show file tree
Hide file tree
Showing 10 changed files with 35,648 additions and 1,336 deletions.
500 changes: 500 additions & 0 deletions trulens_eval/examples/expositional/use_cases/dialogsum.dev.jsonl

Large diffs are not rendered by default.

106 changes: 79 additions & 27 deletions trulens_eval/examples/expositional/use_cases/summarization_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,25 @@
"source": [
"# Evaluating Summarization with TruLens\n",
"\n",
"In this notebook, we will evaluate a summarization application based on [DialogSum dataset](https://github.com/cylnlp/dialogsum). Using a number of different metrics. These will break down into two main categories: \n",
"In this notebook, we will evaluate a summarization application based on [DialogSum dataset](https://github.com/cylnlp/dialogsum) using a broad set of available metrics from TruLens. These metrics break down into three categories.\n",
"\n",
"1. Ground truth agreement: For these set of metrics, we will measure how similar the generated summary is to some human-created ground truth. We will use for different measures: BERT score, BLEU, ROUGE and a measure where an LLM is prompted to produce a similarity score.\n",
"2. Groundedness: For this measure, we will estimate if the generated summary can be traced back to parts of the original transcript.\n",
"2. Groundedness: Estimate if the generated summary can be traced back to parts of the original transcript both with LLM and NLI methods.\n",
"3. Comprehensivenss: Estimate if the generated summary contains all of the key points from the source text.\n",
"\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/expositional/use_cases/summarization_eval.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "428e524f",
"id": "8fb7429f",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
"os.environ['OPENAI_API_KEY'] = 'sk-...'\n",
"os.environ['HUGGINGFACE_API_KEY'] = \"hf_...\""
]
},
{
Expand All @@ -43,13 +46,7 @@
"metadata": {},
"outputs": [],
"source": [
"\"\"\"!pip install trulens_eval==0.18.0\n",
" bert_score==0.3.13 \\\n",
" evaluate==0.4.0 \\\n",
" absl-py==1.4.0 \\\n",
" rouge-score==0.1.2 \\\n",
" pandas \\\n",
" tenacity \"\"\""
"#!pip install trulens_eval bert_score evaluate absl-py rouge-score pandas tenacity"
]
},
{
Expand Down Expand Up @@ -156,7 +153,7 @@
" def summarize(self, dialog):\n",
" client = openai.OpenAI()\n",
" summary = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" model=\"gpt-4-turbo\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"\"\"Summarize the given dialog into 1-2 sentences based on the following criteria: \n",
" 1. Convey only the most salient information; \n",
Expand All @@ -166,7 +163,7 @@
" 5. Be written in formal language. \"\"\"},\n",
" {\"role\": \"user\", \"content\": dialog}\n",
" ]\n",
" )[\"choices\"][0][\"message\"][\"content\"]\n",
" ).choices[0].message.content\n",
" return summary"
]
},
Expand All @@ -188,6 +185,7 @@
"source": [
"from trulens_eval import Tru\n",
"tru = Tru()\n",
"tru.reset_database()\n",
"# If you have a database you can connect to, use a URL. For example:\n",
"# tru = Tru(database_url=\"postgresql://hostname/database?user=username&password=password\")"
]
Expand All @@ -199,7 +197,7 @@
"metadata": {},
"outputs": [],
"source": [
"tru.run_dashboard()"
"tru.start_dashboard(force=True)"
]
},
{
Expand Down Expand Up @@ -230,9 +228,7 @@
"outputs": [],
"source": [
"from trulens_eval import Feedback, feedback\n",
"from trulens_eval.feedback import GroundTruthAgreement\n",
"\n",
"from trulens_eval.feedback.provider import OpenAI"
"from trulens_eval.feedback import GroundTruthAgreement"
]
},
{
Expand Down Expand Up @@ -261,15 +257,45 @@
"metadata": {},
"outputs": [],
"source": [
"provider = OpenAI()\n",
"from trulens_eval.feedback.provider import OpenAI, Huggingface\n",
"from trulens_eval import Select\n",
"\n",
"provider = OpenAI(model_engine=\"gpt-4o\")\n",
"hug_provider = Huggingface()\n",
"\n",
"ground_truth_collection = GroundTruthAgreement(golden_set)\n",
"f_groundtruth = Feedback(ground_truth_collection.agreement_measure).on_input_output()\n",
"ground_truth_collection = GroundTruthAgreement(golden_set, provider=provider)\n",
"f_groundtruth = Feedback(ground_truth_collection.agreement_measure, name = \"Similarity (LLM)\").on_input_output()\n",
"f_bert_score = Feedback(ground_truth_collection.bert_score).on_input_output()\n",
"f_bleu = Feedback(ground_truth_collection.bleu).on_input_output()\n",
"f_rouge = Feedback(ground_truth_collection.rouge).on_input_output()\n",
"# Groundedness between each context chunk and the response.\n",
"f_groundedness = feedback.Feedback(provider.groundedness_measure_with_cot_reasons).on_input().on_output().aggregate(grounded.grounded_statements_aggregator)"
"\n",
"\n",
"f_groundedness_llm = (\n",
" Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness - LLM Judge\")\n",
" .on(Select.RecordInput)\n",
" .on(Select.RecordOutput)\n",
")\n",
"f_groundedness_nli = (\n",
" Feedback(hug_provider.groundedness_measure_with_nli, name = \"Groundedness - NLI Judge\")\n",
" .on(Select.RecordInput)\n",
" .on(Select.RecordOutput)\n",
")\n",
"f_comprehensiveness = (Feedback(provider.comprehensiveness_with_cot_reasons,\n",
" name = \"Comprehensiveness\")\n",
" .on(Select.RecordInput)\n",
" .on(Select.RecordOutput))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40d2f78c",
"metadata": {},
"outputs": [],
"source": [
"provider.comprehensiveness_with_cot_reasons(\"the white house is white. obama is the president\", \"the white house is white. obama is the president\")"
]
},
{
Expand Down Expand Up @@ -298,7 +324,7 @@
"outputs": [],
"source": [
"app = DialogSummaryApp()\n",
"#print(app.summarize(dev_df.dialogue[498]))"
"print(app.summarize(dev_df.dialogue[498]))"
]
},
{
Expand All @@ -308,7 +334,14 @@
"metadata": {},
"outputs": [],
"source": [
"ta = TruCustomApp(app, app_id='Summarize_v1', feedbacks = [f_groundtruth, f_groundedness, f_bert_score, f_bleu, f_rouge])"
"tru_recorder = TruCustomApp(app, app_id='Summarize_v1',\n",
" feedbacks = [f_groundtruth,\n",
" f_groundedness_llm,\n",
" f_groundedness_nli,\n",
" f_comprehensiveness,\n",
" f_bert_score,\n",
" f_bleu,\n",
" f_rouge])"
]
},
{
Expand All @@ -327,7 +360,8 @@
"metadata": {},
"outputs": [],
"source": [
"ta.with_record(app.summarize, dialog=dev_df.dialogue[498])"
"with tru_recorder:\n",
" app.summarize(dialog=dev_df.dialogue[498])"
]
},
{
Expand All @@ -350,7 +384,7 @@
" retry,\n",
" stop_after_attempt,\n",
" wait_random_exponential,\n",
") # for exponential backoff\n"
") # for exponential backoff"
]
},
{
Expand All @@ -362,7 +396,7 @@
"source": [
"@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))\n",
"def run_with_backoff(doc):\n",
" return ta.with_record(app.summarize, dialog=doc)\n"
" return tru_recorder.with_record(app.summarize, dialog=doc)\n"
]
},
{
Expand All @@ -385,6 +419,24 @@
"source": [
"And that's it! This might take a few minutes to run, at the end of it, you can explore the dashboard to see how well your app does."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf5d49dc",
"metadata": {},
"outputs": [],
"source": [
"tru.run_dashboard()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "782dc6d2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -403,7 +455,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
49 changes: 36 additions & 13 deletions trulens_eval/trulens_eval/feedback/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,35 +109,58 @@
STEREOTYPES_SYSTEM_PROMPT = v2.Stereotypes.system_prompt.template
STEREOTYPES_USER_PROMPT = v2.Stereotypes.user_prompt.template

GENERATE_KEY_POINTS_SYSTEM_PROMPT = """
INSTRUCTIONS:
1. Identify the key points in the provided source text.
2. Assign each point high or low importance level.
3. Remove any points that are not assessed to high importance.
4. All key points should now be assessed to high importance. There is no need to mention a points importance level.
Answer using the entire template below. Each key point must be on a new line.
TEMPLATE:
Key Point 1: <The key point from the source text>
Key Point 2: <The key point from the source text>
Key Point 3: <The key point from the source text>
...
"""

GENERATE_KEY_POINTS_USER_PROMPT = """
/SOURCE TEXT/
{source}
/END OF SOURCE TEXT/
"""

COMPREHENSIVENESS_SYSTEM_PROMPT = """
You are tasked with evaluating summarization quality. Please follow the instructions below.
INSTRUCTIONS:
1. Identify the key points in the provided source text and assign them high or low importance level.
2. Assess how well the summary captures these key points.
1. Given a key point, score well the summary captures that key points.
Are the key points from the source text comprehensively included in the summary? More important key points matter more in the evaluation.
Scoring criteria:
0 - Capturing no key points with high importance level
5 - Capturing 70 percent of key points with high importance level
10 - Capturing all key points of high importance level
0 - The key point is not included in the summary.
5 - The key point is vaguely mentioned or partially included in the summary.
10 - The key point is fully included in the summary.
Answer using the entire template below.
TEMPLATE:
Score: <The score from 0 (capturing none of the important key points) to 10 (captures all key points of high importance).>
Criteria: <Mention key points from the source text that should be included in the summary>
Supporting Evidence: <Which key points are present and which key points are absent in the summary.>
Score: <The score from 0 (the key point is not captured at all) to 10 (the key point is fully captured).>
Key Point: <Mention the key point from the source text being evaluated>
Supporting Evidence: <Evidence of whether the key point is present or absent in the summary.>
"""

COMPOREHENSIVENESS_USER_PROMPT = """
/SOURCE TEXT/
{source}
/END OF SOURCE TEXT/
/KEY POINT/
{key_point}
/END OF KEY POINT/
/SUMMARY/
{summary}
Expand Down
73 changes: 61 additions & 12 deletions trulens_eval/trulens_eval/feedback/provider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,50 @@ def _get_answer_agreement(
prompt=(prompts.AGREEMENT_SYSTEM % (prompt, check_response)) +
response
)

def _generate_key_points(self, source: str):
"""
Uses chat completion model. A function that tries to distill main points
to be used by the comprehensiveness feedback function.
Args:
source (str): Text corresponding to source material.
Returns:
(str) key points of the source text.
"""

return self._create_chat_completion(prompt =
prompts.GENERATE_KEY_POINTS_SYSTEM_PROMPT + str.format(
prompts.GENERATE_KEY_POINTS_USER_PROMPT, source=source)
)

def _assess_key_point_inclusion(self, key_points: str, summary: str) -> List:
"""
Splits key points by newlines and assesses if each one is included in the summary.
Args:
key_points (str): Key points separated by newlines.
summary (str): The summary text to check for inclusion of key points.
Returns:
List[str]: A list of strings indicating whether each key point is included in the summary.
"""
key_points_list = key_points.split('\n')

system_prompt = prompts.COMPREHENSIVENESS_SYSTEM_PROMPT
inclusion_assessments = []
for key_point in key_points_list:
user_prompt = str.format(
prompts.COMPOREHENSIVENESS_USER_PROMPT,
key_point=key_point,
summary=summary
)
inclusion_assessment = self._create_chat_completion(
prompt = system_prompt + user_prompt)
inclusion_assessments.append(inclusion_assessment)

return inclusion_assessments

def comprehensiveness_with_cot_reasons(self, source: str,
summary: str) -> Tuple[float, Dict]:
Expand All @@ -1046,23 +1090,28 @@ def comprehensiveness_with_cot_reasons(self, source: str,
points missed).
"""

system_prompt = prompts.COMPREHENSIVENESS_SYSTEM_PROMPT
user_prompt = str.format(
prompts.COMPOREHENSIVENESS_USER_PROMPT,
source=source,
summary=summary
)
return self.generate_score_and_reasons(system_prompt, user_prompt)

key_points = self._generate_key_points(source)
key_point_inclusion_assessments = self._assess_key_point_inclusion(key_points, summary)
scores = []
reasons = ""
for assessment in key_point_inclusion_assessments:
reasons += assessment + "\n\n"
if assessment:
first_line = assessment.split('\n')[0]
score = re_0_10_rating(first_line) / 10
scores.append(score)

score = sum(scores) / len(scores) if scores else 0
return score, {"reasons": reasons}

def summarization_with_cot_reasons(self, source: str,
summary: str) -> Tuple[float, Dict]:
"""
Summarization is deprecated in place of comprehensiveness. Defaulting to comprehensiveness_with_cot_reasons.
Summarization is deprecated in place of comprehensiveness. This function is no longer implemented.
"""
logger.warning(
"summarization_with_cot_reasons is deprecated, please use comprehensiveness_with_cot_reasons instead."
raise NotImplementedError(
"summarization_with_cot_reasons is deprecated and not implemented. Please use comprehensiveness_with_cot_reasons instead."
)
return self.comprehensiveness_with_cot_reasons(source, summary)

def stereotypes(self, prompt: str, response: str) -> float:
"""
Expand Down
Loading

0 comments on commit 32de002

Please sign in to comment.