update comprehensiveness + nb (#1064)

* update comprehensiveness + nb * nb expansion * fix typo * meetingbank transcript data * oss models in app * test * benchmarking gpt-3.5-turbo, gpt-4-turbo, and gpt-4o * update path * comprehensiveness benchmark * updated summarization_eval nb * fix normalization * show improvement in comprehensiveness feedback functions --------- Co-authored-by: Daniel <[email protected]>
truera · May 16, 2024 · 32de002 · 32de002
1 parent e41e513
commit 32de002
Show file tree

Hide file tree

Showing 10 changed files with 35,648 additions and 1,336 deletions.
diff --git a/trulens_eval/examples/expositional/use_cases/dialogsum.dev.jsonl b/trulens_eval/examples/expositional/use_cases/dialogsum.dev.jsonl
diff --git a/trulens_eval/examples/expositional/use_cases/summarization_eval.ipynb b/trulens_eval/examples/expositional/use_cases/summarization_eval.ipynb
@@ -8,22 +8,25 @@
    "source": [
     "# Evaluating Summarization with TruLens\n",
     "\n",
-    "In this notebook, we will evaluate a summarization application based on [DialogSum dataset](https://github.com/cylnlp/dialogsum). Using a number of different metrics. These will break down into two main categories: \n",
+    "In this notebook, we will evaluate a summarization application based on [DialogSum dataset](https://github.com/cylnlp/dialogsum) using a broad set of available metrics from TruLens. These metrics break down into three categories.\n",
+    "\n",
     "1. Ground truth agreement: For these set of metrics, we will measure how similar the generated summary is to some human-created ground truth. We will use for different measures: BERT score, BLEU, ROUGE and a measure where an LLM is prompted to produce a similarity score.\n",
-    "2. Groundedness: For this measure, we will estimate if the generated summary can be traced back to parts of the original transcript.\n",
+    "2. Groundedness: Estimate if the generated summary can be traced back to parts of the original transcript both with LLM and NLI methods.\n",
+    "3. Comprehensivenss: Estimate if the generated summary contains all of the key points from the source text.\n",
     "\n",
     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/truera/trulens/blob/main/trulens_eval/examples/expositional/use_cases/summarization_eval.ipynb)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "428e524f",
+   "id": "8fb7429f",
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
+    "os.environ['OPENAI_API_KEY'] = 'sk-...'\n",
+    "os.environ['HUGGINGFACE_API_KEY'] = \"hf_...\""
    ]
   },
   {
@@ -43,13 +46,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\"\"\"!pip install trulens_eval==0.18.0\n",
-    "            bert_score==0.3.13 \\\n",
-    "             evaluate==0.4.0 \\\n",
-    "             absl-py==1.4.0 \\\n",
-    "             rouge-score==0.1.2 \\\n",
-    "             pandas \\\n",
-    "             tenacity \"\"\""
+    "#!pip install trulens_eval bert_score evaluate absl-py rouge-score pandas tenacity"
    ]
   },
   {
@@ -156,7 +153,7 @@
     "    def summarize(self, dialog):\n",
     "        client = openai.OpenAI()\n",
     "        summary = client.chat.completions.create(\n",
-    "            model=\"gpt-3.5-turbo\",\n",
+    "            model=\"gpt-4-turbo\",\n",
     "            messages=[\n",
     "                    {\"role\": \"system\", \"content\": \"\"\"Summarize the given dialog into 1-2 sentences based on the following criteria: \n",
     "                     1. Convey only the most salient information; \n",
@@ -166,7 +163,7 @@
     "                     5. Be written in formal language. \"\"\"},\n",
     "                    {\"role\": \"user\", \"content\": dialog}\n",
     "                ]\n",
-    "            )[\"choices\"][0][\"message\"][\"content\"]\n",
+    "            ).choices[0].message.content\n",
     "        return summary"
    ]
   },
@@ -188,6 +185,7 @@
    "source": [
     "from trulens_eval import Tru\n",
     "tru = Tru()\n",
+    "tru.reset_database()\n",
     "# If you have a database you can connect to, use a URL. For example:\n",
     "# tru = Tru(database_url=\"postgresql://hostname/database?user=username&password=password\")"
    ]
@@ -199,7 +197,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tru.run_dashboard()"
+    "tru.start_dashboard(force=True)"
    ]
   },
   {
@@ -230,9 +228,7 @@
    "outputs": [],
    "source": [
     "from trulens_eval import Feedback, feedback\n",
-    "from trulens_eval.feedback import GroundTruthAgreement\n",
-    "\n",
-    "from trulens_eval.feedback.provider import OpenAI"
+    "from trulens_eval.feedback import GroundTruthAgreement"
    ]
   },
   {
@@ -261,15 +257,45 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "provider = OpenAI()\n",
+    "from trulens_eval.feedback.provider import OpenAI, Huggingface\n",
+    "from trulens_eval import Select\n",
+    "\n",
+    "provider = OpenAI(model_engine=\"gpt-4o\")\n",
+    "hug_provider = Huggingface()\n",
     "\n",
-    "ground_truth_collection = GroundTruthAgreement(golden_set)\n",
-    "f_groundtruth = Feedback(ground_truth_collection.agreement_measure).on_input_output()\n",
+    "ground_truth_collection = GroundTruthAgreement(golden_set, provider=provider)\n",
+    "f_groundtruth = Feedback(ground_truth_collection.agreement_measure, name = \"Similarity (LLM)\").on_input_output()\n",
     "f_bert_score = Feedback(ground_truth_collection.bert_score).on_input_output()\n",
     "f_bleu = Feedback(ground_truth_collection.bleu).on_input_output()\n",
     "f_rouge = Feedback(ground_truth_collection.rouge).on_input_output()\n",
     "# Groundedness between each context chunk and the response.\n",
-    "f_groundedness = feedback.Feedback(provider.groundedness_measure_with_cot_reasons).on_input().on_output().aggregate(grounded.grounded_statements_aggregator)"
+    "\n",
+    "\n",
+    "f_groundedness_llm = (\n",
+    "    Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness - LLM Judge\")\n",
+    "    .on(Select.RecordInput)\n",
+    "    .on(Select.RecordOutput)\n",
+    ")\n",
+    "f_groundedness_nli = (\n",
+    "    Feedback(hug_provider.groundedness_measure_with_nli, name = \"Groundedness - NLI Judge\")\n",
+    "    .on(Select.RecordInput)\n",
+    "    .on(Select.RecordOutput)\n",
+    ")\n",
+    "f_comprehensiveness = (Feedback(provider.comprehensiveness_with_cot_reasons,\n",
+    "                            name = \"Comprehensiveness\")\n",
+    "                            .on(Select.RecordInput)\n",
+    "                            .on(Select.RecordOutput))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40d2f78c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "provider.comprehensiveness_with_cot_reasons(\"the white house is white. obama is the president\", \"the white house is white. obama is the president\")"
    ]
   },
   {
@@ -298,7 +324,7 @@
    "outputs": [],
    "source": [
     "app = DialogSummaryApp()\n",
-    "#print(app.summarize(dev_df.dialogue[498]))"
+    "print(app.summarize(dev_df.dialogue[498]))"
    ]
   },
   {
@@ -308,7 +334,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ta = TruCustomApp(app, app_id='Summarize_v1', feedbacks = [f_groundtruth, f_groundedness, f_bert_score, f_bleu, f_rouge])"
+    "tru_recorder = TruCustomApp(app, app_id='Summarize_v1',\n",
+    "                  feedbacks = [f_groundtruth,\n",
+    "                                f_groundedness_llm,\n",
+    "                                f_groundedness_nli,\n",
+    "                                f_comprehensiveness,\n",
+    "                                f_bert_score,\n",
+    "                                f_bleu,\n",
+    "                                f_rouge])"
    ]
   },
   {
@@ -327,7 +360,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ta.with_record(app.summarize, dialog=dev_df.dialogue[498])"
+    "with tru_recorder:\n",
+    "    app.summarize(dialog=dev_df.dialogue[498])"
    ]
   },
   {
@@ -350,7 +384,7 @@
     "    retry,\n",
     "    stop_after_attempt,\n",
     "    wait_random_exponential,\n",
-    ")  # for exponential backoff\n"
+    ")  # for exponential backoff"
    ]
   },
   {
@@ -362,7 +396,7 @@
    "source": [
     "@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))\n",
     "def run_with_backoff(doc):\n",
-    "    return ta.with_record(app.summarize, dialog=doc)\n"
+    "    return tru_recorder.with_record(app.summarize, dialog=doc)\n"
    ]
   },
   {
@@ -385,6 +419,24 @@
    "source": [
     "And that's it! This might take a few minutes to run, at the end of it, you can explore the dashboard to see how well your app does."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf5d49dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tru.run_dashboard()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "782dc6d2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -403,7 +455,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

diff --git a/trulens_eval/trulens_eval/feedback/prompts.py b/trulens_eval/trulens_eval/feedback/prompts.py
@@ -109,35 +109,58 @@
 STEREOTYPES_SYSTEM_PROMPT = v2.Stereotypes.system_prompt.template
 STEREOTYPES_USER_PROMPT = v2.Stereotypes.user_prompt.template
 
+GENERATE_KEY_POINTS_SYSTEM_PROMPT = """
+INSTRUCTIONS:
+
+1. Identify the key points in the provided source text.
+
+2. Assign each point high or low importance level.
+
+3. Remove any points that are not assessed to high importance.
+
+4. All key points should now be assessed to high importance. There is no need to mention a points importance level.
+
+Answer using the entire template below. Each key point must be on a new line.
+
+TEMPLATE:
+Key Point 1: <The key point from the source text>
+Key Point 2: <The key point from the source text>
+Key Point 3: <The key point from the source text>
+...
+"""
+
+GENERATE_KEY_POINTS_USER_PROMPT = """
+/SOURCE TEXT/
+{source}
+/END OF SOURCE TEXT/
+"""
+
 COMPREHENSIVENESS_SYSTEM_PROMPT = """
 You are tasked with evaluating summarization quality. Please follow the instructions below.
 
 INSTRUCTIONS:
 
-1. Identify the key points in the provided source text and assign them high or low importance level.
-
-2. Assess how well the summary captures these key points.
+1. Given a key point, score well the summary captures that key points.
 
 Are the key points from the source text comprehensively included in the summary? More important key points matter more in the evaluation.
 
 Scoring criteria:
-0 - Capturing no key points with high importance level
-5 - Capturing 70 percent of key points with high importance level
-10 - Capturing all key points of high importance level
+0 - The key point is not included in the summary.
+5 - The key point is vaguely mentioned or partially included in the summary.
+10 - The key point is fully included in the summary.
 
 Answer using the entire template below.
 
 TEMPLATE:
-Score: <The score from 0 (capturing none of the important key points) to 10 (captures all key points of high importance).>
-Criteria: <Mention key points from the source text that should be included in the summary>
-Supporting Evidence: <Which key points are present and which key points are absent in the summary.>
-
+Score: <The score from 0 (the key point is not captured at all) to 10 (the key point is fully captured).>
+Key Point: <Mention the key point from the source text being evaluated>
+Supporting Evidence: <Evidence of whether the key point is present or absent in the summary.>
 """
 
 COMPOREHENSIVENESS_USER_PROMPT = """
-/SOURCE TEXT/
-{source}
-/END OF SOURCE TEXT/
+/KEY POINT/
+{key_point}
+/END OF KEY POINT/
 
 /SUMMARY/
 {summary}

diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py
@@ -1022,6 +1022,50 @@ def _get_answer_agreement(
             prompt=(prompts.AGREEMENT_SYSTEM % (prompt, check_response)) +
             response
         )
+
+    def _generate_key_points(self, source: str):
+        """
+        Uses chat completion model. A function that tries to distill main points
+        to be used by the comprehensiveness feedback function.
+
+         Args:
+            source (str): Text corresponding to source material. 
+
+        Returns:
+            (str) key points of the source text.
+        """
+
+        return self._create_chat_completion(prompt = 
+            prompts.GENERATE_KEY_POINTS_SYSTEM_PROMPT + str.format(
+                prompts.GENERATE_KEY_POINTS_USER_PROMPT, source=source)
+        )
+
+    def _assess_key_point_inclusion(self, key_points: str, summary: str) -> List:
+        """
+        Splits key points by newlines and assesses if each one is included in the summary.
+
+        Args:
+            key_points (str): Key points separated by newlines.
+            summary (str): The summary text to check for inclusion of key points.
+
+        Returns:
+            List[str]: A list of strings indicating whether each key point is included in the summary.
+        """
+        key_points_list = key_points.split('\n')
+
+        system_prompt = prompts.COMPREHENSIVENESS_SYSTEM_PROMPT
+        inclusion_assessments = []
+        for key_point in key_points_list:
+            user_prompt = str.format(
+                prompts.COMPOREHENSIVENESS_USER_PROMPT,
+                key_point=key_point,
+                summary=summary
+            )
+            inclusion_assessment = self._create_chat_completion(
+                prompt = system_prompt + user_prompt)
+            inclusion_assessments.append(inclusion_assessment)
+
+        return inclusion_assessments
 
     def comprehensiveness_with_cot_reasons(self, source: str,
                                            summary: str) -> Tuple[float, Dict]:
@@ -1046,23 +1090,28 @@ def comprehensiveness_with_cot_reasons(self, source: str,
                 points missed).
         """
 
-        system_prompt = prompts.COMPREHENSIVENESS_SYSTEM_PROMPT
-        user_prompt = str.format(
-            prompts.COMPOREHENSIVENESS_USER_PROMPT,
-            source=source,
-            summary=summary
-        )
-        return self.generate_score_and_reasons(system_prompt, user_prompt)
-
+        key_points = self._generate_key_points(source)
+        key_point_inclusion_assessments = self._assess_key_point_inclusion(key_points, summary)
+        scores = []
+        reasons = ""
+        for assessment in key_point_inclusion_assessments:
+            reasons += assessment + "\n\n"
+            if assessment:
+                first_line = assessment.split('\n')[0]
+                score = re_0_10_rating(first_line) / 10
+                scores.append(score)
+
+        score = sum(scores) / len(scores) if scores else 0
+        return score, {"reasons": reasons}
+
     def summarization_with_cot_reasons(self, source: str,
                                        summary: str) -> Tuple[float, Dict]:
         """
-        Summarization is deprecated in place of comprehensiveness. Defaulting to comprehensiveness_with_cot_reasons.
+        Summarization is deprecated in place of comprehensiveness. This function is no longer implemented.
         """
-        logger.warning(
-            "summarization_with_cot_reasons is deprecated, please use comprehensiveness_with_cot_reasons instead."
+        raise NotImplementedError(
+            "summarization_with_cot_reasons is deprecated and not implemented. Please use comprehensiveness_with_cot_reasons instead."
         )
-        return self.comprehensiveness_with_cot_reasons(source, summary)
 
     def stereotypes(self, prompt: str, response: str) -> float:
         """