Merge pull request #23 from ittia-research/dev

Fix exception handle of summary
ittia-research · Sep 4, 2024 · 3d212f7 · 3d212f7
2 parents 80cb4e4 + c128727
commit 3d212f7
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 28 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 ITTIA
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
@@ -28,13 +28,13 @@ class Union():
       
     TODO:
       - Add support of verdict standards.
-      - Make betetr use of the other data of web search.
-      - Generate or draw class data stracture.
+      - Make better use of the other data of web search.
+      - Generate or draw class data structure.
     """
 
     def __init__(self, input: str):
         """Avoid run I/O intense functions here to better support async"""
-        self.input = input  # raw input to analize
+        self.input = input  # raw input to analyze
         self.data = {}  # contains all intermediate and final data
 
     async def final(self):
@@ -43,8 +43,8 @@ async def final(self):
         await asyncio.gather(*_task)
 
         # update reports
-        _sum = [v['summary'] for v in self.data.values()]
-        self.reports = utils.generate_report_markdown(self.input, _sum)
+        _summaries = [v['summary'] for v in self.data.values()]
+        self.reports = utils.generate_report_markdown(self.input, _summaries)
 
         return self.reports
 
@@ -65,7 +65,7 @@ async def _pipe_statement(self, data_statement):
         self.update_summary(data_statement)
 
     async def _pipe_source(self, data_source, statement):
-        """Update docs and then update retriever, verdic, citation"""
+        """Update docs and then update retriever, verdict, citation"""
 
         # update docs
         _task_docs = []
@@ -160,12 +160,29 @@ def update_verdict_citation(self, data_source, statement):
     def update_summary(self, data_statement):
         """
         Calculate and summarize the verdicts of multiple sources.
+        Choose the one verdict with highest weight and use its citation as final.
+
         Introduce some weights:
           - total: the number of total verdicts
           - valid: the number of verdicts in the desired categories
           - winning: the count of the winning verdict
-          - and count of verdicts of each desiered categories
+          - and count of verdicts of each desired categories
+
+        Exceptions:
+          - If no valid verdicts, generate summary with statement but verdict related keys set to None.
+
+        TODO: maybe output all citations instead of the winning one only.
         """
+
+        statement = data_statement['statement']
+
+        # initial summary
+        data_statement['summary'] = {
+            "statement": data_statement['statement'],
+            "verdict": None,
+            "citation": None, 
+            "weights": None, 
+        }
 
         weight_total = 0
         weight_valid = 0
@@ -176,40 +193,49 @@ def update_summary(self, data_statement):
             "irrelevant": {"citation": [], "weight": 0},
         }
 
-        for hostname, verdict in data_statement['sources'].items():
-            if verdict.get('valid') is False:
+        for hostname, source in data_statement['sources'].items():
+            # skip invalid source
+            if source.get('valid') is False:
                 continue
+
+            # generate citations, add to groups, calculate weights
             weight_total += 1
-            v = verdict['verdict'].lower()
+            v = source['verdict'].lower()
             if v in sum_citation:
                 weight_valid += 1
-                citation = f"{verdict['citation']}  *source: {hostname}*\n\n"
+                citation = f"{source['citation']}  *source: {hostname}*\n\n"  # TODO: more accurate way to construct source
                 sum_citation[v]['citation'].append(citation)
                 sum_citation[v]['weight'] += 1
                 if v == 'true':
                     sum_score += 1
                 elif v == 'false':
                     sum_score -= 1
+
+        # if no valid verdict found
+        if weight_valid == 0:
+            logging.warning(f"No valid verdict found for statement: {statement}")
+            return  # return with verdicts None
 
+        # get the final verdict
         if sum_score > 0:
             verdict = "true"
         elif sum_score < 0:
             verdict = "false"
         else:
             verdict = "irrelevant"
 
+        # generate the final citation
         citation = ''.join(sum_citation[verdict]['citation'])
-        if not citation:
-            raise Exception("No citation found after summarize")
 
+        # add all weights to the summary
         weights = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
         for key in sum_citation.keys():
             weights[key] = sum_citation[key]['weight']
 
-        data_statement['summary'] = {
+        # set summary for this statement
+        data_statement['summary'].update({
             "verdict": verdict, 
             "citation": citation, 
             "weights": weights, 
-            "statement": data_statement['statement'],
-        }
-
+        })
+        return
diff --git a/src/utils.py b/src/utils.py
@@ -35,23 +35,40 @@ def clear_md_links(text):
 
     return text
 
-def generate_report_markdown(input_text, verdicts):
+def generate_report_markdown(input_text, summaries):
     markdown = []
 
     # Add original input
     markdown.append("## Original Input\n")
     markdown.append("```\n" + input_text + "\n```\n")
 
-    # Add verdicts
+    # Add summaries
     markdown.append("## Fact Check\n")
-    for i, verdict in enumerate(verdicts, start=1):
-        weights = verdict['weights']
-        percentage = calculate_percentage(weights['winning'], weights['valid'])
+    for i, summary in enumerate(summaries, start=1):
         markdown.append(f"### Statement {i}\n")
-        markdown.append(f"**Statement**: {verdict['statement']}\n")
-        markdown.append(f"**Verdict**: `{verdict['verdict'].capitalize()}`\n")
-        markdown.append(f"**Weight**: {percentage} (false: {weights['false']}, true: {weights['true']}, irrelevant: {weights['irrelevant']})\n")
-        markdown.append(f"**Citations**:\n\n{verdict['citation']}\n")
+        markdown.append(f"**Statement**: {summary['statement']}\n")
+
+        # Add verdict
+        verdict = summary['verdict']
+        if verdict:
+            markdown.append(f"**Verdict**: {verdict.capitalize()}\n")
+        else:
+            markdown.append("**Verdict**: None\n")
+
+        # Add weights
+        weights = summary['weights']
+        if weights:
+            percentage = calculate_percentage(weights['winning'], weights['valid'])
+            markdown.append(f"**Weight**: {percentage} (false: {weights['false']}, true: {weights['true']}, irrelevant: {weights['irrelevant']})\n")
+        else:
+            markdown.append("**Weight**: None\n")
+
+        # Add citation
+        citation = summary['citation']
+        if citation:
+            markdown.append(f"**Citations**:\n\n{citation}\n")
+        else:
+            markdown.append("**Citations**: None\n")
 
     markdown_str = "\n".join(markdown)
     return markdown_str
@@ -114,7 +131,7 @@ def search_json_to_docs(search_json):
     Search JSON results to Llama-Index documents
 
     Do not add metadata for now
-    cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which addeds metadata to text for generate embeddings
+    cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which adds metadata to text for generate embeddings
 
     TODO: pr to llama-index for metadata_mode setting
     """
@@ -134,7 +151,7 @@ def search_result_to_doc(search_result):
     Search result to Llama-Index document
 
     Do not add metadata for now
-    cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which addeds metadata to text for generate embeddings
+    cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which adds metadata to text for generate embeddings
 
     TODO: pr to llama-index for metadata_mode setting
     """