Skip to content

Commit

Permalink
Merge pull request #23 from ittia-research/dev
Browse files Browse the repository at this point in the history
Fix exception handle of summary
  • Loading branch information
etwk authored Sep 4, 2024
2 parents 80cb4e4 + c128727 commit 3d212f7
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 28 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 ITTIA

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
60 changes: 43 additions & 17 deletions src/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ class Union():
TODO:
- Add support of verdict standards.
- Make betetr use of the other data of web search.
- Generate or draw class data stracture.
- Make better use of the other data of web search.
- Generate or draw class data structure.
"""

def __init__(self, input: str):
"""Avoid run I/O intense functions here to better support async"""
self.input = input # raw input to analize
self.input = input # raw input to analyze
self.data = {} # contains all intermediate and final data

async def final(self):
Expand All @@ -43,8 +43,8 @@ async def final(self):
await asyncio.gather(*_task)

# update reports
_sum = [v['summary'] for v in self.data.values()]
self.reports = utils.generate_report_markdown(self.input, _sum)
_summaries = [v['summary'] for v in self.data.values()]
self.reports = utils.generate_report_markdown(self.input, _summaries)

return self.reports

Expand All @@ -65,7 +65,7 @@ async def _pipe_statement(self, data_statement):
self.update_summary(data_statement)

async def _pipe_source(self, data_source, statement):
"""Update docs and then update retriever, verdic, citation"""
"""Update docs and then update retriever, verdict, citation"""

# update docs
_task_docs = []
Expand Down Expand Up @@ -160,12 +160,29 @@ def update_verdict_citation(self, data_source, statement):
def update_summary(self, data_statement):
"""
Calculate and summarize the verdicts of multiple sources.
Choose the one verdict with highest weight and use its citation as final.
Introduce some weights:
- total: the number of total verdicts
- valid: the number of verdicts in the desired categories
- winning: the count of the winning verdict
- and count of verdicts of each desiered categories
- and count of verdicts of each desired categories
Exceptions:
- If no valid verdicts, generate summary with statement but verdict related keys set to None.
TODO: maybe output all citations instead of the winning one only.
"""

statement = data_statement['statement']

# initial summary
data_statement['summary'] = {
"statement": data_statement['statement'],
"verdict": None,
"citation": None,
"weights": None,
}

weight_total = 0
weight_valid = 0
Expand All @@ -176,40 +193,49 @@ def update_summary(self, data_statement):
"irrelevant": {"citation": [], "weight": 0},
}

for hostname, verdict in data_statement['sources'].items():
if verdict.get('valid') is False:
for hostname, source in data_statement['sources'].items():
# skip invalid source
if source.get('valid') is False:
continue

# generate citations, add to groups, calculate weights
weight_total += 1
v = verdict['verdict'].lower()
v = source['verdict'].lower()
if v in sum_citation:
weight_valid += 1
citation = f"{verdict['citation']} *source: {hostname}*\n\n"
citation = f"{source['citation']} *source: {hostname}*\n\n" # TODO: more accurate way to construct source
sum_citation[v]['citation'].append(citation)
sum_citation[v]['weight'] += 1
if v == 'true':
sum_score += 1
elif v == 'false':
sum_score -= 1

# if no valid verdict found
if weight_valid == 0:
logging.warning(f"No valid verdict found for statement: {statement}")
return # return with verdicts None

# get the final verdict
if sum_score > 0:
verdict = "true"
elif sum_score < 0:
verdict = "false"
else:
verdict = "irrelevant"

# generate the final citation
citation = ''.join(sum_citation[verdict]['citation'])
if not citation:
raise Exception("No citation found after summarize")

# add all weights to the summary
weights = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
for key in sum_citation.keys():
weights[key] = sum_citation[key]['weight']

data_statement['summary'] = {
# set summary for this statement
data_statement['summary'].update({
"verdict": verdict,
"citation": citation,
"weights": weights,
"statement": data_statement['statement'],
}

})
return
39 changes: 28 additions & 11 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,40 @@ def clear_md_links(text):

return text

def generate_report_markdown(input_text, verdicts):
def generate_report_markdown(input_text, summaries):
markdown = []

# Add original input
markdown.append("## Original Input\n")
markdown.append("```\n" + input_text + "\n```\n")

# Add verdicts
# Add summaries
markdown.append("## Fact Check\n")
for i, verdict in enumerate(verdicts, start=1):
weights = verdict['weights']
percentage = calculate_percentage(weights['winning'], weights['valid'])
for i, summary in enumerate(summaries, start=1):
markdown.append(f"### Statement {i}\n")
markdown.append(f"**Statement**: {verdict['statement']}\n")
markdown.append(f"**Verdict**: `{verdict['verdict'].capitalize()}`\n")
markdown.append(f"**Weight**: {percentage} (false: {weights['false']}, true: {weights['true']}, irrelevant: {weights['irrelevant']})\n")
markdown.append(f"**Citations**:\n\n{verdict['citation']}\n")
markdown.append(f"**Statement**: {summary['statement']}\n")

# Add verdict
verdict = summary['verdict']
if verdict:
markdown.append(f"**Verdict**: {verdict.capitalize()}\n")
else:
markdown.append("**Verdict**: None\n")

# Add weights
weights = summary['weights']
if weights:
percentage = calculate_percentage(weights['winning'], weights['valid'])
markdown.append(f"**Weight**: {percentage} (false: {weights['false']}, true: {weights['true']}, irrelevant: {weights['irrelevant']})\n")
else:
markdown.append("**Weight**: None\n")

# Add citation
citation = summary['citation']
if citation:
markdown.append(f"**Citations**:\n\n{citation}\n")
else:
markdown.append("**Citations**: None\n")

markdown_str = "\n".join(markdown)
return markdown_str
Expand Down Expand Up @@ -114,7 +131,7 @@ def search_json_to_docs(search_json):
Search JSON results to Llama-Index documents
Do not add metadata for now
cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which addeds metadata to text for generate embeddings
cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which adds metadata to text for generate embeddings
TODO: pr to llama-index for metadata_mode setting
"""
Expand All @@ -134,7 +151,7 @@ def search_result_to_doc(search_result):
Search result to Llama-Index document
Do not add metadata for now
cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which addeds metadata to text for generate embeddings
cause LlamaIndex uses `node.get_content(metadata_mode=MetadataMode.EMBED)` which adds metadata to text for generate embeddings
TODO: pr to llama-index for metadata_mode setting
"""
Expand Down

0 comments on commit 3d212f7

Please sign in to comment.