Skip to content

Commit

Permalink
Merge pull request #25 from ittia-research/dev
Browse files Browse the repository at this point in the history
Add JSON return format support, add PyPI package for API connect, update pipeline
  • Loading branch information
etwk authored Sep 9, 2024
2 parents e1c3dc4 + d2a07a1 commit 35a1316
Show file tree
Hide file tree
Showing 9 changed files with 277 additions and 77 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
.*
__pycache__/

# poetry build
dist/
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ True, false, or just opinions? Maybe not binary, but a percentage.
Fact-checking tools to combat disinformation.

## Get Started
Fact-check:
- Online demo: `https://check.ittia.net`
- API docs: `https://check.ittia.net/docs`
Online demo: `https://check.ittia.net`

Search backend:
Use pip package `ittia-check` to connect to API: https://github.com/ittia-research/check/packages/ittia_check

API docs: `https://check.ittia.net/docs`

### Search backend
- Using `search.ittia.net` for better optimization.
- API doc: `https://search.ittia.net/docs`
- Features:
Expand Down
21 changes: 21 additions & 0 deletions packages/ittia_check/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
This package connects to the ITTIA Check API.

More on this project and how to self-host one: https://github.com/ittia-research/check

## How-to
Demo on how to fact-check a text:
```python
import asyncio
from ittia_check import Check

base_url = "https://check.ittia.net"
format = "json" # or markdown

check = Check(base_url=base_url, format=format)

query = "Germany hosted the 2024 Olympics"

result = asyncio.run(check(query))

print(result)
```
67 changes: 67 additions & 0 deletions packages/ittia_check/ittia_check/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import httpx
import json
import logging

API_BASE_URL = "https://check.ittia.net"

class Check():
"""
Fact-check a string and returns verdicts in markdown or JSON formats.
"""
def __init__(self,
base_url: str = API_BASE_URL,
format: str = 'markdown',
timeout: int = 600,
):
"""
Args:
- base_url: API base URL
- format: markdown | json, return format
"""
self.base_url = base_url
self.format = format
self.timeout = timeout # api request timeout, set higher cause the process might take a long time

self.headers = {
"X-Return-Format": self.format,
"Accept": "text/event-stream",
}
self.client = httpx.AsyncClient(follow_redirects=True, timeout=self.timeout)

async def __call__(self, query: str):
"""
Args:
- query: text to check
"""
url = self.base_url + '/' + query
result = None

async with self.client.stream("GET", url, headers=self.headers) as response:
buffer = ""
async for chunk in response.aiter_text():
if chunk.strip(): # Only process non-empty chunks
buffer += chunk

# Attempt to load the buffer as JSON
try:
# Keep loading JSON until all data is consumed
while buffer:
# Try to load a complete JSON object
rep, index = json.JSONDecoder().raw_decode(buffer)

# Select the `final` stage only
if rep['stage'] != 'final':
logging.debug(f"Stage {rep['stage']}: {rep['content']}")
else:
result = rep['content']

# Remove processed JSON and any leading whitespace from the buffer
buffer = buffer[index:].lstrip()
except json.JSONDecodeError:
# If we encounter an error, we may not have a complete JSON object yet
continue # Continue to read more data

if not result:
logging.warning("No result found")

return result
19 changes: 19 additions & 0 deletions packages/ittia_check/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[tool.poetry]
name = "ittia-check"
version = "0.1.0"
description = "Connect to the ITTIA Check API or self-hosted ones"
authors = ["ITTIA <[email protected]>"]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/ittia-research/check"
repository = "https://github.com/ittia-research/check"
keywords = ["fact-check", "ai", "llm", "rag"]

[tool.poetry.dependencies]
python = "^3.8"
httpx = "^0.27.2"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
52 changes: 38 additions & 14 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio
import logging
from fastapi import FastAPI, HTTPException, Header
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse, PlainTextResponse, RedirectResponse, StreamingResponse

import pipeline
Expand All @@ -16,9 +16,10 @@

app = FastAPI()


# TODO: multi-stage response
async def stream_response(path):
pipeline_check = pipeline.Check(path)
async def stream_response(input: str, format: str):
pipeline_check = pipeline.Check(input=input, format=format)
task = asyncio.create_task(pipeline_check.final())

# Stream response to prevent timeout, return multi-stage responses
Expand All @@ -28,44 +29,67 @@ async def stream_response(path):
if elapsed_time > settings.STREAM_TIME_OUT: # waiting timeout
raise Exception(f"Waiting fact check results reached time limit: {settings.STREAM_TIME_OUT} seconds")
if elapsed_time % 30 == 0: # return wait messages from time to time
yield utils.get_stream(stage='processing', content='### Processing ...')
yield utils.get_stream(stage='processing', content='processing ...')
await asyncio.sleep(_check_interval)
elapsed_time = round(elapsed_time + _check_interval, 1)

result = await task
yield utils.get_stream(stage='final', content=result)



@app.on_event("startup")
async def startup_event():
pass


"""Redirect /doc to /docs"""
@app.get("/doc", include_in_schema=False)
async def _doc_redirect():
return RedirectResponse(url="/docs")



@app.get("/health")
async def health():
return {"status": "ok"}


@app.get("/status")
async def status():
_status = utils.get_status()
return _status

# TODO: integrate error handle with output

@app.get("/{input:path}", response_class=PlainTextResponse)
async def catch_all(input: str, accept: str = Header(None)):
async def catch_all(input: str, request: Request):
"""
Headers:
- Accept: text/event-stream (Without this header returns the basic HTML page)
- X-Return-Format: markdown | json (Choose the return format, default markdown)
"""
# Catch all exception to avoid inner error message expose to public
try:
headers = request.headers

# Filter out browser automated and other invalid requests
if not utils.check_input(input):
return HTMLResponse(status_code=404, content='not found') # filter browser background requests
return HTMLResponse(status_code=404, content='not found')

if accept == "text/markdown":
if not input:
return utils.get_stream(stage='final', content=web.get_homepage())
return StreamingResponse(stream_response(input), media_type="text/event-stream")
else:
# Return static HTML page if not requesting stream.
# The HTML page will fetch the same URL with stream header and render the result.
if headers.get('accept') != "text/event-stream":
return HTMLResponse(content=web.html_browser)

# Homepage
if not input:
return utils.get_stream(stage='final', content=web.get_homepage())

# Get return format, default `markdown`
return_format = headers.get("X-Return-Format")
if return_format not in ['markdown', 'json']:
return_format = 'markdown'

# Streaming content
return StreamingResponse(stream_response(input=input, format=return_format), media_type="text/event-stream")
except HTTPException as e:
raise e
except Exception as e:
Expand Down
97 changes: 65 additions & 32 deletions src/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,34 @@ class Check():
- Generate or draw class data structure.
"""

def __init__(self, input: str):
"""Avoid run I/O intense functions here to better support async"""
self.input = input # raw input to analyze
def __init__(self, input: str, format: str = 'markdown'):
"""
Args:
- input: raw input to check
- format: markdown | json, format of the returning response
Notes: avoid run I/O intense functions here to better support async
"""
self.input = input
self.format = format
self.data = {} # contains all intermediate and final data

async def final(self):
await self.get_statements()
_task = [asyncio.create_task(self._pipe_statement(data_statement)) for data_statement in self.data.values()]
await asyncio.gather(*_task)

# update reports
_summaries = [v['summary'] for v in self.data.values()]
self.reports = utils.generate_report_markdown(self.input, _summaries)
# List of all summaries
summaries = [v['summary'] for v in self.data.values()]

# Update reports
if self.format == 'json':
self.reports = {
'input': self.input,
'summaries': summaries,
}
else:
self.reports = utils.generate_report_markdown(self.input, summaries)

return self.reports

Expand Down Expand Up @@ -173,6 +188,14 @@ def update_summary(self, data_statement):
- winning: the count of the winning verdict
- and count of verdicts of each desired categories
Response:
Verdicts:
- true
- false
- irrelevant
- tie: number of true and false verdicts are the same and above zero
- None: no valid verdict found
Exceptions:
- If no valid verdicts, generate summary with statement but verdict related keys set to None.
Expand All @@ -181,21 +204,21 @@ def update_summary(self, data_statement):

statement = data_statement['statement']

# initial summary
data_statement['summary'] = {
"statement": data_statement['statement'],
# initialize summary
data_summary = data_statement['summary'] = {
"statement": statement,
"verdict": None,
"citation": None,
"weights": None,
"weights": {},
"citations": {},
}

weight_total = 0
weight_valid = 0
sum_score = 0
sum_citation = {
"true": {"citation": [], "weight": 0},
"false": {"citation": [], "weight": 0},
"irrelevant": {"citation": [], "weight": 0},
"true": {"citations": [], "weight": 0},
"false": {"citations": [], "weight": 0},
"irrelevant": {"citations": [], "weight": 0},
}

for hostname, source in data_statement['sources'].items():
Expand All @@ -206,41 +229,51 @@ def update_summary(self, data_statement):
# generate citations, add to groups, calculate weights
weight_total += 1
v = source['verdict'].lower()
if v in sum_citation:
if v in sum_citation: # Checking if verdict are valid here, do not add non-valid verdict to sum_citation keys prior
weight_valid += 1
citation = f"{source['citation']} *source: {hostname}*\n\n" # TODO: more accurate way to construct source
sum_citation[v]['citation'].append(citation)
citation = {
'citation': source['citation'],
'source': f"http://{hostname}",
}
sum_citation[v]['citations'].append(citation)
sum_citation[v]['weight'] += 1
if v == 'true':
sum_score += 1
elif v == 'false':
sum_score -= 1

# if no valid verdict found
# Return None if no valid verdict found
if weight_valid == 0:
logging.warning(f"No valid verdict found for statement: {statement}")
return # return with verdicts None

# get the final verdict
"""
Get the final verdict.
TODO:
- Some source should have different weights. For example:
- A well-known reliable source compare to a average one.
- One has more latest info and the statement is time sensitive.
"""
if sum_score > 0:
verdict = "true"
elif sum_score < 0:
verdict = "false"
else:
verdict = "irrelevant"

# generate the final citation
citation = ''.join(sum_citation[verdict]['citation'])
# If positive/negative verdict are not 0, set verdict to tie.
if sum_citation['true']['weight'] > 0:
verdict = 'tie'
sum_citation['tie'] = {"citations": [], "weight": 0} # add keys for processing after
else:
verdict = "irrelevant"
data_summary['verdict'] = verdict

# add all weights to the summary
weights = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
data_summary['weights'] = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
for key in sum_citation.keys():
weights[key] = sum_citation[key]['weight']
data_summary['weights'][key] = sum_citation[key]['weight']

# set summary for this statement
data_statement['summary'].update({
"verdict": verdict,
"citation": citation,
"weights": weights,
})
return
# Gather all non-empty citations
for key, value in sum_citation.items():
if value['citations']:
data_summary['citations'][key] = value['citations']
Loading

0 comments on commit 35a1316

Please sign in to comment.