Merge pull request #25 from ittia-research/dev

Add JSON return format support, add PyPI package for API connect, update pipeline
ittia-research · Sep 9, 2024 · 35a1316 · 35a1316
2 parents e1c3dc4 + d2a07a1
commit 35a1316
Show file tree

Hide file tree

Showing 9 changed files with 277 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 .*
 __pycache__/
+
+# poetry build
+dist/
diff --git a/README.md b/README.md
@@ -3,11 +3,13 @@ True, false, or just opinions? Maybe not binary, but a percentage.
 Fact-checking tools to combat disinformation.
 
 ## Get Started
-Fact-check:
-  - Online demo: `https://check.ittia.net`
-  - API docs: `https://check.ittia.net/docs`
+Online demo: `https://check.ittia.net`
 
-Search backend:
+Use pip package `ittia-check` to connect to API: https://github.com/ittia-research/check/packages/ittia_check
+
+API docs: `https://check.ittia.net/docs`
+
+### Search backend
   - Using `search.ittia.net` for better optimization.
   - API doc: `https://search.ittia.net/docs`
   - Features:

diff --git a/packages/ittia_check/README.md b/packages/ittia_check/README.md
@@ -0,0 +1,21 @@
+This package connects to the ITTIA Check API.
+
+More on this project and how to self-host one: https://github.com/ittia-research/check
+
+## How-to
+Demo on how to fact-check a text:
+```python
+import asyncio
+from ittia_check import Check
+
+base_url = "https://check.ittia.net"
+format = "json"  # or markdown
+
+check = Check(base_url=base_url, format=format)
+
+query = "Germany hosted the 2024 Olympics"
+
+result = asyncio.run(check(query))
+
+print(result)
+```
diff --git a/packages/ittia_check/ittia_check/__init__.py b/packages/ittia_check/ittia_check/__init__.py
@@ -0,0 +1,67 @@
+import httpx
+import json
+import logging
+
+API_BASE_URL = "https://check.ittia.net"
+
+class Check():
+    """
+    Fact-check a string and returns verdicts in markdown or JSON formats.
+    """
+    def __init__(self,
+                 base_url: str = API_BASE_URL,
+                 format: str = 'markdown',
+                 timeout: int = 600,
+                 ):
+        """
+        Args:
+          - base_url: API base URL
+          - format: markdown | json, return format
+        """
+        self.base_url = base_url
+        self.format = format
+        self.timeout = timeout  # api request timeout, set higher cause the process might take a long time
+
+        self.headers = {
+            "X-Return-Format": self.format,
+            "Accept": "text/event-stream",
+        }
+        self.client = httpx.AsyncClient(follow_redirects=True, timeout=self.timeout)
+
+    async def __call__(self, query: str):
+        """
+        Args:
+          - query: text to check
+        """
+        url = self.base_url + '/' + query
+        result = None
+
+        async with self.client.stream("GET", url, headers=self.headers) as response:
+            buffer = ""
+            async for chunk in response.aiter_text():
+                if chunk.strip():  # Only process non-empty chunks
+                    buffer += chunk
+
+                    # Attempt to load the buffer as JSON
+                    try:
+                        # Keep loading JSON until all data is consumed
+                        while buffer:
+                            # Try to load a complete JSON object
+                            rep, index = json.JSONDecoder().raw_decode(buffer)
+
+                            # Select the `final` stage only
+                            if rep['stage'] != 'final':
+                                logging.debug(f"Stage {rep['stage']}: {rep['content']}")
+                            else:
+                                result = rep['content']
+
+                            # Remove processed JSON and any leading whitespace from the buffer
+                            buffer = buffer[index:].lstrip()
+                    except json.JSONDecodeError:
+                        # If we encounter an error, we may not have a complete JSON object yet
+                        continue  # Continue to read more data
+
+        if not result:
+            logging.warning("No result found")
+
+        return result
diff --git a/packages/ittia_check/pyproject.toml b/packages/ittia_check/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "ittia-check"
+version = "0.1.0"
+description = "Connect to the ITTIA Check API or self-hosted ones"
+authors = ["ITTIA <[email protected]>"]
+license = "MIT"
+readme = "README.md"
+homepage = "https://github.com/ittia-research/check"
+repository = "https://github.com/ittia-research/check"
+keywords = ["fact-check", "ai", "llm", "rag"]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+httpx = "^0.27.2"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/src/main.py b/src/main.py
@@ -1,6 +1,6 @@
 import asyncio
 import logging
-from fastapi import FastAPI, HTTPException, Header
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import HTMLResponse, PlainTextResponse, RedirectResponse, StreamingResponse
 
 import pipeline
@@ -16,9 +16,10 @@
 
 app = FastAPI()
 
+
 # TODO: multi-stage response
-async def stream_response(path):
-    pipeline_check = pipeline.Check(path)
+async def stream_response(input: str, format: str):
+    pipeline_check = pipeline.Check(input=input, format=format)
     task = asyncio.create_task(pipeline_check.final())
 
     # Stream response to prevent timeout, return multi-stage responses
@@ -28,44 +29,67 @@ async def stream_response(path):
         if elapsed_time > settings.STREAM_TIME_OUT:  # waiting timeout
             raise Exception(f"Waiting fact check results reached time limit: {settings.STREAM_TIME_OUT} seconds")
         if elapsed_time % 30 == 0:  # return wait messages from time to time
-            yield utils.get_stream(stage='processing', content='### Processing ...')
+            yield utils.get_stream(stage='processing', content='processing ...')
         await asyncio.sleep(_check_interval)
         elapsed_time = round(elapsed_time + _check_interval, 1)
 
     result = await task
     yield utils.get_stream(stage='final', content=result)
-
+
+
 @app.on_event("startup")
 async def startup_event():
     pass
 
+
 """Redirect /doc to /docs"""
 @app.get("/doc", include_in_schema=False)
 async def _doc_redirect():
     return RedirectResponse(url="/docs")
-
+
+
 @app.get("/health")
 async def health():
     return {"status": "ok"}
 
+
 @app.get("/status")
 async def status():
     _status = utils.get_status()
     return _status
 
-# TODO: integrate error handle with output
+
 @app.get("/{input:path}", response_class=PlainTextResponse)
-async def catch_all(input: str, accept: str = Header(None)):
+async def catch_all(input: str, request: Request):
+    """
+    Headers:
+      - Accept: text/event-stream (Without this header returns the basic HTML page)
+      - X-Return-Format: markdown | json (Choose the return format, default markdown)
+    """
+    # Catch all exception to avoid inner error message expose to public
     try:
+        headers = request.headers
+
+        # Filter out browser automated and other invalid requests
         if not utils.check_input(input):
-            return HTMLResponse(status_code=404, content='not found')  # filter browser background requests
+            return HTMLResponse(status_code=404, content='not found')
 
-        if accept == "text/markdown":
-            if not input:
-                return utils.get_stream(stage='final', content=web.get_homepage())
-            return StreamingResponse(stream_response(input), media_type="text/event-stream")
-        else:
+        # Return static HTML page if not requesting stream.
+        # The HTML page will fetch the same URL with stream header and render the result.
+        if headers.get('accept') != "text/event-stream":
             return HTMLResponse(content=web.html_browser)
+
+        # Homepage
+        if not input:
+            return utils.get_stream(stage='final', content=web.get_homepage())
+
+        # Get return format, default `markdown`
+        return_format = headers.get("X-Return-Format")
+        if return_format not in ['markdown', 'json']:
+            return_format = 'markdown'
+
+        # Streaming content
+        return StreamingResponse(stream_response(input=input, format=return_format), media_type="text/event-stream")
     except HTTPException as e:
         raise e
     except Exception as e:

diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
@@ -32,19 +32,34 @@ class Check():
       - Generate or draw class data structure.
     """
 
-    def __init__(self, input: str):
-        """Avoid run I/O intense functions here to better support async"""
-        self.input = input  # raw input to analyze
+    def __init__(self, input: str, format: str = 'markdown'):
+        """
+        Args:
+          - input: raw input to check
+          - format: markdown | json, format of the returning response
+
+        Notes: avoid run I/O intense functions here to better support async
+        """
+        self.input = input
+        self.format = format
         self.data = {}  # contains all intermediate and final data
 
     async def final(self):
         await self.get_statements()
         _task = [asyncio.create_task(self._pipe_statement(data_statement)) for data_statement in self.data.values()]
         await asyncio.gather(*_task)
 
-        # update reports
-        _summaries = [v['summary'] for v in self.data.values()]
-        self.reports = utils.generate_report_markdown(self.input, _summaries)
+        # List of all summaries
+        summaries = [v['summary'] for v in self.data.values()]
+
+        # Update reports
+        if self.format == 'json':
+            self.reports = {
+                'input': self.input,
+                'summaries': summaries,
+            }
+        else:
+            self.reports = utils.generate_report_markdown(self.input, summaries)
 
         return self.reports
 
@@ -173,6 +188,14 @@ def update_summary(self, data_statement):
           - winning: the count of the winning verdict
           - and count of verdicts of each desired categories
 
+        Response:
+            Verdicts:
+                - true
+                - false
+                - irrelevant
+                - tie: number of true and false verdicts are the same and above zero
+                - None: no valid verdict found
+
         Exceptions:
           - If no valid verdicts, generate summary with statement but verdict related keys set to None.
 
@@ -181,21 +204,21 @@ def update_summary(self, data_statement):
 
         statement = data_statement['statement']
 
-        # initial summary
-        data_statement['summary'] = {
-            "statement": data_statement['statement'],
+        # initialize summary
+        data_summary = data_statement['summary'] = {
+            "statement": statement,
             "verdict": None,
-            "citation": None, 
-            "weights": None, 
+            "weights": {}, 
+            "citations": {}, 
         }
 
         weight_total = 0
         weight_valid = 0
         sum_score = 0
         sum_citation = {
-            "true": {"citation": [], "weight": 0},
-            "false": {"citation": [], "weight": 0},
-            "irrelevant": {"citation": [], "weight": 0},
+            "true": {"citations": [], "weight": 0},
+            "false": {"citations": [], "weight": 0},
+            "irrelevant": {"citations": [], "weight": 0},
         }
 
         for hostname, source in data_statement['sources'].items():
@@ -206,41 +229,51 @@ def update_summary(self, data_statement):
             # generate citations, add to groups, calculate weights
             weight_total += 1
             v = source['verdict'].lower()
-            if v in sum_citation:
+            if v in sum_citation:  # Checking if verdict are valid here, do not add non-valid verdict to sum_citation keys prior
                 weight_valid += 1
-                citation = f"{source['citation']}  *source: {hostname}*\n\n"  # TODO: more accurate way to construct source
-                sum_citation[v]['citation'].append(citation)
+                citation = {
+                    'citation': source['citation'],
+                    'source': f"http://{hostname}",
+                  }
+                sum_citation[v]['citations'].append(citation)
                 sum_citation[v]['weight'] += 1
                 if v == 'true':
                     sum_score += 1
                 elif v == 'false':
                     sum_score -= 1
 
-        # if no valid verdict found
+        # Return None if no valid verdict found
         if weight_valid == 0:
             logging.warning(f"No valid verdict found for statement: {statement}")
             return  # return with verdicts None
 
-        # get the final verdict
+        """
+        Get the final verdict.
+
+        TODO:
+            - Some source should have different weights. For example:
+              - A well-known reliable source compare to a average one.
+              - One has more latest info and the statement is time sensitive.
+        """
         if sum_score > 0:
             verdict = "true"
         elif sum_score < 0:
             verdict = "false"
         else:
-            verdict = "irrelevant"
-
-        # generate the final citation
-        citation = ''.join(sum_citation[verdict]['citation'])
+            # If positive/negative verdict are not 0, set verdict to tie.
+            if sum_citation['true']['weight'] > 0:
+                verdict = 'tie'
+                sum_citation['tie'] = {"citations": [], "weight": 0}  # add keys for processing after
+            else:
+                verdict = "irrelevant"
+        data_summary['verdict'] = verdict
 
         # add all weights to the summary
-        weights = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
+        data_summary['weights'] = {"total": weight_total, "valid": weight_valid, "winning": sum_citation[verdict]['weight']}
         for key in sum_citation.keys():
-            weights[key] = sum_citation[key]['weight']
+            data_summary['weights'][key] = sum_citation[key]['weight']
 
-        # set summary for this statement
-        data_statement['summary'].update({
-            "verdict": verdict, 
-            "citation": citation, 
-            "weights": weights, 
-        })
-        return
+        # Gather all non-empty citations
+        for key, value in sum_citation.items():
+            if value['citations']:
+                data_summary['citations'][key] = value['citations']