diff --git a/README.md b/README.md index e7f5163..034992b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,17 @@ True, false, or just opinions? Maybe not binary, but a percentage. Fact-checking tools to combat disinformation. ## Get Started -Online demo: https://check.ittia.net +Fact-check: + - Online demo: `https://check.ittia.net` + - API docs: `https://check.ittia.net/doc` + +Search backend: + - Using self-hosted search backend for better optimization. + - API doc: `https://search.ittia.net/doc` + - Features: + - Use first-class search engine, Google at this moment. + - Customize count of sources. + - Supports search session: streaming, resume. ## Design Input something. @@ -14,7 +24,7 @@ Factcheck like what a researcher will do: * Use search engine as data source and AI as the verdit. Output analysis: - * MARKDOWN as the default format, JSON as one option. + * MARKDOWN as the default format, JSON optional. ### Pholosophy: - For new information, doubts as default, factcheck follows. @@ -26,16 +36,9 @@ Input types: - questions Verdits: -- true - false -- uncheckable: can't check without more background -- unavailable: service unavailable - -## Support -Please contact if you can provide resources for this project: -- AI API access -- Hardware for hosting -- Data sources +- true +- irrelevant: context processed irrelevant to the statement ## Todo ### Frontend @@ -64,7 +67,6 @@ Retrieval ### pipeline DSPy: -- [ ] make dspy.settings apply to sessions only in order to support multiple retrieve index - [ ] choose the right LLM temperature - [ ] better training datasets @@ -82,7 +84,7 @@ DSPy: - [ ] Use multiple sources for factcheck. ### Stability -- [ ] AI backend stress test, especially xinference. +- [ ] Stress test. ### Extend - [ ] To other types of media: image, audio, video, etc. @@ -97,9 +99,6 @@ DSPy: ### Logging - [ ] Full logging on chain of events for re-producing and debugging. -### Doc -- [ ] Show current tech stack. - ### Checkout - [ ] Chroma #retrieve @@ -110,12 +109,16 @@ DSPy: ## References ### Reports - [ ] AI-generated misinformation + ### Factcheck - https://www.snopes.com - https://www.bmi.bund.de/SharedDocs/schwerpunkte/EN/disinformation/examples-of-russian-disinformation-and-the-facts.html + ### Resources -#### Inference -- https://console.groq.com/docs/ (free tier) +Inference + - https://console.groq.com/docs/ (free tier) +Search and fetch: + - https://jina.ai/read ## Acknowledgements - TPU Research Cloud team at Google diff --git a/docs/changelog.md b/docs/changelog.md index 36fe47a..5a0f501 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,6 +3,8 @@ - Change from AutoGen to plain OpenAI, since AutoGen AssistantAgent adds system role which are not compateble with Gemma 2 + vllm. ## pipeline +2028/9/2: + - Changed search backend to `https://search.ittia.net` for better optimization. 2024/8/26: - Changed to multi-sources mode (divide sources based on hostname), instead of use all web search results as one single source. 2024/8/13: diff --git a/infra/env.d/check b/infra/env.d/check index b98b2b4..45323dc 100644 --- a/infra/env.d/check +++ b/infra/env.d/check @@ -22,6 +22,6 @@ RERANK_BASE_URL=http://infinity:7997 RERANK_MODEL_DEPLOY=api RERANK_MODEL_NAME=jinaai/jina-reranker-v2-base-multilingual -SEARCH_BASE_URL=https://s.jina.ai +SEARCH_BASE_URL=https://search.ittia.net PROJECT_HOSTING_BASE_URL=http://127.0.0.1:8000 \ No newline at end of file diff --git a/src/api/__init__.py b/src/api/__init__.py index 5991ecf..4cfb7fd 100644 --- a/src/api/__init__.py +++ b/src/api/__init__.py @@ -1,2 +1,2 @@ -from .fetch import FetchUrl +from .read import ReadUrl from .search import SearchWeb diff --git a/src/api/fetch.py b/src/api/read.py similarity index 75% rename from src/api/fetch.py rename to src/api/read.py index daaca20..4151fd1 100644 --- a/src/api/fetch.py +++ b/src/api/read.py @@ -7,12 +7,12 @@ client = httpx.AsyncClient(http2=True, follow_redirects=True) -class FetchUrl(): - """Fetch one single url via API fetch endpoint""" +class ReadUrl(): + """Read one single url via API fetch endpoint""" def __init__(self, url: str): self.url = url - self.api = settings.SEARCH_BASE_URL + '/fetch' + self.api = settings.SEARCH_BASE_URL + '/read' self.timeout = 120 # api request timeout, set higher cause api backend might need to try a few times @retry(stop=stop_after_attempt(3), wait=wait_fixed(0.1), before_sleep=utils.retry_log_warning, reraise=True) @@ -23,5 +23,5 @@ async def get(self): response = await client.post(self.api, json=_data, timeout=self.timeout) _r = response.json() if _r['status'] != 'ok': - raise Exception(f"Fetch url return status not ok: {self.url}") + raise Exception(f"Read url return status not ok: {self.url}") # TODO: avoid duplicated retry return _r['data'] \ No newline at end of file diff --git a/src/main.py b/src/main.py index 472b23c..93f4da9 100644 --- a/src/main.py +++ b/src/main.py @@ -1,9 +1,9 @@ import asyncio import json +import logging from fastapi import FastAPI, HTTPException, Request, Header from fastapi.concurrency import run_in_threadpool from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, StreamingResponse -import logging import pipeline, utils, web from modules import Search @@ -17,64 +17,6 @@ app = FastAPI() -# """ -# Process input string, fact-check and output MARKDOWN -# """ -# async def fact_check(input): -# status = 500 -# logger.info(f"Fact checking: {input}") - -# # get list of statements -# try: -# statements = await run_in_threadpool(pipeline.get_statements, input) -# logger.info(f"statements: {statements}") -# except Exception as e: -# logger.error(f"Get statements failed: {e}") -# raise HTTPException(status_code=status, detail="No statements found") - -# verdicts = [] -# fail_search = False -# for statement in statements: -# if not statement: -# continue -# logger.info(f"Statement: {statement}") - -# # get search query -# try: -# query = await run_in_threadpool(pipeline.get_search_query, statement) -# logger.info(f"Search query: {query}") -# except Exception as e: -# logger.error(f"Getting search query from statement '{statement}' failed: {e}") -# continue - -# # searching -# try: -# search = await Search(query) -# logger.info(f"Head of search results: {json.dumps(search)[0:500]}") -# except Exception as e: -# fail_search = True -# logger.error(f"Search '{query}' failed: {e}") -# continue - -# # get verdict -# try: -# verdict = await run_in_threadpool(pipeline.get_verdict, search_json=search, statement=statement) -# logger.info(f"Verdict: {verdict}") -# except Exception as e: -# logger.error(f"Getting verdict for statement '{statement}' failed: {e}") -# continue - -# verdicts.append(verdict) - -# if not verdicts: -# if fail_search: -# raise HTTPException(status_code=status, detail="Search not available") -# else: -# raise HTTPException(status_code=status, detail="No verdicts found") - -# report = utils.generate_report_markdown(input, verdicts) -# return report - # TODO: multi-stage response async def stream_response(path): union = pipeline.Union(path) @@ -98,10 +40,6 @@ async def stream_response(path): async def startup_event(): pass -@app.get("/robots.txt", response_class=FileResponse) -async def robots(): - return "web/robots.txt" - @app.get("/health") async def health(): return {"status": "ok"} @@ -112,16 +50,16 @@ async def status(): return _status # TODO: integrade error handle with output -@app.get("/{path:path}", response_class=PlainTextResponse) -async def catch_all(path: str, accept: str = Header(None)): +@app.get("/{input:path}", response_class=PlainTextResponse) +async def catch_all(input: str, accept: str = Header(None)): try: - if not utils.check_input(path): - return HTMLResponse(status_code=404, content="Invalid request") # filter brower background requests + if not utils.check_input(input): + return HTMLResponse(status_code=404, content='not found') # filter brower background requests if accept == "text/markdown": - if not path: + if not input: return utils.get_stream(stage='final', content=web.get_homepage()) - return StreamingResponse(stream_response(path), media_type="text/event-stream") + return StreamingResponse(stream_response(input), media_type="text/event-stream") else: return HTMLResponse(content=web.html_browser) except HTTPException as e: diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py index 582142e..9d2b5aa 100644 --- a/src/pipeline/__init__.py +++ b/src/pipeline/__init__.py @@ -7,7 +7,7 @@ from urllib.parse import urlparse import utils -from api import FetchUrl, SearchWeb +from api import ReadUrl, SearchWeb from modules import SearchQuery, Statements from modules import llm_long, Citation, LlamaIndexRM, ContextVerdict from settings import settings @@ -22,9 +22,13 @@ class Union(): Run the full cycle from raw input to verdicts of multiple statements. Keep data in the class. + Exception handle: + - Mark doc as invalid if failed to read content. + TODO: - Add support of verdict standards. - Make betetr use of the other data of web search. + - Generate or draw class data stracture. """ def __init__(self, input: str): @@ -65,16 +69,19 @@ async def _pipe_source(self, data_source, statement): # update docs _task_docs = [] for _, data_doc in data_source['docs'].items(): - if not data_doc.get('doc'): # TODO: better way to decide if update doc + if not data_doc.get('doc') and data_doc.get('valid') != False: # TODO: better way to decide if update doc _task_docs.append(asyncio.create_task(self.update_doc(data_doc))) await asyncio.gather(*_task_docs) # finish all docs processing # update retriever - docs = [v['doc'] for v in data_source['docs'].values()] - data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs) - - # update verdict, citation - await run_in_threadpool(self.update_verdict_citation, data_source, statement) + docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') != False] + if docs: + data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs) + + # update verdict, citation + await run_in_threadpool(self.update_verdict_citation, data_source, statement) + else: + data_source['valid'] = False # TODO: update status after add valid doc # Statements has retry set already, do not retry here async def get_statements(self): @@ -120,7 +127,12 @@ async def update_source_map(self, data_sources, query): async def update_doc(self, data_doc): """Update doc (URL content for now)""" - _rep = await FetchUrl(url=data_doc['url']).get() + try: + _rep = await ReadUrl(url=data_doc['url']).get() + except: + data_doc['valid'] = False + logging.warning(f"Failed to read URL, mark as invalid: {data_doc['url']}") + return data_doc['raw'] = _rep # dict including URL content and metadata, etc. data_doc['title'] = _rep['title'] data_doc['doc'] = utils.search_result_to_doc(_rep) # TODO: better process @@ -164,6 +176,8 @@ def update_summary(self, data_statement): } for hostname, verdict in data_statement['sources'].items(): + if verdict.get('valid') == False: + continue weight_total += 1 v = verdict['verdict'].lower() if v in sum_citation: diff --git a/src/settings.py b/src/settings.py index f0fef2f..b1c0cf7 100644 --- a/src/settings.py +++ b/src/settings.py @@ -12,7 +12,7 @@ def __init__(self): self.EMBEDDING_BASE_URL = os.environ.get("EMBEDDING_BASE_URL") or "http://ollama:11434" self.RERANK_BASE_URL = os.environ.get("RERANK_BASE_URL") or "http://xinference:9997/v1" self.PROJECT_HOSTING_BASE_URL = os.environ.get("PROJECT_HOSTING_BASE_URL") or "https://check.ittia.net" - self.SEARCH_BASE_URL = os.environ.get("SEARCH_BASE_URL") or "https://s.jina.ai" + self.SEARCH_BASE_URL = os.environ.get("SEARCH_BASE_URL") or "https://search.ittia.net" # set RAG models deploy mode self.EMBEDDING_MODEL_DEPLOY = os.environ.get("EMBEDDING_MODEL_DEPLOY") or "local" diff --git a/src/utils.py b/src/utils.py index e6842ce..82b3564 100644 --- a/src/utils.py +++ b/src/utils.py @@ -65,7 +65,7 @@ def check_input(input): # check invalid whole query invalid_path = ['YOUR_FACT_CHECK_QUERY'] - common_web_requests = ["favicon.ico"] + common_web_requests = ["robots.txt", "favicon.ico"] if input in itertools.chain(invalid_path, common_web_requests): return False diff --git a/src/web/robots.txt b/src/web/robots.txt deleted file mode 100644 index 6ffbc30..0000000 --- a/src/web/robots.txt +++ /dev/null @@ -1,3 +0,0 @@ -User-agent: * -Disallow: / -