Merge pull request #24 from ittia-research/dev

Update URL read exception handle, update class name, fix typos
ittia-research · Sep 8, 2024 · e1c3dc4 · e1c3dc4
2 parents 3d212f7 + 19cf052
commit e1c3dc4
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -18,43 +18,43 @@ Search backend:
 ## Design
 Input something.
 
-Analize percentage of facts and opnions.
+Analyze percentage of facts and opinions.
 
-Factcheck like what a researcher will do:
-  * Use search engine as data source and AI as the verdit.
+Fact-check like what a researcher will do:
+  * Use search engine as data source and AI as the verdict.
 
 Output analysis:
   * MARKDOWN as the default format, JSON optional. 
 
-### Pholosophy:
-- For new information, doubts as default, factcheck follows.
+### Philosophy:
+- For new information, doubts as default, fact-check follows.
 
 ### Elements
 Input types:
 - facts
-- opnions
+- opinions
 - questions
 
-Verdits:
+Verdicts:
 - false
 - true
 - irrelevant: context processed irrelevant to the statement
 
 ## Todo
 ### Frontend
 - [ ] API: Input string or url, output analysis
-- [ ] Optional more detailed output: correction, explannation, references
+- [ ] Optional more detailed output: correction, explanation, references
 
 ### Backend
 - [ ] Get list of facts from input, improve performance
 - [ ] Get search results of each facts and check if they are true or false
-- [ ] Get weight of facts and opnions
+- [ ] Get weight of facts and opinions
 - [ ] Compare different search engines.
 - [ ] Add support for URL input
 - [ ] Performance benchmark.
 
 LLM
-- [ ] Better way to handle LLM output formating: list, JSON.
+- [ ] Better way to handle LLM output formatting: list, JSON.
 
 Embedding:
 - [ ] chunk size optimize
@@ -71,7 +71,7 @@ DSPy:
 - [ ] better training datasets
 
 ### Retrival
-- [ ] Better retrival solution: high performance, concurrency, multiple index, index editable.
+- [ ] Better retrieval solution: high performance, concurrency, multiple index, index editable.
 - [ ] Getting more sources when needed.
 
 ### Verdict
@@ -81,17 +81,17 @@ DSPy:
 - [ ] Evaluate MLOps pipeline
   - https://kitops.ml
 - [ ] Evaluate data quality of searching and url fetching. Better error handle.
-- [ ] Use multiple sources for factcheck.
+- [ ] Use multiple sources for fact-check.
 
 ### Stability
 - [ ] Stress test.
 
 ### Extend
 - [ ] To other types of media: image, audio, video, etc.
-- [ ] Shall we try to anser questions if provided.
+- [ ] Shall we try to answer questions if provided.
 - [ ] Multi-language support.
 - [ ] Add logging and long-term memory.
-- [ ] Intergrate with other factcheck services.
+- [ ] Integrate with other fact-check services.
 
 ### Calculate
 - [ ] Shall we calculate percentage of true and false in the input? Any better calculation than items count?
@@ -103,14 +103,14 @@ DSPy:
 - [ ] Chroma #retrieve
 
 ## Issues
-- [ ] Uses many different types of models, diffcult for performance optimization and maintenance.
-- [ ] LLM verdit wrong contradict to context provided.
+- [ ] Uses many different types of models, difficult for performance optimization and maintenance.
+- [ ] LLM verdict wrong contradict to context provided.
 
 ## References
 ### Reports
 - [ ] AI-generated misinformation
 
-### Factcheck
+### Fact-check
 - https://www.snopes.com
 - https://www.bmi.bund.de/SharedDocs/schwerpunkte/EN/disinformation/examples-of-russian-disinformation-and-the-facts.html
 

diff --git a/datasets/wiki_dpr/README.md b/datasets/wiki_dpr/README.md
@@ -1,8 +1,8 @@
 ## About
 - Dataset: https://github.com/facebookresearch/DPR/blob/main/dpr/data/download_data.py
-    - direct downlaod link: `https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz`
+    - direct download link: `https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz`
 - Generate index for the ColBERTv2 model
-- Downlaod the generated index: https://huggingface.co/datasets/ittia/wiki_dpr
+- Download the generated index: https://huggingface.co/datasets/ittia/wiki_dpr
 - Start a retrieve server
 
 ## How-to

diff --git a/datasets/wiki_dpr/prepare_files.py b/datasets/wiki_dpr/prepare_files.py
@@ -46,19 +46,19 @@ def move_files_subfolders(source_folder, destination_folder):
 
 @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), reraise=True) 
 def download_hf_folder(repo_dir, local_dir):
-    downlaod_dir = os.path.join(local_dir, '.downlaod')
+    download_dir = os.path.join(local_dir, '.download')
 
-    os.makedirs(downlaod_dir, exist_ok=True)
+    os.makedirs(download_dir, exist_ok=True)
 
     snapshot_download(
         repo_id=repo_id, 
         repo_type=repo_type, 
         revision=revision, 
         allow_patterns=f"{repo_dir}/*",
-        local_dir=downlaod_dir
+        local_dir=download_dir
     )
 
-    return downlaod_dir
+    return download_dir
 
 for map in dir_map:
     repo_dir = map['repo_dir']
@@ -68,8 +68,8 @@ def download_hf_folder(repo_dir, local_dir):
         print(f"local dir '{local_dir}' exists and not empty, skip download")
         continue
 
-    downlaod_dir = download_hf_folder(repo_dir, local_dir)
-    _source_dir = os.path.join(downlaod_dir, repo_dir)
+    download_dir = download_hf_folder(repo_dir, local_dir)
+    _source_dir = os.path.join(download_dir, repo_dir)
     move_files_subfolders(_source_dir, local_dir)
 
     print(f"Downloaded: {repo_dir} to {local_dir}")

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,6 @@
 ## application
 2024/8/3:
-  - Change from AutoGen to plain OpenAI, since AutoGen AssistantAgent adds system role which are not compateble with Gemma 2 + vllm.
+  - Change from AutoGen to plain OpenAI, since AutoGen AssistantAgent adds system role which are not compatible with Gemma 2 + vllm.
 
 ## pipeline
 2028/9/2:

diff --git a/src/api/read.py b/src/api/read.py
@@ -7,7 +7,15 @@
 client = httpx.AsyncClient(http2=True, follow_redirects=True)
 
 class ReadUrl():
-    """Read one single url via API fetch endpoint"""
+    """
+    Read one single url via API fetch endpoint.
+    Retry failed read at API server end not here.
+    
+    API response status:
+      - ok
+      - error
+      - not_implemented: fetch ok but not able to read content
+    """
 
     def __init__(self, url: str):
         self.url = url
@@ -20,7 +28,4 @@ async def get(self):
             'url': self.url,
         }
         response = await client.post(self.api, json=_data, timeout=self.timeout)
-        _r = response.json()
-        if _r['status'] != 'ok':
-            raise Exception(f"Read url return status not ok: {self.url}")  # TODO: avoid duplicated retry
-        return _r['data']
+        return response.json()
diff --git a/src/api/search.py b/src/api/search.py
@@ -46,7 +46,7 @@ async def get(self, num: int = 10, all: bool = False):
                             rep, index = json.JSONDecoder().raw_decode(buffer)
                             _url = rep['url']
                             # deduplication
-                            if _url not in self.urls:  # TODO: waht if the new one containes same url but better metadata
+                            if _url not in self.urls:  # TODO: what if the new one contains same url but better metadata
                                 self.urls.append(_url)
                                 yield rep
 

diff --git a/src/main.py b/src/main.py
@@ -18,15 +18,15 @@
 
 # TODO: multi-stage response
 async def stream_response(path):
-    union = pipeline.Union(path)
-    task = asyncio.create_task(union.final())
+    pipeline_check = pipeline.Check(path)
+    task = asyncio.create_task(pipeline_check.final())
 
-    # Stream response to prevent timeout, return multi-stage reponses
+    # Stream response to prevent timeout, return multi-stage responses
     elapsed_time = 0
     _check_interval = 0.2
     while not task.done():
-        if elapsed_time > settings.STREAM_TIME_OUT:  # waitting timeout
-            raise Exception(f"Waitting fact check results reached time limit: {settings.STREAM_TIME_OUT} seconds")
+        if elapsed_time > settings.STREAM_TIME_OUT:  # waiting timeout
+            raise Exception(f"Waiting fact check results reached time limit: {settings.STREAM_TIME_OUT} seconds")
         if elapsed_time % 30 == 0:  # return wait messages from time to time
             yield utils.get_stream(stage='processing', content='### Processing ...')
         await asyncio.sleep(_check_interval)
@@ -53,12 +53,12 @@ async def status():
     _status = utils.get_status()
     return _status
 
-# TODO: integrade error handle with output
+# TODO: integrate error handle with output
 @app.get("/{input:path}", response_class=PlainTextResponse)
 async def catch_all(input: str, accept: str = Header(None)):
     try:
         if not utils.check_input(input):
-            return HTMLResponse(status_code=404, content='not found')  # filter brower background requests
+            return HTMLResponse(status_code=404, content='not found')  # filter browser background requests
 
         if accept == "text/markdown":
             if not input:

diff --git a/src/modules/context_verdict.py b/src/modules/context_verdict.py
@@ -33,7 +33,7 @@ class GenerateSearchQuery(dspy.Signature):
 
 """
 SimplifiedBaleen module
-Avoid unnecessary content in module cause MIPROv2 optimizer will analize modules.
+Avoid unnecessary content in module cause MIPROv2 optimizer will analyze modules.
 
 To-do: 
   - retrieve latest facts

diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
@@ -18,7 +18,7 @@
 context_verdict = ContextVerdict()
 context_verdict.load(optimizer_path)
 
-class Union():
+class Check():
     """
     Run the full cycle from raw input to verdicts of multiple statements.
     Keep data in the class.
@@ -129,7 +129,12 @@ async def update_source_map(self, data_sources, query):
     async def update_doc(self, data_doc):
         """Update doc (URL content for now)"""
         try:
-            _rep = await ReadUrl(url=data_doc['url']).get()
+            _read_url = ReadUrl(url=data_doc['url'])
+            _rep = await _read_url.get()
+
+            # check source read `status` and content
+            if _rep['status'] != 'ok' or not _rep['content']:
+                raise Exception
         except Exception:
             data_doc['valid'] = False
             logging.warning(f"Failed to read URL, mark as invalid: {data_doc['url']}")

diff --git a/src/settings.py b/src/settings.py
@@ -35,7 +35,7 @@ def __init__(self):
         """
         embedding batch:
             - set higher to improve performance: overcome network latency, etc.
-            - embedding servers usually have the capacity to divide too large batch themself
+            - embedding servers usually have the capacity to divide too large batch on their own
         """
         self.EMBEDDING_BATCH_SIZE = os.environ.get("EMBEDDING_BATCH_SIZE") or 1024