diff --git a/.env b/.env
deleted file mode 100644
index a4e95ee..0000000
--- a/.env
+++ /dev/null
@@ -1,14 +0,0 @@
-EMBEDDING_API_KEY=ollama:abc
-EMBEDDING_MODEL_DEPLOY=api
-EMBEDDING_MODEL_NAME=jina/jina-embeddings-v2-base-en
-INDEX_CHUNK_SIZES=[2048, 512, 128]
-LLM_MODEL_NAME=google/gemma-2-27b-it
-OLLAMA_BASE_URL=http://ollama:11434
-OPENAI_API_KEY=sk-proj-aaaaaaaaaaaaaaaaa
-OPENAI_BASE_URL=http://localhost:8000/v1
-PROJECT_HOSTING_BASE_URL=http://127.0.0.1:8000
-RERANK_MODEL_DEPLOY=local
-RERANK_MODEL_NAME=BAAI/bge-reranker-v2-m3
-RERANK_BASE_URL=http://xinference:9997/v1
-SEARCH_BASE_URL=https://s.jina.ai
-THREAD_BUILD_INDEX=12
\ No newline at end of file
diff --git a/datasets/HotPotQA/HotPotQA_statement_verdict.ipynb b/datasets/HotPotQA/HotPotQA_statement_verdict.ipynb
new file mode 100644
index 0000000..97b2dbf
--- /dev/null
+++ b/datasets/HotPotQA/HotPotQA_statement_verdict.ipynb
@@ -0,0 +1,399 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a29502b0-dc28-4ad3-8b8d-44bc0bee9581",
+   "metadata": {},
+   "source": [
+    "This script adds more fields to the HotPotQA dataset and generate more data,\n",
+    "for statement & verdict training:\n",
+    "  - choose a random direction: True or False\n",
+    "  - generate statement based on the direction and the QA\n",
+    "  - generate some new statements that has no coresponding context in the retriever as \"irrelevant\"\n",
+    "  - save to file\n",
+    "\n",
+    "TODO:\n",
+    "  - quality control automatically\n",
+    "  - more diverse irrelevant statements"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "55f01e6b-2e11-4ae5-b734-a66ea1f7390d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "select_dataset_total = 100000  # how many to select from each portion of the dataset\n",
+    "concurrency = 24  # LLM calling\n",
+    "irrelevant_percent = 0.3  # how many irrelevant statements to created based on the number of total True & False\n",
+    "\n",
+    "OPENAI_BASE_URL = \"http://127.0.0.1:8010/v1/\"\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"aaaa\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "54dfa7e6-4f7f-4a40-a49e-047f9f0f60ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dspy\n",
+    "\n",
+    "llm = dspy.OpenAI(model='google/gemma-2-9b-it', api_base=OPENAI_BASE_URL, max_tokens=200, stop='\\n\\n') # model_type=\"chat\", \n",
+    "dspy.settings.configure(lm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "43c096d3-1b9d-4cae-a48a-d154f16d68b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "from datasets import load_dataset\n",
+    "from dspy.datasets.dataset import Dataset\n",
+    "\n",
+    "class HotPotSV():\n",
+    "    \"\"\"\n",
+    "    HotSpotQA to statements + verdicts\n",
+    "    \"\"\"\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        train_size = 50,\n",
+    "        validation_size = 50,\n",
+    "        keep_details = False,\n",
+    "    ):\n",
+    "\n",
+    "        hf_official_train = load_dataset(\"hotpot_qa\", 'fullwiki', split='train', trust_remote_code=True)\n",
+    "        hf_official_validation = load_dataset(\"hotpot_qa\", 'fullwiki', split='validation', trust_remote_code=True)\n",
+    "        # `test` split has no answer\n",
+    "\n",
+    "        self.train = self.process_dataset(hf_official_train, train_size)\n",
+    "        self.validation = self.process_dataset(hf_official_validation, validation_size)\n",
+    "\n",
+    "    def process_dataset(self, dataset, size, keep_details = False):\n",
+    "        rep = []\n",
+    "        for raw_example in dataset:\n",
+    "            if keep_details is True:\n",
+    "                keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context', 'level']\n",
+    "            elif keep_details == 'validation_titles':\n",
+    "                keys = ['question', 'answer', 'supporting_facts', 'level']\n",
+    "            else:\n",
+    "                keys = ['question', 'answer', 'level']\n",
+    "\n",
+    "            example = {k: raw_example[k] for k in keys}\n",
+    "            \n",
+    "            if 'supporting_facts' in example:\n",
+    "                example['gold_titles'] = set(example['supporting_facts']['title'])\n",
+    "                del example['supporting_facts']\n",
+    "\n",
+    "            rep.append(example)\n",
+    "\n",
+    "        rng = random.Random(0)\n",
+    "        rng.shuffle(rep)\n",
+    "        rep = rep[:size]\n",
+    "        return rep"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fe310e0a-9f0b-42a6-b36b-2d0d911034e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_HotPotSV = HotPotSV(train_size=select_dataset_total, validation_size=select_dataset_total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c02f1843-6199-4cbf-9b3a-5d179aeab54c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ChangeAnswerSignature(dspy.Signature):\n",
+    "    \"\"\"Generate a new answer based on the question and correct answer\"\"\"\n",
+    "    question = dspy.InputField()\n",
+    "    answer = dspy.InputField(desc=\"correct answer\")\n",
+    "    output = dspy.OutputField(desc=\"incorrect answer\")\n",
+    "    \n",
+    "class CombineQA(dspy.Signature):\n",
+    "    \"\"\"Combine the question and answer into one statement of direct expression with context\"\"\"\n",
+    "    question = dspy.InputField()\n",
+    "    answer = dspy.InputField()\n",
+    "    statement = dspy.OutputField(desc=\"generated from the question and answer only, with all the original info\")\n",
+    "    \n",
+    "class GenerateStatement(dspy.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.change_answer = dspy.Predict(ChangeAnswerSignature)\n",
+    "        self.combine_qa = dspy.Predict(CombineQA)\n",
+    "\n",
+    "    def forward(self, question, answer, level=None):\n",
+    "        direction = random.choice([\"True\", \"False\"])\n",
+    "        # direction = 'False'\n",
+    "        answer_fake = None\n",
+    "        if direction == \"False\":\n",
+    "            new_answer = self.change_answer(question=question, answer=answer).output\n",
+    "            answer_fake = new_answer\n",
+    "            # print(f\"new_answer: {new_answer}\")\n",
+    "        else:\n",
+    "            new_answer = answer\n",
+    "        statement = self.combine_qa(question=question, answer=new_answer).statement\n",
+    "        # statement = self.generate_statement(question=question, answer=new_answer).statement\n",
+    "\n",
+    "        rep = {\n",
+    "            'question': question,\n",
+    "            'answer': answer,\n",
+    "            'verdict': direction,\n",
+    "            'statement': statement,\n",
+    "            'level': level,\n",
+    "        }\n",
+    "        \n",
+    "        if answer_fake:\n",
+    "            rep['answer_fake'] = answer_fake\n",
+    "        # print(f\"with statement: {rep}\\n\")\n",
+    "        return rep"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "349d1bba-112d-482b-916f-3b68b20cd052",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/dsp/modules/gpt3.py:251: UserWarning: Persisting input arguments took 1.68s to run.If this happens often in your code, it can cause performance problems (results will be correct in all cases). The reason for this is probably some large input arguments for a wrapped function.\n",
+      "  return v1_cached_gpt3_request_v2(**kwargs)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'question': 'Which cartridge did John Browning design that has a rim that is the same in diameter as the .50 GI?', 'answer': '.45 ACP', 'verdict': 'False', 'statement': 'John Browning designed the .30-06 Springfield cartridge, which has a rim that is the same diameter as the .50 GI.', 'level': 'hard', 'answer_fake': '.30-06 Springfield'}, {'question': 'Mindless Self Indulgence and Tappi Tíkarrass are both what?', 'answer': 'band', 'verdict': 'False', 'statement': 'Mindless Self Indulgence and Tappi Tíkarrass are both musical instruments.', 'level': 'medium', 'answer_fake': 'musical instrument'}, {'question': \"What was Robert Tree Cody's adopted father's most prominent PSA role?\", 'answer': 'Keep America Beautiful', 'verdict': 'False', 'statement': \"Robert Tree Cody's adopted father was a spokesperson for the American Cancer Society.\", 'level': 'hard', 'answer_fake': 'He was a spokesperson for the American Cancer Society.'}]\n",
+      "\n",
+      "\n",
+      "[{'question': \"Which of Damon Stoudamire's cousins once played college basketball for the University of Kentucky?\", 'answer': 'Terrence Jones', 'verdict': 'True', 'statement': \"Damon Stoudamire's cousin Terrence Jones once played college basketball for the University of Kentucky.\", 'level': 'hard'}, {'question': 'What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?', 'answer': 'Free Range Films', 'verdict': 'False', 'statement': 'A24, a production company co-owned by Kevin Loader and Rodger Michell, produced My Cousin Rachel.', 'level': 'hard', 'answer_fake': 'A24'}, {'question': 'Alexander Kerensky was defeated and destroyed by the Bolsheviks in the course of a civil war that ended when ?', 'answer': 'October 1922', 'verdict': 'True', 'statement': 'Alexander Kerensky was defeated and destroyed by the Bolsheviks in the course of a civil war that ended in October 1922.', 'level': 'hard'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import concurrent.futures\n",
+    "\n",
+    "def generate_datasets():\n",
+    "    def generate_statement(d):\n",
+    "        return GenerateStatement()(**d)\n",
+    "        \n",
+    "    train = []\n",
+    "    validation = []\n",
+    "\n",
+    "    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:\n",
+    "        train_futures = [executor.submit(generate_statement, d) for d in dataset_HotPotSV.train]\n",
+    "        validation_futures = [executor.submit(generate_statement, d) for d in dataset_HotPotSV.validation]\n",
+    "\n",
+    "        # Collecting results as they complete\n",
+    "        train = [future.result() for future in concurrent.futures.as_completed(train_futures)]\n",
+    "        validation = [future.result() for future in concurrent.futures.as_completed(validation_futures)]\n",
+    "\n",
+    "    return train, validation\n",
+    "\n",
+    "dataset_train_HotPotQA_generated, dataset_validation_HotPotQA_generated = generate_datasets()\n",
+    "\n",
+    "print(dataset_train_HotPotQA_generated[:3])\n",
+    "print(\"\\n\")\n",
+    "print(dataset_validation_HotPotQA_generated[:3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "168438c4-cbc3-4a29-9b54-5251f8d790b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'statement': 'In 2023, the Japanese sumo wrestler from Honolulu,  [Name of '\n",
+      "               \"wrestler], won the prestigious Emperor's Cup at the Autumn \"\n",
+      "               'Grand Sumo Tournament.',\n",
+      "  'verdict': 'irrelevant'},\n",
+      " {'statement': 'After 2020,  Charles Giblyn directed the short film \"The Last '\n",
+      "               'Supper,\" which premiered at the 2022 Cannes Film Festival.',\n",
+      "  'verdict': 'irrelevant'},\n",
+      " {'statement': 'In 2021, a French-Scottish collaborative project published a '\n",
+      "               'new translation of Kenneth Grahame\\'s \"The Wind in the '\n",
+      "               'Willows\" with an introduction by a prominent French '\n",
+      "               'philosopher.',\n",
+      "  'verdict': 'irrelevant'}]\n",
+      "\n",
+      "\n",
+      "[{'statement': 'In 2022, Cranium launched a new digital version of their '\n",
+      "               'classic board game, allowing players to compete online.',\n",
+      "  'verdict': 'irrelevant'},\n",
+      " {'statement': 'In 2021, a documentary film titled \"The Forgotten Revolution\" '\n",
+      "               'explored the lesser-known stories of individuals who opposed '\n",
+      "               'the Bolsheviks during the Russian Civil War, shedding new '\n",
+      "               'light on the complexities of that era.',\n",
+      "  'verdict': 'irrelevant'},\n",
+      " {'statement': 'In 2023, the average age of pilots in the French Air Force was '\n",
+      "               'higher than the average age of pilots in the German Air Force.',\n",
+      "  'verdict': 'irrelevant'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"Generate `irrelevant` datasets.\"\"\"\n",
+    "\n",
+    "from dsp.utils import deduplicate\n",
+    "import re\n",
+    "\n",
+    "class GenerateLatestStatementSignature(dspy.Signature):\n",
+    "    \"\"\"Generate a fact with content similar to the input and happened after year 2020\"\"\"\n",
+    "    input = dspy.InputField()\n",
+    "    output = dspy.OutputField(desc=\"a fact that exists after year 2020\")\n",
+    "\n",
+    "import concurrent.futures\n",
+    "\n",
+    "def generate_latest_statements():\n",
+    "    \"\"\"Generate statements that don't have related context in retriever.\"\"\"\n",
+    "\n",
+    "    def contains_number_in_range(s):\n",
+    "        \"\"\"\n",
+    "        The retriever has data up to year 2017.\n",
+    "        Make sure the statement contains year number higher than 2020.\n",
+    "        \"\"\"\n",
+    "        numbers = re.findall(r'\\d+', s)\n",
+    "        \n",
+    "        # Check if any of these numbers are in the range 2021 to 2029 (inclusive)\n",
+    "        for number in numbers:\n",
+    "            num = int(number)\n",
+    "            if 2020 < num < 2030:\n",
+    "                return True\n",
+    "        \n",
+    "        return False\n",
+    "        \n",
+    "    def to_json(statements):\n",
+    "        return [{'statement': s, 'verdict': 'Irrelevant'} for s in statements]\n",
+    "\n",
+    "    def process_data(data, count):\n",
+    "        processed = []\n",
+    "        \n",
+    "        def process_single(d):\n",
+    "            return dspy.Predict(GenerateLatestStatementSignature)(input=d['statement']).output\n",
+    "\n",
+    "        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:\n",
+    "            futures = [executor.submit(process_single, d) for d in data]\n",
+    "            for future in concurrent.futures.as_completed(futures):\n",
+    "                s_new = future.result()\n",
+    "                if contains_number_in_range(s_new):\n",
+    "                    processed = deduplicate(processed + [s_new])\n",
+    "                if len(processed) >= count:\n",
+    "                    break\n",
+    "        \n",
+    "        return to_json(processed)\n",
+    "\n",
+    "    count_train = len(dataset_train_HotPotQA_generated) * irrelevant_percent\n",
+    "    train = process_data(dataset_train_HotPotQA_generated, count_train)\n",
+    "\n",
+    "    count_validation = len(dataset_validation_HotPotQA_generated) * irrelevant_percent\n",
+    "    validation = process_data(dataset_validation_HotPotQA_generated, count_validation)\n",
+    "\n",
+    "    return train, validation\n",
+    "\n",
+    "_irrelevant_train, _irrelevant_validation = generate_latest_statements()\n",
+    "\n",
+    "from pprint import pprint\n",
+    "pprint(_irrelevant_train[:3])\n",
+    "print(\"\\n\")\n",
+    "pprint(_irrelevant_validation[:3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "87f7bfa2-5c2b-4f5d-8922-89f20d42449f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "117582\n",
+      "9627\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "dataset_train_HotPotQA_generated += _irrelevant_train\n",
+    "dataset_validation_HotPotQA_generated += _irrelevant_validation\n",
+    "\n",
+    "rng = random.Random(1)\n",
+    "rng.shuffle(dataset_train_HotPotQA_generated)\n",
+    "rng.shuffle(dataset_validation_HotPotQA_generated)\n",
+    "\n",
+    "print(len(dataset_train_HotPotQA_generated))\n",
+    "print(len(dataset_validation_HotPotQA_generated))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "03bc664d-8677-404a-ab38-b73e5f4b25f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Save datasets to file\"\"\"\n",
+    "\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "base = \"./datasets/HotPotQA\"\n",
+    "data_files = {\n",
+    "    \"train.json\": dataset_train_HotPotQA_generated,\n",
+    "    \"validation.json\": dataset_validation_HotPotQA_generated,\n",
+    "}\n",
+    "\n",
+    "os.makedirs(base, exist_ok=True)\n",
+    "\n",
+    "for filename, data in data_files.items():\n",
+    "    file_path = os.path.join(base, filename)\n",
+    "    with open(file_path, 'w') as file:\n",
+    "        json.dump(data, file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docker-compose.yml b/docker-compose.yml
index 820a047..cc61cba 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,7 +4,7 @@ services:
       dockerfile: Dockerfile
     container_name: check
     env_file:
-      - .env
+      - ./infra/env.d/check
     ports:
       - 8000:8000
     restart: always
@@ -23,17 +23,19 @@ services:
               count: all
               capabilities: [gpu]
     restart: always
-  xinference:
-    build:
-      context: infra/xinference
-      dockerfile: Dockerfile
-    container_name: xinference
+
+  # infinity supports embedding and rerank models, v2 version supports serving multiple models
+  infinity:
+    image: michaelf34/infinity:latest
+    container_name: infinity
     ports:
-      - 9997:9997
+      - 7997:7997
     volumes:
-      - /data/volumes/xinference:/data
-    environment:
-      - XINFERENCE_HOME=/data
+      - /data/cache/huggingface:/cache/huggingface
+    env_file:
+      - ./infra/env.d/infinity
+      - ./infra/env.d/huggingface
+    command: ["v2"]
     deploy:
       resources:
         reservations:
diff --git a/infra/env.d/check b/infra/env.d/check
new file mode 100644
index 0000000..8a2069f
--- /dev/null
+++ b/infra/env.d/check
@@ -0,0 +1,19 @@
+EMBEDDING_API_KEY=<CHANGE_ME>
+EMBEDDING_BASE_URL=http://ollama:11434
+EMBEDDING_MODEL_DEPLOY=api
+EMBEDDING_MODEL_NAME=jina/jina-embeddings-v2-base-en
+INDEX_CHUNK_SIZES=[2048, 512, 128]
+THREAD_BUILD_INDEX=12
+
+LLM_MODEL_NAME=google/gemma-2-27b-it
+OPENAI_API_KEY=<CHANGE_ME>
+OPENAI_BASE_URL=http://localhost:8000/v1
+
+RERANK_API_KEY=<CHANGE_ME>
+RERANK_BASE_URL=http://infinity:7997
+RERANK_MODEL_DEPLOY=api
+RERANK_MODEL_NAME=jinaai/jina-reranker-v2-base-multilingual
+
+SEARCH_BASE_URL=https://s.jina.ai
+
+PROJECT_HOSTING_BASE_URL=http://127.0.0.1:8000
\ No newline at end of file
diff --git a/infra/env.d/huggingface b/infra/env.d/huggingface
new file mode 100644
index 0000000..e652558
--- /dev/null
+++ b/infra/env.d/huggingface
@@ -0,0 +1,3 @@
+HF_HOME=/cache/huggingface
+HUGGING_FACE_HUB_TOKEN=<CHANGE_ME>
+
diff --git a/infra/env.d/infinity b/infra/env.d/infinity
new file mode 100644
index 0000000..96cf367
--- /dev/null
+++ b/infra/env.d/infinity
@@ -0,0 +1,4 @@
+INFINITY_API_KEY=<CHANGE_ME>
+INFINITY_LOG_LEVEL=trace
+INFINITY_MODEL_ID=jinaai/jina-reranker-v2-base-multilingual
+
diff --git a/infra/xinference/Dockerfile b/infra/xinference/Dockerfile
deleted file mode 100644
index a1e1df1..0000000
--- a/infra/xinference/Dockerfile
+++ /dev/null
@@ -1,8 +0,0 @@
-# Reference: https://github.com/xorbitsai/inference/issues/1431
-
-FROM xprobe/xinference:latest
-RUN apt-get install -y tini && \
-  rm -rf /var/lib/apt/lists/*
-COPY init.sh /init.sh
-ENTRYPOINT ["/usr/bin/tini", "--", "/init.sh"]
-
diff --git a/infra/xinference/init.sh b/infra/xinference/init.sh
deleted file mode 100755
index 9adac43..0000000
--- a/infra/xinference/init.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-xinference-local -H 0.0.0.0 &
-PID1=$!
-while true; do
-  if curl -s "http://localhost:9997" > /dev/null; then
-    break
-  else
-    sleep 1
-  fi
-done
-xinference launch --model-name jina-reranker-v2 --model-type rerank &
-PID2=$!
-wait $PID1 $PID2
-
diff --git a/src/modules/retrieve.py b/src/modules/retrieve.py
index c42c925..35bc0df 100644
--- a/src/modules/retrieve.py
+++ b/src/modules/retrieve.py
@@ -2,9 +2,9 @@
 LlamaIndexCustomRetriever
 """
 
-import os, logging
-from typing import Optional
+import logging
 import concurrent.futures
+from typing import Optional
 
 from llama_index.core import (
     Document,
@@ -23,8 +23,7 @@
 import utils
 from settings import settings
 
-import llama_index.postprocessor.jinaai_rerank.base as jinaai_rerank  # todo: shall we lock package version?
-jinaai_rerank.API_URL = settings.RERANK_BASE_URL + "/rerank"  # switch to on-premise
+from llama_index.postprocessor.jinaai_rerank import JinaRerank
 
 # todo: high lantency between client and the ollama embedding server will slow down embedding a lot
 from . import OllamaEmbedding
@@ -33,9 +32,10 @@
 if settings.EMBEDDING_MODEL_DEPLOY == "local":
     embed_model="local:" + settings.EMBEDDING_MODEL_NAME
 else:
+    # TODO: debug Ollama embedding with chunk size [4096, 2048, 1024] compare to local
     embed_model = OllamaEmbedding(
         model_name=settings.EMBEDDING_MODEL_NAME,
-        base_url=os.environ.get("OLLAMA_BASE_URL"),  # todo: any other configs here?
+        base_url=settings.EMBEDDING_BASE_URL,
     )
 Settings.embed_model = embed_model
 
@@ -116,7 +116,12 @@ def get_automerging_query_engine(
                 top_n=rerank_top_n, model=settings.RERANK_MODEL_NAME,
             )  # TODO: add support `trust_remote_code=True`
         else:
-            rerank = jinaai_rerank.JinaRerank(api_key='', top_n=rerank_top_n, model=settings.RERANK_MODEL_NAME)
+            rerank = JinaRerank(
+                base_url = settings.RERANK_BASE_URL,
+                api_key=settings.RERANK_API_KEY, 
+                top_n=rerank_top_n, 
+                model=settings.RERANK_MODEL_NAME,
+            )
         
         auto_merging_engine = RetrieverQueryEngine.from_args(
             retriever, node_postprocessors=[rerank]
@@ -134,15 +139,22 @@ def build_index(self, docs):
             )  # TODO: try to retrieve directly
         
     def retrieve(self, query):
+        rerank_top_n=self.similarity_top_k
         query_engine = self.get_automerging_query_engine(
             automerging_index=self.index,
             storage_context=self.storage_context,
-            similarity_top_k=self.similarity_top_k * 3,
-            rerank_top_n=self.similarity_top_k
+            similarity_top_k=rerank_top_n * 3,
+            rerank_top_n=rerank_top_n
         )
         self.query_engine = query_engine
         auto_merging_response = self.query_engine.query(query)
         contexts = utils.llama_index_nodes_to_list(auto_merging_response.source_nodes)
+
+        # select top_n here because some rerank services does not support the feature
+        if len(contexts) > rerank_top_n:
+            contexts.sort(key=lambda x: x['score'], reverse=True)  # sort by score in descending order
+            contexts = contexts[:rerank_top_n]
+            
         return contexts
 
 import dspy
diff --git a/src/settings.py b/src/settings.py
index 9a052b9..294de5d 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -8,6 +8,7 @@ def __init__(self):
         self.RERANK_MODEL_NAME = os.environ.get("RERANK_MODEL_NAME") or "BAAI/bge-reranker-v2-m3"
         
         self.OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL") or "https://api.openai.com/v1"
+        self.EMBEDDING_BASE_URL = os.environ.get("EMBEDDING_BASE_URL") or "http://ollama:11434"
         self.RERANK_BASE_URL = os.environ.get("RERANK_BASE_URL") or "http://xinference:9997/v1"
         self.PROJECT_HOSTING_BASE_URL = os.environ.get("PROJECT_HOSTING_BASE_URL") or "https://check.ittia.net"
         self.SEARCH_BASE_URL = os.environ.get("SEARCH_BASE_URL") or "https://s.jina.ai"
@@ -27,5 +28,6 @@ def __init__(self):
 
         # keys
         self.EMBEDDING_API_KEY = os.environ.get("EMBEDDING_API_KEY") or ""
+        self.RERANK_API_KEY = os.environ.get("RERANK_API_KEY") or ""
     
 settings = Settings()
\ No newline at end of file