diff --git a/.gitignore b/.gitignore
index 7094b42f..6f0276a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,10 @@
+# temp files of examples
+model-cache/
+notebooks/dataset/
+notebooks/toy_data/nv_embedding/
+notebooks/toy_data/jordan.png
+notebooks/embed
+
 # Python Exclusions
 .venv
 **__pycache__**
diff --git a/notebooks/01_dataloader.ipynb b/notebooks/01_dataloader.ipynb
index 4fe75990..862b6751 100644
--- a/notebooks/01_dataloader.ipynb
+++ b/notebooks/01_dataloader.ipynb
@@ -7,7 +7,12 @@
    "source": [
     "# Press Release Chat Bot\n",
     "\n",
-    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database. "
+    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database.\n",
+    "\n",
+    "\n",
+    "**Pre-requisites:** follow the [Using NVIDIA NIM for LLMs](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html) tutorial, up to the [Build and Start the Containers](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#build-and-start-the-containers) section. This tutorial requires \\~39 GB disk space for container images (\\~29 GB) and model checkpoints (\\~9GB).\n",
+    "\n",
+    "> To run this example on a machine _without_ any GPU, you need to obtain a key for the NVIDIA API Catalog ([steps](https://nvidia.github.io/GenerativeAIExamples/latest/api-catalog.html#get-an-api-key-for-the-accessing-models-on-the-api-catalog)), then modify step 5A to `export NVIDIA_API_KEY=nvapi-xxx`. This will make step 5B to run containers that will invoke NVIDIA AI APIs."
    ]
   },
   {
@@ -15,9 +20,9 @@
    "id": "4c74eaf2",
    "metadata": {},
    "source": [
-    "#### Step-1: Load the pdf files from the dataset folder.\n",
+    "## Step-1: Load the pdf files from the dataset folder.\n",
     "\n",
-    "You can upload the pdf files containing the NVIDIA blogs to ```query:8081/uploadDocument``` API endpoint"
+    "You can upload the pdf files containing the NVIDIA blogs to ```http://chain_server:8081/documents``` API endpoint."
    ]
   },
   {
@@ -38,6 +43,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "chain_server = 'localhost'  # Assume the API server is co-located with the JupyterLab server.\n",
+    "\n",
     "import os\n",
     "import requests\n",
     "import mimetypes\n",
@@ -63,7 +70,7 @@
     "            file_path = os.path.join(folder_path, files)\n",
     "            print(upload_document(file_path, upload_url))\n",
     "            i += 1\n",
-    "            if i > num_files:\n",
+    "            if i >= num_files:\n",
     "                break"
    ]
   },
@@ -78,7 +85,7 @@
     "\n",
     "start_time = time.time()\n",
     "NUM_DOCS_TO_UPLOAD=100\n",
-    "upload_pdf_files(\"dataset\", \"http://chain-server:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
+    "upload_pdf_files(\"dataset\", f\"http://{chain_server}:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
     "print(f\"--- {time.time() - start_time} seconds ---\")"
    ]
   },
@@ -87,8 +94,8 @@
    "id": "830882ef",
    "metadata": {},
    "source": [
-    "#### Step-2 : Ask a question without referring to the knowledge base\n",
-    "Ask Tensorrt LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
+    "## Step-2 : Ask a question without referring to the knowledge base\n",
+    "Ask TensorRT-LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
    ]
   },
   {
@@ -112,7 +119,7 @@
     "  \"max_tokens\": 256\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "with requests.post(url, stream=True, json=data) as req:\n",
@@ -155,7 +162,7 @@
     "  \"max_tokens\": 50\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "tokens_generated = 0\n",
@@ -180,7 +187,7 @@
    "id": "58954d15",
    "metadata": {},
    "source": [
-    "#### Next steps\n",
+    "## Next steps\n",
     "\n",
     "We have setup a playground UI for you to upload files and get answers from, the UI is available on the same IP address as the notebooks: `host_ip:8090/converse`"
    ]
diff --git a/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb b/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
index fdc1fb56..36f1db3c 100644
--- a/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
+++ b/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
@@ -79,7 +79,7 @@
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
     "\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
@@ -111,10 +111,10 @@
    "source": [
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
     "\n",
-    "embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -248,7 +248,20 @@
    "id": "1421512a",
    "metadata": {},
    "source": [
-    "### Step 6c - Read the previously processed & saved Faiss vectore store back"
+    "### Step 6c - Read the previously processed & saved Faiss vectore store back\n",
+    "\n",
+    "<details>\n",
+    "<summary>Notes on <code>allow_dangerous_serialization</code> <code>ValueError</code></summary>\n",
+    "\n",
+    "<p><code>langchain>=0.1.17</code> requires kwarg <code>allow_dangerous_deserialization=True</code>\n",
+    "added to <code>FAISS.load_local()</code> to avoid this error: <i>\"The de-serialization relies loading a pickle file.\n",
+    "Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.\n",
+    "You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you\n",
+    "trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the\n",
+    "file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on \"\n",
+    "the internet.)\"</i> (<a href=\"https://github.com/langchain-ai/langchain/blob/315223ce264d4932f44ca12736619c89340beabe/libs/community/langchain_community/vectorstores/faiss.py#L1076\">ref</a>) .\n",
+    "</p>\n",
+    "</details>"
    ]
   },
   {
@@ -259,8 +272,7 @@
    "outputs": [],
    "source": [
     "# Load the vectorestore back.\n",
-    "\n",
-    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder)\n"
+    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder, allow_dangerous_deserialization=True)"
    ]
   },
   {
@@ -278,7 +290,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "retriever = store.as_retriever()\n",
     "\n",
     "prompt = ChatPromptTemplate.from_messages(\n",
diff --git a/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb b/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
index 56769a71..a940bba6 100644
--- a/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
+++ b/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
@@ -79,7 +79,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
    ]
@@ -101,13 +101,14 @@
    "source": [
     "# Create and dl embeddings instance wrapping huggingface embedding into langchain embedding\n",
     "# Bring in embeddings wrapper\n",
-    "from llama_index.embeddings import LangchainEmbedding\n",
+    "from llama_index.legacy.embeddings import LangchainEmbedding\n",
     "\n",
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
-    "nv_embedding = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "\n",
+    "nv_embedding = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "li_embedding=LangchainEmbedding(nv_embedding)\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")\n"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -144,7 +145,7 @@
     "    embed_model=li_embedding\n",
     ")\n",
     "# And set the service context\n",
-    "set_global_service_context(service_context)\n"
+    "set_global_service_context(service_context)"
    ]
   },
   {
@@ -159,15 +160,16 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "47a17ce2",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "#create query engine with cross encoder reranker\n",
     "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
-    "import torch\n",
     "\n",
     "documents = SimpleDirectoryReader(\"./toy_data\").load_data()\n",
-    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n"
+    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
    ]
   },
   {
diff --git a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
index b38f28dd..06c63700 100644
--- a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
+++ b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
@@ -34,13 +34,13 @@
     "At the end of the day, as below illustrated, we would like to have a UI which allow user to upload image of their choice and have the agent choose tools to do visual reasoning. \n",
     "\n",
     "![interactive UI](./imgs/visual_reasoning.png)    \n",
-    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware\n"
+    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "86843453",
+   "id": "2699edba-8f2a-4473-8cfb-b8667051a619",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -72,11 +72,11 @@
     "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ[\"NVIDIA_API_KEY\"]\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key\n",
-    "global nvapi_key"
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
    ]
   },
   {
@@ -125,14 +125,10 @@
     "\n",
     "\n",
     "\n",
-    "def fuyu(prompt,img_path):\n",
+    "def fuyu(prompt, img_path, nvapi_key=nvapi_key):\n",
     "    invoke_url = \"https://ai.api.nvidia.com/v1/vlm/adept/fuyu-8b\"\n",
     "    stream = True\n",
-    "    \n",
-    "    \n",
     "    image_b64=img2base64_string(img_path)\n",
-    "    \n",
-    "    \n",
     "    assert len(image_b64) < 200_000, \\\n",
     "      \"To upload larger images, use the assets API (see docs)\"\n",
     "\n",
@@ -140,7 +136,7 @@
     "      \"Authorization\": f\"Bearer {nvapi_key}\",\n",
     "      \"Accept\": \"text/event-stream\" if stream else \"application/json\"\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    payload = {\n",
     "      \"messages\": [\n",
     "        {\n",
@@ -154,9 +150,9 @@
     "      \"seed\": 0,\n",
     "      \"stream\": stream\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    response = requests.post(invoke_url, headers=headers, json=payload)\n",
-    "    \n",
+    "\n",
     "    if stream:\n",
     "        output=[]\n",
     "        for line in response.iter_lines():\n",
@@ -216,7 +212,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n"
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)"
    ]
   },
   {
diff --git a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
index 2f945afb..02e17aa8 100644
--- a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
+++ b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "In this notebook we demonstrate how to build a RAG using [NVIDIA AI Endpoints for LangChain](https://python.langchain.com/docs/integrations/text_embedding/nvidia_ai_endpoints). We create a vector store by downloading web pages and generating their embeddings using FAISS. We then showcase two different chat chains for querying the vector store. For this example, we use the NVIDIA Triton documentation website, though the code can be easily modified to use any other source.  \n",
     "\n",
-    "### First stage is to load NVIDIA Triton documentation from the web, chunkify the data, and generate embeddings using FAISS\n",
+    "## First stage is to load NVIDIA Triton documentation from the web, chunkify the data, and generate embeddings using FAISS\n",
     "\n",
     "To get started:\n",
     "\n",
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "980506c9",
    "metadata": {},
    "outputs": [],
@@ -70,25 +70,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "bf9a84ac",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Enter your NVIDIA API key:  ······································································\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import getpass\n",
     "\n",
     "if not os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    nvapi_key = getpass.getpass(\"Enter your NVIDIA API key: \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key\n",
+    "else:\n",
+    "    nvapi_key = os.environ.get(\"NVIDIA_API_KEY\")"
    ]
   },
   {
@@ -96,12 +90,25 @@
    "id": "91fcd102",
    "metadata": {},
    "source": [
-    "Helper functions for loading html files, which we'll use to generate the embeddings. We'll use this later to load the relevant html documents from the Triton documentation website and convert to a vector store."
+    "Helper functions for loading html files, which we'll use to generate the embeddings. We'll use this later to load the relevant html documents from the Triton documentation website and convert to a vector store.\n",
+    "\n",
+    "<details>\n",
+    "<summary>Notes on <code>allow_dangerous_serialization</code> on <code>FAISS.load_local()</code></summary>\n",
+    "\n",
+    "<p><code>langchain>=0.1.17</code> requires kwarg <code>allow_dangerous_deserialization=True</code>\n",
+    "added to <code>FAISS.load_local()</code> to avoid this error: <i>\"The de-serialization relies loading a pickle file.\n",
+    "Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.\n",
+    "You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you\n",
+    "trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the\n",
+    "file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on \"\n",
+    "the internet.)\"</i> (<a href=\"https://github.com/langchain-ai/langchain/blob/315223ce264d4932f44ca12736619c89340beabe/libs/community/langchain_community/vectorstores/faiss.py#L1076\">ref</a>) .\n",
+    "</p>\n",
+    "</details>\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d84c5ef5",
    "metadata": {},
    "outputs": [],
@@ -145,7 +152,7 @@
     "        text = soup.get_text()\n",
     "\n",
     "        # Remove excess whitespace and newlines\n",
-    "        text = re.sub(\"\\s+\", \" \", text).strip()\n",
+    "        text = re.sub(r\"\\s+\", \" \", text).strip()\n",
     "\n",
     "        return text\n",
     "    except Exception as e:\n",
@@ -166,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "6f48635f",
    "metadata": {},
    "outputs": [],
@@ -206,12 +213,12 @@
    "id": "942934e8",
    "metadata": {},
    "source": [
-    "Generate embeddings using NVIDIA AI Endpoints for LangChain and save embeddings to offline vector store in the /embed directory for future re-use"
+    "Generate embeddings using NVIDIA AI Endpoints for LangChain and save embeddings to offline vector store in the /embed directory for future re-use."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "27d1aced",
    "metadata": {},
    "outputs": [],
@@ -229,7 +236,7 @@
     "    Returns:\n",
     "        None\n",
     "    \"\"\"\n",
-    "    embeddings = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "    embeddings = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "\n",
     "    for document in documents:\n",
     "        texts = splitter.split_text(document.page_content)\n",
@@ -239,7 +246,7 @@
     "\n",
     "        # create embeddings and add to vector store\n",
     "        if os.path.exists(dest_embed_dir):\n",
-    "            update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings)\n",
+    "            update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings, allow_dangerous_deserialization=True)\n",
     "            update.add_texts(texts, metadatas=metadatas)\n",
     "            update.save_local(folder_path=dest_embed_dir)\n",
     "        else:\n",
@@ -252,7 +259,7 @@
    "id": "9831f7ba",
    "metadata": {},
    "source": [
-    "### Second stage is to load the embeddings from the vector store and build a RAG using NVIDIAEmbeddings\n",
+    "## Second stage is to load the embeddings from the vector store and build a RAG using NVIDIAEmbeddings\n",
     "\n",
     "Create the embeddings model using NVIDIA Retrieval QA Embedding endpoint. This model represents words, phrases, or other entities as vectors of numbers and understands the relation between words and phrases. See here for reference: https://build.nvidia.com/nvidia/embed-qa-4"
    ]
@@ -262,21 +269,10 @@
    "execution_count": null,
    "id": "f56cadd0",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Storing embeddings to ./embed\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "create_embeddings()\n",
-    "\n",
-    "embedding_model = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n"
+    "embedding_model = NVIDIAEmbeddings(model=\"NV-Embed-QA\")"
    ]
   },
   {
@@ -296,7 +292,7 @@
    "source": [
     "# Embed documents\n",
     "embedding_path = \"embed/\"\n",
-    "docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model)"
+    "docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model, allow_dangerous_deserialization=True)"
    ]
   },
   {
@@ -320,7 +316,7 @@
     "\n",
     "question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)\n",
     "\n",
-    "chat = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", temperature=0.1, max_tokens=1000, top_p=1.0)\n",
+    "chat = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", temperature=0.1, max_tokens=1000, top_p=1.0)\n",
     "\n",
     "doc_chain = load_qa_chain(chat , chain_type=\"stuff\", prompt=QA_PROMPT)\n",
     "\n",
@@ -348,7 +344,7 @@
    "outputs": [],
    "source": [
     "query = \"What is Triton?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -368,7 +364,7 @@
    "outputs": [],
    "source": [
     "query = \"What interfaces does Triton support?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -388,7 +384,7 @@
    "outputs": [],
    "source": [
     "query = \"But why?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -438,7 +434,7 @@
    "outputs": [],
    "source": [
     "query = \"What is Triton?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -458,7 +454,7 @@
    "outputs": [],
    "source": [
     "query = \"Does Triton support ONNX?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -478,7 +474,7 @@
    "outputs": [],
    "source": [
     "query = \"But why?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   }
diff --git a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
index 8a85c771..4270977d 100644
--- a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
+++ b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
@@ -82,6 +82,7 @@
     "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ.get('NVIDIA_API_KEY')\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
@@ -129,7 +130,7 @@
    "source": [
     "## Step 3 - Retriever from FAISS vector store\n",
     "\n",
-    "We need to process a toy example, here we use `Sweden.txt` from the `toy_data` folder."
+    "We need to process a toy example, here we use `Sweden.txt` from the `toy_data` folder. Please review the path where we'll store the embeddings, and update as necessary."
    ]
   },
   {
@@ -139,6 +140,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "embed_path = \"/workspace/save_embedding/sv'\n",
+    "\n",
     "import os\n",
     "from tqdm import tqdm\n",
     "from pathlib import Path\n",
@@ -177,7 +180,7 @@
     "\n",
     "# you only need to do this once, in the future, when re-run this notebook, skip to below and load the vector store from disk\n",
     "store = FAISS.from_texts(docs, embedder , metadatas=metadatas)\n",
-    "store.save_local('/workspace/save_embedding/sv')\n"
+    "store.save_local(embed_path)\n"
    ]
   },
   {
@@ -188,7 +191,7 @@
    "outputs": [],
    "source": [
     "## If you previously preprocessed and saved the vector store to disk, then reload it here\n",
-    "faissDB = FAISS.load_local(\"/workspace/save_embedding/sv\", embedder, allow_dangerous_deserialization=True)\n",
+    "faissDB = FAISS.load_local(embed_path, embedder, allow_dangerous_deserialization=True)\n",
     "retriever = faissDB.as_retriever()"
    ]
   },
diff --git a/notebooks/07_Chat_with_nvidia_financial_reports.ipynb b/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
index 9ad5c16a..43b77663 100644
--- a/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
+++ b/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
@@ -24,7 +24,7 @@
    "id": "612375a9",
    "metadata": {},
    "source": [
-    "### Step 1  - Export the NVIDIA_API_KEY\n",
+    "## Step 1  - Export the NVIDIA_API_KEY\n",
     "Supply the NVIDIA_API_KEY in this notebook when you run the cell below"
    ]
   },
@@ -39,6 +39,7 @@
     "import os\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ['NVIDIA_API_KEY']\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
@@ -50,7 +51,7 @@
    "id": "5b4afb52",
    "metadata": {},
    "source": [
-    "### Step 2 - initialize the LLM and Embedding Model\n",
+    "## Step 2 - initialize the LLM and Embedding Model\n",
     "Here we will use **mixtral_8x7b** "
    ]
   },
@@ -63,12 +64,11 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA,NVIDIAEmbeddings\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "from langchain.vectorstores import Milvus\n",
-    "import torch\n",
-    "import time\n",
-    "embedder_document = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")\n",
-    "embedder_query = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"query\")"
+    "\n",
+    "embedder_document = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")\n",
+    "embedder_query = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"query\")"
    ]
   },
   {
@@ -76,7 +76,7 @@
    "id": "d2104106",
    "metadata": {},
    "source": [
-    "### Step 3 - Ingest http files"
+    "## Step 3 - Ingest http files"
    ]
   },
   {
@@ -84,7 +84,7 @@
    "id": "a4e45a22-c883-40cc-b2f4-e6f51866e52b",
    "metadata": {},
    "source": [
-    "#### 3.1 Download http files covering financial reports from Fiscal year 2020 to 2024"
+    "### 3.1 Download http files covering financial reports from Fiscal year 2020 to 2024"
    ]
   },
   {
@@ -107,8 +107,7 @@
     "        if quarter == \"fourth\":\n",
     "            urls_content.append(requests.get(url_template2.format(**args)).content)\n",
     "        else:\n",
-    "            urls_content.append(requests.get(url_template1.format(**args)).content)\n",
-    "\n"
+    "            urls_content.append(requests.get(url_template1.format(**args)).content)"
    ]
   },
   {
@@ -116,7 +115,7 @@
    "id": "8cbada93-6b0c-49df-a6b9-781f1a2400fb",
    "metadata": {},
    "source": [
-    "#### 3.2 Parse html files"
+    "### 3.2 Parse html files"
    ]
   },
   {
@@ -159,8 +158,7 @@
     "    soup = BeautifulSoup(url_content, 'html.parser')\n",
     "    url, title, content, tables = extract_url_title_time(soup)\n",
     "    parsed_htmls.append({\"url\":url, \"title\":title, \"content\":content, \"tables\":tables})\n",
-    "\n",
-    "\n"
+    "print(f\"Document count: {len(parsed_htmls)}\")"
    ]
   },
   {
@@ -168,14 +166,18 @@
    "id": "d985444a-8ef2-47d7-ba13-1a4da369f468",
    "metadata": {},
    "source": [
-    "#### 3.3 Summarize tables"
+    "### 3.3 Summarize tables\n",
+    "\n",
+    "It takes ~4 minutes to summarize the tables in the 2020-2024 reports."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "8bb88c6c-8167-4b6e-9b83-5a6937f9f9fb",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "# summarize tables\n",
@@ -200,11 +202,10 @@
     "    finally:\n",
     "        return res\n",
     "\n",
-    "\n",
-    "for parsed_item in parsed_htmls:\n",
+    "for doc_idx, parsed_item in enumerate(parsed_htmls):\n",
     "    title = parsed_item['title']\n",
     "    for idx, table in enumerate(parsed_item['tables']):\n",
-    "        print(f\"parsing tables in {title}...\")\n",
+    "        print(f\"Document {doc_idx} -- parsing tables {idx} in {title}...\")\n",
     "        table = get_table_summary(table, title, llm)\n",
     "        parsed_item['tables'][idx] = table\n"
    ]
@@ -214,7 +215,7 @@
    "id": "ccfd4607-3479-4cdd-b120-1f416959cb23",
    "metadata": {},
    "source": [
-    "#### 3.4 Split the text/table in chunks, extract embedding for each chunk, and store the embeddinges into milvus vectordb"
+    "### 3.4 Split the text/table in chunks, extract embedding for each chunk, and store the embeddinges into milvus vectordb"
    ]
   },
   {
@@ -252,6 +253,14 @@
     "print(f\"obtain {len(documents)} chunks\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2ba6dd03-adeb-493d-92b4-a03104d9f190",
+   "metadata": {},
+   "source": [
+    "Deploy a local Milvus server -- see step 7 on this [tutorial section](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#build-and-start-the-containers). Then, proceed to the next cell."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -261,15 +270,19 @@
    "source": [
     "COLLECTION_NAME = \"NVIDIA_Finance\"\n",
     "from langchain.vectorstores import Milvus\n",
+    "\n",
     "vectorstore = Milvus(\n",
     "    embedding_function=embedder_document,\n",
     "    collection_name=COLLECTION_NAME,\n",
     "    connection_args={\n",
-    "        \"host\": \"milvus\",\n",
+    "        \"host\": \"localhost\",\n",
     "        \"port\": \"19530\"},\n",
     "    drop_old = True,\n",
     "    auto_id = True\n",
     "    )\n",
+    "\n",
+    "# Use langchain-core<=0.2.10 to avoid TypeError: 'NoneType' object is not subscriptable\n",
+    "# See: https://github.com/langchain-ai/langchain/issues/24116#issuecomment-2223984425\n",
     "vectorstore.add_documents(documents)\n",
     "docs = vectorstore.similarity_search(\"what are 2024 Q3 revenues? \")"
    ]
@@ -289,7 +302,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "from langchain.prompts.prompt import PromptTemplate\n",
     "\n",
     "PROMPT_TEMPLATE = \"\"\"[INST]You are a friendly virtual assistant and maintain a conversational, polite, patient, friendly and gender neutral tone throughout the conversation.\n",
@@ -321,7 +333,6 @@
     "prompt_template = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[\"context\", \"question\"])\n",
     "\n",
     "\n",
-    "\n",
     "def build_context(chunks):\n",
     "    context = \"\"\n",
     "    for chunk in chunks:\n",
@@ -342,6 +353,22 @@
     "\n",
     "generate_answer(llm, vectorstore, prompt_template, question)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bf7b05e-53bf-4a9c-87d1-50b0077d6b2e",
+   "metadata": {},
+   "source": [
+    "Lastly, remember to stop the Milvus container. See step 1 in this [tutorial section](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#stopping-the-containers)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15970f1e-fbc7-4134-adf5-f60d8d10221a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
index c8002465..ccbc1fb6 100644
--- a/notebooks/requirements.txt
+++ b/notebooks/requirements.txt
@@ -4,7 +4,7 @@ python-multipart==0.0.6
 langchain==0.1.9
 unstructured[all-docs]==0.11.2
 sentence-transformers==2.2.2
-llama-index==0.9.22
+llama-index<0.10.0
 dataclass-wizard==0.22.2
 opencv-python==4.8.0.74
 llama-hub==0.0.43