diff --git a/experimental/knowledge_graph_rag/README.md b/experimental/knowledge_graph_rag/README.md index 2ec888009..0fe5642f3 100644 --- a/experimental/knowledge_graph_rag/README.md +++ b/experimental/knowledge_graph_rag/README.md @@ -85,17 +85,22 @@ python3 -m virtualenv venv source venv/bin/activate ``` -### 4. Install the required packages +### 4. Install external dependencies +```bash +sudo apt install poppler-utils ffmpeg libsm6 libxext6 tesseract-ocr libtesseract-dev +``` + +### 5. Install the required packages ```bash pip install -r requirements.txt ``` -### 5. Setup a hosted Milvus vector database +### 6. Setup a hosted Milvus vector database Follow the instructions [here](https://milvus.io/docs/install_standalone-docker.md) to deploy a hosted Milvus instance for the vector database backend. Note that it must be Milvus 2.4 or better to support [hybrid search](https://milvus.io/docs/multi-vector-search.md). We do not support disabling this feature for previous versions of Milvus as of now. -### 5. Launch the Streamlit frontend +### 7. Launch the Streamlit frontend ```bash streamlit run app.py @@ -103,7 +108,7 @@ streamlit run app.py Open the URL in your browser to access the UI and chatbot! -### 6. Upload Docs and Train Model +### 8. Upload Docs and Train Model Upload your own documents to a folder, or use an existing folder for the knowledge graph creation. Note that the implementation currently focuses on text from PDFs only. It can be extended to other text file formats using the Unstructured.io data loader in LangChain. diff --git a/experimental/knowledge_graph_rag/app.py b/experimental/knowledge_graph_rag/app.py index e19edc817..315c940cf 100644 --- a/experimental/knowledge_graph_rag/app.py +++ b/experimental/knowledge_graph_rag/app.py @@ -25,6 +25,9 @@ from vectorstore.search import SearchHandler from langchain_nvidia_ai_endpoints import ChatNVIDIA +import nltk +nltk.download('averaged_perceptron_tagger') + def load_data(input_dir, num_workers): reader = SimpleDirectoryReader(input_dir=input_dir) documents = reader.load_data(num_workers=num_workers) diff --git a/experimental/knowledge_graph_rag/requirements.txt b/experimental/knowledge_graph_rag/requirements.txt index 82a9a1410..a0ad3b38b 100644 --- a/experimental/knowledge_graph_rag/requirements.txt +++ b/experimental/knowledge_graph_rag/requirements.txt @@ -7,8 +7,8 @@ llama_index==0.10.50 networkx==3.2.1 numpy==1.24.1 pandas==2.2.2 -pymilvus==2.4.3 -Requests==2.32.3 +pymilvus[model]==2.4.3 +Requests==2.31.0 streamlit==1.30.0 unstructured[all-docs] tqdm==4.66.1 diff --git a/experimental/knowledge_graph_rag/utils/lc_graph.py b/experimental/knowledge_graph_rag/utils/lc_graph.py index 5a5a81ac2..fc763dc8e 100644 --- a/experimental/knowledge_graph_rag/utils/lc_graph.py +++ b/experimental/knowledge_graph_rag/utils/lc_graph.py @@ -15,7 +15,7 @@ from langchain_nvidia_ai_endpoints import ChatNVIDIA import concurrent.futures -from preprocessor import extract_triples +from utils.preprocessor import extract_triples from tqdm import tqdm from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter