Skip to content

testing RAG implementation and deployment at local #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions 2_vector_db/embeddings/different_types_of_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def word_embeddings():
# - min_count=1: Ignores all words with total frequency lower than this
# - workers=4: Number of CPU cores to use for training
print("Word Embedding for 'cat':", model.wv['cat'])
print("Word Embedding for 'dog':", model.wv['dog'])

# Sentence Embeddings
def sentence_embeddings():
Expand Down
15 changes: 8 additions & 7 deletions 2_vector_db/embeddings/simple_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,16 @@

# Plotting the embeddings
plt.figure(figsize=(10, 8))

for word, embedding in word_embeddings.items():
plt.scatter(embedding[0], embedding[1], label=word)
plt.annotate(word, (embedding[0], embedding[1]))
plt.scatter(embedding[0], embedding[1], marker='o', s=100, edgecolors='black', alpha=0.75)
plt.annotate(word, (embedding[0], embedding[1]), textcoords="offset points", xytext=(5, 5), ha='center', fontsize=10)

plt.title("2D Word Embeddings", fontsize=14)
plt.xlabel("Dimension 1", fontsize=12)
plt.ylabel("Dimension 2", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)

plt.title("Simple 2D Word Embeddings")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend()
plt.grid(True)
plt.show()

# Print out the embeddings
Expand Down
18 changes: 13 additions & 5 deletions 2_vector_db/faiss_annoy_pinecone/creating_vector_db.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
from pinecone import Pinecone, ServerlessSpec
from pinecone import Pinecone
from pinecone import ServerlessSpec
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv

load_dotenv()

try:
# Initialize Pinecone
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
# initialize the pinecore api key
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
if not PINECONE_API_KEY:
PINECONE_API_KEY = input("Please enter your PINECONE API key: ")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

# initialize the pinecore api key
# pc = Pinecone(api_key= 'pcsk_27UvEa_PHyZ3n7E6euuUHXg7CPCjSb7HmEDe8j1XqzqmHLZHgpswygWhAHu7weNmuqWDrT')

print("Pinecone initialized successfully.")

# Load a pre-trained model
Expand All @@ -30,7 +38,7 @@
print(f"Vector dimension: {dimension}")

# Define index name
index_name = "ragudemy"
index_name = "firstPineconeIndex"

# Check if index already exists
existing_indexes = pc.list_indexes()
Expand Down
2 changes: 1 addition & 1 deletion 2_vector_db/vectors_search/image_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def search_similar_images(query_image_path, vector_db):
print("Creating vector database...")
vector_db = create_vector_db(image_files)

query_image_path = "2_vector_db/3.png"
query_image_path = "2_vector_db/2.png"
print(f"\nPerforming similarity search with query image: {query_image_path}")
results = search_similar_images(query_image_path, vector_db)

Expand Down
1,000 changes: 500 additions & 500 deletions 3_building_simple_rag_pipeline/tesla_motors_data.csv

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions 4_advanced_RAG/multi-vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def fetch_html(url):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'utf-8' # Force UTF-8 encoding
return response.text
except requests.RequestException as e:
print(f"Error fetching the website: {e}")
Expand All @@ -45,8 +46,10 @@ def process_website(url):
if not html_content:
raise ValueError("No content could be fetched from the website.")

with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html') as temp_file:
temp_file.write(html_content)
cleaned_text = html_content.encode('utf-8', errors='ignore').decode('utf-8')

with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8') as temp_file:
temp_file.write(cleaned_text)
temp_file_path = temp_file.name

try:
Expand Down
5,001 changes: 5,001 additions & 0 deletions 6_sql_rag/tesla_motors_data.csv

Large diffs are not rendered by default.

Binary file modified 6_sql_rag/tesla_motors_data.db
Binary file not shown.
3 changes: 2 additions & 1 deletion 9_prompt_caching/with_claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain_anthropic import ChatAnthropic, AnthropicEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_community.embeddings import AnthropicEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
Expand Down
1 change: 0 additions & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ pdfplumber
pillow
pinecone
pinecone-plugin-inference
pinecone-plugin-interface
platformdirs
pooch
protobuf
Expand Down
3 changes: 3 additions & 0 deletions flagged/log.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Upload PDF,Enter your question,Answer,Processing Log,flag,username,timestamp
flagged\Upload PDF\addd6a8e72f9ebd9faee\Use long short-term memory to enhance Internet of Things for combinedsewer overflow monitoring.pdf,what is sewer overflow?,,,,,2025-03-28 14:57:45.291407
flagged\Upload PDF\fb9ce6b62de17c74f929\introlinux4-spring22.pdf,what is bash shell?,,,,,2025-03-31 20:47:38.308820