diff --git a/README.md b/README.md index bf56e54b8..af9545ef6 100644 --- a/README.md +++ b/README.md @@ -118,24 +118,23 @@ Allow unauthenticated request : Yes |-------------------------|--------------------|---------------|--------------------------------------------------------------------------------------------------| | | | **BACKEND ENV** -| OPENAI_API_KEY | Mandatory | |An OpenAPI Key is required to use open LLM model to authenticate andn track requests | +| OPENAI_API_KEY | Optional | |An Open AI key to use incase of openai embeddings | | DIFFBOT_API_KEY | Mandatory | |API key is required to use Diffbot's NLP service to extraction entities and relatioship from unstructured data| -| BUCKET | Mandatory | |bucket name to store uploaded file on GCS | +| BUCKET_UPLOAD_FILE | Optional | |bucket name to store uploaded file on GCS | +| BUCKET_FAILED_FILE | Optional | |bucket name to store failed file on GCS while extraction | | NEO4J_USER_AGENT | Optional | llm-graph-builder | Name of the user agent to track neo4j database activity | | ENABLE_USER_AGENT | Optional | true | Boolean value to enable/disable neo4j user agent | -| DUPLICATE_TEXT_DISTANCE | Mandatory | 5 | This value used to find distance for all node pairs in the graph and calculated based on node properties | -| DUPLICATE_SCORE_VALUE | Mandatory | 0.97 | Node score value to match duplicate node | -| EFFECTIVE_SEARCH_RATIO | Mandatory | 1 | | -| GRAPH_CLEANUP_MODEL | Optional | 0.97 | Model name to clean-up graph in post processing | +| DUPLICATE_TEXT_DISTANCE | Optional | 5 | This value used to find distance for all node pairs in the graph and calculated based on node properties | +| DUPLICATE_SCORE_VALUE | Optional | 0.97 | Node score value to match duplicate node | +| EFFECTIVE_SEARCH_RATIO | Optional | 5 | | +| GRAPH_CLEANUP_MODEL | Optional | openai_gpt_4o | Model name to clean-up graph in post processing | | MAX_TOKEN_CHUNK_SIZE | Optional | 10000 | Maximum token size to process file content | -| YOUTUBE_TRANSCRIPT_PROXY| Optional | | Proxy key to process youtube video for getting transcript | +| YOUTUBE_TRANSCRIPT_PROXY| Mandatory | | Proxy key required to process youtube video for getting transcript | | EMBEDDING_MODEL | Optional | all-MiniLM-L6-v2 | Model for generating the text embedding (all-MiniLM-L6-v2 , openai , vertexai) | | IS_EMBEDDING | Optional | true | Flag to enable text embedding | -| KNN_MIN_SCORE | Optional | 0.94 | Minimum score for KNN algorithm | +| KNN_MIN_SCORE | Optional | 0.8 | Minimum score for KNN algorithm | | GEMINI_ENABLED | Optional | False | Flag to enable Gemini | | GCP_LOG_METRICS_ENABLED | Optional | False | Flag to enable Google Cloud logs | -| NUMBER_OF_CHUNKS_TO_COMBINE | Optional | 5 | Number of chunks to combine when processing embeddings | -| UPDATE_GRAPH_CHUNKS_PROCESSED | Optional | 20 | Number of chunks processed before updating progress | | NEO4J_URI | Optional | neo4j://database:7687 | URI for Neo4j database | | NEO4J_USERNAME | Optional | neo4j | Username for Neo4j database | | NEO4J_PASSWORD | Optional | password | Password for Neo4j database | diff --git a/backend/README.md b/backend/README.md index 1ab091216..7cb99a484 100644 --- a/backend/README.md +++ b/backend/README.md @@ -62,10 +62,6 @@ Update the environment variable in `.env` file. Refer example.env in backend fol `NEO4J_PASSWORD` : Neo4j database user password -`AWS_ACCESS_KEY_ID` : AWS Access key ID - -`AWS_SECRET_ACCESS_KEY` : AWS secret access key - ## Contact For questions or support, feel free to contact us at christopher.crosbie@neo4j.com or michael.hunger@neo4j.com diff --git a/backend/example.env b/backend/example.env index 4a66791a9..576c32a6b 100644 --- a/backend/example.env +++ b/backend/example.env @@ -1,32 +1,35 @@ -OPENAI_API_KEY = "" #This is required if you are using openai embedding model -EMBEDDING_MODEL = "all-MiniLM-L6-v2" #this can be openai or vertexai or by default all-MiniLM-L6-v2 -RAGAS_EMBEDDING_MODEL = "openai" #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings -IS_EMBEDDING = "TRUE" -KNN_MIN_SCORE = "0.94" -# Enable Gemini (default is False) | Can be False or True -GEMINI_ENABLED = False -# Enable Google Cloud logs (default is False) | Can be False or True -GCP_LOG_METRICS_ENABLED = False -NUMBER_OF_CHUNKS_TO_COMBINE = 6 -UPDATE_GRAPH_CHUNKS_PROCESSED = 20 -NEO4J_URI = "" -NEO4J_USERNAME = "" -NEO4J_PASSWORD = "" -NEO4J_DATABASE = "" -AWS_ACCESS_KEY_ID = "" -AWS_SECRET_ACCESS_KEY = "" -LANGCHAIN_API_KEY = "" -LANGCHAIN_PROJECT = "" -LANGCHAIN_TRACING_V2 = "" -LANGCHAIN_ENDPOINT = "" -GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False -NEO4J_USER_AGENT="" -ENABLE_USER_AGENT = "" +GET_VALUE_FROM_SECRET_MANAGER= "" #OPTIONAL- Default_Value = False -- True to get all secret variable values from secret manager if available then try to get from env. +OPENAI_API_KEY = "" #OPTIONAL- Default_Value = "openai_api_key" #This is required if you are using openai embedding model +EMBEDDING_MODEL = "" #OPTIONAL- Default_Value ="" #this can be openai or vertexai or by default all-MiniLM-L6-v2 +RAGAS_EMBEDDING_MODEL = "" #OPTIONAL- Default_Value ="openai" #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings +IS_EMBEDDING = "" #OPTIONAL- Default_Value ="True" --Flag to enable text embedding +BUCKET_UPLOAD_FILE = "" #OPTIONAL- Default_Value ="gcs bucket name" -- use the gcs bucket to upload local file to gcs cloud +BUCKET_FAILED_FILE = "" #OPTIONAL- Default_Value ="gcs bucket name" -- use the gcs bucket for failed file while extraction +KNN_MIN_SCORE = "" #OPTIONAL- Default_Value ="0.8" --Minimum score for KNN algorithm +GEMINI_ENABLED = "" #OPTIONAL- Default_Value ="False"-- Enable Gemini can be False or True +GCP_LOG_METRICS_ENABLED = "" #OPTIONAL- Default_Value = "False" -- Enable to logs metrics on gcp cloud logging +NEO4J_URI = "" #OPTIONAL- Default_Value ="Neo4j URL" +NEO4J_USERNAME = "" #OPTIONAL- Default_Value = "Neo4J database username" +NEO4J_PASSWORD = "" #OPTIONAL- Default_Value = "Neo4j database user password" +NEO4J_DATABASE = "" #OPTIONAL- Default_Value = "Neo4j database user database" +LANGCHAIN_API_KEY ="" #OPTIONAL- Default_Value = "API key for Langchain" +LANGCHAIN_PROJECT ="" #OPTIONAL- Default_Value = "Project for Langchain " +LANGCHAIN_TRACING_V2 = "" #OPTIONAL- Default_Value = "Flag to enable Langchain tracing " +LANGCHAIN_ENDPOINT = "" #OPTIONAL- Default_Value = "https://api.smith.langchain.com" -- Endpoint for Langchain API +GCS_FILE_CACHE = "" #OPTIONAL- Default_Value = "False" #save the file into GCS or local, SHould be True or False +USER_AGENT="" #OPTIONAL- Default_Value = "LLM-Graph-Builder" +ENABLE_USER_AGENT = "" #OPTIONAL- Default_Value = "False" +MAX_TOKEN_CHUNK_SIZE="" #OPTIONAL- Default_Value = "10000" #Max token used to process/extract the file content. +ENTITY_EMBEDDING="" #OPTIONAL- Default_Value = "False"-- Value based on whether to create embeddings for entities suitable for entity vector mode +DUPLICATE_SCORE_VALUE = "" #OPTIONAL- Default_Value = "0.97" -- Node score value to match duplicate node +DUPLICATE_TEXT_DISTANCE = "" #OPTIONAL- Default_Value = "3" --This value used to find distance for all node pairs in the graph and calculated based on node properties +DEFAULT_DIFFBOT_CHAT_MODEL="" #OPTIONAL- Default_Value = "openai_gpt_4o" #whichever model specified here , need to add config for that model in below format) +GRAPH_CLEANUP_MODEL="" #OPTIONAL- Default_Value = "openai_gpt_4o" -- Model name to clean-up graph in post processing +BEDROCK_EMBEDDING_MODEL="" #Mandatory - Default_Value = "model_name,aws_access_key,aws_secret_key,region_name" -- If want to use bedrock embedding #model_name="amazon.titan-embed-text-v1" +YOUTUBE_TRANSCRIPT_PROXY="" #Mandatory --Proxy key required to process youtube video for getting transcript --Sample Value ="https://user:pass@domain:port" +EFFECTIVE_SEARCH_RATIO="" #OPTIONAL- Default_Value = "2" + LLM_MODEL_CONFIG_model_version="" -ENTITY_EMBEDDING="TRUE" # TRUE or FALSE based on whether to create embeddings for entities suitable for entity vector mode -DUPLICATE_SCORE_VALUE =0.97 -DUPLICATE_TEXT_DISTANCE =3 -DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , need to add config for that model in below format) #examples LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key" LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key" @@ -43,13 +46,8 @@ LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key" LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key" LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name" LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url" -YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port" -EFFECTIVE_SEARCH_RATIO=5 -GRAPH_CLEANUP_MODEL="openai_gpt_4o" -BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.titan-embed-text-v1" LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-micro-v1:0" LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-lite-v1:0" LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-pro-v1:0" LLM_MODEL_CONFIG_fireworks_deepseek_r1="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-r1" -LLM_MODEL_CONFIG_fireworks_deepseek_v3="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-v3" -MAX_TOKEN_CHUNK_SIZE=2000 #Max token used to process/extract the file content. \ No newline at end of file +LLM_MODEL_CONFIG_fireworks_deepseek_v3="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-v3" \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 6761a63c7..293831d37 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -61,3 +61,4 @@ rouge_score==0.1.2 langchain-neo4j==0.3.0 pypandoc-binary==1.15 chardet==5.2.0 +google-cloud-secret-manager==2.23.1 \ No newline at end of file diff --git a/backend/score.py b/backend/score.py index e668788c3..d7dbdc965 100644 --- a/backend/score.py +++ b/backend/score.py @@ -1,6 +1,8 @@ from fastapi import FastAPI, File, UploadFile, Form, Request, HTTPException from fastapi_health import health from fastapi.middleware.cors import CORSMiddleware +from dotenv import load_dotenv +load_dotenv() from src.main import * from src.QA_integration import * from src.shared.common_fn import * @@ -112,7 +114,7 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): ) app.add_middleware(SessionMiddleware, secret_key=os.urandom(24)) -is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "False").lower() in ("true", "1", "yes") +is_gemini_enabled = get_value_from_env_or_sm("GEMINI_ENABLED", "False", "bool") if is_gemini_enabled: add_routes(app,ChatVertexAI(), path="/vertexai") @@ -381,7 +383,7 @@ async def post_processing(uri=Form(None), userName=Form(None), password=Form(Non api_name = 'post_processing/enable_hybrid_search_and_fulltext_search_in_bloom' logging.info(f'Full Text index created') - if os.environ.get('ENTITY_EMBEDDING','False').upper()=="TRUE" and "materialize_entity_similarities" in tasks: + if get_value_from_env_or_sm("ENTITY_EMBEDDING","False","bool") and "materialize_entity_similarities" in tasks: await asyncio.to_thread(create_entity_embedding, graph) api_name = 'post_processing/create_entity_embedding' logging.info(f'Entity Embeddings created') @@ -551,13 +553,13 @@ async def connect(uri=Form(None), userName=Form(None), password=Form(None), data start = time.time() graph = create_graph_database_connection(uri, userName, password, database) result = await asyncio.to_thread(connection_check_and_get_vector_dimensions, graph, database) - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') + gcs_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") end = time.time() elapsed_time = end - start json_obj = {'api_name':'connect','db_url':uri, 'userName':userName, 'database':database, 'count':1, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email} logger.log_struct(json_obj, "INFO") result['elapsed_api_time'] = f'{elapsed_time:.2f}' - result['gcs_file_cache'] = gcs_file_cache + result['gcs_file_cache'] = gcs_cache return create_api_response('Success',data=result) except Exception as e: job_status = "Failed" @@ -1034,11 +1036,11 @@ async def fetch_chunktext( async def backend_connection_configuration(): try: start = time.time() - uri = os.getenv('NEO4J_URI') - username= os.getenv('NEO4J_USERNAME') - database= os.getenv('NEO4J_DATABASE') - password= os.getenv('NEO4J_PASSWORD') - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') + uri = get_value_from_env_or_sm("NEO4J_URI") + username= get_value_from_env_or_sm("NEO4J_USERNAME") + database= get_value_from_env_or_sm("NEO4J_DATABASE") + password= get_value_from_env_or_sm("NEO4J_PASSWORD") + gcs_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") if all([uri, username, database, password]): graph = Neo4jGraph() logging.info(f'login connection status of object: {graph}') @@ -1046,7 +1048,7 @@ async def backend_connection_configuration(): graph_connection = True graphDb_data_Access = graphDBdataAccess(graph) result = graphDb_data_Access.connection_check_and_get_vector_dimensions(database) - result['gcs_file_cache'] = gcs_file_cache + result['gcs_file_cache'] = gcs_cache result['uri'] = uri end = time.time() elapsed_time = end - start diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py index 67e74e6eb..264e275db 100644 --- a/backend/src/QA_integration.py +++ b/backend/src/QA_integration.py @@ -33,12 +33,11 @@ # Local imports from src.llm import get_llm -from src.shared.common_fn import load_embedding_model +from src.shared.common_fn import * from src.shared.constants import * load_dotenv() - -EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL') -EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL) +embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") +EMBEDDING_FUNCTION , _ = load_embedding_model(embedding_model) class SessionChatHistory: history_dict = {} @@ -397,7 +396,7 @@ def get_neo4j_retriever(graph, document_names,chat_mode_settings, score_threshol neo_db = initialize_neo4j_vector(graph, chat_mode_settings) # document_names= list(map(str.strip, json.loads(document_names))) search_k = chat_mode_settings["top_k"] - ef_ratio = int(os.getenv("EFFECTIVE_SEARCH_RATIO", "2")) if os.getenv("EFFECTIVE_SEARCH_RATIO", "2").isdigit() else 2 + ef_ratio = get_value_from_env_or_sm("EFFECTIVE_SEARCH_RATIO", 5, "int") retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio) return retriever except Exception as e: @@ -410,7 +409,7 @@ def setup_chat(model, graph, document_names, chat_mode_settings): start_time = time.time() try: if model == "diffbot": - model = os.getenv('DEFAULT_DIFFBOT_CHAT_MODEL') + model = get_value_from_env_or_sm("DEFAULT_DIFFBOT_CHAT_MODEL","openai_gpt_4o") llm, model_name = get_llm(model=model) logging.info(f"Model called in chat: {model} (version: {model_name})") diff --git a/backend/src/communities.py b/backend/src/communities.py index 0ecf493cc..146b98687 100644 --- a/backend/src/communities.py +++ b/backend/src/communities.py @@ -5,7 +5,7 @@ from langchain_core.output_parsers import StrOutputParser from concurrent.futures import ThreadPoolExecutor, as_completed import os -from src.shared.common_fn import load_embedding_model +from src.shared.common_fn import * COMMUNITY_PROJECTION_NAME = "communities" @@ -194,9 +194,9 @@ def get_gds_driver(uri, username, password, database): try: if all(v is None for v in [username, password]): - username= os.getenv('NEO4J_USERNAME') - database= os.getenv('NEO4J_DATABASE') - password= os.getenv('NEO4J_PASSWORD') + username= get_value_from_env_or_sm('NEO4J_USERNAME') + database= get_value_from_env_or_sm('NEO4J_DATABASE') + password= get_value_from_env_or_sm('NEO4J_PASSWORD') gds = GraphDataScience( endpoint=uri, @@ -351,9 +351,9 @@ def create_community_summaries(gds, model): def create_community_embeddings(gds): try: - embedding_model = os.getenv('EMBEDDING_MODEL') + embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL","sentence_transformer") embeddings, dimension = load_embedding_model(embedding_model) - logging.info(f"Embedding model '{embedding_model}' loaded successfully.") + logging.info(f"Embedding model '{embeddings}' loaded successfully.") logging.info("Fetching community details.") rows = gds.run_cypher(GET_COMMUNITY_DETAILS) diff --git a/backend/src/create_chunks.py b/backend/src/create_chunks.py index 523d2b77c..af994277b 100644 --- a/backend/src/create_chunks.py +++ b/backend/src/create_chunks.py @@ -1,5 +1,6 @@ from langchain_text_splitters import TokenTextSplitter from langchain.docstore.document import Document +from src.shared.common_fn import get_value_from_env_or_sm from langchain_neo4j import Neo4jGraph import logging from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps @@ -26,7 +27,7 @@ def split_file_into_chunks(self,token_chunk_size, chunk_overlap): """ logging.info("Split file into smaller chunks") text_splitter = TokenTextSplitter(chunk_size=token_chunk_size, chunk_overlap=chunk_overlap) - MAX_TOKEN_CHUNK_SIZE = int(os.getenv('MAX_TOKEN_CHUNK_SIZE', 10000)) + MAX_TOKEN_CHUNK_SIZE = get_value_from_env_or_sm("MAX_TOKEN_CHUNK_SIZE",10000, "int") chunk_to_be_created = int(MAX_TOKEN_CHUNK_SIZE / token_chunk_size) if 'page' in self.pages[0].metadata: diff --git a/backend/src/document_sources/wikipedia.py b/backend/src/document_sources/wikipedia.py index 163f971b4..2eb070d35 100644 --- a/backend/src/document_sources/wikipedia.py +++ b/backend/src/document_sources/wikipedia.py @@ -4,8 +4,8 @@ def get_documents_from_Wikipedia(wiki_query:str, language:str): try: - pages = WikipediaLoader(query=wiki_query.strip(), lang=language, load_all_available_meta=False,doc_content_chars_max=100000,load_max_docs=1).load() file_name = wiki_query.strip() + pages = WikipediaLoader(query=wiki_query.strip(), lang=language, load_all_available_meta=False,doc_content_chars_max=100000,load_max_docs=1).load() logging.info(f"Total Pages from Wikipedia = {len(pages)}") return file_name, pages except Exception as e: diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py index 82e9a9219..22a55a264 100644 --- a/backend/src/document_sources/youtube.py +++ b/backend/src/document_sources/youtube.py @@ -1,4 +1,5 @@ from langchain.docstore.document import Document + from src.shared.llm_graph_builder_exception import LLMGraphBuilderException from youtube_transcript_api import YouTubeTranscriptApi import logging @@ -8,10 +9,11 @@ from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS import os import re +from src.shared.common_fn import get_value_from_env_or_sm def get_youtube_transcript(youtube_id): try: - proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY") + proxy = get_value_from_env_or_sm("YOUTUBE_TRANSCRIPT_PROXY") proxies = { 'https': proxy } transcript_pieces = YouTubeTranscriptApi.get_transcript(youtube_id, proxies = proxies) return transcript_pieces diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py index 13fa1c53a..e978d8775 100644 --- a/backend/src/graphDB_dataAccess.py +++ b/backend/src/graphDB_dataAccess.py @@ -1,11 +1,12 @@ import logging import os from langchain_neo4j import Neo4jGraph -from src.shared.common_fn import create_gcs_bucket_folder_name_hashed, delete_uploaded_local_file, load_embedding_model +from src.shared.common_fn import create_gcs_bucket_folder_name_hashed, delete_uploaded_local_file, get_value_from_env_or_sm, load_embedding_model from src.document_sources.gcs_bucket import delete_file_from_gcs -from src.shared.constants import BUCKET_UPLOAD,NODEREL_COUNT_QUERY_WITH_COMMUNITY, NODEREL_COUNT_QUERY_WITHOUT_COMMUNITY +from src.shared.constants import NODEREL_COUNT_QUERY_WITH_COMMUNITY, NODEREL_COUNT_QUERY_WITHOUT_COMMUNITY from src.entities.source_node import sourceNode from src.communities import MAX_COMMUNITY_LEVELS +from src.shared.common_fn import * import json from dotenv import load_dotenv @@ -147,7 +148,7 @@ def update_KNN_graph(self): """ index = self.graph.query("""show indexes yield * where type = 'VECTOR' and name = 'vector'""") # logging.info(f'show index vector: {index}') - knn_min_score = os.environ.get('KNN_MIN_SCORE') + knn_min_score = get_value_from_env_or_sm("KNN_MIN_SCORE",0.8,"float") if len(index) > 0: logging.info('update KNN graph') self.graph.query("""MATCH (c:Chunk) @@ -237,8 +238,7 @@ def connection_check_and_get_vector_dimensions(self,database): result_chunks = self.graph.query("""match (c:Chunk) return size(c.embedding) as embeddingSize, count(*) as chunks, count(c.embedding) as hasEmbedding """) - - embedding_model = os.getenv('EMBEDDING_MODEL') + embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") embeddings, application_dimension = load_embedding_model(embedding_model) logging.info(f'embedding model:{embeddings} and dimesion:{application_dimension}') @@ -281,13 +281,14 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me filename_list= list(map(str.strip, json.loads(filenames))) source_types_list= list(map(str.strip, json.loads(source_types))) - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') + gcs_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE","llm-graph-builder-upload") for (file_name,source_type) in zip(filename_list, source_types_list): merged_file_path = os.path.join(merged_dir, file_name) - if source_type == 'local file' and gcs_file_cache == 'True': + if source_type == 'local file' and gcs_cache : folder_name = create_gcs_bucket_folder_name_hashed(uri, file_name) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) + delete_file_from_gcs(gcs_bucket_name_upload,folder_name,file_name) else: logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') delete_uploaded_local_file(merged_file_path,file_name) @@ -385,8 +386,8 @@ def delete_unconnected_nodes(self,unconnected_entities_list): return self.execute_query(query,param) def get_duplicate_nodes_list(self): - score_value = float(os.environ.get('DUPLICATE_SCORE_VALUE')) - text_distance = int(os.environ.get('DUPLICATE_TEXT_DISTANCE')) + score_value = get_value_from_env_or_sm("DUPLICATE_SCORE_VALUE",0.97,"float") + text_distance = get_value_from_env_or_sm("DUPLICATE_TEXT_DISTANCE", 3 ,"int") query_duplicate_nodes = """ MATCH (n:!Chunk&!Session&!Document&!`__Community__`) with n WHERE n.embedding is not null and n.id is not null // and size(toString(n.id)) > 3 @@ -458,7 +459,7 @@ def drop_create_vector_index(self, isVectorIndexExist): """ drop and create the vector index when vector index dimesion are different. """ - embedding_model = os.getenv('EMBEDDING_MODEL') + embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") embeddings, dimension = load_embedding_model(embedding_model) if isVectorIndexExist == 'true': diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py index f8054061f..353b157a9 100644 --- a/backend/src/graph_query.py +++ b/backend/src/graph_query.py @@ -1,4 +1,5 @@ import logging +from src.shared.common_fn import get_value_from_env_or_sm from neo4j import time from neo4j import GraphDatabase import os @@ -17,13 +18,13 @@ def get_graphDB_driver(uri, username, password,database="neo4j"): try: logging.info(f"Attempting to connect to the Neo4j database at {uri}") if all(v is None for v in [username, password]): - username= os.getenv('NEO4J_USERNAME') - database= os.getenv('NEO4J_DATABASE') - password= os.getenv('NEO4J_PASSWORD') + username= get_value_from_env_or_sm('NEO4J_USERNAME') + database= get_value_from_env_or_sm('NEO4J_DATABASE') + password= get_value_from_env_or_sm('NEO4J_PASSWORD') - enable_user_agent = os.environ.get("ENABLE_USER_AGENT", "False").lower() in ("true", "1", "yes") + enable_user_agent = get_value_from_env_or_sm("ENABLE_USER_AGENT","False","bool") if enable_user_agent: - driver = GraphDatabase.driver(uri, auth=(username, password),database=database, user_agent=os.environ.get('NEO4J_USER_AGENT')) + driver = GraphDatabase.driver(uri, auth=(username, password),database=database, user_agent= get_value_from_env_or_sm("USER_AGENT","LLM-Graph-Builder")) else: driver = GraphDatabase.driver(uri, auth=(username, password),database=database) logging.info("Connection successful") diff --git a/backend/src/llm.py b/backend/src/llm.py index e16c149d9..c3334c798 100644 --- a/backend/src/llm.py +++ b/backend/src/llm.py @@ -1,6 +1,7 @@ import logging from langchain.docstore.document import Document import os +from src.shared.common_fn import * from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_google_vertexai import ChatVertexAI from langchain_groq import ChatGroq @@ -20,7 +21,7 @@ def get_llm(model: str): """Retrieve the specified language model based on the model name.""" model = model.lower().strip() env_key = f"LLM_MODEL_CONFIG_{model}" - env_value = os.environ.get(env_key) + env_value = get_value_from_env_or_sm(env_key) if not env_value: err = f"Environment variable '{env_key}' is not defined as per format or missing" diff --git a/backend/src/logger.py b/backend/src/logger.py index 6a9822787..8eadda6cd 100644 --- a/backend/src/logger.py +++ b/backend/src/logger.py @@ -1,9 +1,11 @@ import os from google.cloud import logging as gclogger +from src.shared.common_fn import get_value_from_env_or_sm + class CustomLogger: def __init__(self): - self.is_gcp_log_enabled = os.environ.get("GCP_LOG_METRICS_ENABLED", "False").lower() in ("true", "1", "yes") + self.is_gcp_log_enabled = get_value_from_env_or_sm("GCP_LOG_METRICS_ENABLED", "False", "bool") if self.is_gcp_log_enabled: self.logging_client = gclogger.Client() self.logger_name = "llm_experiments_metrics" diff --git a/backend/src/main.py b/backend/src/main.py index c21e26f5a..16eabcbf5 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -1,5 +1,6 @@ from langchain_neo4j import Neo4jGraph -from src.shared.constants import (BUCKET_UPLOAD,BUCKET_FAILED_FILE, PROJECT_ID, QUERY_TO_GET_CHUNKS, +from src.shared.common_fn import * +from src.shared.constants import (QUERY_TO_GET_CHUNKS, QUERY_TO_DELETE_EXISTING_ENTITIES, QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, @@ -8,7 +9,6 @@ DELETE_ENTITIES_AND_START_FROM_BEGINNING, QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT) from src.shared.schema_extraction import schema_extraction_from_text -from dotenv import load_dotenv from datetime import datetime import logging from src.create_chunks import CreateChunksofDocument @@ -20,7 +20,7 @@ from src.document_sources.s3_bucket import * from src.document_sources.wikipedia import * from src.document_sources.youtube import * -from src.shared.common_fn import * + from src.make_relationships import * from src.document_sources.web_pages import * import re @@ -33,7 +33,7 @@ from src.shared.llm_graph_builder_exception import LLMGraphBuilderException warnings.filterwarnings("ignore") -load_dotenv() + logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO') def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, aws_secret_access_key, source_type): @@ -229,11 +229,13 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type async def extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, fileName, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): logging.info(f'Process file name :{fileName}') + gcs_file_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE","llm-graph-builder-upload") if not retry_condition: - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - if gcs_file_cache == 'True': + if gcs_file_cache: + project_id = get_value_from_env_or_sm("PROJECT_ID", "llm-experiments-387609") folder_name = create_gcs_bucket_folder_name_hashed(uri, fileName) - file_name, pages = get_documents_from_gcs( PROJECT_ID, BUCKET_UPLOAD, folder_name, fileName) + file_name, pages = get_documents_from_gcs( project_id, gcs_bucket_name_upload, folder_name, fileName) else: file_name, pages, file_extension = get_documents_from_file_by_path(merged_file_path,fileName) if pages==None or len(pages)==0: @@ -365,7 +367,7 @@ async def processing_source(uri, userName, password, database, model, file_name, uri_latency["update_source_node"] = f'{elapsed_update_source_node:.2f}' logging.info('Update the status as Processing') - update_graph_chunk_processed = int(os.environ.get('UPDATE_GRAPH_CHUNKS_PROCESSED')) + update_graph_chunk_processed = get_value_from_env_or_sm("UPDATE_GRAPH_CHUNKS_PROCESSED",20,"int") # selected_chunks = [] is_cancelled_status = False job_status = "Completed" @@ -424,17 +426,16 @@ async def processing_source(uri, userName, password, database, model, file_name, graphDb_data_Access.update_source_node(obj_source_node) graphDb_data_Access.update_node_relationship_count(file_name) + gcs_file_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE", "llm-graph-builder-upload") logging.info('Updated the nodeCount and relCount properties in Document node') logging.info(f'file:{file_name} extraction has been completed') - - # merged_file_path have value only when file uploaded from local if is_uploaded_from_local: - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - if gcs_file_cache == 'True': + if gcs_file_cache: folder_name = create_gcs_bucket_folder_name_hashed(uri, file_name) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) + delete_file_from_gcs(gcs_bucket_name_upload, folder_name, file_name) else: delete_uploaded_local_file(merged_file_path, file_name) processing_source_func = time.time() - processing_source_start_time @@ -629,13 +630,11 @@ def merge_chunks_local(file_name, total_chunks, chunk_dir, merged_dir): def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, originalname, uri, chunk_dir, merged_dir): - - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - logging.info(f'gcs file cache: {gcs_file_cache}') - - if gcs_file_cache == 'True': + gcs_file_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE", "llm-graph-builder-upload") + if gcs_file_cache: folder_name = create_gcs_bucket_folder_name_hashed(uri,originalname) - upload_file_to_gcs(chunk, chunk_number, originalname, BUCKET_UPLOAD, folder_name) + upload_file_to_gcs(chunk, chunk_number, originalname, gcs_bucket_name_upload, folder_name) else: if not os.path.exists(chunk_dir): os.mkdir(chunk_dir) @@ -648,8 +647,8 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina if int(chunk_number) == int(total_chunks): # If this is the last chunk, merge all chunks into a single file - if gcs_file_cache == 'True': - file_size = merge_file_gcs(BUCKET_UPLOAD, originalname, folder_name, int(total_chunks)) + if gcs_file_cache: + file_size = merge_file_gcs(gcs_bucket_name_upload, originalname, folder_name, int(total_chunks)) else: file_size = merge_chunks_local(originalname, int(total_chunks), chunk_dir, merged_dir) @@ -695,7 +694,8 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir, uri): filename_list= list(map(str.strip, json.loads(filenames))) source_types_list= list(map(str.strip, json.loads(source_types))) - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') + gcs_file_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE", "llm-graph-builder-upload") for (file_name,source_type) in zip(filename_list, source_types_list): obj_source_node = sourceNode() @@ -708,9 +708,9 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir, uri): count_response = graphDb_data_Access.update_node_relationship_count(file_name) obj_source_node = None merged_file_path = os.path.join(merged_dir, file_name) - if source_type == 'local file' and gcs_file_cache == 'True': + if source_type == 'local file' and gcs_file_cache: folder_name = create_gcs_bucket_folder_name_hashed(uri, file_name) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) + delete_file_from_gcs(gcs_bucket_name_upload, folder_name, file_name) else: logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') delete_uploaded_local_file(merged_file_path,file_name) @@ -748,12 +748,14 @@ def set_status_retry(graph, file_name, retry_condition): graphDb_data_Access.update_source_node(obj_source_node) def failed_file_process(uri,file_name, merged_file_path): - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - if gcs_file_cache == 'True': + gcs_file_cache = get_value_from_env_or_sm("GCS_FILE_CACHE","False","bool") + gcs_bucket_name_upload = get_value_from_env_or_sm("BUCKET_UPLOAD_FILE", "llm-graph-builder-upload") + gcs_bucket_name_failed = get_value_from_env_or_sm("BUCKET_FAILED_FILE", "llm-graph-builder-failed") + if gcs_file_cache: folder_name = create_gcs_bucket_folder_name_hashed(uri,file_name) - copy_failed_file(BUCKET_UPLOAD, BUCKET_FAILED_FILE, folder_name, file_name) + copy_failed_file(gcs_bucket_name_upload, gcs_bucket_name_failed, folder_name, file_name) time.sleep(5) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) + delete_file_from_gcs(gcs_bucket_name_upload,folder_name,file_name) else: logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') delete_uploaded_local_file(merged_file_path,file_name) \ No newline at end of file diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py index bccfa1ddd..b5ec7c295 100644 --- a/backend/src/make_relationships.py +++ b/backend/src/make_relationships.py @@ -1,17 +1,15 @@ from langchain_neo4j import Neo4jGraph from langchain.docstore.document import Document -from src.shared.common_fn import load_embedding_model +from src.shared.common_fn import * import logging from typing import List -import os import hashlib import time from langchain_neo4j import Neo4jVector logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO') - -EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL') -EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(EMBEDDING_MODEL) +embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") +EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(embedding_model) def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list): batch_data = [] @@ -37,14 +35,14 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): - isEmbedding = os.getenv('IS_EMBEDDING') + isEmbedding= get_value_from_env_or_sm("IS_EMBEDDING", "True" ,"bool") embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION logging.info(f'embedding model:{embeddings} and dimesion:{dimension}') data_for_query = [] logging.info(f"update embedding and vector index for chunks") for row in chunkId_chunkDoc_list: - if isEmbedding.upper() == "TRUE": + if isEmbedding: embeddings_arr = embeddings.embed_query(row['chunk_doc'].page_content) data_for_query.append({ diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py index cdc8b06d3..5d54f98ec 100644 --- a/backend/src/post_processing.py +++ b/backend/src/post_processing.py @@ -4,7 +4,7 @@ from langchain_neo4j import Neo4jGraph import os from src.graph_query import get_graphDB_driver -from src.shared.common_fn import load_embedding_model +from src.shared.common_fn import * from langchain_core.output_parsers import JsonOutputParser from langchain_core.prompts import ChatPromptTemplate from src.shared.constants import GRAPH_CLEANUP_PROMPT @@ -131,7 +131,7 @@ def create_fulltext(driver,type): def create_vector_fulltext_indexes(uri, username, password, database): types = ["entities", "hybrid"] - embedding_model = os.getenv('EMBEDDING_MODEL') + embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") embeddings, dimension = load_embedding_model(embedding_model) if not dimension: dimension = CHUNK_VECTOR_EMBEDDING_DIMENSION @@ -184,7 +184,7 @@ def fetch_entities_for_embedding(graph): return [{"elementId": record["elementId"], "text": record["text"]} for record in result] def update_embeddings(rows, graph): - embedding_model = os.getenv('EMBEDDING_MODEL') + embedding_model = get_value_from_env_or_sm("EMBEDDING_MODEL", "sentence_transformer") embeddings, dimension = load_embedding_model(embedding_model) logging.info(f"update embedding for entities") for row in rows: @@ -204,7 +204,7 @@ def graph_schema_consolidation(graph): messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")], partial_variables={"format_instructions": parser.get_format_instructions()} ) - graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o') + graph_cleanup_model = get_value_from_env_or_sm("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o') llm, _ = get_llm(graph_cleanup_model) chain = prompt | llm | parser diff --git a/backend/src/ragas_eval.py b/backend/src/ragas_eval.py index 251ab71c0..ebf7701b4 100644 --- a/backend/src/ragas_eval.py +++ b/backend/src/ragas_eval.py @@ -6,7 +6,7 @@ from dotenv import load_dotenv from ragas import evaluate from ragas.metrics import answer_relevancy, faithfulness,context_entity_recall -from src.shared.common_fn import load_embedding_model +from src.shared.common_fn import get_value_from_env_or_sm, load_embedding_model from ragas.dataset_schema import SingleTurnSample from ragas.metrics import RougeScore, SemanticSimilarity, ContextEntityRecall from ragas.llms import LangchainLLMWrapper @@ -16,9 +16,9 @@ nltk.download('punkt') load_dotenv() -EMBEDDING_MODEL = os.getenv("RAGAS_EMBEDDING_MODEL") -logging.info(f"Loading embedding model '{EMBEDDING_MODEL}' for ragas evaluation") -EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL) +ragas_embedding_model = get_value_from_env_or_sm("RAGAS_EMBEDDING_MODEL","openai") +logging.info("Loading embedding model for ragas evaluation") +EMBEDDING_FUNCTION, _ = load_embedding_model(ragas_embedding_model) def get_ragas_metrics(question: str, context: list, answer: list, model: str): """Calculates RAGAS metrics.""" diff --git a/backend/src/shared/common_fn.py b/backend/src/shared/common_fn.py index d95626bb3..649b81b51 100644 --- a/backend/src/shared/common_fn.py +++ b/backend/src/shared/common_fn.py @@ -1,6 +1,5 @@ import hashlib import logging -from src.document_sources.youtube import create_youtube_url from langchain_huggingface import HuggingFaceEmbeddings from langchain_google_vertexai import VertexAIEmbeddings from langchain_openai import OpenAIEmbeddings @@ -9,6 +8,11 @@ from typing import List import re import os +import json +import logging +from typing import Any +from google.cloud import secretmanager +from google.api_core.exceptions import NotFound, PermissionDenied from pathlib import Path from urllib.parse import urlparse import boto3 @@ -16,6 +20,7 @@ def check_url_source(source_type, yt_url:str=None, wiki_query:str=None): language='' + from src.document_sources.youtube import create_youtube_url try: logging.info(f"incoming URL: {yt_url}") if source_type == 'youtube': @@ -30,15 +35,11 @@ def check_url_source(source_type, yt_url:str=None, wiki_query:str=None): wiki_query_id='' #pattern = r"https?:\/\/([a-zA-Z0-9\.\,\_\-\/]+)\.wikipedia\.([a-zA-Z]{2,3})\/wiki\/([a-zA-Z0-9\.\,\_\-\/]+)" wikipedia_url_regex = r'https?:\/\/(www\.)?([a-zA-Z]{2,3})\.wikipedia\.org\/wiki\/(.*)' - wiki_id_pattern = r'^[a-zA-Z0-9 _\-\.\,\:\(\)\[\]\{\}\/]*$' match = re.search(wikipedia_url_regex, wiki_query.strip()) if match: language = match.group(2) wiki_query_id = match.group(3) - # else : - # languages.append("en") - # wiki_query_ids.append(wiki_url.strip()) else: raise Exception(f'Not a valid wikipedia url: {wiki_query} ') @@ -59,9 +60,9 @@ def get_chunk_and_graphDocument(graph_document_list, chunkId_chunkDoc_list): return lst_chunk_chunkId_document def create_graph_database_connection(uri, userName, password, database): - enable_user_agent = os.environ.get("ENABLE_USER_AGENT", "False").lower() in ("true", "1", "yes") + enable_user_agent = get_value_from_env_or_sm("ENABLE_USER_AGENT", "False" ,"bool") if enable_user_agent: - graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True,driver_config={'user_agent':os.environ.get('NEO4J_USER_AGENT')}) + graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True,driver_config={'user_agent':get_value_from_env_or_sm("USER_AGENT","LLM-Graph-Builder")}) else: graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True) return graph @@ -82,7 +83,7 @@ def load_embedding_model(embedding_model_name: str): embeddings = get_bedrock_embeddings() dimension = 1536 logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}") - else: + elif embedding_model_name == "sentence_transformer": embeddings = HuggingFaceEmbeddings( model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model" ) @@ -150,7 +151,7 @@ def get_bedrock_embeddings(): BedrockEmbeddings: An instance of the BedrockEmbeddings class. """ try: - env_value = os.getenv("BEDROCK_EMBEDDING_MODEL") + env_value = get_value_from_env_or_sm("BEDROCK_EMBEDDING_MODEL") if not env_value: raise ValueError("Environment variable 'BEDROCK_EMBEDDING_MODEL' is not set.") try: @@ -174,3 +175,61 @@ def get_bedrock_embeddings(): except Exception as e: print(f"An unexpected error occurred: {e}") raise + +def get_value_from_env_or_sm(key_name: str, default_value: Any = None, data_type: type = str): + """ + Fetches a secret from Google Cloud Secret Manager. + If GET_VALUE_FROM_SECRET_MANAGER env value True, Otherwise get from local .env file + Converts the value to the specified data type. + Args: + key_name (str): Name of the secret in Secret Manager. + default_value (Any) : Any type of default value + data_type (type): Expected data type (str, int, float, bool, list, dict). + Returns: + Converted value of the secret or environment variable. + """ + get_value_from_env_or_sm = os.getenv("GET_VALUE_FROM_SECRET_MANAGER","False").lower() in ["true", "1", "yes"] + try: + if get_value_from_env_or_sm: + project_id = os.getenv("PROJECT_ID", "llm-experiments-387609") + client = secretmanager.SecretManagerServiceClient() + secret_path = f"projects/{project_id}/secrets/{key_name}/versions/latest" + + response = client.access_secret_version(request={"name": secret_path}) + value = response.payload.data.decode("UTF-8") + else: + value = os.getenv(key_name, None) + except (NotFound, PermissionDenied): + try: + logging.warning(f"key {key_name} not found in Secret Manager. Checking environment variable.") + env_value = os.getenv(key_name, None) + if env_value is None and default_value is not None: + return convert_type(default_value, data_type) + elif env_value is None and default_value is None: + raise Exception(f"env {key_name} value not found") + else: + return convert_type(env_value, data_type) + except Exception as e: + raise Exception(f"env {key_name} value not found") + + if value is None and default_value is not None: + return convert_type(default_value, data_type) # Return the default value when key not found in secret manager not in .env file. + + return convert_type(value, data_type) + + +def convert_type(value: str, data_type: type): + """Convert string value to the specified data type.""" + try: + if data_type == "int": + return int(value) + elif data_type == "float": + return float(value) + elif data_type == "bool": + return value.lower() in ["true", "1", "yes"] + elif data_type == "list" or data_type == "dict": + return json.loads(value) # Convert JSON strings to list/dict + return value # Default to string + except Exception as e: + logging.error(f"Type conversion error: {e}") + return None \ No newline at end of file diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py index b85654c8c..fdb6ef55a 100644 --- a/backend/src/shared/constants.py +++ b/backend/src/shared/constants.py @@ -1,11 +1,7 @@ OPENAI_MODELS = ["openai-gpt-3.5", "openai-gpt-4o", "openai-gpt-4o-mini"] GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"] GROQ_MODELS = ["groq-llama3"] -BUCKET_UPLOAD = 'llm-graph-builder-upload' -BUCKET_FAILED_FILE = 'llm-graph-builder-failed' -PROJECT_ID = 'llm-experiments-387609' -GRAPH_CHUNK_LIMIT = 50 - +GRAPH_CHUNK_LIMIT = 50 #query GRAPH_QUERY = """ diff --git a/backend/test_integrationqa.py b/backend/test_integrationqa.py index 2d7a5c5e5..9f0210d15 100644 --- a/backend/test_integrationqa.py +++ b/backend/test_integrationqa.py @@ -8,14 +8,13 @@ from dotenv import load_dotenv from src.main import * from src.QA_integration import QA_RAG -from src.ragas_eval import get_ragas_metrics -from datasets import Dataset # Load environment variables load_dotenv() -URI = os.getenv('NEO4J_URI') -USERNAME = os.getenv('NEO4J_USERNAME') -PASSWORD = os.getenv('NEO4J_PASSWORD') -DATABASE = os.getenv('NEO4J_DATABASE') + +URI = get_value_from_env_or_sm('NEO4J_URI') +USERNAME = get_value_from_env_or_sm('NEO4J_USERNAME') +PASSWORD = get_value_from_env_or_sm('NEO4J_PASSWORD') +DATABASE = get_value_from_env_or_sm('NEO4J_DATABASE') # Logging configuration logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Directory Paths diff --git a/docker-compose.yml b/docker-compose.yml index 4b166f490..3087027a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,28 +11,21 @@ services: - NEO4J_URI=${NEO4J_URI-neo4j://database:7687} - NEO4J_PASSWORD=${NEO4J_PASSWORD-password} - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j} - - OPENAI_API_KEY=${OPENAI_API_KEY-} - - DIFFBOT_API_KEY=${DIFFBOT_API_KEY-} - - EMBEDDING_MODEL=${EMBEDDING_MODEL-all-MiniLM-L6-v2} - - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-} - - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-} - - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT-} - - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY-} - - KNN_MIN_SCORE=${KNN_MIN_SCORE-0.94} - - IS_EMBEDDING=${IS_EMBEDDING-true} - - GEMINI_ENABLED=${GEMINI_ENABLED-False} - - GCP_LOG_METRICS_ENABLED=${GCP_LOG_METRICS_ENABLED-False} - - UPDATE_GRAPH_CHUNKS_PROCESSED=${UPDATE_GRAPH_CHUNKS_PROCESSED-20} - - NUMBER_OF_CHUNKS_TO_COMBINE=${NUMBER_OF_CHUNKS_TO_COMBINE-6} - - ENTITY_EMBEDDING=${ENTITY_EMBEDDING-False} - - GCS_FILE_CACHE=${GCS_FILE_CACHE-False} -# - LLM_MODEL_CONFIG_anthropic_claude_35_sonnet=${LLM_MODEL_CONFIG_anthropic_claude_35_sonnet-} -# - LLM_MODEL_CONFIG_fireworks_llama_v3_70b=${LLM_MODEL_CONFIG_fireworks_llama_v3_70b-} -# - LLM_MODEL_CONFIG_azure_ai_gpt_4o=${LLM_MODEL_CONFIG_azure_ai_gpt_4o-} -# - LLM_MODEL_CONFIG_azure_ai_gpt_35=${LLM_MODEL_CONFIG_azure_ai_gpt_35-} -# - LLM_MODEL_CONFIG_groq_llama3_70b=${LLM_MODEL_CONFIG_groq_llama3_70b-} -# - LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet=${LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet-} -# - LLM_MODEL_CONFIG_fireworks_qwen_72b=${LLM_MODEL_CONFIG_fireworks_qwen_72b-} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - DIFFBOT_API_KEY=${DIFFBOT_API_KEY} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT} + - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2} + - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT} + - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY} + - KNN_MIN_SCORE=${KNN_MIN_SCORE} + - IS_EMBEDDING=${IS_EMBEDDING} + - GEMINI_ENABLED=${GEMINI_ENABLED} + - GCP_LOG_METRICS_ENABLED=${GCP_LOG_METRICS_ENABLED} + - UPDATE_GRAPH_CHUNKS_PROCESSED=${UPDATE_GRAPH_CHUNKS_PROCESSED} + - NUMBER_OF_CHUNKS_TO_COMBINE=${NUMBER_OF_CHUNKS_TO_COMBINE} + - ENTITY_EMBEDDING=${ENTITY_EMBEDDING} + - GCS_FILE_CACHE=${GCS_FILE_CACHE} - LLM_MODEL_CONFIG_ollama_llama3=${LLM_MODEL_CONFIG_ollama_llama3-} container_name: backend extra_hosts: diff --git a/docs/project_docs.adoc b/docs/project_docs.adoc index 22e997217..fa827d731 100644 --- a/docs/project_docs.adoc +++ b/docs/project_docs.adoc @@ -61,7 +61,6 @@ LANGCHAIN_API_KEY = "" LANGCHAIN_PROJECT = "" LANGCHAIN_TRACING_V2 = "" LANGCHAIN_ENDPOINT = "" -NUMBER_OF_CHUNKS_TO_COMBINE = "" .... == Architecture diff --git a/frontend/example.env b/frontend/example.env index 1576bbea0..306e6a21f 100644 --- a/frontend/example.env +++ b/frontend/example.env @@ -6,7 +6,7 @@ VITE_ENV="DEV" VITE_TIME_PER_PAGE=50 VITE_CHUNK_SIZE=5242880 VITE_CHUNK_OVERLAP=20 -VITE_TOKENS_PER_CHUNK=100 +VITE_TOKENS_PER_CHUNK=200 VITE_CHUNK_TO_COMBINE=1 VITE_LARGE_FILE_SIZE=5242880 VITE_GOOGLE_CLIENT_ID=""