diff --git a/backend/cdk/glue/scripts/clean-survey-monkey-data.py b/backend/cdk/glue/scripts/clean-survey-monkey-data.py
new file mode 100644
index 0000000..0eb610f
--- /dev/null
+++ b/backend/cdk/glue/scripts/clean-survey-monkey-data.py
@@ -0,0 +1,825 @@
+import boto3
+import pandas as pd
+import numpy as np
+import re
+import sys
+import datetime
+from fuzzywuzzy import fuzz
+import collections
+comprehend = boto3.client('comprehend', region_name='ca-central-1')
+
+# Glue parameters
+from awsglue.utils import getResolvedOptions
+# in the Job details tab, the parameters has an extra -- in front, Glue expect it
+# this bit retrieve the environment variables (parameters) in the Job details into the Glue python environment
+args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "INSTITUTION_DATA_S3_URI", "SURVEY_MONKEY_S3URI"])
+
+BUCKET_NAME = args["BUCKET_NAME"]
+INSTITUTION_DATA_S3_URI = args["INSTITUTION_DATA_S3_URI"]
+SURVEY_MONKEY_S3URI = args["SURVEY_MONKEY_S3URI"]
+
+def return_df(bucket, data_key):
+
+    if "s3://" in data_key: # TIEN a full s3 URI is passed
+        data_location = data_key
+    else:
+        data_location = 's3://{}/{}'.format(bucket, data_key)
+
+    if data_location[-3:] == 'csv':
+        df = pd.read_csv(data_location)
+    else:
+        df = pd.read_excel(data_location)
+
+    delete_columns = []
+    
+    if '"Small TLEF Project Proposal" completion status' in list(df.columns):
+        delete_columns.append('"Small TLEF Project Proposal" completion status')
+    if '"Small TLEF Project Proposal" completion date/time' in list(df.columns):
+        delete_columns.append('"Small TLEF Project Proposal" completion date/time')
+    
+    df = df.drop(columns=delete_columns, axis=1)
+    
+    if 'Project Title (200 characters max.)' in list(df.columns):
+        df.dropna(subset=['Project Title (200 characters max.)'], inplace=True)
+    return df
+
+def empty_to_zero(df):
+    return df.fillna(0)
+
+def empty_to_blank(df):
+    return df.fillna('')
+
+def change_na_or_no_to_blank(df, column_name):
+    
+    df[column_name] = df[column_name].apply(
+        lambda x: re.sub(
+            r'^(No(t\sapplicable\.?|ne\.?)|n(o(t\sapplicable\.?|ne\.?)|/a\.?)|N(/[Aa]|[Aa])\.?)$', '', str(x)
+        ))
+    
+    df[column_name] = df[column_name].apply(
+        lambda x: re.sub(
+            r'.*[Nn]o.*', '', str(x)
+        ))
+    
+    return df
+
+def course_info_formatting(s):
+    parts = []
+    current_part = ''
+
+    # Iterate over each character in the input string.
+    for char in s:
+        if char.isdigit():
+            # If the current part is not empty and the last character of the current part is not a digit:
+            if current_part and not current_part[-1].isdigit():
+                parts.append(current_part)  # Add the current part to the parts list.
+                current_part = char  # Start a new current part with the current character.
+            else:
+                current_part += char  # If the last character of the current part is a digit, append the current character to the current part.
+        else:  # If the current character is not a digit (i.e., it's a letter or other character):
+            # If the current part is not empty and the last character of the current part is a digit:
+            if current_part and current_part[-1].isdigit():
+                parts.append(current_part)  # Add the current part to the parts list.
+                current_part = char  # Start a new current part with the current character.
+            else:
+                current_part += char  # If the last character of the current part is not a digit, append the current character to the current part.
+
+    # After finishing the loop, if there is any remaining part that hasn't been added to the parts list:
+    if current_part:
+        parts.append(current_part)  # Add the final part to the parts list.
+    
+    return ' '.join(parts)
+
+def course_info_mapping(df, column_list):
+    # Compile a regular expression pattern to match course codes (e.g., "CS101") and optional suffixes (e.g., "A/B/C").
+    course_info_pattern = re.compile(r'([A-Z]+\d+)([A-Z/]*)')
+    
+    # Iterate over each column name provided in column_list.
+    for column_name in column_list:
+        new_column_values = []  # Initialize a list to store the new, cleaned values for the current column.
+        
+        # Iterate over each value in the current column by its index.
+        for value_idx in range(len(list(df[column_name]))):
+            
+            this_course_info = ''.join(str(df[column_name].iloc[value_idx]).upper().split())  # Extract and clean the current value from the column, making it uppercase and removing spaces.
+            course_info = re.findall(course_info_pattern, this_course_info)  # Use the compiled regex to find all matches of the course code pattern in the cleaned string.
+            
+            # If at least one course code is found in the string:
+            if course_info:
+                course_code = course_info[0][0]  # Extract the base course code from the first match.
+                suffix = course_info[0][1].strip('/')  # Extract and clean the suffix from the first match, removing any trailing slashes.
+                
+                # If a suffix exists after cleaning:
+                if suffix:
+                    # If the suffix contains slashes, indicating multiple course codes are mentioned:
+                    if '/' in suffix: 
+                        all_course_codes = []  # Initialize a list to hold all formatted course codes.
+                        detailed_codes = suffix.split('/')  # Split the suffix at slashes to get individual codes.
+                        for d_code in detailed_codes:  # Iterate over each individual code from the suffix.
+                            this_course_code = course_code + d_code  # Append the individual code to the base course code.
+                            all_course_codes.append(course_info_formatting(this_course_code))  # Format and add the full course code to the list.
+                        new_column_values.append(all_course_codes)  # Add the list of formatted course codes to the new column values.
+                        continue  # Skip to the next value in the column, as processing for this value is complete.
+                    
+                    else:
+                        # If there's a suffix without slashes, append it directly to the base course code.
+                        course_code += suffix  
+                
+                # Format the complete course code (with or without suffix).
+                course_code = course_info_formatting(course_code)
+                new_column_values.append([course_code])  # Add the formatted course code to the new column values as a list containing a single element.
+                
+            else:
+                # If no course code is found, append a blank to the new column values.
+                new_column_values.append('')
+
+        df['cleaned_'+column_name] = new_column_values
+    
+    return df
+
+def fill_in_years(mentioned_years):
+    mentioned_years = [int(year) for year in mentioned_years]
+    
+    min_year = min(mentioned_years)
+    max_year = max(mentioned_years)
+    
+    elapsed_years = list(set([str(i) for i in range(min_year, max_year + 1)]))
+    return elapsed_years
+    
+def check_string_for_20(s):
+
+    # Check if '2020' is a standalone year in the string
+    if '2020' in s:
+        return s.replace('20', ''), True
+
+    # Find all occurrences of '20' in the string
+    index = 0
+    while index < len(s):
+        index = s.find('20', index)
+        if index == -1:
+            break
+
+        # Check if '20' is part of a 21st century year e.g., "2023", "2024-2025"
+        if (index > 0 and s[index - 1].isdigit() and s[index + 2:index + 4].isdigit()):
+            # Move past this occurrence
+            index += 2
+            continue
+
+        # Check if '20' is after a '-' or standalone e.g., "20" or "2019-20" AND there is nothing after "20" or it's not a number
+        if (index == 0 or s[index - 1] == '-') and (index + 2 >= len(s) or not s[index + 2].isdigit()):
+            return s.replace('20', ''), True
+
+        # Move to the next character to continue the search
+        index += 1
+
+    # If none of the conditions are met, return False
+    return s.replace('20', ''), False
+
+def remove_except_numbers_dash_and_onward_no_space(s):
+    result = ""
+    i = 0
+
+    # Iterate over each character in the string
+    while i < len(s):
+        if s[i].isdigit() or s[i] == '-' or s[i] == '/' or s[i] == '&':
+            result += s[i]
+        elif s[i:i+6].lower() == 'onward':
+            result += 'onward'
+            i += 5  # Skip the length of 'onward' minus one
+        elif s[i:i+4].lower() == 'and':
+            result += 'and'
+            i += 3  # Skip the length of 'and' minus one 
+        i += 1
+
+    return result
+
+def remove_numeric_after_slash(s):
+    result = ""
+    i = 0
+    while i < len(s):
+        # If the current character is '/' and the next character is a digit
+        if s[i] == '/' and i + 1 < len(s) and s[i + 1].isdigit():
+            # Skip the '/' and the following numeric characters
+            i += 1
+            while i < len(s) and s[i].isdigit():
+                i += 1
+        else:
+            # Add the current character to the result and move to the next character
+            result += s[i]
+            i += 1
+    return result
+
+def process_colon(s):
+    # Only extracting year data if the input was '2024-05-01 00:00:00'
+    if ':' in s:
+        return s[:4]
+    else:
+        return s
+
+def term_mapping(df, column_list):
+    terms = ['sep', 'jan', 'may', 'jul']
+    for column_name in column_list:
+        
+        for value_idx in range(len(list(df[column_name]))):
+            mentioned_terms = []
+            
+            for term in terms:
+                if term in str(df[column_name].iloc[value_idx]).lower():
+                    mentioned_terms.append(term)
+
+            if mentioned_terms:
+                df[column_name].iloc[value_idx] = ','.join(mentioned_terms)
+            else:
+                df[column_name].iloc[value_idx] = ''
+            
+    return df
+
+def year_mapping(df, column_list, year_start=0, upto_future_year=5):
+    today = datetime.date.today()
+    this_year = int(str(today.year)[-2:]) + upto_future_year # Up to 'upto_future_year' years into the future
+    years = [str(j) if j>=10 else '0'+str(j) for j in range(year_start, this_year)]
+    
+    for column_name in column_list:
+        new_column_values = []
+        
+        for value_idx in range(len(list(df[column_name]))):
+            
+            mentioned_years = []
+            
+            this_year_data = str(df[column_name].iloc[value_idx])
+            if this_year_data == '20245/2025': # Exception
+                new_column_values.append('') # Add a blank
+                continue
+            this_year_data = process_colon(this_year_data)
+            this_year_data = remove_except_numbers_dash_and_onward_no_space(this_year_data)
+            this_year_data = remove_numeric_after_slash(this_year_data)
+            this_year_data, is_2020 = check_string_for_20(this_year_data)
+            
+            if is_2020:
+                mentioned_years.append('2020')
+
+            for year in years:
+                if str(year) in this_year_data:
+                    mentioned_years.append('20'+str(year))
+                
+                    if "onward" in this_year_data:
+                        today = datetime.date.today()
+                        this_year = int(str(today.year)[-2:])
+                        for elapsed_year in range(int(year)+1, this_year):
+                            mentioned_years.append('20'+str(year))
+
+            if mentioned_years:
+                mentioned_years = list(set(mentioned_years))
+                
+                if '-' in this_year_data:
+                    mentioned_years = fill_in_years(mentioned_years)
+                mentioned_years.sort()
+                new_column_values.append(mentioned_years)
+
+            else:
+                new_column_values.append('') # add a blank
+                
+        df['cleaned_'+column_name] = new_column_values
+            
+    return df
+
+def get_faculty(input_text):
+    
+    faculty_list = [
+    "Faculty of Applied Science",
+    "Faculty of Arts",
+    "Faculty of Dentistry",
+    "Faculty of Education",
+    "First Nations House of Learning",
+    "Faculty of Forestry",
+    "Faculty of Graduate & Postdoctoral Studies",
+    "Faculty of Graduate and Postdoctoral Studies",
+    "Faculty of Land & Food Systems",
+    "Faculty of Land and Food Systems",
+    "Allard School of Law",
+    "Faculty of Medicine",
+    "Faculty of Pharmaceutical Sciences",
+    "Sauder School of Business",
+    "Faculty of Science",
+    "UBC Health",
+    "UBC Library",
+    "Vantage College",
+    "VP Academic",
+    "VP Students"
+        ]
+    
+    mentioned_faculties = {}
+    sorted_mentioned_faculties = []
+    
+    for faculty in faculty_list:
+        this_mentioned = []
+        start = 0
+        while True:
+            index = input_text.lower().find(faculty.lower(), start)
+            if index == -1:
+                break
+            this_mentioned.append(index)
+            start = index+1
+            
+        if this_mentioned:
+            for idx in this_mentioned:
+                mentioned_faculties[idx] = faculty
+                
+    ordered_faculties = collections.OrderedDict(sorted(mentioned_faculties.items()))
+    return ordered_faculties
+
+def ner_mapping(df, column, bucket, data_key, faculty_code_dict):
+    
+    df_institution_data = return_df(
+        bucket=bucket,
+        data_key=data_key
+    )
+    
+    full_name_list = list(df_institution_data['PREFERRED_FULL_NAME'])
+    department_list = list(df_institution_data['PRIMARY_DEPARTMENT_AFFILIATION'])
+    faculty_list = list(df_institution_data['PRIMARY_FACULTY_AFFILIATION'])
+    rank_list = list(df_institution_data['PRIMARY_ACADEMIC_RANK'])
+    emails_list = list(df_institution_data['EMAIL_ADDRESS'])
+    campus_list = list(df_institution_data['PRIMARY_CAMPUS_LOCATION'])
+    
+    source_data = df[column]
+    destination_data = []
+    
+    for this_free_text_idx in range(len(source_data)):
+        this_free_text = source_data.iloc[this_free_text_idx]
+        if isinstance(this_free_text, str) and this_free_text != '': # only accepting valid strings
+            associated_entities = retrieve_member_info(this_free_text)
+            cleaned_associated_entities = add_institution_info(
+                associated_entities,
+                full_name_list,
+                department_list,
+                faculty_list,
+                rank_list,
+                emails_list,
+                campus_list,
+                faculty_code_dict
+            )
+            destination_data.append(cleaned_associated_entities)
+        else:
+            destination_data.append(this_free_text)
+    df['cleaned_Team Members'] = destination_data
+    return df
+
+def retrieve_member_info(input_text):
+    detected_entities = comprehend.detect_entities(Text=input_text, LanguageCode='en')['Entities']
+    mentioned_faculties = get_faculty(input_text)
+    faculties_idx = list(mentioned_faculties.keys())
+    
+    names_dict = {}
+    names = []
+    remaining_names = []
+    
+    emails = []
+    emails_idx = []
+    remaining_emails = []
+    
+    for entity in detected_entities:
+        if entity['Type'] == 'PERSON':
+            if ' ' in entity['Text'] and not ('President' in entity['Text'] or 'Dean' in entity['Text']):
+                names_dict[entity['BeginOffset']] = entity['Text']
+            
+        elif entity['Type'] == 'OTHER':
+            if '@' in entity['Text']:
+                emails.append(entity['Text'])
+                emails_idx.append(entity['BeginOffset'])
+    
+    names = list(names_dict.values())
+    remaining_names = names.copy()
+    remaining_emails = emails.copy()
+    
+    associated_entities = {}
+    
+    if len(names) == len(emails):
+        for i in range(len(names)):
+            associated_entities[names[i]] = {}
+            associated_entities[names[i]]['Team Member Email'] = emails[i]
+            remaining_names = []
+            remaining_emails = []
+            
+    elif len(emails) < len(names):
+        for i in range(len(emails)):
+            remove_name = ""
+            remove_email = ""
+            
+            this_fuzz_seq = {}
+            this_fuzz_seq[0] = "blank" 
+            for name in names:
+                this_fuzz_ratio = fuzz.ratio(emails[i].split('@')[0].lower(), name.lower())
+                if this_fuzz_ratio > list(this_fuzz_seq.keys())[0]:
+                    del this_fuzz_seq[list(this_fuzz_seq.keys())[0]]
+                    this_fuzz_seq[int(fuzz.ratio(emails[i].split('@')[0].lower(), name.lower()))] = name
+                    
+            associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]] = {} 
+            associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]]['Team Member Email'] = emails[i]
+            remove_name = str(this_fuzz_seq[list(this_fuzz_seq.keys())[0]])
+            remove_email = emails[i]
+            
+            if remove_name in remaining_names:
+                remaining_names.remove(remove_name)
+            
+            if remove_email in remaining_emails:
+                remaining_emails.remove(remove_email)
+
+        if remaining_names:
+            for r_name in remaining_names:
+                associated_entities[r_name] = {}
+                associated_entities[r_name]['Team Member Email'] = ""
+                
+    elif len(names) < len(emails):
+        for i in range(len(names)):
+            remove_name = ""
+            remove_email = ""
+            
+            this_fuzz_seq = {}
+            this_fuzz_seq[0] = "blank" 
+            for email in emails:
+                this_fuzz_ratio = fuzz.ratio(email.split('@')[0].lower(), names[i].lower())
+                if this_fuzz_ratio > list(this_fuzz_seq.keys())[0]:
+                    del this_fuzz_seq[list(this_fuzz_seq.keys())[0]]
+                    this_fuzz_seq[
+                        int(fuzz.ratio(email.split('@')[0].lower(), names[i].lower()))] = names[i]
+                    
+            associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]] = {}
+            associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]]['Team Member Email'] = email
+            remove_name = str(this_fuzz_seq[list(this_fuzz_seq.keys())[0]])
+            remove_email = email
+            
+            if remove_name in remaining_names:
+                remaining_names.remove(remove_name)
+            
+            if remove_email in remaining_emails:
+                remaining_emails.remove(remove_email)
+    
+    # Finally, update the faculty info
+    for n_idx in names_dict.keys():
+        higher_f_idx = [f_idx for f_idx in faculties_idx if f_idx > n_idx]
+        if higher_f_idx:
+            this_key = names_dict[n_idx]
+            associated_entities[this_key]['Team Member Faculty'] = mentioned_faculties[min(higher_f_idx)]
+            
+    return associated_entities
+
+def add_institution_info(
+    associated_entities, 
+    full_name_list, 
+    department_list, 
+    faculty_list, 
+    rank_list, 
+    emails_list, 
+    campus_list,
+    faculty_code_dict
+):
+    
+    cleaned_associated_entities = {}
+    
+    for extracted_full_name in associated_entities.keys():
+        
+        selected_name = ""
+        selected_rank = ""
+        selected_department = ""
+        selected_campus = ""
+        selected_faculty = None
+        
+        match_score = 0
+        for i in range(len(full_name_list)):
+            this_fuzz_ratio = fuzz.ratio(extracted_full_name.lower(), full_name_list[i].lower())
+            if this_fuzz_ratio > 50 and this_fuzz_ratio > match_score:
+                if 'Team Member Email' in list(associated_entities[extracted_full_name].keys()):
+                    if associated_entities[extracted_full_name]['Team Member Email'].split('@')[0].lower() == str(emails_list[i]).split('@')[0].lower():
+                        
+                        selected_name = full_name_list[i]
+                        selected_rank = rank_list[i]
+                        selected_department = department_list[i]
+                        
+                        if 'UBC Vancouver' in campus_list[i]:
+                            selected_campus = "UBCV"
+                        elif 'UBC Okanagan' in campus_list[i]:
+                            selected_campus = "UBCO"
+                        else:
+                            selected_campus = "External"
+                
+                        if not 'Faculty' in list(associated_entities[extracted_full_name].keys()):
+                            selected_faculty = faculty_list[i]
+        
+        if not selected_name == "":
+            cleaned_associated_entities[selected_name] = {}
+            cleaned_associated_entities[selected_name] = associated_entities[extracted_full_name]
+            cleaned_associated_entities[selected_name]['Team Member Department'] = selected_department
+            cleaned_associated_entities[selected_name]['Team Member Title'] = selected_rank
+            cleaned_associated_entities[selected_name]['Campus'] = selected_campus
+            
+            if selected_faculty:
+                if selected_faculty in list(faculty_code_dict.keys()):
+                    cleaned_associated_entities[selected_name]['Team Member Faculty'] = faculty_code_dict[selected_faculty]
+                else:
+                     cleaned_associated_entities[selected_name]['Team Member Faculty'] = 'NA'
+            
+        else:
+            cleaned_associated_entities[extracted_full_name] = associated_entities[extracted_full_name]
+            if 'Team Member Faculty' in list(cleaned_associated_entities[extracted_full_name].keys()):
+                if cleaned_associated_entities[extracted_full_name]['Team Member Faculty'] in list(faculty_code_dict.keys()):
+                    cleaned_associated_entities[extracted_full_name]['Team Member Faculty'] = faculty_code_dict[cleaned_associated_entities[extracted_full_name]['Team Member Faculty']]
+    return cleaned_associated_entities
+
+def assign_faculty_code(df, faculty_code_dict, col_name="Project Faculty"):
+    new_project_faculty = []
+    project_faculty = list(df[col_name])
+    
+    for proj_fac in project_faculty:
+        if proj_fac in list(faculty_code_dict.keys()):
+            new_project_faculty.append(faculty_code_dict[proj_fac])
+        else:
+            new_project_faculty.append(proj_fac)
+            
+    df[col_name] = new_project_faculty
+    
+    return df
+
+def add_project_type_col(df):
+    project_types = None
+    
+    if 'Application Title' in list(df.columns):
+        application_titles = list(df['Application Title'])
+        project_types = []
+        project_types = ['Small' if 'SP' in application_titles[i] else 'Large' for i in range(len(application_titles))]
+    
+    if project_types:
+        df['Project Type'] = project_types
+        
+    return df
+
+def add_funding_year_col(df):
+    funding_years = None
+    
+    if 'Application Title' in list(df.columns):
+        application_titles = list(df['Application Title'])
+        funding_years = []
+        funding_years = [application_titles[i].split('-')[0] for i in range(len(application_titles))]
+    
+    if funding_years:
+        df['Funding Year'] = funding_years
+        
+    return df
+
+def remove_text_inside_brackets(text):
+    # Regular expression to match text inside round brackets including the brackets
+    pattern = r'\(.*?\)\s*'
+    # Replace the matched text with an empty string
+    cleaned_text = re.sub(pattern, '', text)
+    return cleaned_text.strip()
+
+def focus_areas_list_change(df):
+    new_focus_areas = None
+    if 'Project Focus Areas' in list(df.columns):
+        new_focus_areas = []
+        new_focus_areas = [remove_text_inside_brackets(
+            focus_text).split(
+            ',') for focus_text in list(
+            df['Project Focus Areas']
+        )]
+
+    if new_focus_areas:
+        df['Project Focus Areas'] = new_focus_areas
+        
+    return df
+
+def rename_columns(df):
+    new_col_names = {}
+    
+    new_col_names['Project Title (200 characters max.)'] = 'Project Title'
+    new_col_names['What project year does this proposal pertain to?'] = 'Project Year'
+    new_col_names['Principal Applicant | Principal Applicant’s name:'] = 'PI'
+    new_col_names['Principal Applicant | Principal Applicant’s title(s) (e.g. Assistant Professor, Lecturer, Professor of Teaching, etc.):'] = 'PI Title'
+    new_col_names['Principal Applicant | Principal Applicant’s primary (UBC) email address:'] = 'PI Email'
+    new_col_names['Principal Applicant | Principal Applicant’s role:'] = 'PI Role'
+    new_col_names['Principal Applicant | Principal Applicant’s Faculty, College, or administrative unit:'] = 'Project Faculty'
+    new_col_names['Principal Applicant | Principal Applicant’s Department, School, or unit:'] = 'Department'
+    new_col_names['Special Classroom or Facilities Requirements (150 words max.)'] = 'Special Classroom or Facilities Requirements'
+    new_col_names['Co-Applicants &amp; Project Team Members (500 words max.)'] = 'Team Members'
+    
+    for col in list(df.columns):
+        if 'Students Reached by the Project |  | Academic Year' in col:
+            new_col_names[col] = col.replace('Students Reached by the Project |  | Academic Year', 'project_academic_year')
+            
+        if 'Students Reached by the Project |  | Course Code' in col:
+            new_col_names[col] = col.replace('Students Reached by the Project |  | Course Code', 'project_course_code')
+            
+        if 'Students Reached by the Project |  | Term (Sep/Jan/May)' in col:
+            new_col_names[col] = col.replace('Students Reached by the Project |  | Term (Sep/Jan/May)', 'project_term')
+            
+        if 'Students Reached by the Project |  | Term (Sep/Jan/May)' in col:
+            new_col_names[col] = col.replace('Students Reached by the Project |  | Term (Sep/Jan/May)', 'project_term')
+    
+    df = df.rename(columns=new_col_names)
+    
+    return df
+
+def generate_faculty_engagement_xlsx(df, ner_col_name='cleaned_Team Members'):
+    
+    funding_year = []
+    project_type = []
+    application_title = []
+    project_faculty = []
+    team_member_name = []
+    team_member_title = []
+    team_member_stream = []
+    campus = []
+    team_member_faculty = []
+    team_member_department = []
+    team_member_email = []
+    
+    
+    for proj_idx in range(len(df)):
+        this_member_info = df[ner_col_name].iloc[proj_idx]
+        if not isinstance(this_member_info, int) and this_member_info != '': # Not an integer or a blank string
+            for this_team_member_name in list(this_member_info.keys()):
+
+                funding_year.append(df['Funding Year'].iloc[proj_idx])
+                project_type.append(df['Project Type'].iloc[proj_idx])
+                application_title.append(df['Application Title'].iloc[proj_idx])
+                project_faculty.append(df['Project Faculty'].iloc[proj_idx])
+
+                team_member_name.append(this_team_member_name)
+
+                if 'Team Member Title' in list(this_member_info[this_team_member_name].keys()):
+                    team_member_title.append(this_member_info[this_team_member_name]['Team Member Title'])
+                else:
+                    team_member_title.append('')
+
+                if 'Team Member Stream' in list(this_member_info[this_team_member_name].keys()):
+                    team_member_stream.append(this_member_info[this_team_member_name]['Team Member Stream'])
+                else:
+                    team_member_stream.append('')
+
+                if 'Campus' in list(this_member_info[this_team_member_name].keys()):
+                    campus.append(this_member_info[this_team_member_name]['Campus'])
+                else:
+                    campus.append('')
+
+                if 'Team Member Faculty' in list(this_member_info[this_team_member_name].keys()):
+                    team_member_faculty.append(this_member_info[this_team_member_name]['Team Member Faculty'])
+                else:
+                    team_member_faculty.append('')
+
+                if 'Team Member Department' in list(this_member_info[this_team_member_name].keys()):
+                    team_member_department.append(this_member_info[this_team_member_name]['Team Member Department'])
+                else:
+                    team_member_department.append('')
+
+                if 'Team Member Email' in list(this_member_info[this_team_member_name].keys()):
+                    team_member_email.append(this_member_info[this_team_member_name]['Team Member Email'])
+                else:
+                    team_member_email.append('')
+    
+    n_rows = len(funding_year)
+    
+    faculty_engagement_df = pd.DataFrame({
+        'funding_year': funding_year,
+        'project_type': project_type,
+        'project_id': application_title,
+        'grant_id': [''] * n_rows, # This info is not present in the raw survey monkey dataset
+        'project_faculty': project_faculty,
+        'member_name': team_member_name,
+        'member_title': team_member_title,
+        'member_stream': team_member_stream,
+        'member_campus': campus,
+        'member_faculty': team_member_faculty,
+        'member_unit': team_member_department,
+        'member_other': [''] * n_rows # This info is not available in the institution_data.csv file
+    })
+    
+    # Unused: team_member_email
+    
+    return faculty_engagement_df
+
+def generate_project_details_xlsx(df):
+    funding_year = df['Funding Year']
+    n_rows = len(funding_year)
+    project_type = df['Project Type']
+    grant_id = df['Application Title']
+    project_id = [''] * n_rows # This info is not present in the raw survey monkey dataset
+    project_faculty = df['Project Faculty']
+    pi_name = df['PI']
+    pi_unit = df['Department']
+    funding_amount = df['Total Project Budget']
+    title = df['Project Title']
+    summary = df['Summary of Work Accomplished to Date (1000 words max.)']
+    co_applicants = df['Team Members']
+    generated_grant_id = [''] * n_rows # This info is not generated at this stage
+    project_year = [''] * n_rows # This info is not present in the raw survey monkey dataset
+    project_status = [''] * n_rows # This info is not present in the raw survey monkey dataset
+    
+    project_details_df = pd.DataFrame({
+        'funding_year': funding_year,
+        'project_type': project_type,
+        'grant_id': grant_id,
+        'project_id': project_id,
+        'project_faculty': project_faculty,
+        'pi_name': pi_name,
+        'pi_unit': pi_unit,
+        'funding_amount': funding_amount,
+        'title': title,
+        'summary': summary,
+        'co_applicants': co_applicants,
+        'generated_grant_id': generated_grant_id,
+        'project_year': project_year,
+        'project_status': project_status
+    })
+    
+    return project_details_df
+
+def tlef_raw_data_preprocessing(bucket, raw_data_key, institution_data_key):
+    
+    faculty_code_dict = {
+    "Faculty of Applied Science": "APSC",
+    "Faculty of Arts": "ARTS",
+    "Faculty of Dentistry": "DENT",
+    "Faculty of Education": "EDUC",
+    "First Nations House of Learning": "ARTS",
+    "Faculty of Forestry": "FRST",
+    "Faculty of Graduate & Postdoctoral Studies": "GRAD",
+    "Faculty of Graduate and Postdoctoral Studies": "GRAD",
+    "Faculty of Land & Food Systems": "LFS",
+    "Faculty of Land and Food Systems": "LFS",
+    "Allard School of Law": "LAW",
+    "Faculty of Medicine": "MED",
+    "Faculty of Pharmaceutical Sciences": "PHAR",
+    "Sauder School of Business": "COMM",
+    "Faculty of Science": "SCI",
+    "UBC Health": "HLTH",
+    "UBC Library": "LIBR",
+    "Vantage College": "VANT",
+    "VP Academic": "VPA",
+    "VP Students": "VPS"
+    }
+    
+    df = return_df(
+        bucket=bucket,
+        data_key=raw_data_key
+    )
+    
+    print("1. Transforming empty values to blanks...")
+    df = empty_to_blank(df)
+    
+    print("\n\n2. Transforming irrelevant values to 0...")
+    df = change_na_or_no_to_blank(df, 'Special Classroom or Facilities Requirements (150 words max.)')
+    
+    course_code_columns_list = [f'Students Reached by the Project |  | Course Code{f".{i}" if i > 0 else ""}' for i in range(10)]
+    print("\n\n3. Mapping course codes...")
+    df = course_info_mapping(df, course_code_columns_list)
+    
+    years_columns_list = [f'Students Reached by the Project |  | Academic Year{f".{i}" if i > 0 else ""}' for i in range(10)]
+    print("\n\n4. Mapping years...")
+    df = year_mapping(df, years_columns_list)
+    
+    terms_columns_list = [f'Students Reached by the Project |  | Term (Sep/Jan/May){f".{i}" if i > 0 else ""}' for i in range(10)]
+    print("\n\n5. Mapping terms...")
+    df = term_mapping(df, terms_columns_list)
+    
+    ner_column = 'Co-Applicants &amp; Project Team Members (500 words max.)'
+    
+    print("\n\n6. Extracting name-entity relationships...")
+    df = ner_mapping(df, ner_column, bucket, institution_data_key, faculty_code_dict)
+    
+    print("\n\n7. Adding a 'Project Type' column that contains info on if the project was Large or Small...")
+    df = add_project_type_col(df)
+    
+    print("\n\n8. Adding a 'Funding Year' column that contains info on what year did the project receive funding...")
+    df = add_funding_year_col(df)
+    
+    print("\n\n9. Converted the Focus Areas column data into a list where each element is the focus area relevant to that project...")
+    df = focus_areas_list_change(df)
+    
+    print("\n\n10. Rename columns...")
+    df = rename_columns(df)
+    
+    print("\n\n11. Change full faculty name to its faculty code...")
+    df = assign_faculty_code(df, faculty_code_dict, col_name="Project Faculty")
+    
+    print("\n\n11. Generate faculty engagement xlsx dataset...")
+    faculty_engagement_df = generate_faculty_engagement_xlsx(df)
+    
+    print("\n\n12. Generate project details xlsx dataset...")
+    project_details_df = generate_project_details_xlsx(df)
+    
+    print("Process completed!")
+    
+    return df, faculty_engagement_df, project_details_df
+    
+clean_df, faculty_engagement_df, project_details_df = tlef_raw_data_preprocessing(
+    bucket=BUCKET_NAME,
+    raw_data_key = SURVEY_MONKEY_S3URI,
+    institution_data_key = INSTITUTION_DATA_S3_URI
+)
+
+current_year = project_details_df["funding_year"][0] # TIEN, get the year
+
+# save directly to s3
+#clean_df.to_excel(f's3://{BUCKET_NAME}/raw/clean_survey_monkey_{current_year}.xlsx', index=False) # TIEN
+faculty_engagement_df.to_excel(f's3://{BUCKET_NAME}/raw/faculty_engagement/faculty_engagement_{current_year}.xlsx', index=False) # TIEN
+project_details_df.to_excel(f's3://{BUCKET_NAME}/raw/project_details/project_details_{current_year}.xlsx', index=False) # TIEN
\ No newline at end of file
diff --git a/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py b/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py
new file mode 100644
index 0000000..57eab58
--- /dev/null
+++ b/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py
@@ -0,0 +1,282 @@
+from sentence_transformers import SentenceTransformer, util
+import boto3
+from io import BytesIO
+import pandas as pd
+import numpy as np
+import pickle
+import os
+import sys
+from scipy.spatial.distance import cdist
+
+# Glue parameters
+from awsglue.utils import getResolvedOptions
+args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "EMBEDDINGS_BUCKET", "PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI"])
+
+BUCKET_NAME = args["BUCKET_NAME"]
+EMBEDDINGS_BUCKET = args["EMBEDDINGS_BUCKET"]
+PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI = args["PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI"]
+
+def createDir(path):
+    os.makedirs(path, exist_ok=True) # using os.markdirs to also create intermediate directories
+    return os.path.join(os.getcwd(), path)
+
+model_custom_path = createDir("cache/torch/sentence_transformers")
+
+def return_df(bucket, data_key): # TIEN
+    
+    if "s3://" in data_key: # TIEN a full s3 URI is passed
+        data_location = data_key
+    else:
+        data_location = 's3://{}/{}'.format(bucket, data_key)
+
+    if ".parquet" in data_key:
+        df =  pd.read_parquet(data_location)
+    else:
+        df = pd.read_excel(data_location)
+        
+    return df
+
+project_details_df = return_df(
+    bucket = BUCKET_NAME, #previously 'clean-full-data',
+    data_key = PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI # previously 'project_details_with_new_ids.xlsx'
+)
+
+def find_all_summaries(model='all-mpnet-base-v2'):
+    
+    embedding_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = model_custom_path)
+    
+    for i in range(len(list(project_details_df.project_id))):
+        
+        this_project_id = project_details_df.project_id[i]
+        if this_project_id != this_project_id: # i.e., project_id is NaN
+            this_project_id = project_details_df.generated_grant_id[i] # use the automatically generated grant ID instead
+            
+        # Find all rows where either the corresponding 'project_id' column values are equal to this_project_id
+        this_relevant_df = project_details_df.loc[project_details_df.project_id == this_project_id]
+        this_project_context = ""
+        this_project_context_embedding = None
+        
+        if this_relevant_df.empty:
+            # Whatever we have right now is the only occurrence
+            this_title = project_details_df.title[i] # TIEN project_title -> title
+            this_summary = project_details_df.summary[i] # TIEN project_summary -> summary
+        
+            if not this_title != this_title and not this_summary != this_summary: # i.e., both title and summary are not NaN
+
+                this_title_summary = this_title + '. ' + this_summary + ' ' # concatenate title and summary into 1 string
+                # create embeddings witht the concat string
+                this_title_summary_embedding = embedding_model.encode(this_title_summary, convert_to_tensor=True)
+
+                this_project_context = this_title_summary
+                this_project_context_embedding = this_title_summary_embedding
+            
+        else:
+            # Generate one context for embedding
+            this_project_context, this_project_context_embedding = generate_context_embeddings(this_relevant_df, embedding_model)
+        
+        store_context_and_embeddings(
+            this_project_context, 
+            this_project_context_embedding,
+            bucket = EMBEDDINGS_BUCKET, #'tlef-project-summary-embeddings',
+            data_key = this_project_id
+        )
+        
+def check_and_update_embeddings(bucket, data_key, current_context, current_embeddings, threshold=0.96):
+    """
+    Check if embeddings exist, compare them, and update if similarity is above threshold.
+    """
+    s3_client = boto3.client('s3')
+    context_key = f'{data_key}.pkl'
+    embeddings_key = f'{data_key}_embeddings.pkl'
+
+    try:
+        # Check if the context and embeddings files exist
+        context_object = s3_client.get_object(Bucket=bucket, Key=context_key)
+        embeddings_object = s3_client.get_object(Bucket=bucket, Key=embeddings_key)
+        
+        # Load existing context and embeddings
+        existing_context = pickle.loads(context_object['Body'].read())
+        existing_embeddings = pickle.loads(embeddings_object['Body'].read())
+
+        # Compare embeddings using cosine similarity
+        similarity = util.pytorch_cos_sim(current_embeddings, existing_embeddings)
+
+        if similarity > threshold:
+            # Update context and embeddings if similarity is high
+            updated_context = existing_context + " " + current_context
+            updated_embeddings = pickle.dumps(current_embeddings)  # Assuming current_embeddings is the updated embeddings
+
+            # Save updated context and embeddings
+            s3_client.put_object(Bucket=bucket, Key=context_key, Body=pickle.dumps(updated_context))
+            s3_client.put_object(Bucket=bucket, Key=embeddings_key, Body=updated_embeddings)
+            return True
+        
+    except s3_client.exceptions.NoSuchKey:
+        # If the files do not exist, proceed with the original saving process
+        return False
+
+    return False
+
+def store_context_and_embeddings(project_context, project_context_embedding, bucket, data_key):
+    if project_context_embedding is not None:  # If it is not NaN
+        if not check_and_update_embeddings(bucket, data_key, project_context, project_context_embedding):
+            # If the embeddings are not updated due to no existing files or low similarity, save them as new
+            s3_client = boto3.client('s3')
+            context_bytes = pickle.dumps(project_context)
+            embeddings_bytes = pickle.dumps(project_context_embedding)
+
+            context_key = f'{data_key}.pkl'
+            embeddings_key = f'{data_key}_embeddings.pkl'
+            
+            s3_client.put_object(Bucket=bucket, Key=context_key, Body=context_bytes)
+            s3_client.put_object(Bucket=bucket, Key=embeddings_key, Body=embeddings_bytes)
+
+def generate_context_embeddings(relevant_df, embedding_model):
+    
+    context = ""
+    context_embedding = None
+    
+    # iterate through each rows of the "relevant" df
+    for index, row in relevant_df.iterrows():
+        if context_embedding is None:
+            current_context_embedding = embedding_model.encode(context, convert_to_tensor=True)
+        else:
+            current_context_embedding = context_embedding
+        
+        this_title = row.title # TIEN project_title -> title
+        this_summary = row.project_summary
+        
+        if not this_title != this_title and not this_summary != this_summary: # i.e., both title and summary are not NaN
+            
+            this_title_summary = this_title + '. ' + this_summary + ' '
+            this_title_summary_embedding = embedding_model.encode(this_title_summary, convert_to_tensor=True)
+            
+            if util.cos_sim(current_context_embedding, this_title_summary_embedding)[0] < 0.96: # The new title+summary has some differences
+                context += this_title_summary
+                context_embedding = embedding_model.encode(context, convert_to_tensor=True)
+                
+    return context, context_embedding
+    
+find_all_summaries()
+
+
+############## END OF GENERATING EMBEDDINGS PART
+
+############## BEGIN OF COMPUTING SIMILARITY PART
+
+def generate_embeddings_database(bucket_name):
+    # Initialize a boto3 client
+    s3 = boto3.client('s3')
+    
+    # Create a reusable Paginator
+    paginator = s3.get_paginator('list_objects_v2')
+    
+    # Create a PageIterator from the Paginator
+    page_iterator = paginator.paginate(Bucket=bucket_name)
+    
+    embeddings_database = {}
+    
+    for page in page_iterator:
+        for obj in page['Contents']:
+            key = obj['Key']
+            if 'embeddings' in key:
+                # Get the object from S3
+                response = s3.get_object(Bucket=bucket_name, Key=key)
+                body = response['Body'].read()
+                
+                # Deserialize the embeddings
+                this_embedding = pickle.loads(body)
+                
+                embeddings_database[key] = this_embedding
+                
+    return embeddings_database
+
+def generate_similar_projects_database(bucket_name, embeddings_database, top_k=10):
+    # Initialize a boto3 client
+    s3 = boto3.client('s3')
+    
+    # Create a reusable Paginator
+    paginator = s3.get_paginator('list_objects_v2')
+    
+    # Create a PageIterator from the Paginator
+    page_iterator = paginator.paginate(Bucket=bucket_name)
+    
+    similar_projects_database = {}
+    
+    for page in page_iterator:
+        for obj in page['Contents']:
+            key = obj['Key']
+            if 'embeddings' in key:
+                # Get the object from S3
+                s3_response = s3.get_object(Bucket=bucket_name, Key=key)
+                body = s3_response['Body'].read()
+                
+                # Deserialize the embeddings
+                this_embedding = pickle.loads(body)
+                
+                # Convert embeddings_database values into a list for comparison
+                database_embeddings_list = list(embeddings_database.values())
+                database_keys_list = list(embeddings_database.keys())
+                
+                print(key)
+                print(type(this_embedding))
+                print(this_embedding)
+                print("-----------")
+                print(type(database_embeddings_list))
+    
+                # Assuming embeddings_database is a dict with file names as keys and embeddings as values
+                embeddings_matrix = np.stack(database_embeddings_list)
+                this_embedding_matrix = np.array([this_embedding])
+                
+                print("Here")
+
+                # Calculate cosine similarities (using 1 - cosine distance)
+                similarities = 1 - cdist(this_embedding_matrix, embeddings_matrix, 'cosine')
+                sorted_indices = np.argsort(similarities[0])[::-1]  # Sorting indices in descending order
+                
+                print("Here2")
+
+                # Store top k similar projects
+                similar_projects_database[key.replace("_embeddings.pkl", "")] = [database_keys_list[i].replace("_embeddings.pkl", "") for i in sorted_indices[1:top_k+1]]
+    
+                print("End")
+
+    return similar_projects_database
+
+def save_similar_projects_database(similar_projects_database, bucket_name, file_name): # TIEN
+    # Convert the similar projects database to a DataFrame for easy Excel writing
+    df = pd.DataFrame(list(similar_projects_database.items()), columns=['project_key', 'similar_projects'])
+    
+    # Convert the list of similar projects into a string for better Excel visualization
+    df['similar_projects'] = df['similar_projects'].apply(lambda x: ', '.join(x))
+    
+    df.to_parquet(f"s3://{bucket_name}/{file_name}")
+
+    # # Create a BytesIO buffer to hold the Excel file in memory
+    # excel_buffer = BytesIO()
+    # # Write the DataFrame to an Excel file in the buffer
+    # with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
+    #     df.to_excel(writer, sheet_name='similar_projects', index=False)
+    
+    # # Go to the beginning of the BytesIO buffer
+    # excel_buffer.seek(0)
+    
+    # # Initialize a boto3 client
+    # s3_client = boto3.client('s3')
+    # # Upload the Excel file from the buffer to the S3 bucket
+    # s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=excel_buffer.getvalue(), ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
+    
+    # # Close the BytesIO buffer
+    # excel_buffer.close()
+
+def main():
+    
+    embeddings_database = generate_embeddings_database(EMBEDDINGS_BUCKET) #'tlef-project-summary-embeddings'
+    
+    similar_projects_database = generate_similar_projects_database(EMBEDDINGS_BUCKET, embeddings_database) #'tlef-project-summary-embeddings'
+    
+    save_similar_projects_database(similar_projects_database, BUCKET_NAME, "staging/similar_projects.parquet") # similar_projects.xlsx
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/cdk/glue/scripts/generate-new-grant-ids.py b/backend/cdk/glue/scripts/generate-new-grant-ids.py
new file mode 100644
index 0000000..cb6b08f
--- /dev/null
+++ b/backend/cdk/glue/scripts/generate-new-grant-ids.py
@@ -0,0 +1,162 @@
+import pandas as pd
+import boto3
+import s3fs
+
+import sys
+import io
+import boto3
+from botocore.exceptions import ClientError
+
+# Glue parameters
+from awsglue.utils import getResolvedOptions
+args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "PROJECT_DETAILS_S3URI"])
+
+BUCKET_NAME = args["BUCKET_NAME"]
+PROJECT_DETAILS_S3URI = args["PROJECT_DETAILS_S3URI"]
+
+
+def return_df(bucket, data_key):
+    
+    if "s3://" in data_key: # TIEN a full s3 URI is passed
+        data_location = data_key
+    else:
+        data_location = 's3://{}/{}'.format(bucket, data_key)
+
+    df = pd.read_excel(data_location)
+        
+    return df
+
+project_details_df = return_df(
+    bucket = BUCKET_NAME,
+    data_key = PROJECT_DETAILS_S3URI
+)
+
+def store_df(df, bucket, data_key):
+
+    data_location = 's3://{}/{}'.format(bucket, data_key)
+    
+    if '.parquet' in data_key:
+        df.to_parquet(data_location, index=False)
+    else:
+        df.to_excel(data_location, index=False)
+
+def assign_grant_ids(project_details_df):
+    
+    grant_ids = []
+    
+    for proj_idx in range(len(project_details_df)):
+        
+        this_proj_funding_year = project_details_df.funding_year.iloc[proj_idx]
+        
+        if ';' in project_details_df.project_faculty.iloc[proj_idx]: # There are multiple project faculties
+            this_proj_faculty = ""
+            
+            faculty_list = project_details_df.project_faculty.iloc[proj_idx].split(';') # Add all faculties to the grant ID
+            for this_faculty in faculty_list:
+                this_proj_faculty += this_faculty.strip() + '-'
+            
+            this_proj_faculty = this_proj_faculty[:-1] # Remove the last '-'
+            
+        else:
+            this_proj_faculty = project_details_df.project_faculty.iloc[proj_idx].strip()
+        
+        this_proj_type = ""
+        
+        existing_grant_id = project_details_df.grant_id.iloc[proj_idx]
+        
+        if not existing_grant_id != existing_grant_id: # Existing Grant ID is not NaN
+            if len(existing_grant_id.split('-')) >= 5: # Existing Grant ID has 5 components or 6 in some cases ('2015-TLEF-LP1-ARTS-Giltrow-A' and '2015-TLEF-LP1-ARTS-Giltrow-B')
+
+                if "SP" in existing_grant_id.split('-')[2] or "LP" in existing_grant_id.split('-')[2]:
+                    this_proj_type = existing_grant_id.split('-')[2] # Extract project type from the existing Grant ID
+                
+                elif "FL" in existing_grant_id.split('-')[1]: # Convert it into an LP but restore the number, e.g., FL2 -> LP2
+                    if existing_grant_id.split('-')[1][-1].isdigit(): # E.g., "FL2"
+                        this_proj_type = "LP" + existing_grant_id.split('-')[1][-1]
+                    
+                    else:
+                        this_proj_type = "LP"
+                        
+                elif "UDL" in existing_grant_id.split('-')[1]: # Convert it into an SP but restore the number, e.g., UDL2 -> SP2
+                    if existing_grant_id.split('-')[1][-1].isdigit(): # E.g., "UDL2"
+                        this_proj_type = "SP" + existing_grant_id.split('-')[1][-1]
+                    
+                    else:
+                        this_proj_type = "SP"
+                
+                elif "ITTG" in existing_grant_id.split('-')[2]: # E.g., Grant ID = 2020-ITTG-ARTS-Mawani
+                    this_proj_id = project_details_df.project_id.iloc[proj_idx] # Extract the corresponding Project ID, e.g., 2020-SP-ARTS-008
+                    
+                    if "SP" in this_proj_id.split('-')[1] or "LP" in this_proj_id.split('-')[1]:
+                        this_proj_type = this_proj_id.split('-')[1] # Extract and store project type from the project ID
+                        
+        if this_proj_type == "":
+            this_proj_type = "SP" if project_details_df.project_type.iloc[proj_idx] == "Small" else "LP"
+        
+        if ';' in project_details_df.pi_name.iloc[proj_idx]: # If there are multiple PIs
+            this_proj_PI_surname = ""
+            
+            PI_list = project_details_df.pi_name.iloc[proj_idx].split(';')
+            for PI_name in PI_list:
+                this_PI_surname = PI_name.split(' ')[-1]
+                this_proj_PI_surname += this_PI_surname.strip() + '-' # Add all PI surnames to the grant ID
+                
+            this_proj_PI_surname = this_proj_PI_surname[:-1] # Remove the last '-'
+            
+        else:
+            this_proj_PI_surname = project_details_df.pi_name.iloc[proj_idx].split(' ')[-1].strip()
+
+        this_proj_grant_id = str(this_proj_funding_year) + "-TLEF-" + this_proj_type + '-' + this_proj_faculty + '-' +  this_proj_PI_surname
+        
+        if this_proj_grant_id in grant_ids: # This PI has multiple TLEF projects in the same year
+            existing_grant_id_idx = grant_ids.index(this_proj_grant_id)
+            grant_ids[existing_grant_id_idx] = this_proj_grant_id + '-A'
+            grant_ids.append(this_proj_grant_id + '-B')
+            
+        else:
+            grant_ids.append(this_proj_grant_id)
+    
+    project_details_df["generated_grant_id"] = grant_ids
+    return project_details_df
+
+def convert_summary_to_strings(project_details_df):
+    
+    summary_list = []
+    for proj_idx in range(len(project_details_df)):
+        
+        this_summary = project_details_df.summary.iloc[proj_idx]
+        if this_summary != this_summary: # To check if the value is NaN, in which case this will be True
+            summary_list.append("")
+        else:
+            summary_list.append(this_summary)
+    
+    project_details_df.summary = summary_list
+    
+    return project_details_df
+    
+    
+    
+current_year = project_details_df["funding_year"][0]
+file_key = f"staging/project_details/project_details_with_new_ids_{current_year}.parquet"
+
+store_df(
+    df= convert_summary_to_strings(
+        assign_grant_ids(
+            project_details_df)),
+    bucket = BUCKET_NAME,
+    data_key = file_key
+)
+    
+
+# start generate-embeddings-and-similar-projects Glue Job
+arguments = {
+    "--BUCKET_NAME": BUCKET_NAME,
+    "--EMBEDDINGS_BUCKET": "tlef-project-summary-embeddings",
+    "--PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI": f"s3://{BUCKET_NAME}/{file_key}",
+    "--additional-python-modules": "sentence-transformers"
+}
+glue_client = boto3.client("glue")
+glue_client.start_job_run(
+    JobName="generate-embeddings-and-similar-projects.py",
+    Arguments=arguments
+)
diff --git a/backend/cdk/lambda/createFolders/createFolders.py b/backend/cdk/lambda/createFolders/createFolders.py
new file mode 100644
index 0000000..69b2621
--- /dev/null
+++ b/backend/cdk/lambda/createFolders/createFolders.py
@@ -0,0 +1,16 @@
+import os
+import boto3
+
+BUCKET_NAME = os.environ["BUCKET_NAME"]
+
+s3 = boto3.client("s3")
+
+def lambda_handler(event, context):
+
+    # CREATE A FOLDER STRUCTURE FOR THE AMPLIFY S3 STORAGE with a public/ top level folder
+
+    s3.put_object(Bucket=BUCKET_NAME, Key="raw-data/")
+    s3.put_object(Bucket=BUCKET_NAME, Key="staging-data/")
+    s3.put_object(Bucket=BUCKET_NAME, Key="production-data/")
+    
+    print("Trigger Function scripts finished execution successfully!")
\ No newline at end of file
diff --git a/backend/cdk/lib/datapipeline-stack.ts b/backend/cdk/lib/datapipeline-stack.ts
new file mode 100644
index 0000000..1ae76e9
--- /dev/null
+++ b/backend/cdk/lib/datapipeline-stack.ts
@@ -0,0 +1,236 @@
+import * as cdk from "aws-cdk-lib";
+import { Construct } from "constructs";
+import { VpcStack } from "./vpc-stack";
+import * as s3 from "aws-cdk-lib/aws-s3";
+import * as s3deploy from "aws-cdk-lib/aws-s3-deployment";
+import * as iam from "aws-cdk-lib/aws-iam";
+import * as glue from "aws-cdk-lib/aws-glue";
+import * as lambda from "aws-cdk-lib/aws-lambda";
+
+export class DataPipelineStack extends cdk.Stack {
+  constructor(
+    scope: Construct,
+    id: string,
+    vpcStack: VpcStack,
+    props?: cdk.StackProps
+  ) {
+    super(scope, id, props);
+
+    // files storage bucket
+    const dataBucket = new s3.Bucket(this, "data-s3bucket", {
+      removalPolicy: cdk.RemovalPolicy.DESTROY,
+      autoDeleteObjects: true,
+      versioned: false,
+      publicReadAccess: false,
+      blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
+      encryption: s3.BucketEncryption.S3_MANAGED,
+    });
+
+    // bucket for embeddings
+    const embeddingsBucket = new s3.Bucket(this, "embeddings-s3bucket", {
+      removalPolicy: cdk.RemovalPolicy.DESTROY,
+      autoDeleteObjects: true,
+      versioned: false,
+      publicReadAccess: false,
+      blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
+      encryption: s3.BucketEncryption.S3_MANAGED,
+    });
+
+    // Glue deployment bucket
+    const glueS3Bucket = new s3.Bucket(this, "glue-s3bucket", {
+      removalPolicy: cdk.RemovalPolicy.DESTROY,
+      autoDeleteObjects: true,
+      versioned: false,
+      publicReadAccess: false,
+      blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
+      encryption: s3.BucketEncryption.S3_MANAGED,
+    });
+
+    // const triggerLambda = new cdk.triggers.TriggerFunction(
+    //   this,
+    //   "createFolders-lambda",
+    //   {
+    //     functionName: "createFolders",
+    //     runtime: lambda.Runtime.PYTHON_3_9,
+    //     handler: "createFolders.lambda_handler",
+    //     timeout: cdk.Duration.seconds(300),
+    //     memorySize: 512,
+    //     environment: {
+    //       BUCKET_NAME: dataBucket.bucketName,
+    //     },
+    //     vpc: vpcStack.vpc,
+    //     code: lambda.Code.fromAsset("./lambda/createFolders"),
+    //     layers: [],
+    //     executeAfter: [dataBucket],
+    //   }
+    // );
+
+    // triggerLambda.addToRolePolicy(
+    //   new iam.PolicyStatement({
+    //     effect: iam.Effect.ALLOW,
+    //     actions: [
+    //       "s3:ListBucket",
+    //       "s3:ListObjectsV2",
+    //       "s3:PutObject",
+    //       "s3:PutObjectAcl",
+    //       "s3:GetObject",
+    //     ],
+    //     resources: [
+    //       `arn:aws:s3:::${dataBucket.bucketName}`,
+    //       `arn:aws:s3:::${dataBucket.bucketName}/*`,
+    //     ],
+    //   })
+    // );
+    // triggerLambda.addToRolePolicy(
+    //   new iam.PolicyStatement({
+    //     effect: iam.Effect.ALLOW,
+    //     actions: [
+    //       // CloudWatch Logs
+    //       "logs:CreateLogGroup",
+    //       "logs:CreateLogStream",
+    //       "logs:PutLogEvents",
+    //     ],
+    //     resources: ["arn:aws:logs:*:*:*"],
+    //   })
+    // );
+
+    // Create new Glue Role. DO NOT RENAME THE ROLE!!!
+    const roleName = "AWSGlueServiceRole-gluedatapipeline";
+    const glueRole = new iam.Role(this, roleName, {
+      assumedBy: new iam.ServicePrincipal("glue.amazonaws.com"),
+      description: "Glue Service Role",
+      roleName: roleName,
+    });
+
+    // Add different policies to glue-service-role
+    const glueServiceRolePolicy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "service-role/AWSGlueServiceRole"
+    );
+    const glueComprehendPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "ComprehendFullAccess"
+    );
+    const glueS3policy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "AmazonS3FullAccess"
+    );
+    const glueEventBridgePolicy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "AmazonEventBridgeFullAccess"
+    )
+    const glueSNSPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "AmazonSNSFullAccess"
+    )
+    const glueCloudwatchEventsPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName(
+      "CloudWatchEventsFullAccess"
+    )
+
+
+    glueRole.addManagedPolicy(glueServiceRolePolicy);
+    glueRole.addManagedPolicy(glueS3policy);
+    glueRole.addManagedPolicy(glueComprehendPolicy);
+    glueRole.addManagedPolicy(glueEventBridgePolicy);
+    glueRole.addManagedPolicy(glueSNSPolicy);
+    glueRole.addManagedPolicy(glueCloudwatchEventsPolicy);
+
+    /* THESE ARE THE CONFIGURATION FOR GLUE */
+    const PYTHON_VER = "3.9";
+    const GLUE_VER = "3.0";
+    const MAX_RETRIES = 0; // no retries, only execute once
+    const MAX_CAPACITY = 1 // or 0.0625; // 1 DPU
+    const MAX_CONCURRENT_RUNS = 1; // 1 concurrent runs of the same job simultaneously
+    const TIMEOUT = 120; // 120 min timeout duration
+    
+    // clean-survey-monkey-data
+    const glueJob1Name = "tlef-clean-survey-monkey-data";
+    const glueJob1 = new glue.CfnJob(this, glueJob1Name, {
+      name: glueJob1Name,
+      role: glueRole.roleArn,
+      command: {
+        name: "pythonshell",
+        pythonVersion: PYTHON_VER,
+        scriptLocation:
+          "s3://" + glueS3Bucket.bucketName + "/scripts/clean-survey-monkey-data.py",
+      },
+      executionProperty: {
+        maxConcurrentRuns: MAX_CONCURRENT_RUNS,
+      },
+      maxRetries: MAX_RETRIES,
+      maxCapacity: MAX_CAPACITY,
+      timeout: TIMEOUT,
+      glueVersion: GLUE_VER,
+      defaultArguments: {
+        "--additional-python-modules": "openpyxl,fuzzywuzzy",
+        "library-set": "analytics",
+        "--BUCKET_NAME": dataBucket.bucketName,
+        "--SURVEY_MONKEY_S3URI": "n/a", // placeholder, to be copy paste by client on the console
+        "--INSTITUTION_DATA_S3URI": `s3://${dataBucket}/INSTITUTION_DATA/institution_data.csv`, // hardcoded folder name and file name
+      },
+    });
+
+    // generate-new-grant-ids
+    const glueJob2Name = "tlef-generate-new-grant-ids";
+    const glueJob2 = new glue.CfnJob(this, glueJob2Name, {
+      name: glueJob2Name,
+      role: glueRole.roleArn,
+      command: {
+        name: "pythonshell",
+        pythonVersion: PYTHON_VER,
+        scriptLocation:
+          "s3://" + glueS3Bucket.bucketName + "/scripts/generate-new-grant-ids.py",
+      },
+      executionProperty: {
+        maxConcurrentRuns: MAX_CONCURRENT_RUNS,
+      },
+      maxRetries: MAX_RETRIES,
+      maxCapacity: MAX_CAPACITY,
+      timeout: TIMEOUT,
+      glueVersion: GLUE_VER,
+      defaultArguments: {
+        "library-set": "analytics",
+        "--BUCKET_NAME": dataBucket.bucketName,
+        "--PROJECT_DETAILS_S3URI": "n/a", // placeholder, to be copy paste by client on the console
+      },
+    });
+
+    // 
+    const glueJob3Name = "tlef-generate-embeddings-and-similar-projects.py";
+    const glueJob3 = new glue.CfnJob(this, glueJob3Name, {
+      name: glueJob3Name,
+      role: glueRole.roleArn,
+      command: {
+        name: "pythonshell",
+        pythonVersion: PYTHON_VER,
+        scriptLocation:
+          "s3://" + glueS3Bucket.bucketName + "/scripts/generate-embeddings-and-similar-projects.py",
+      },
+      executionProperty: {
+        maxConcurrentRuns: MAX_CONCURRENT_RUNS,
+      },
+      maxRetries: MAX_RETRIES,
+      maxCapacity: MAX_CAPACITY,
+      timeout: TIMEOUT,
+      glueVersion: GLUE_VER,
+      defaultArguments: {
+        "--additional-python-modules": "sentence-transformers",
+        "library-set": "analytics",
+        "--BUCKET_NAME": dataBucket.bucketName,
+        "--EMBEDDINGS_BUCKET": embeddingsBucket.bucketName,
+        "--PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI": "n/a", // will be filled by the 2nd glue job
+      },
+    });
+
+    // Deploy glue job to glue S3 bucket
+    new s3deploy.BucketDeployment(this, "DeployGlueJobFiles1", {
+      sources: [s3deploy.Source.asset("./glue/scripts")],
+      destinationBucket: glueS3Bucket,
+      destinationKeyPrefix: "scripts",
+    });
+
+    // Grant S3 read/write role to Glue
+    glueS3Bucket.grantReadWrite(glueRole);
+    dataBucket.grantReadWrite(glueRole);
+
+    // Destroy Glue related resources when PatentDataStack is deleted
+    glueJob1.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY);
+    glueJob2.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY);
+    glueJob3.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY);
+  }
+}
diff --git a/backend/cdk/lib/vpc-stack.ts b/backend/cdk/lib/vpc-stack.ts
index b70b0ba..0128bdc 100644
--- a/backend/cdk/lib/vpc-stack.ts
+++ b/backend/cdk/lib/vpc-stack.ts
@@ -32,12 +32,6 @@ export class VpcStack extends Stack {
         });
         this.vpc.addFlowLog("vpcFlowLog");
 
-        const defaultSecutiryGroup = ec2.SecurityGroup.fromSecurityGroupId(
-            this,
-            id,
-            this.vpc.vpcDefaultSecurityGroup
-        );
-
         this.vpc.addInterfaceEndpoint("ECR Endpoint", {
             service: ec2.InterfaceVpcEndpointAwsService.ECR,
             subnets: { subnetType: ec2.SubnetType.PRIVATE_ISOLATED },
diff --git a/backend/cdk/package-lock.json b/backend/cdk/package-lock.json
index a765033..b78b564 100644
--- a/backend/cdk/package-lock.json
+++ b/backend/cdk/package-lock.json
@@ -1763,12 +1763,14 @@
     "node_modules/balanced-match": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "dev": true
     },
     "node_modules/brace-expansion": {
       "version": "1.1.11",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
       "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+      "dev": true,
       "dependencies": {
         "balanced-match": "^1.0.0",
         "concat-map": "0.0.1"
@@ -1979,7 +1981,8 @@
     "node_modules/concat-map": {
       "version": "0.0.1",
       "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg=="
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true
     },
     "node_modules/constructs": {
       "version": "10.3.0",
@@ -3393,6 +3396,7 @@
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
       "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "dev": true,
       "dependencies": {
         "brace-expansion": "^1.1.7"
       },
@@ -3731,6 +3735,7 @@
       "version": "6.3.1",
       "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
       "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
+      "dev": true,
       "bin": {
         "semver": "bin/semver.js"
       }