diff --git a/backend/cdk/glue/scripts/clean-survey-monkey-data.py b/backend/cdk/glue/scripts/clean-survey-monkey-data.py new file mode 100644 index 0000000..0eb610f --- /dev/null +++ b/backend/cdk/glue/scripts/clean-survey-monkey-data.py @@ -0,0 +1,825 @@ +import boto3 +import pandas as pd +import numpy as np +import re +import sys +import datetime +from fuzzywuzzy import fuzz +import collections +comprehend = boto3.client('comprehend', region_name='ca-central-1') + +# Glue parameters +from awsglue.utils import getResolvedOptions +# in the Job details tab, the parameters has an extra -- in front, Glue expect it +# this bit retrieve the environment variables (parameters) in the Job details into the Glue python environment +args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "INSTITUTION_DATA_S3_URI", "SURVEY_MONKEY_S3URI"]) + +BUCKET_NAME = args["BUCKET_NAME"] +INSTITUTION_DATA_S3_URI = args["INSTITUTION_DATA_S3_URI"] +SURVEY_MONKEY_S3URI = args["SURVEY_MONKEY_S3URI"] + +def return_df(bucket, data_key): + + if "s3://" in data_key: # TIEN a full s3 URI is passed + data_location = data_key + else: + data_location = 's3://{}/{}'.format(bucket, data_key) + + if data_location[-3:] == 'csv': + df = pd.read_csv(data_location) + else: + df = pd.read_excel(data_location) + + delete_columns = [] + + if '"Small TLEF Project Proposal" completion status' in list(df.columns): + delete_columns.append('"Small TLEF Project Proposal" completion status') + if '"Small TLEF Project Proposal" completion date/time' in list(df.columns): + delete_columns.append('"Small TLEF Project Proposal" completion date/time') + + df = df.drop(columns=delete_columns, axis=1) + + if 'Project Title (200 characters max.)' in list(df.columns): + df.dropna(subset=['Project Title (200 characters max.)'], inplace=True) + return df + +def empty_to_zero(df): + return df.fillna(0) + +def empty_to_blank(df): + return df.fillna('') + +def change_na_or_no_to_blank(df, column_name): + + df[column_name] = df[column_name].apply( + lambda x: re.sub( + r'^(No(t\sapplicable\.?|ne\.?)|n(o(t\sapplicable\.?|ne\.?)|/a\.?)|N(/[Aa]|[Aa])\.?)$', '', str(x) + )) + + df[column_name] = df[column_name].apply( + lambda x: re.sub( + r'.*[Nn]o.*', '', str(x) + )) + + return df + +def course_info_formatting(s): + parts = [] + current_part = '' + + # Iterate over each character in the input string. + for char in s: + if char.isdigit(): + # If the current part is not empty and the last character of the current part is not a digit: + if current_part and not current_part[-1].isdigit(): + parts.append(current_part) # Add the current part to the parts list. + current_part = char # Start a new current part with the current character. + else: + current_part += char # If the last character of the current part is a digit, append the current character to the current part. + else: # If the current character is not a digit (i.e., it's a letter or other character): + # If the current part is not empty and the last character of the current part is a digit: + if current_part and current_part[-1].isdigit(): + parts.append(current_part) # Add the current part to the parts list. + current_part = char # Start a new current part with the current character. + else: + current_part += char # If the last character of the current part is not a digit, append the current character to the current part. + + # After finishing the loop, if there is any remaining part that hasn't been added to the parts list: + if current_part: + parts.append(current_part) # Add the final part to the parts list. + + return ' '.join(parts) + +def course_info_mapping(df, column_list): + # Compile a regular expression pattern to match course codes (e.g., "CS101") and optional suffixes (e.g., "A/B/C"). + course_info_pattern = re.compile(r'([A-Z]+\d+)([A-Z/]*)') + + # Iterate over each column name provided in column_list. + for column_name in column_list: + new_column_values = [] # Initialize a list to store the new, cleaned values for the current column. + + # Iterate over each value in the current column by its index. + for value_idx in range(len(list(df[column_name]))): + + this_course_info = ''.join(str(df[column_name].iloc[value_idx]).upper().split()) # Extract and clean the current value from the column, making it uppercase and removing spaces. + course_info = re.findall(course_info_pattern, this_course_info) # Use the compiled regex to find all matches of the course code pattern in the cleaned string. + + # If at least one course code is found in the string: + if course_info: + course_code = course_info[0][0] # Extract the base course code from the first match. + suffix = course_info[0][1].strip('/') # Extract and clean the suffix from the first match, removing any trailing slashes. + + # If a suffix exists after cleaning: + if suffix: + # If the suffix contains slashes, indicating multiple course codes are mentioned: + if '/' in suffix: + all_course_codes = [] # Initialize a list to hold all formatted course codes. + detailed_codes = suffix.split('/') # Split the suffix at slashes to get individual codes. + for d_code in detailed_codes: # Iterate over each individual code from the suffix. + this_course_code = course_code + d_code # Append the individual code to the base course code. + all_course_codes.append(course_info_formatting(this_course_code)) # Format and add the full course code to the list. + new_column_values.append(all_course_codes) # Add the list of formatted course codes to the new column values. + continue # Skip to the next value in the column, as processing for this value is complete. + + else: + # If there's a suffix without slashes, append it directly to the base course code. + course_code += suffix + + # Format the complete course code (with or without suffix). + course_code = course_info_formatting(course_code) + new_column_values.append([course_code]) # Add the formatted course code to the new column values as a list containing a single element. + + else: + # If no course code is found, append a blank to the new column values. + new_column_values.append('') + + df['cleaned_'+column_name] = new_column_values + + return df + +def fill_in_years(mentioned_years): + mentioned_years = [int(year) for year in mentioned_years] + + min_year = min(mentioned_years) + max_year = max(mentioned_years) + + elapsed_years = list(set([str(i) for i in range(min_year, max_year + 1)])) + return elapsed_years + +def check_string_for_20(s): + + # Check if '2020' is a standalone year in the string + if '2020' in s: + return s.replace('20', ''), True + + # Find all occurrences of '20' in the string + index = 0 + while index < len(s): + index = s.find('20', index) + if index == -1: + break + + # Check if '20' is part of a 21st century year e.g., "2023", "2024-2025" + if (index > 0 and s[index - 1].isdigit() and s[index + 2:index + 4].isdigit()): + # Move past this occurrence + index += 2 + continue + + # Check if '20' is after a '-' or standalone e.g., "20" or "2019-20" AND there is nothing after "20" or it's not a number + if (index == 0 or s[index - 1] == '-') and (index + 2 >= len(s) or not s[index + 2].isdigit()): + return s.replace('20', ''), True + + # Move to the next character to continue the search + index += 1 + + # If none of the conditions are met, return False + return s.replace('20', ''), False + +def remove_except_numbers_dash_and_onward_no_space(s): + result = "" + i = 0 + + # Iterate over each character in the string + while i < len(s): + if s[i].isdigit() or s[i] == '-' or s[i] == '/' or s[i] == '&': + result += s[i] + elif s[i:i+6].lower() == 'onward': + result += 'onward' + i += 5 # Skip the length of 'onward' minus one + elif s[i:i+4].lower() == 'and': + result += 'and' + i += 3 # Skip the length of 'and' minus one + i += 1 + + return result + +def remove_numeric_after_slash(s): + result = "" + i = 0 + while i < len(s): + # If the current character is '/' and the next character is a digit + if s[i] == '/' and i + 1 < len(s) and s[i + 1].isdigit(): + # Skip the '/' and the following numeric characters + i += 1 + while i < len(s) and s[i].isdigit(): + i += 1 + else: + # Add the current character to the result and move to the next character + result += s[i] + i += 1 + return result + +def process_colon(s): + # Only extracting year data if the input was '2024-05-01 00:00:00' + if ':' in s: + return s[:4] + else: + return s + +def term_mapping(df, column_list): + terms = ['sep', 'jan', 'may', 'jul'] + for column_name in column_list: + + for value_idx in range(len(list(df[column_name]))): + mentioned_terms = [] + + for term in terms: + if term in str(df[column_name].iloc[value_idx]).lower(): + mentioned_terms.append(term) + + if mentioned_terms: + df[column_name].iloc[value_idx] = ','.join(mentioned_terms) + else: + df[column_name].iloc[value_idx] = '' + + return df + +def year_mapping(df, column_list, year_start=0, upto_future_year=5): + today = datetime.date.today() + this_year = int(str(today.year)[-2:]) + upto_future_year # Up to 'upto_future_year' years into the future + years = [str(j) if j>=10 else '0'+str(j) for j in range(year_start, this_year)] + + for column_name in column_list: + new_column_values = [] + + for value_idx in range(len(list(df[column_name]))): + + mentioned_years = [] + + this_year_data = str(df[column_name].iloc[value_idx]) + if this_year_data == '20245/2025': # Exception + new_column_values.append('') # Add a blank + continue + this_year_data = process_colon(this_year_data) + this_year_data = remove_except_numbers_dash_and_onward_no_space(this_year_data) + this_year_data = remove_numeric_after_slash(this_year_data) + this_year_data, is_2020 = check_string_for_20(this_year_data) + + if is_2020: + mentioned_years.append('2020') + + for year in years: + if str(year) in this_year_data: + mentioned_years.append('20'+str(year)) + + if "onward" in this_year_data: + today = datetime.date.today() + this_year = int(str(today.year)[-2:]) + for elapsed_year in range(int(year)+1, this_year): + mentioned_years.append('20'+str(year)) + + if mentioned_years: + mentioned_years = list(set(mentioned_years)) + + if '-' in this_year_data: + mentioned_years = fill_in_years(mentioned_years) + mentioned_years.sort() + new_column_values.append(mentioned_years) + + else: + new_column_values.append('') # add a blank + + df['cleaned_'+column_name] = new_column_values + + return df + +def get_faculty(input_text): + + faculty_list = [ + "Faculty of Applied Science", + "Faculty of Arts", + "Faculty of Dentistry", + "Faculty of Education", + "First Nations House of Learning", + "Faculty of Forestry", + "Faculty of Graduate & Postdoctoral Studies", + "Faculty of Graduate and Postdoctoral Studies", + "Faculty of Land & Food Systems", + "Faculty of Land and Food Systems", + "Allard School of Law", + "Faculty of Medicine", + "Faculty of Pharmaceutical Sciences", + "Sauder School of Business", + "Faculty of Science", + "UBC Health", + "UBC Library", + "Vantage College", + "VP Academic", + "VP Students" + ] + + mentioned_faculties = {} + sorted_mentioned_faculties = [] + + for faculty in faculty_list: + this_mentioned = [] + start = 0 + while True: + index = input_text.lower().find(faculty.lower(), start) + if index == -1: + break + this_mentioned.append(index) + start = index+1 + + if this_mentioned: + for idx in this_mentioned: + mentioned_faculties[idx] = faculty + + ordered_faculties = collections.OrderedDict(sorted(mentioned_faculties.items())) + return ordered_faculties + +def ner_mapping(df, column, bucket, data_key, faculty_code_dict): + + df_institution_data = return_df( + bucket=bucket, + data_key=data_key + ) + + full_name_list = list(df_institution_data['PREFERRED_FULL_NAME']) + department_list = list(df_institution_data['PRIMARY_DEPARTMENT_AFFILIATION']) + faculty_list = list(df_institution_data['PRIMARY_FACULTY_AFFILIATION']) + rank_list = list(df_institution_data['PRIMARY_ACADEMIC_RANK']) + emails_list = list(df_institution_data['EMAIL_ADDRESS']) + campus_list = list(df_institution_data['PRIMARY_CAMPUS_LOCATION']) + + source_data = df[column] + destination_data = [] + + for this_free_text_idx in range(len(source_data)): + this_free_text = source_data.iloc[this_free_text_idx] + if isinstance(this_free_text, str) and this_free_text != '': # only accepting valid strings + associated_entities = retrieve_member_info(this_free_text) + cleaned_associated_entities = add_institution_info( + associated_entities, + full_name_list, + department_list, + faculty_list, + rank_list, + emails_list, + campus_list, + faculty_code_dict + ) + destination_data.append(cleaned_associated_entities) + else: + destination_data.append(this_free_text) + df['cleaned_Team Members'] = destination_data + return df + +def retrieve_member_info(input_text): + detected_entities = comprehend.detect_entities(Text=input_text, LanguageCode='en')['Entities'] + mentioned_faculties = get_faculty(input_text) + faculties_idx = list(mentioned_faculties.keys()) + + names_dict = {} + names = [] + remaining_names = [] + + emails = [] + emails_idx = [] + remaining_emails = [] + + for entity in detected_entities: + if entity['Type'] == 'PERSON': + if ' ' in entity['Text'] and not ('President' in entity['Text'] or 'Dean' in entity['Text']): + names_dict[entity['BeginOffset']] = entity['Text'] + + elif entity['Type'] == 'OTHER': + if '@' in entity['Text']: + emails.append(entity['Text']) + emails_idx.append(entity['BeginOffset']) + + names = list(names_dict.values()) + remaining_names = names.copy() + remaining_emails = emails.copy() + + associated_entities = {} + + if len(names) == len(emails): + for i in range(len(names)): + associated_entities[names[i]] = {} + associated_entities[names[i]]['Team Member Email'] = emails[i] + remaining_names = [] + remaining_emails = [] + + elif len(emails) < len(names): + for i in range(len(emails)): + remove_name = "" + remove_email = "" + + this_fuzz_seq = {} + this_fuzz_seq[0] = "blank" + for name in names: + this_fuzz_ratio = fuzz.ratio(emails[i].split('@')[0].lower(), name.lower()) + if this_fuzz_ratio > list(this_fuzz_seq.keys())[0]: + del this_fuzz_seq[list(this_fuzz_seq.keys())[0]] + this_fuzz_seq[int(fuzz.ratio(emails[i].split('@')[0].lower(), name.lower()))] = name + + associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]] = {} + associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]]['Team Member Email'] = emails[i] + remove_name = str(this_fuzz_seq[list(this_fuzz_seq.keys())[0]]) + remove_email = emails[i] + + if remove_name in remaining_names: + remaining_names.remove(remove_name) + + if remove_email in remaining_emails: + remaining_emails.remove(remove_email) + + if remaining_names: + for r_name in remaining_names: + associated_entities[r_name] = {} + associated_entities[r_name]['Team Member Email'] = "" + + elif len(names) < len(emails): + for i in range(len(names)): + remove_name = "" + remove_email = "" + + this_fuzz_seq = {} + this_fuzz_seq[0] = "blank" + for email in emails: + this_fuzz_ratio = fuzz.ratio(email.split('@')[0].lower(), names[i].lower()) + if this_fuzz_ratio > list(this_fuzz_seq.keys())[0]: + del this_fuzz_seq[list(this_fuzz_seq.keys())[0]] + this_fuzz_seq[ + int(fuzz.ratio(email.split('@')[0].lower(), names[i].lower()))] = names[i] + + associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]] = {} + associated_entities[this_fuzz_seq[list(this_fuzz_seq.keys())[0]]]['Team Member Email'] = email + remove_name = str(this_fuzz_seq[list(this_fuzz_seq.keys())[0]]) + remove_email = email + + if remove_name in remaining_names: + remaining_names.remove(remove_name) + + if remove_email in remaining_emails: + remaining_emails.remove(remove_email) + + # Finally, update the faculty info + for n_idx in names_dict.keys(): + higher_f_idx = [f_idx for f_idx in faculties_idx if f_idx > n_idx] + if higher_f_idx: + this_key = names_dict[n_idx] + associated_entities[this_key]['Team Member Faculty'] = mentioned_faculties[min(higher_f_idx)] + + return associated_entities + +def add_institution_info( + associated_entities, + full_name_list, + department_list, + faculty_list, + rank_list, + emails_list, + campus_list, + faculty_code_dict +): + + cleaned_associated_entities = {} + + for extracted_full_name in associated_entities.keys(): + + selected_name = "" + selected_rank = "" + selected_department = "" + selected_campus = "" + selected_faculty = None + + match_score = 0 + for i in range(len(full_name_list)): + this_fuzz_ratio = fuzz.ratio(extracted_full_name.lower(), full_name_list[i].lower()) + if this_fuzz_ratio > 50 and this_fuzz_ratio > match_score: + if 'Team Member Email' in list(associated_entities[extracted_full_name].keys()): + if associated_entities[extracted_full_name]['Team Member Email'].split('@')[0].lower() == str(emails_list[i]).split('@')[0].lower(): + + selected_name = full_name_list[i] + selected_rank = rank_list[i] + selected_department = department_list[i] + + if 'UBC Vancouver' in campus_list[i]: + selected_campus = "UBCV" + elif 'UBC Okanagan' in campus_list[i]: + selected_campus = "UBCO" + else: + selected_campus = "External" + + if not 'Faculty' in list(associated_entities[extracted_full_name].keys()): + selected_faculty = faculty_list[i] + + if not selected_name == "": + cleaned_associated_entities[selected_name] = {} + cleaned_associated_entities[selected_name] = associated_entities[extracted_full_name] + cleaned_associated_entities[selected_name]['Team Member Department'] = selected_department + cleaned_associated_entities[selected_name]['Team Member Title'] = selected_rank + cleaned_associated_entities[selected_name]['Campus'] = selected_campus + + if selected_faculty: + if selected_faculty in list(faculty_code_dict.keys()): + cleaned_associated_entities[selected_name]['Team Member Faculty'] = faculty_code_dict[selected_faculty] + else: + cleaned_associated_entities[selected_name]['Team Member Faculty'] = 'NA' + + else: + cleaned_associated_entities[extracted_full_name] = associated_entities[extracted_full_name] + if 'Team Member Faculty' in list(cleaned_associated_entities[extracted_full_name].keys()): + if cleaned_associated_entities[extracted_full_name]['Team Member Faculty'] in list(faculty_code_dict.keys()): + cleaned_associated_entities[extracted_full_name]['Team Member Faculty'] = faculty_code_dict[cleaned_associated_entities[extracted_full_name]['Team Member Faculty']] + return cleaned_associated_entities + +def assign_faculty_code(df, faculty_code_dict, col_name="Project Faculty"): + new_project_faculty = [] + project_faculty = list(df[col_name]) + + for proj_fac in project_faculty: + if proj_fac in list(faculty_code_dict.keys()): + new_project_faculty.append(faculty_code_dict[proj_fac]) + else: + new_project_faculty.append(proj_fac) + + df[col_name] = new_project_faculty + + return df + +def add_project_type_col(df): + project_types = None + + if 'Application Title' in list(df.columns): + application_titles = list(df['Application Title']) + project_types = [] + project_types = ['Small' if 'SP' in application_titles[i] else 'Large' for i in range(len(application_titles))] + + if project_types: + df['Project Type'] = project_types + + return df + +def add_funding_year_col(df): + funding_years = None + + if 'Application Title' in list(df.columns): + application_titles = list(df['Application Title']) + funding_years = [] + funding_years = [application_titles[i].split('-')[0] for i in range(len(application_titles))] + + if funding_years: + df['Funding Year'] = funding_years + + return df + +def remove_text_inside_brackets(text): + # Regular expression to match text inside round brackets including the brackets + pattern = r'\(.*?\)\s*' + # Replace the matched text with an empty string + cleaned_text = re.sub(pattern, '', text) + return cleaned_text.strip() + +def focus_areas_list_change(df): + new_focus_areas = None + if 'Project Focus Areas' in list(df.columns): + new_focus_areas = [] + new_focus_areas = [remove_text_inside_brackets( + focus_text).split( + ',') for focus_text in list( + df['Project Focus Areas'] + )] + + if new_focus_areas: + df['Project Focus Areas'] = new_focus_areas + + return df + +def rename_columns(df): + new_col_names = {} + + new_col_names['Project Title (200 characters max.)'] = 'Project Title' + new_col_names['What project year does this proposal pertain to?'] = 'Project Year' + new_col_names['Principal Applicant | Principal Applicant’s name:'] = 'PI' + new_col_names['Principal Applicant | Principal Applicant’s title(s) (e.g. Assistant Professor, Lecturer, Professor of Teaching, etc.):'] = 'PI Title' + new_col_names['Principal Applicant | Principal Applicant’s primary (UBC) email address:'] = 'PI Email' + new_col_names['Principal Applicant | Principal Applicant’s role:'] = 'PI Role' + new_col_names['Principal Applicant | Principal Applicant’s Faculty, College, or administrative unit:'] = 'Project Faculty' + new_col_names['Principal Applicant | Principal Applicant’s Department, School, or unit:'] = 'Department' + new_col_names['Special Classroom or Facilities Requirements (150 words max.)'] = 'Special Classroom or Facilities Requirements' + new_col_names['Co-Applicants & Project Team Members (500 words max.)'] = 'Team Members' + + for col in list(df.columns): + if 'Students Reached by the Project | | Academic Year' in col: + new_col_names[col] = col.replace('Students Reached by the Project | | Academic Year', 'project_academic_year') + + if 'Students Reached by the Project | | Course Code' in col: + new_col_names[col] = col.replace('Students Reached by the Project | | Course Code', 'project_course_code') + + if 'Students Reached by the Project | | Term (Sep/Jan/May)' in col: + new_col_names[col] = col.replace('Students Reached by the Project | | Term (Sep/Jan/May)', 'project_term') + + if 'Students Reached by the Project | | Term (Sep/Jan/May)' in col: + new_col_names[col] = col.replace('Students Reached by the Project | | Term (Sep/Jan/May)', 'project_term') + + df = df.rename(columns=new_col_names) + + return df + +def generate_faculty_engagement_xlsx(df, ner_col_name='cleaned_Team Members'): + + funding_year = [] + project_type = [] + application_title = [] + project_faculty = [] + team_member_name = [] + team_member_title = [] + team_member_stream = [] + campus = [] + team_member_faculty = [] + team_member_department = [] + team_member_email = [] + + + for proj_idx in range(len(df)): + this_member_info = df[ner_col_name].iloc[proj_idx] + if not isinstance(this_member_info, int) and this_member_info != '': # Not an integer or a blank string + for this_team_member_name in list(this_member_info.keys()): + + funding_year.append(df['Funding Year'].iloc[proj_idx]) + project_type.append(df['Project Type'].iloc[proj_idx]) + application_title.append(df['Application Title'].iloc[proj_idx]) + project_faculty.append(df['Project Faculty'].iloc[proj_idx]) + + team_member_name.append(this_team_member_name) + + if 'Team Member Title' in list(this_member_info[this_team_member_name].keys()): + team_member_title.append(this_member_info[this_team_member_name]['Team Member Title']) + else: + team_member_title.append('') + + if 'Team Member Stream' in list(this_member_info[this_team_member_name].keys()): + team_member_stream.append(this_member_info[this_team_member_name]['Team Member Stream']) + else: + team_member_stream.append('') + + if 'Campus' in list(this_member_info[this_team_member_name].keys()): + campus.append(this_member_info[this_team_member_name]['Campus']) + else: + campus.append('') + + if 'Team Member Faculty' in list(this_member_info[this_team_member_name].keys()): + team_member_faculty.append(this_member_info[this_team_member_name]['Team Member Faculty']) + else: + team_member_faculty.append('') + + if 'Team Member Department' in list(this_member_info[this_team_member_name].keys()): + team_member_department.append(this_member_info[this_team_member_name]['Team Member Department']) + else: + team_member_department.append('') + + if 'Team Member Email' in list(this_member_info[this_team_member_name].keys()): + team_member_email.append(this_member_info[this_team_member_name]['Team Member Email']) + else: + team_member_email.append('') + + n_rows = len(funding_year) + + faculty_engagement_df = pd.DataFrame({ + 'funding_year': funding_year, + 'project_type': project_type, + 'project_id': application_title, + 'grant_id': [''] * n_rows, # This info is not present in the raw survey monkey dataset + 'project_faculty': project_faculty, + 'member_name': team_member_name, + 'member_title': team_member_title, + 'member_stream': team_member_stream, + 'member_campus': campus, + 'member_faculty': team_member_faculty, + 'member_unit': team_member_department, + 'member_other': [''] * n_rows # This info is not available in the institution_data.csv file + }) + + # Unused: team_member_email + + return faculty_engagement_df + +def generate_project_details_xlsx(df): + funding_year = df['Funding Year'] + n_rows = len(funding_year) + project_type = df['Project Type'] + grant_id = df['Application Title'] + project_id = [''] * n_rows # This info is not present in the raw survey monkey dataset + project_faculty = df['Project Faculty'] + pi_name = df['PI'] + pi_unit = df['Department'] + funding_amount = df['Total Project Budget'] + title = df['Project Title'] + summary = df['Summary of Work Accomplished to Date (1000 words max.)'] + co_applicants = df['Team Members'] + generated_grant_id = [''] * n_rows # This info is not generated at this stage + project_year = [''] * n_rows # This info is not present in the raw survey monkey dataset + project_status = [''] * n_rows # This info is not present in the raw survey monkey dataset + + project_details_df = pd.DataFrame({ + 'funding_year': funding_year, + 'project_type': project_type, + 'grant_id': grant_id, + 'project_id': project_id, + 'project_faculty': project_faculty, + 'pi_name': pi_name, + 'pi_unit': pi_unit, + 'funding_amount': funding_amount, + 'title': title, + 'summary': summary, + 'co_applicants': co_applicants, + 'generated_grant_id': generated_grant_id, + 'project_year': project_year, + 'project_status': project_status + }) + + return project_details_df + +def tlef_raw_data_preprocessing(bucket, raw_data_key, institution_data_key): + + faculty_code_dict = { + "Faculty of Applied Science": "APSC", + "Faculty of Arts": "ARTS", + "Faculty of Dentistry": "DENT", + "Faculty of Education": "EDUC", + "First Nations House of Learning": "ARTS", + "Faculty of Forestry": "FRST", + "Faculty of Graduate & Postdoctoral Studies": "GRAD", + "Faculty of Graduate and Postdoctoral Studies": "GRAD", + "Faculty of Land & Food Systems": "LFS", + "Faculty of Land and Food Systems": "LFS", + "Allard School of Law": "LAW", + "Faculty of Medicine": "MED", + "Faculty of Pharmaceutical Sciences": "PHAR", + "Sauder School of Business": "COMM", + "Faculty of Science": "SCI", + "UBC Health": "HLTH", + "UBC Library": "LIBR", + "Vantage College": "VANT", + "VP Academic": "VPA", + "VP Students": "VPS" + } + + df = return_df( + bucket=bucket, + data_key=raw_data_key + ) + + print("1. Transforming empty values to blanks...") + df = empty_to_blank(df) + + print("\n\n2. Transforming irrelevant values to 0...") + df = change_na_or_no_to_blank(df, 'Special Classroom or Facilities Requirements (150 words max.)') + + course_code_columns_list = [f'Students Reached by the Project | | Course Code{f".{i}" if i > 0 else ""}' for i in range(10)] + print("\n\n3. Mapping course codes...") + df = course_info_mapping(df, course_code_columns_list) + + years_columns_list = [f'Students Reached by the Project | | Academic Year{f".{i}" if i > 0 else ""}' for i in range(10)] + print("\n\n4. Mapping years...") + df = year_mapping(df, years_columns_list) + + terms_columns_list = [f'Students Reached by the Project | | Term (Sep/Jan/May){f".{i}" if i > 0 else ""}' for i in range(10)] + print("\n\n5. Mapping terms...") + df = term_mapping(df, terms_columns_list) + + ner_column = 'Co-Applicants & Project Team Members (500 words max.)' + + print("\n\n6. Extracting name-entity relationships...") + df = ner_mapping(df, ner_column, bucket, institution_data_key, faculty_code_dict) + + print("\n\n7. Adding a 'Project Type' column that contains info on if the project was Large or Small...") + df = add_project_type_col(df) + + print("\n\n8. Adding a 'Funding Year' column that contains info on what year did the project receive funding...") + df = add_funding_year_col(df) + + print("\n\n9. Converted the Focus Areas column data into a list where each element is the focus area relevant to that project...") + df = focus_areas_list_change(df) + + print("\n\n10. Rename columns...") + df = rename_columns(df) + + print("\n\n11. Change full faculty name to its faculty code...") + df = assign_faculty_code(df, faculty_code_dict, col_name="Project Faculty") + + print("\n\n11. Generate faculty engagement xlsx dataset...") + faculty_engagement_df = generate_faculty_engagement_xlsx(df) + + print("\n\n12. Generate project details xlsx dataset...") + project_details_df = generate_project_details_xlsx(df) + + print("Process completed!") + + return df, faculty_engagement_df, project_details_df + +clean_df, faculty_engagement_df, project_details_df = tlef_raw_data_preprocessing( + bucket=BUCKET_NAME, + raw_data_key = SURVEY_MONKEY_S3URI, + institution_data_key = INSTITUTION_DATA_S3_URI +) + +current_year = project_details_df["funding_year"][0] # TIEN, get the year + +# save directly to s3 +#clean_df.to_excel(f's3://{BUCKET_NAME}/raw/clean_survey_monkey_{current_year}.xlsx', index=False) # TIEN +faculty_engagement_df.to_excel(f's3://{BUCKET_NAME}/raw/faculty_engagement/faculty_engagement_{current_year}.xlsx', index=False) # TIEN +project_details_df.to_excel(f's3://{BUCKET_NAME}/raw/project_details/project_details_{current_year}.xlsx', index=False) # TIEN \ No newline at end of file diff --git a/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py b/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py new file mode 100644 index 0000000..57eab58 --- /dev/null +++ b/backend/cdk/glue/scripts/generate-embeddings-and-similar-projects.py @@ -0,0 +1,282 @@ +from sentence_transformers import SentenceTransformer, util +import boto3 +from io import BytesIO +import pandas as pd +import numpy as np +import pickle +import os +import sys +from scipy.spatial.distance import cdist + +# Glue parameters +from awsglue.utils import getResolvedOptions +args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "EMBEDDINGS_BUCKET", "PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI"]) + +BUCKET_NAME = args["BUCKET_NAME"] +EMBEDDINGS_BUCKET = args["EMBEDDINGS_BUCKET"] +PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI = args["PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI"] + +def createDir(path): + os.makedirs(path, exist_ok=True) # using os.markdirs to also create intermediate directories + return os.path.join(os.getcwd(), path) + +model_custom_path = createDir("cache/torch/sentence_transformers") + +def return_df(bucket, data_key): # TIEN + + if "s3://" in data_key: # TIEN a full s3 URI is passed + data_location = data_key + else: + data_location = 's3://{}/{}'.format(bucket, data_key) + + if ".parquet" in data_key: + df = pd.read_parquet(data_location) + else: + df = pd.read_excel(data_location) + + return df + +project_details_df = return_df( + bucket = BUCKET_NAME, #previously 'clean-full-data', + data_key = PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI # previously 'project_details_with_new_ids.xlsx' +) + +def find_all_summaries(model='all-mpnet-base-v2'): + + embedding_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = model_custom_path) + + for i in range(len(list(project_details_df.project_id))): + + this_project_id = project_details_df.project_id[i] + if this_project_id != this_project_id: # i.e., project_id is NaN + this_project_id = project_details_df.generated_grant_id[i] # use the automatically generated grant ID instead + + # Find all rows where either the corresponding 'project_id' column values are equal to this_project_id + this_relevant_df = project_details_df.loc[project_details_df.project_id == this_project_id] + this_project_context = "" + this_project_context_embedding = None + + if this_relevant_df.empty: + # Whatever we have right now is the only occurrence + this_title = project_details_df.title[i] # TIEN project_title -> title + this_summary = project_details_df.summary[i] # TIEN project_summary -> summary + + if not this_title != this_title and not this_summary != this_summary: # i.e., both title and summary are not NaN + + this_title_summary = this_title + '. ' + this_summary + ' ' # concatenate title and summary into 1 string + # create embeddings witht the concat string + this_title_summary_embedding = embedding_model.encode(this_title_summary, convert_to_tensor=True) + + this_project_context = this_title_summary + this_project_context_embedding = this_title_summary_embedding + + else: + # Generate one context for embedding + this_project_context, this_project_context_embedding = generate_context_embeddings(this_relevant_df, embedding_model) + + store_context_and_embeddings( + this_project_context, + this_project_context_embedding, + bucket = EMBEDDINGS_BUCKET, #'tlef-project-summary-embeddings', + data_key = this_project_id + ) + +def check_and_update_embeddings(bucket, data_key, current_context, current_embeddings, threshold=0.96): + """ + Check if embeddings exist, compare them, and update if similarity is above threshold. + """ + s3_client = boto3.client('s3') + context_key = f'{data_key}.pkl' + embeddings_key = f'{data_key}_embeddings.pkl' + + try: + # Check if the context and embeddings files exist + context_object = s3_client.get_object(Bucket=bucket, Key=context_key) + embeddings_object = s3_client.get_object(Bucket=bucket, Key=embeddings_key) + + # Load existing context and embeddings + existing_context = pickle.loads(context_object['Body'].read()) + existing_embeddings = pickle.loads(embeddings_object['Body'].read()) + + # Compare embeddings using cosine similarity + similarity = util.pytorch_cos_sim(current_embeddings, existing_embeddings) + + if similarity > threshold: + # Update context and embeddings if similarity is high + updated_context = existing_context + " " + current_context + updated_embeddings = pickle.dumps(current_embeddings) # Assuming current_embeddings is the updated embeddings + + # Save updated context and embeddings + s3_client.put_object(Bucket=bucket, Key=context_key, Body=pickle.dumps(updated_context)) + s3_client.put_object(Bucket=bucket, Key=embeddings_key, Body=updated_embeddings) + return True + + except s3_client.exceptions.NoSuchKey: + # If the files do not exist, proceed with the original saving process + return False + + return False + +def store_context_and_embeddings(project_context, project_context_embedding, bucket, data_key): + if project_context_embedding is not None: # If it is not NaN + if not check_and_update_embeddings(bucket, data_key, project_context, project_context_embedding): + # If the embeddings are not updated due to no existing files or low similarity, save them as new + s3_client = boto3.client('s3') + context_bytes = pickle.dumps(project_context) + embeddings_bytes = pickle.dumps(project_context_embedding) + + context_key = f'{data_key}.pkl' + embeddings_key = f'{data_key}_embeddings.pkl' + + s3_client.put_object(Bucket=bucket, Key=context_key, Body=context_bytes) + s3_client.put_object(Bucket=bucket, Key=embeddings_key, Body=embeddings_bytes) + +def generate_context_embeddings(relevant_df, embedding_model): + + context = "" + context_embedding = None + + # iterate through each rows of the "relevant" df + for index, row in relevant_df.iterrows(): + if context_embedding is None: + current_context_embedding = embedding_model.encode(context, convert_to_tensor=True) + else: + current_context_embedding = context_embedding + + this_title = row.title # TIEN project_title -> title + this_summary = row.project_summary + + if not this_title != this_title and not this_summary != this_summary: # i.e., both title and summary are not NaN + + this_title_summary = this_title + '. ' + this_summary + ' ' + this_title_summary_embedding = embedding_model.encode(this_title_summary, convert_to_tensor=True) + + if util.cos_sim(current_context_embedding, this_title_summary_embedding)[0] < 0.96: # The new title+summary has some differences + context += this_title_summary + context_embedding = embedding_model.encode(context, convert_to_tensor=True) + + return context, context_embedding + +find_all_summaries() + + +############## END OF GENERATING EMBEDDINGS PART + +############## BEGIN OF COMPUTING SIMILARITY PART + +def generate_embeddings_database(bucket_name): + # Initialize a boto3 client + s3 = boto3.client('s3') + + # Create a reusable Paginator + paginator = s3.get_paginator('list_objects_v2') + + # Create a PageIterator from the Paginator + page_iterator = paginator.paginate(Bucket=bucket_name) + + embeddings_database = {} + + for page in page_iterator: + for obj in page['Contents']: + key = obj['Key'] + if 'embeddings' in key: + # Get the object from S3 + response = s3.get_object(Bucket=bucket_name, Key=key) + body = response['Body'].read() + + # Deserialize the embeddings + this_embedding = pickle.loads(body) + + embeddings_database[key] = this_embedding + + return embeddings_database + +def generate_similar_projects_database(bucket_name, embeddings_database, top_k=10): + # Initialize a boto3 client + s3 = boto3.client('s3') + + # Create a reusable Paginator + paginator = s3.get_paginator('list_objects_v2') + + # Create a PageIterator from the Paginator + page_iterator = paginator.paginate(Bucket=bucket_name) + + similar_projects_database = {} + + for page in page_iterator: + for obj in page['Contents']: + key = obj['Key'] + if 'embeddings' in key: + # Get the object from S3 + s3_response = s3.get_object(Bucket=bucket_name, Key=key) + body = s3_response['Body'].read() + + # Deserialize the embeddings + this_embedding = pickle.loads(body) + + # Convert embeddings_database values into a list for comparison + database_embeddings_list = list(embeddings_database.values()) + database_keys_list = list(embeddings_database.keys()) + + print(key) + print(type(this_embedding)) + print(this_embedding) + print("-----------") + print(type(database_embeddings_list)) + + # Assuming embeddings_database is a dict with file names as keys and embeddings as values + embeddings_matrix = np.stack(database_embeddings_list) + this_embedding_matrix = np.array([this_embedding]) + + print("Here") + + # Calculate cosine similarities (using 1 - cosine distance) + similarities = 1 - cdist(this_embedding_matrix, embeddings_matrix, 'cosine') + sorted_indices = np.argsort(similarities[0])[::-1] # Sorting indices in descending order + + print("Here2") + + # Store top k similar projects + similar_projects_database[key.replace("_embeddings.pkl", "")] = [database_keys_list[i].replace("_embeddings.pkl", "") for i in sorted_indices[1:top_k+1]] + + print("End") + + return similar_projects_database + +def save_similar_projects_database(similar_projects_database, bucket_name, file_name): # TIEN + # Convert the similar projects database to a DataFrame for easy Excel writing + df = pd.DataFrame(list(similar_projects_database.items()), columns=['project_key', 'similar_projects']) + + # Convert the list of similar projects into a string for better Excel visualization + df['similar_projects'] = df['similar_projects'].apply(lambda x: ', '.join(x)) + + df.to_parquet(f"s3://{bucket_name}/{file_name}") + + # # Create a BytesIO buffer to hold the Excel file in memory + # excel_buffer = BytesIO() + # # Write the DataFrame to an Excel file in the buffer + # with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: + # df.to_excel(writer, sheet_name='similar_projects', index=False) + + # # Go to the beginning of the BytesIO buffer + # excel_buffer.seek(0) + + # # Initialize a boto3 client + # s3_client = boto3.client('s3') + # # Upload the Excel file from the buffer to the S3 bucket + # s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=excel_buffer.getvalue(), ContentType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') + + # # Close the BytesIO buffer + # excel_buffer.close() + +def main(): + + embeddings_database = generate_embeddings_database(EMBEDDINGS_BUCKET) #'tlef-project-summary-embeddings' + + similar_projects_database = generate_similar_projects_database(EMBEDDINGS_BUCKET, embeddings_database) #'tlef-project-summary-embeddings' + + save_similar_projects_database(similar_projects_database, BUCKET_NAME, "staging/similar_projects.parquet") # similar_projects.xlsx + + +if __name__ == "__main__": + main() diff --git a/backend/cdk/glue/scripts/generate-new-grant-ids.py b/backend/cdk/glue/scripts/generate-new-grant-ids.py new file mode 100644 index 0000000..cb6b08f --- /dev/null +++ b/backend/cdk/glue/scripts/generate-new-grant-ids.py @@ -0,0 +1,162 @@ +import pandas as pd +import boto3 +import s3fs + +import sys +import io +import boto3 +from botocore.exceptions import ClientError + +# Glue parameters +from awsglue.utils import getResolvedOptions +args = getResolvedOptions(sys.argv, ["BUCKET_NAME", "PROJECT_DETAILS_S3URI"]) + +BUCKET_NAME = args["BUCKET_NAME"] +PROJECT_DETAILS_S3URI = args["PROJECT_DETAILS_S3URI"] + + +def return_df(bucket, data_key): + + if "s3://" in data_key: # TIEN a full s3 URI is passed + data_location = data_key + else: + data_location = 's3://{}/{}'.format(bucket, data_key) + + df = pd.read_excel(data_location) + + return df + +project_details_df = return_df( + bucket = BUCKET_NAME, + data_key = PROJECT_DETAILS_S3URI +) + +def store_df(df, bucket, data_key): + + data_location = 's3://{}/{}'.format(bucket, data_key) + + if '.parquet' in data_key: + df.to_parquet(data_location, index=False) + else: + df.to_excel(data_location, index=False) + +def assign_grant_ids(project_details_df): + + grant_ids = [] + + for proj_idx in range(len(project_details_df)): + + this_proj_funding_year = project_details_df.funding_year.iloc[proj_idx] + + if ';' in project_details_df.project_faculty.iloc[proj_idx]: # There are multiple project faculties + this_proj_faculty = "" + + faculty_list = project_details_df.project_faculty.iloc[proj_idx].split(';') # Add all faculties to the grant ID + for this_faculty in faculty_list: + this_proj_faculty += this_faculty.strip() + '-' + + this_proj_faculty = this_proj_faculty[:-1] # Remove the last '-' + + else: + this_proj_faculty = project_details_df.project_faculty.iloc[proj_idx].strip() + + this_proj_type = "" + + existing_grant_id = project_details_df.grant_id.iloc[proj_idx] + + if not existing_grant_id != existing_grant_id: # Existing Grant ID is not NaN + if len(existing_grant_id.split('-')) >= 5: # Existing Grant ID has 5 components or 6 in some cases ('2015-TLEF-LP1-ARTS-Giltrow-A' and '2015-TLEF-LP1-ARTS-Giltrow-B') + + if "SP" in existing_grant_id.split('-')[2] or "LP" in existing_grant_id.split('-')[2]: + this_proj_type = existing_grant_id.split('-')[2] # Extract project type from the existing Grant ID + + elif "FL" in existing_grant_id.split('-')[1]: # Convert it into an LP but restore the number, e.g., FL2 -> LP2 + if existing_grant_id.split('-')[1][-1].isdigit(): # E.g., "FL2" + this_proj_type = "LP" + existing_grant_id.split('-')[1][-1] + + else: + this_proj_type = "LP" + + elif "UDL" in existing_grant_id.split('-')[1]: # Convert it into an SP but restore the number, e.g., UDL2 -> SP2 + if existing_grant_id.split('-')[1][-1].isdigit(): # E.g., "UDL2" + this_proj_type = "SP" + existing_grant_id.split('-')[1][-1] + + else: + this_proj_type = "SP" + + elif "ITTG" in existing_grant_id.split('-')[2]: # E.g., Grant ID = 2020-ITTG-ARTS-Mawani + this_proj_id = project_details_df.project_id.iloc[proj_idx] # Extract the corresponding Project ID, e.g., 2020-SP-ARTS-008 + + if "SP" in this_proj_id.split('-')[1] or "LP" in this_proj_id.split('-')[1]: + this_proj_type = this_proj_id.split('-')[1] # Extract and store project type from the project ID + + if this_proj_type == "": + this_proj_type = "SP" if project_details_df.project_type.iloc[proj_idx] == "Small" else "LP" + + if ';' in project_details_df.pi_name.iloc[proj_idx]: # If there are multiple PIs + this_proj_PI_surname = "" + + PI_list = project_details_df.pi_name.iloc[proj_idx].split(';') + for PI_name in PI_list: + this_PI_surname = PI_name.split(' ')[-1] + this_proj_PI_surname += this_PI_surname.strip() + '-' # Add all PI surnames to the grant ID + + this_proj_PI_surname = this_proj_PI_surname[:-1] # Remove the last '-' + + else: + this_proj_PI_surname = project_details_df.pi_name.iloc[proj_idx].split(' ')[-1].strip() + + this_proj_grant_id = str(this_proj_funding_year) + "-TLEF-" + this_proj_type + '-' + this_proj_faculty + '-' + this_proj_PI_surname + + if this_proj_grant_id in grant_ids: # This PI has multiple TLEF projects in the same year + existing_grant_id_idx = grant_ids.index(this_proj_grant_id) + grant_ids[existing_grant_id_idx] = this_proj_grant_id + '-A' + grant_ids.append(this_proj_grant_id + '-B') + + else: + grant_ids.append(this_proj_grant_id) + + project_details_df["generated_grant_id"] = grant_ids + return project_details_df + +def convert_summary_to_strings(project_details_df): + + summary_list = [] + for proj_idx in range(len(project_details_df)): + + this_summary = project_details_df.summary.iloc[proj_idx] + if this_summary != this_summary: # To check if the value is NaN, in which case this will be True + summary_list.append("") + else: + summary_list.append(this_summary) + + project_details_df.summary = summary_list + + return project_details_df + + + +current_year = project_details_df["funding_year"][0] +file_key = f"staging/project_details/project_details_with_new_ids_{current_year}.parquet" + +store_df( + df= convert_summary_to_strings( + assign_grant_ids( + project_details_df)), + bucket = BUCKET_NAME, + data_key = file_key +) + + +# start generate-embeddings-and-similar-projects Glue Job +arguments = { + "--BUCKET_NAME": BUCKET_NAME, + "--EMBEDDINGS_BUCKET": "tlef-project-summary-embeddings", + "--PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI": f"s3://{BUCKET_NAME}/{file_key}", + "--additional-python-modules": "sentence-transformers" +} +glue_client = boto3.client("glue") +glue_client.start_job_run( + JobName="generate-embeddings-and-similar-projects.py", + Arguments=arguments +) diff --git a/backend/cdk/lambda/createFolders/createFolders.py b/backend/cdk/lambda/createFolders/createFolders.py new file mode 100644 index 0000000..69b2621 --- /dev/null +++ b/backend/cdk/lambda/createFolders/createFolders.py @@ -0,0 +1,16 @@ +import os +import boto3 + +BUCKET_NAME = os.environ["BUCKET_NAME"] + +s3 = boto3.client("s3") + +def lambda_handler(event, context): + + # CREATE A FOLDER STRUCTURE FOR THE AMPLIFY S3 STORAGE with a public/ top level folder + + s3.put_object(Bucket=BUCKET_NAME, Key="raw-data/") + s3.put_object(Bucket=BUCKET_NAME, Key="staging-data/") + s3.put_object(Bucket=BUCKET_NAME, Key="production-data/") + + print("Trigger Function scripts finished execution successfully!") \ No newline at end of file diff --git a/backend/cdk/lib/datapipeline-stack.ts b/backend/cdk/lib/datapipeline-stack.ts new file mode 100644 index 0000000..1ae76e9 --- /dev/null +++ b/backend/cdk/lib/datapipeline-stack.ts @@ -0,0 +1,236 @@ +import * as cdk from "aws-cdk-lib"; +import { Construct } from "constructs"; +import { VpcStack } from "./vpc-stack"; +import * as s3 from "aws-cdk-lib/aws-s3"; +import * as s3deploy from "aws-cdk-lib/aws-s3-deployment"; +import * as iam from "aws-cdk-lib/aws-iam"; +import * as glue from "aws-cdk-lib/aws-glue"; +import * as lambda from "aws-cdk-lib/aws-lambda"; + +export class DataPipelineStack extends cdk.Stack { + constructor( + scope: Construct, + id: string, + vpcStack: VpcStack, + props?: cdk.StackProps + ) { + super(scope, id, props); + + // files storage bucket + const dataBucket = new s3.Bucket(this, "data-s3bucket", { + removalPolicy: cdk.RemovalPolicy.DESTROY, + autoDeleteObjects: true, + versioned: false, + publicReadAccess: false, + blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, + encryption: s3.BucketEncryption.S3_MANAGED, + }); + + // bucket for embeddings + const embeddingsBucket = new s3.Bucket(this, "embeddings-s3bucket", { + removalPolicy: cdk.RemovalPolicy.DESTROY, + autoDeleteObjects: true, + versioned: false, + publicReadAccess: false, + blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, + encryption: s3.BucketEncryption.S3_MANAGED, + }); + + // Glue deployment bucket + const glueS3Bucket = new s3.Bucket(this, "glue-s3bucket", { + removalPolicy: cdk.RemovalPolicy.DESTROY, + autoDeleteObjects: true, + versioned: false, + publicReadAccess: false, + blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, + encryption: s3.BucketEncryption.S3_MANAGED, + }); + + // const triggerLambda = new cdk.triggers.TriggerFunction( + // this, + // "createFolders-lambda", + // { + // functionName: "createFolders", + // runtime: lambda.Runtime.PYTHON_3_9, + // handler: "createFolders.lambda_handler", + // timeout: cdk.Duration.seconds(300), + // memorySize: 512, + // environment: { + // BUCKET_NAME: dataBucket.bucketName, + // }, + // vpc: vpcStack.vpc, + // code: lambda.Code.fromAsset("./lambda/createFolders"), + // layers: [], + // executeAfter: [dataBucket], + // } + // ); + + // triggerLambda.addToRolePolicy( + // new iam.PolicyStatement({ + // effect: iam.Effect.ALLOW, + // actions: [ + // "s3:ListBucket", + // "s3:ListObjectsV2", + // "s3:PutObject", + // "s3:PutObjectAcl", + // "s3:GetObject", + // ], + // resources: [ + // `arn:aws:s3:::${dataBucket.bucketName}`, + // `arn:aws:s3:::${dataBucket.bucketName}/*`, + // ], + // }) + // ); + // triggerLambda.addToRolePolicy( + // new iam.PolicyStatement({ + // effect: iam.Effect.ALLOW, + // actions: [ + // // CloudWatch Logs + // "logs:CreateLogGroup", + // "logs:CreateLogStream", + // "logs:PutLogEvents", + // ], + // resources: ["arn:aws:logs:*:*:*"], + // }) + // ); + + // Create new Glue Role. DO NOT RENAME THE ROLE!!! + const roleName = "AWSGlueServiceRole-gluedatapipeline"; + const glueRole = new iam.Role(this, roleName, { + assumedBy: new iam.ServicePrincipal("glue.amazonaws.com"), + description: "Glue Service Role", + roleName: roleName, + }); + + // Add different policies to glue-service-role + const glueServiceRolePolicy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "service-role/AWSGlueServiceRole" + ); + const glueComprehendPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "ComprehendFullAccess" + ); + const glueS3policy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "AmazonS3FullAccess" + ); + const glueEventBridgePolicy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "AmazonEventBridgeFullAccess" + ) + const glueSNSPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "AmazonSNSFullAccess" + ) + const glueCloudwatchEventsPolicy = iam.ManagedPolicy.fromAwsManagedPolicyName( + "CloudWatchEventsFullAccess" + ) + + + glueRole.addManagedPolicy(glueServiceRolePolicy); + glueRole.addManagedPolicy(glueS3policy); + glueRole.addManagedPolicy(glueComprehendPolicy); + glueRole.addManagedPolicy(glueEventBridgePolicy); + glueRole.addManagedPolicy(glueSNSPolicy); + glueRole.addManagedPolicy(glueCloudwatchEventsPolicy); + + /* THESE ARE THE CONFIGURATION FOR GLUE */ + const PYTHON_VER = "3.9"; + const GLUE_VER = "3.0"; + const MAX_RETRIES = 0; // no retries, only execute once + const MAX_CAPACITY = 1 // or 0.0625; // 1 DPU + const MAX_CONCURRENT_RUNS = 1; // 1 concurrent runs of the same job simultaneously + const TIMEOUT = 120; // 120 min timeout duration + + // clean-survey-monkey-data + const glueJob1Name = "tlef-clean-survey-monkey-data"; + const glueJob1 = new glue.CfnJob(this, glueJob1Name, { + name: glueJob1Name, + role: glueRole.roleArn, + command: { + name: "pythonshell", + pythonVersion: PYTHON_VER, + scriptLocation: + "s3://" + glueS3Bucket.bucketName + "/scripts/clean-survey-monkey-data.py", + }, + executionProperty: { + maxConcurrentRuns: MAX_CONCURRENT_RUNS, + }, + maxRetries: MAX_RETRIES, + maxCapacity: MAX_CAPACITY, + timeout: TIMEOUT, + glueVersion: GLUE_VER, + defaultArguments: { + "--additional-python-modules": "openpyxl,fuzzywuzzy", + "library-set": "analytics", + "--BUCKET_NAME": dataBucket.bucketName, + "--SURVEY_MONKEY_S3URI": "n/a", // placeholder, to be copy paste by client on the console + "--INSTITUTION_DATA_S3URI": `s3://${dataBucket}/INSTITUTION_DATA/institution_data.csv`, // hardcoded folder name and file name + }, + }); + + // generate-new-grant-ids + const glueJob2Name = "tlef-generate-new-grant-ids"; + const glueJob2 = new glue.CfnJob(this, glueJob2Name, { + name: glueJob2Name, + role: glueRole.roleArn, + command: { + name: "pythonshell", + pythonVersion: PYTHON_VER, + scriptLocation: + "s3://" + glueS3Bucket.bucketName + "/scripts/generate-new-grant-ids.py", + }, + executionProperty: { + maxConcurrentRuns: MAX_CONCURRENT_RUNS, + }, + maxRetries: MAX_RETRIES, + maxCapacity: MAX_CAPACITY, + timeout: TIMEOUT, + glueVersion: GLUE_VER, + defaultArguments: { + "library-set": "analytics", + "--BUCKET_NAME": dataBucket.bucketName, + "--PROJECT_DETAILS_S3URI": "n/a", // placeholder, to be copy paste by client on the console + }, + }); + + // + const glueJob3Name = "tlef-generate-embeddings-and-similar-projects.py"; + const glueJob3 = new glue.CfnJob(this, glueJob3Name, { + name: glueJob3Name, + role: glueRole.roleArn, + command: { + name: "pythonshell", + pythonVersion: PYTHON_VER, + scriptLocation: + "s3://" + glueS3Bucket.bucketName + "/scripts/generate-embeddings-and-similar-projects.py", + }, + executionProperty: { + maxConcurrentRuns: MAX_CONCURRENT_RUNS, + }, + maxRetries: MAX_RETRIES, + maxCapacity: MAX_CAPACITY, + timeout: TIMEOUT, + glueVersion: GLUE_VER, + defaultArguments: { + "--additional-python-modules": "sentence-transformers", + "library-set": "analytics", + "--BUCKET_NAME": dataBucket.bucketName, + "--EMBEDDINGS_BUCKET": embeddingsBucket.bucketName, + "--PROJECT_DETAILS_WITH_NEW_GRANT_IDS_S3URI": "n/a", // will be filled by the 2nd glue job + }, + }); + + // Deploy glue job to glue S3 bucket + new s3deploy.BucketDeployment(this, "DeployGlueJobFiles1", { + sources: [s3deploy.Source.asset("./glue/scripts")], + destinationBucket: glueS3Bucket, + destinationKeyPrefix: "scripts", + }); + + // Grant S3 read/write role to Glue + glueS3Bucket.grantReadWrite(glueRole); + dataBucket.grantReadWrite(glueRole); + + // Destroy Glue related resources when PatentDataStack is deleted + glueJob1.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY); + glueJob2.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY); + glueJob3.applyRemovalPolicy(cdk.RemovalPolicy.DESTROY); + } +} diff --git a/backend/cdk/lib/vpc-stack.ts b/backend/cdk/lib/vpc-stack.ts index b70b0ba..0128bdc 100644 --- a/backend/cdk/lib/vpc-stack.ts +++ b/backend/cdk/lib/vpc-stack.ts @@ -32,12 +32,6 @@ export class VpcStack extends Stack { }); this.vpc.addFlowLog("vpcFlowLog"); - const defaultSecutiryGroup = ec2.SecurityGroup.fromSecurityGroupId( - this, - id, - this.vpc.vpcDefaultSecurityGroup - ); - this.vpc.addInterfaceEndpoint("ECR Endpoint", { service: ec2.InterfaceVpcEndpointAwsService.ECR, subnets: { subnetType: ec2.SubnetType.PRIVATE_ISOLATED }, diff --git a/backend/cdk/package-lock.json b/backend/cdk/package-lock.json index a765033..b78b564 100644 --- a/backend/cdk/package-lock.json +++ b/backend/cdk/package-lock.json @@ -1763,12 +1763,14 @@ "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "dev": true }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "dev": true, "dependencies": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -1979,7 +1981,8 @@ "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==" + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true }, "node_modules/constructs": { "version": "10.3.0", @@ -3393,6 +3396,7 @@ "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", + "dev": true, "dependencies": { "brace-expansion": "^1.1.7" }, @@ -3731,6 +3735,7 @@ "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, "bin": { "semver": "bin/semver.js" }