helper.py

import re
import pandas as pd
import numpy as np
from typing import Tuple, List
from sklearn.metrics import confusion_matrix
import seaborn as sns

PROGRAM = "Program"


def clean_text(text):
    text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
    output = re.sub(r'\d+', '', text_input)
    return output.lower().strip()


def get_num_courses_per_program():
    df = pd.read_csv('program_courses.csv')
    return df.groupby([PROGRAM])[PROGRAM].count()


def load_data(num_majors=20, include_majors=[]) -> Tuple[List[str], np.ndarray]:
    """
    Loads and preprocesses `course_sentences` data.
    """
    courses = pd.read_csv("course_sentences.csv").drop(["course"], axis=1).dropna()
    descriptions = pd.read_csv("program_descriptions.csv").rename(columns={"description": "sentence"}).dropna()
    df = pd.concat([courses, descriptions], axis=0, ignore_index=True)
    majors = list(df.groupby("program").count().sort_values(by=["sentence"], ascending=False).index)
    majors = include_majors + majors
    majors = majors[:num_majors]
    df = df[df["program"].isin(majors)]
    sentences = list(df["sentence"])
    labels = np.array(df["program"])

    return sentences, labels

def plot_confusion_matrix(y_true:List[str], y_pred:List[str], classes:List[str]):
    """Plots a confusion matrix"""
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    cm_df=pd.DataFrame(data=cm, index=classes, columns=classes)
    sns.heatmap(cm_df, annot=True)
    

def get_recommendations(probs:np.ndarray, labels:List[str], n=5) -> List[List[str]]:
    """
    Args:
        `probs`: predictions array of shape (n_inputs,n_classes)
        `labels`: class labels of shape (n_classes,)
        `n`: number of recommendations
    Returns:
        Top labels based on a probability distribution
    """
    np_labels = np.array(labels)
    return np_labels[(-probs).argsort(-1)[:,:n]]