akzare · AdithyaS99 · Oct 21, 2021 · Nov 18, 2021
diff --git a/src/main/python/Fuzzy-Genetic-Recommender-System/.gitignore b/src/main/python/Fuzzy-Genetic-Recommender-System/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+*.pyc
+.idea
diff --git a/src/main/python/Fuzzy-Genetic-Recommender-System/README.md b/src/main/python/Fuzzy-Genetic-Recommender-System/README.md
@@ -0,0 +1,13 @@
+# Hybrid Fuzzy-Genetic Approach to Recommendation Systems
+
+Implementation of [Fuzzy-genetic approach to recommender systems based on a novel hybrid user model](http://www.sciencedirect.com/science/article/pii/S095741740700351X) using python and some libraries like pandas, numpy.
+
+# Requirements(tested on)
+1. Python 3.6.6
+2. numpy==1.15.4
+3. pandas==0.23.4
+
+# How to Execute
+`python fgrs.py`
+
+Sources: [MovieLens 100K dataset](http://grouplens.org/datasets/movielens/100k/)
diff --git a/src/main/python/Fuzzy-Genetic-Recommender-System/fgrs.py b/src/main/python/Fuzzy-Genetic-Recommender-System/fgrs.py
@@ -0,0 +1,183 @@
+""""""
+
+# Import pandas and numpy
+import pandas as pd
+import numpy as np
+import operator
+
+from settings import NO_OF_GENRES, NO_OF_FEATURES
+import load_data
+import fuzzy_sets
+import gim
+from genetic import genetic_optimize, cost_function
+
+# Constants
+WEIGHTS=np.random.rand(NO_OF_FEATURES)
+NO_OF_ITERATIONS=10
+NO_OF_NEIGHBOURS=20
+
+# namespace items_merged from load_data file
+items_merged = load_data.items_merged
+
+# Create objects for Age and GIM to use for fuzzy sets
+age = fuzzy_sets.Age()
+gim_obj = fuzzy_sets.GIM()
+
+m_cols = ['unknown', 'Action', 'Adventure',
+          'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
+          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'age',
+          'user_id']
+
+# Initialize empty dataFrames for active and passive users
+model_data_active_users = pd.DataFrame(columns=m_cols)
+model_data_passive_users = pd.DataFrame(columns=m_cols)
+
+
+def euclidean_dist(list_a, list_b):
+    """Return the Euclidean distance between two array elements."""
+    return np.linalg.norm(np.array(list_a) - np.array(list_b))
+
+
+def fuzzy_dist(first_point, second_point, fuzzy_set_first_point, fuzzy_set_second_point):
+    """Returns fuzzy distance between two values and their fuzzy sets."""
+    return abs(first_point - second_point) * euclidean_dist(fuzzy_set_first_point, fuzzy_set_second_point)
+
+
+def fuzzy_distance(ui, uj):
+    """Returns fuzzy distance between given points."""
+
+    fuzzy_dis = [0] * NO_OF_FEATURES
+
+    # Get fuzzy set values for movie genres
+    for i in range(0, NO_OF_GENRES):
+        ui_gim = gim_obj.get_fuzzy_set(ui[i])
+        uj_gim = gim_obj.get_fuzzy_set(uj[i])
+        fuzzy_dis[i] = fuzzy_dist(ui[i], uj[i], ui_gim, uj_gim)
+
+    # Get fuzzy set values for age
+    ui_gim = age.get_fuzzy_set(ui[i])
+    uj_gim = age.get_fuzzy_set(uj[i])
+    fuzzy_dis[i] = fuzzy_dist(ui[i], uj[i], ui_gim, uj_gim)
+
+    # adding user_id of second user
+    fuzzy_dis[NO_OF_FEATURES-1] = uj['user_id']
+    return fuzzy_dis
+
+
+def get_neighbours(model_active_users, model_passive_users):
+
+    # Save active users and its neighbours in a data-frame with active users' id as column name
+    user_neighbours= pd.DataFrame(columns=model_active_users['user_id'])
+
+    # Iterate over active users model and save neighbours of each active users in user_neighbours
+    for _, value in model_active_users.iterrows():
+        j = 0
+        fuzzy_vec = []
+        for _, value_p in model_passive_users.iterrows():
+            fuzzy_vec.append(fuzzy_distance(value, value_p))
+
+            fuzzy_gen_dist = np.sum(np.multiply(WEIGHTS[:-1], np.array(fuzzy_vec[j][:-1]))) ** 0.5
+
+            fuzzy_vec[j] = [fuzzy_gen_dist, fuzzy_vec[j][-1]]
+
+            j = j + 1
+
+        user_neighbours[value[-1]] = [n[1] for n in sorted(fuzzy_vec, key=operator.itemgetter(0), reverse=True)][:NO_OF_NEIGHBOURS]
+    return user_neighbours
+
+
+def model_for_users(users_data):
+    """Create model for given users data i.e. merged movies, items, and users
+
+    Args:
+        users_data: DataFrame of merged movies, items, and users based on movie_id
+    """
+
+    i = 0
+    model_data_for_users = pd.DataFrame(columns=m_cols)
+
+    for _, curr_value in users_data.iterrows():
+
+        # Get user movies based on user
+        user_movies = items_merged.loc[items_merged['user_id'] == curr_value['user_id']]
+
+        # Get feature list for all movies of one user
+        feature_array = gim.gim_final(user_movies, curr_value['user_id'])
+        feature_array[NO_OF_GENRES] = curr_value['age']
+        feature_array[NO_OF_GENRES + 1] = curr_value['user_id']
+
+        # Save current feature values in model data
+        model_data_for_users.loc[i] = feature_array
+        i = i + 1
+    return model_data_for_users
+
+
+def recommend(nearest_neighbours, test_users_data):
+    """Recommend rating for given movies i.e. test_examples based on nearest neighbours.
+
+    Also return actual and predicated ratings for testing users
+    """
+    predicated_rat=[]
+    actual_rat = []
+    for key, item in test_users_data.iterrows():
+        m_id = item['movie_id']
+        n_ratings = []
+        for i in nearest_neighbours:
+
+            # Get items or movie details reviewed by neighbour i with given m_id
+            temp = items_merged.loc[items_merged['user_id'] == i].loc[items_merged['movie_id'] == m_id]
+            for k, it in temp.iterrows():
+                n_ratings.append(it['rating'])
+        predicated_rat.append(float(sum(n_ratings)) / len(n_ratings) if len(n_ratings) else 0)
+        actual_rat.append(item['rating'])
+    return actual_rat, predicated_rat
+
+# Users who has rated movies at least 60 movies
+top_users = load_data.items_merged.groupby('user_id').size().sort_values(ascending=False)[:497]
+
+model_error = []
+
+# Train model for given iterations
+for i in range(0, NO_OF_ITERATIONS):
+
+    # Get random 10% of the top_users as active users and remaining are passive users
+    active_users = top_users.sample(frac=0.10)
+
+    # Random 34% of active users will be used for training and 66% users for testing purpose.
+    training_active_users = active_users.sample(frac=0.34)
+    #testing_active_users = active_users.drop(training_active_users.index)
+
+    # passive_users will be used as training examples
+    passive_users = top_users.drop(active_users.index)
+
+    # Get active and passive users' data from merged movies, items, and users
+    training_active_users_data = items_merged.loc[items_merged['user_id'].isin(training_active_users)][:10]
+    test_active_users_data = items_merged.loc[items_merged['user_id'].isin(training_active_users)][10:]
+    passive_users_data = items_merged.loc[items_merged['user_id'].isin(passive_users)][:10]
+
+    # Get model for active users
+    model_data_active_users = model_for_users(training_active_users_data)
+
+    # Get model for passive users
+    model_data_passive_users = model_for_users(passive_users_data)
+
+    # Get neighbour users of active users
+    active_users_neighbours = get_neighbours(model_data_active_users, model_data_passive_users)
+
+    # Recommend users based on neighbours
+    actual_ratings, predicated_ratings=recommend(active_users_neighbours, test_active_users_data)
+
+    # Get error for predictions of test users
+    cur_error = cost_function(actual_ratings, predicated_ratings)
+
+    # Optimize weights using genetic algorithm approach and update weighs
+    WEIGHTS = genetic_optimize(actual_ratings, predicated_ratings).flatten()
+
+    # Add current iteration error to model error for MAE of the model
+    model_error.append(cost_function(actual_ratings, predicated_ratings))
+
+    # Log details for current iteration
+    print("Iteration : ", i)
+    print("Error: ", cur_error)
+    print('Weights after iteration is: ', WEIGHTS)
+print("Mean absolute error for all iterations: ", sum(model_error)/len(model_error))
diff --git a/src/main/python/Fuzzy-Genetic-Recommender-System/fuzzy_sets.py b/src/main/python/Fuzzy-Genetic-Recommender-System/fuzzy_sets.py
@@ -0,0 +1,88 @@
+class Age:
+    """Define methods to get fuzzy values of given age in three sets i.e. young, middle, and old."""
+
+    def __int__(self):
+        pass
+
+    def young(self, age):
+        """Get value for young fuzzy set for given age."""
+        if age < 20.0:
+            return 1.0
+        elif 20.0 <= age < 35.0:
+            return float((35-age)/15.0)
+        else:
+            return 0.0
+
+    def middle(self, age):
+        """Get value for middle fuzzy set for given age."""
+        if age <= 20 or age > 60:
+            return 0.0
+        elif 20 < age <= 35:
+            return float(age-20)/15
+        elif 35 < age <= 45:
+            return 1.0
+        elif 45 < age <= 60:
+            return (60-age)/15.0
+
+    def old(self, age):
+        """get value for old fuzzy set for given age."""
+        if age <= 45:
+            return 0.0
+        elif 45 < age <= 60:
+            return (age-45.0)/15
+        else:
+            return 1.0
+
+    def get_fuzzy_set(self, age):
+        """Get fuzzy set values of given age."""
+        return [self.young(age),
+                self.middle(age),
+                self.old(age)]
+
+class GIM:
+    """GIM- Genre Interestingness Measure"""
+
+    def __init__(self):
+        pass
+
+    def gim_a(self, gim, i):
+        """Method to get fuzzy set value for very_bad, bad, average, good."""
+        if gim <= i - 2 or gim > i:
+            return 0.0
+        elif i - 2 < gim <= i - 1:
+            return gim - i + 2.0
+        elif i - 1 < gim <= i:
+            return float(i - gim)
+
+    def very_bad(self, gim):
+        if gim <= 1.0:
+            return 1.0
+        else:
+            return 0.0
+
+    def bad(self, gim):
+        return self.gim_a(gim, 2.0)
+
+    def average(self, gim):
+        return self.gim_a(gim, 3.0)
+
+    def good(self, gim):
+        return self.gim_a(gim, 4.0)
+
+    def very_good(self, gim):
+        return self.gim_a(gim, 5.0)
+
+    def excellent(self, gim):
+        if gim <=4.0:
+            return 0.0
+        else:
+            return (gim-4.0)
+
+    def get_fuzzy_set(self, gim_value):
+        """Get fuzzy set of gim(list of values) based on given gim value."""
+        return [self.very_bad(gim_value),
+                self.bad(gim_value),
+                self.average(gim_value),
+                self.good(gim_value),
+                self.very_good(gim_value),
+                self.excellent(gim_value)]
diff --git a/src/main/python/Fuzzy-Genetic-Recommender-System/genetic.py b/src/main/python/Fuzzy-Genetic-Recommender-System/genetic.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+# Constants
+INITIAL_POPULATION=50
+MUTATION_RATE=0.05
+NUM_OF_GENERATIONS=50
+FEATURE_LENGTH=21
+WINNER_PER_GEN=5
+
+
+def cost_function(actual_ratings, predicted_ratings):
+    """Cost function to optimize using genetic method."""
+    ms = sum(abs(np.array(predicted_ratings) - np.array(actual_ratings)))
+    return ms / len(actual_ratings)
+
+
+def genetic_optimize(actual_ratings, predicted_ratings):
+    """Get genetic feature weights using crossover and mutation."""
+
+    # Generate initial binary population
+    cur_gen_pop = np.random.randint(2, size=(INITIAL_POPULATION, FEATURE_LENGTH))
+    next_gen_pop = np.zeros((cur_gen_pop.shape[0], cur_gen_pop.shape[1]))
+    fitness_vector = np.zeros((INITIAL_POPULATION, 2))
+
+    for i in range(NUM_OF_GENERATIONS):
+
+        fitness_vector = np.array([np.array([x, cost_function(actual_ratings, predicted_ratings)]) for x in range(INITIAL_POPULATION)])
+        # e.g. [0, 0.11] means that the 0th element in cur_gen_pop (first solution) has an error of 0.11
+
+        # create a winners array of size winner*solution
+        winners = np.zeros((WINNER_PER_GEN, FEATURE_LENGTH))
+        for n in range(len(winners)):
+            selected = np.random.choice(range(len(fitness_vector)), int(WINNER_PER_GEN/2),
+                                        replace=False)  # select random indexes from pop
+
+            wnr = np.argmin(fitness_vector[selected, 1])  # select one index with min fitness error (tournament)
+            winners[n] = cur_gen_pop[int(fitness_vector[selected[wnr]][0])]  # add to winner population
+
+        next_gen_pop[:len(winners)] = winners  # populate new gen with winners
+
+        # mating using crossover via permutation
+        next_gen_pop[len(winners):] = np.array(
+            [np.array(
+                np.random.permutation(np.repeat(winners[:, x], ((INITIAL_POPULATION - len(winners)) / len(winners)), axis=0)))
+             for x in range(winners.shape[1])]).T  # Populate the rest of the generation with offspring of mating pairs
+
+        # random mutation
+        next_gen_pop = np.multiply(next_gen_pop, np.matrix(
+            [np.float(np.random.normal(0, 2, 1)) if np.random.random() < MUTATION_RATE else 1 for x in
+             range(next_gen_pop.size)]).reshape(next_gen_pop.shape))
+        cur_gen_pop = next_gen_pop
+
+    best_soln = np.array(cur_gen_pop[np.argmin(fitness_vector[:, 1])])
+    return best_soln