diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..4edea80 Binary files /dev/null and b/.DS_Store differ diff --git a/Part 1 - Data Preprocessing/.DS_Store b/Part 1 - Data Preprocessing/.DS_Store new file mode 100644 index 0000000..9117c32 Binary files /dev/null and b/Part 1 - Data Preprocessing/.DS_Store differ diff --git a/Part 1 - Data Preprocessing/Data.csv b/Part 1 - Data Preprocessing/Data.csv new file mode 100644 index 0000000..564b65b --- /dev/null +++ b/Part 1 - Data Preprocessing/Data.csv @@ -0,0 +1,11 @@ +Country,Age,Salary,Purchased +France,44,72000,No +Spain,27,48000,Yes +Germany,30,54000,No +Spain,38,61000,No +Germany,40,,Yes +France,35,58000,Yes +Spain,,52000,No +France,48,79000,Yes +Germany,50,83000,No +France,37,67000,Yes \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/categorical_data.R b/Part 1 - Data Preprocessing/categorical_data.R new file mode 100644 index 0000000..84614db --- /dev/null +++ b/Part 1 - Data Preprocessing/categorical_data.R @@ -0,0 +1,20 @@ +# Data Preprocessing + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Taking care of missing data +dataset$Age = ifelse(is.na(dataset$Age), + ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Age) +dataset$Salary = ifelse(is.na(dataset$Salary), + ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Salary) + +# Encoding categorical data +dataset$Country = factor(dataset$Country, + levels = c('France', 'Spain', 'Germany'), + labels = c(1, 2, 3)) +dataset$Purchased = factor(dataset$Purchased, + levels = c('No', 'Yes'), + labels = c(0, 1)) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/categorical_data.py b/Part 1 - Data Preprocessing/categorical_data.py new file mode 100644 index 0000000..35c9b37 --- /dev/null +++ b/Part 1 - Data Preprocessing/categorical_data.py @@ -0,0 +1,28 @@ +# Data Preprocessing + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +# Taking care of missing data +from sklearn.preprocessing import Imputer +imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) +imputer = imputer.fit(X[:, 1:3]) +X[:, 1:3] = imputer.transform(X[:, 1:3]) + +# Encoding categorical data +# Encoding the Independent Variable +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder_X = LabelEncoder() +X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) +onehotencoder = OneHotEncoder(categorical_features = [0]) +X = onehotencoder.fit_transform(X).toarray() +# Encoding the Dependent Variable +labelencoder_y = LabelEncoder() +y = labelencoder_y.fit_transform(y) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/data_preprocessing.py b/Part 1 - Data Preprocessing/data_preprocessing.py new file mode 100644 index 0000000..96e8bdd --- /dev/null +++ b/Part 1 - Data Preprocessing/data_preprocessing.py @@ -0,0 +1,40 @@ +# Data Preprocessing Template + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +#Take care of misssing data +from sklearn.preprocessing import Imputer +imputer=Imputer(missing_values="NaN", strategy="mean", axis=0) +imputer = imputer.fit(X[:, 1:3]) +X[:,1:3]=imputer.transform(X[:,1:3]) + + +#Encoding Categorical data +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +labelencoder_x = LabelEncoder() +X[:,0] = labelencoder_x.fit_transform(X[:,0]) +onehotencoder = OneHotEncoder(categorical_features=[0]) +X = onehotencoder.fit_transform(X).toarray() + +labelencoder_y = LabelEncoder() +y = labelencoder_y.fit_transform(y) + +# Splitting the dataset into the Training set and Test set +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +# Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.transform(X_test) +sc_y = StandardScaler() +y_train = sc_y.fit_transform(y_train) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/data_preprocessing_template.R b/Part 1 - Data Preprocessing/data_preprocessing_template.R new file mode 100644 index 0000000..73fd269 --- /dev/null +++ b/Part 1 - Data Preprocessing/data_preprocessing_template.R @@ -0,0 +1,16 @@ +# Data Preprocessing Template + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Splitting the dataset into the Training set and Test set +# install.packages('caTools') +library(caTools) +set.seed(123) +split = sample.split(dataset$DependentVariable, SplitRatio = 0.8) +training_set = subset(dataset, split == TRUE) +test_set = subset(dataset, split == FALSE) + +# Feature Scaling +# training_set = scale(training_set) +# test_set = scale(test_set) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/missing_data.R b/Part 1 - Data Preprocessing/missing_data.R new file mode 100644 index 0000000..21baccb --- /dev/null +++ b/Part 1 - Data Preprocessing/missing_data.R @@ -0,0 +1,12 @@ +# Data Preprocessing + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Taking care of missing data +dataset$Age = ifelse(is.na(dataset$Age), + ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Age) +dataset$Salary = ifelse(is.na(dataset$Salary), + ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Salary) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/missing_data.py b/Part 1 - Data Preprocessing/missing_data.py new file mode 100644 index 0000000..ac2dd8a --- /dev/null +++ b/Part 1 - Data Preprocessing/missing_data.py @@ -0,0 +1,17 @@ +# Data Preprocessing + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +# Taking care of missing data +from sklearn.preprocessing import Imputer +imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) +imputer = imputer.fit(X[:, 1:3]) +X[:, 1:3] = imputer.transform(X[:, 1:3]) \ No newline at end of file diff --git a/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv b/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv new file mode 100644 index 0000000..a6863aa --- /dev/null +++ b/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv @@ -0,0 +1,31 @@ +YearsExperience,Salary +1.1,39343.00 +1.3,46205.00 +1.5,37731.00 +2.0,43525.00 +2.2,39891.00 +2.9,56642.00 +3.0,60150.00 +3.2,54445.00 +3.2,64445.00 +3.7,57189.00 +3.9,63218.00 +4.0,55794.00 +4.0,56957.00 +4.1,57081.00 +4.5,61111.00 +4.9,67938.00 +5.1,66029.00 +5.3,83088.00 +5.9,81363.00 +6.0,93940.00 +6.8,91738.00 +7.1,98273.00 +7.9,101302.00 +8.2,113812.00 +8.7,109431.00 +9.0,105582.00 +9.5,116969.00 +9.6,112635.00 +10.3,122391.00 +10.5,121872.00 diff --git a/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py b/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py new file mode 100644 index 0000000..bfcdbde --- /dev/null +++ b/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py @@ -0,0 +1,28 @@ +#Simple Linear Regression +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Salary_Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 1].values + +from sklearn.model_selection import train_test_split +X_train,X_test, y_train, y_test=train_test_split(X,y,test_size=1/3, random_state=0) + +#Fittin Simple Linear Regression +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +#Predicting the results +y_pred = regressor.predict(X_test) + +#Visualizing the predictions +plt.scatter(X_train,y_train, color='red') +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Training set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv b/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv new file mode 100644 index 0000000..14ffb86 --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv @@ -0,0 +1,51 @@ +R&D Spend,Administration,Marketing Spend,State,Profit +165349.2,136897.8,471784.1,New York,192261.83 +162597.7,151377.59,443898.53,California,191792.06 +153441.51,101145.55,407934.54,Florida,191050.39 +144372.41,118671.85,383199.62,New York,182901.99 +142107.34,91391.77,366168.42,Florida,166187.94 +131876.9,99814.71,362861.36,New York,156991.12 +134615.46,147198.87,127716.82,California,156122.51 +130298.13,145530.06,323876.68,Florida,155752.6 +120542.52,148718.95,311613.29,New York,152211.77 +123334.88,108679.17,304981.62,California,149759.96 +101913.08,110594.11,229160.95,Florida,146121.95 +100671.96,91790.61,249744.55,California,144259.4 +93863.75,127320.38,249839.44,Florida,141585.52 +91992.39,135495.07,252664.93,California,134307.35 +119943.24,156547.42,256512.92,Florida,132602.65 +114523.61,122616.84,261776.23,New York,129917.04 +78013.11,121597.55,264346.06,California,126992.93 +94657.16,145077.58,282574.31,New York,125370.37 +91749.16,114175.79,294919.57,Florida,124266.9 +86419.7,153514.11,0,New York,122776.86 +76253.86,113867.3,298664.47,California,118474.03 +78389.47,153773.43,299737.29,New York,111313.02 +73994.56,122782.75,303319.26,Florida,110352.25 +67532.53,105751.03,304768.73,Florida,108733.99 +77044.01,99281.34,140574.81,New York,108552.04 +64664.71,139553.16,137962.62,California,107404.34 +75328.87,144135.98,134050.07,Florida,105733.54 +72107.6,127864.55,353183.81,New York,105008.31 +66051.52,182645.56,118148.2,Florida,103282.38 +65605.48,153032.06,107138.38,New York,101004.64 +61994.48,115641.28,91131.24,Florida,99937.59 +61136.38,152701.92,88218.23,New York,97483.56 +63408.86,129219.61,46085.25,California,97427.84 +55493.95,103057.49,214634.81,Florida,96778.92 +46426.07,157693.92,210797.67,California,96712.8 +46014.02,85047.44,205517.64,New York,96479.51 +28663.76,127056.21,201126.82,Florida,90708.19 +44069.95,51283.14,197029.42,California,89949.14 +20229.59,65947.93,185265.1,New York,81229.06 +38558.51,82982.09,174999.3,California,81005.76 +28754.33,118546.05,172795.67,California,78239.91 +27892.92,84710.77,164470.71,Florida,77798.83 +23640.93,96189.63,148001.11,California,71498.49 +15505.73,127382.3,35534.17,New York,69758.98 +22177.74,154806.14,28334.72,California,65200.33 +1000.23,124153.04,1903.93,New York,64926.08 +1315.46,115816.21,297114.46,Florida,49490.75 +0,135426.92,0,California,42559.73 +542.05,51743.15,0,New York,35673.41 +0,116983.8,45173.06,California,14681.4 \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip new file mode 100644 index 0000000..c0d35d2 Binary files /dev/null and b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip differ diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store differ diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.R b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.R new file mode 100644 index 0000000..73fd269 --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.R @@ -0,0 +1,16 @@ +# Data Preprocessing Template + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Splitting the dataset into the Training set and Test set +# install.packages('caTools') +library(caTools) +set.seed(123) +split = sample.split(dataset$DependentVariable, SplitRatio = 0.8) +training_set = subset(dataset, split == TRUE) +test_set = subset(dataset, split == FALSE) + +# Feature Scaling +# training_set = scale(training_set) +# test_set = scale(test_set) \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.py b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.py new file mode 100644 index 0000000..c9a8494 --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.py @@ -0,0 +1,23 @@ +# Data Preprocessing Template + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +# Splitting the dataset into the Training set and Test set +from sklearn.cross_validation import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +# Feature Scaling +"""from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.transform(X_test) +sc_y = StandardScaler() +y_train = sc_y.fit_transform(y_train)""" \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.R b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.R new file mode 100644 index 0000000..a07753f --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.R @@ -0,0 +1,28 @@ +# Multiple Linear Regression + +# Importing the dataset +dataset = read.csv('50_Startups.csv') + +# Encoding categorical data +dataset$State = factor(dataset$State, + levels = c('New York', 'California', 'Florida'), + labels = c(1, 2, 3)) + +# Splitting the dataset into the Training set and Test set +# install.packages('caTools') +library(caTools) +set.seed(123) +split = sample.split(dataset$Profit, SplitRatio = 0.8) +training_set = subset(dataset, split == TRUE) +test_set = subset(dataset, split == FALSE) + +# Feature Scaling +# training_set = scale(training_set) +# test_set = scale(test_set) + +# Fitting Multiple Linear Regression to the Training set +regressor = lm(formula = Profit ~ ., + data = training_set) + +# Predicting the Test set results +y_pred = predict(regressor, newdata = test_set) \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.py b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.py new file mode 100644 index 0000000..6db7c77 --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.py @@ -0,0 +1,41 @@ +# Multiple Linear Regression + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('50_Startups.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 4].values + +# Encoding categorical data +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder = LabelEncoder() +X[:, 3] = labelencoder.fit_transform(X[:, 3]) +onehotencoder = OneHotEncoder(categorical_features = [3]) +X = onehotencoder.fit_transform(X).toarray() + +# Avoiding the Dummy Variable Trap +X = X[:, 1:] + +# Splitting the dataset into the Training set and Test set +from sklearn.cross_validation import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +# Feature Scaling +"""from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.transform(X_test) +sc_y = StandardScaler() +y_train = sc_y.fit_transform(y_train)""" + +# Fitting Multiple Linear Regression to the Training set +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +# Predicting the Test set results +y_pred = regressor.predict(X_test) \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._.DS_Store b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._.DS_Store new file mode 100644 index 0000000..09fa6bd Binary files /dev/null and b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._.DS_Store differ diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv new file mode 100644 index 0000000..be931da Binary files /dev/null and b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv differ diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py new file mode 100644 index 0000000..b467d1a --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py @@ -0,0 +1,100 @@ +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('50_Startups.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 4].values + + +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder_X = LabelEncoder() +X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) +onehotencoder = OneHotEncoder(categorical_features = [3]) +X = onehotencoder.fit_transform(X).toarray() + +#Avoid Dummy Variable trap +X = X[:,1:] + +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +#Fitting Multiple Linear regression +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +y_pred = regressor.predict(X_test) + +#Building for Backward Elimination +import statsmodels.formula.api as sm +X = np.append(arr=np.ones((50,1)).astype(int), values = X, axis=1) +X_opt = X[:, [0,1,2,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,1,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +#Backward Elimination with p-values only: +import statsmodels.formula.api as sm +def backwardElimination(x, sl): + numVars = len(x[0]) + for i in range(0, numVars): + regressor_OLS = sm.OLS(y, x).fit() + maxVar = max(regressor_OLS.pvalues).astype(float) + if maxVar > sl: + for j in range(0, numVars - i): + if (regressor_OLS.pvalues[j].astype(float) == maxVar): + x = np.delete(x, j, 1) + regressor_OLS.summary() + return x + +SL = 0.05 +X_opt = X[:, [0, 1, 2, 3, 4, 5]] +X_Modeled = backwardElimination(X_opt, SL) + + +#Backward Elimination with p-values and Adjusted R Squared: +import statsmodels.formula.api as sm +def backwardElimination(x, SL): + numVars = len(x[0]) + temp = np.zeros((50,6)).astype(int) + for i in range(0, numVars): + regressor_OLS = sm.OLS(y, x).fit() + maxVar = max(regressor_OLS.pvalues).astype(float) + adjR_before = regressor_OLS.rsquared_adj.astype(float) + if maxVar > SL: + for j in range(0, numVars - i): + if (regressor_OLS.pvalues[j].astype(float) == maxVar): + temp[:,j] = x[:, j] + x = np.delete(x, j, 1) + tmp_regressor = sm.OLS(y, x).fit() + adjR_after = tmp_regressor.rsquared_adj.astype(float) + if (adjR_before >= adjR_after): + x_rollback = np.hstack((x, temp[:,[0,j]])) + x_rollback = np.delete(x_rollback, j, 1) + print (regressor_OLS.summary()) + return x_rollback + else: + continue + regressor_OLS.summary() + return x + +SL = 0.05 +X_opt = X[:, [0, 1, 2, 3, 4, 5]] +X_Modeled = backwardElimination(X_opt, SL)