From 6ce25d14dcab3b7ef2c90a869da338667aa511c4 Mon Sep 17 00:00:00 2001 From: Tanuj Vishnoi Date: Mon, 27 Aug 2018 04:13:07 +0530 Subject: [PATCH] first commit --- .DS_Store | Bin 0 -> 6148 bytes Part 1 - Data Preprocessing/.DS_Store | Bin 0 -> 6148 bytes Part 1 - Data Preprocessing/Data.csv | 11 ++ .../categorical_data.R | 20 ++++ .../categorical_data.py | 28 +++++ .../data_preprocessing.py | 40 +++++++ .../data_preprocessing_template.R | 16 +++ Part 1 - Data Preprocessing/missing_data.R | 12 +++ Part 1 - Data Preprocessing/missing_data.py | 17 +++ .../Salary_Data.csv | 31 ++++++ .../simple_linear_regression.py | 28 +++++ .../50_Startups.csv | 51 +++++++++ .../Multiple_Linear_Regression.zip | Bin 0 -> 5501 bytes .../Multiple_Linear_Regression/.DS_Store | Bin 0 -> 6148 bytes .../data_preprocessing_template.R | 16 +++ .../data_preprocessing_template.py | 23 ++++ .../multiple_linear_regression.R | 28 +++++ .../multiple_linear_regression.py | 41 +++++++ .../Multiple_Linear_Regression/._.DS_Store | Bin 0 -> 120 bytes .../._50_Startups.csv | Bin 0 -> 576 bytes .../Multiple_linear_regression.py | 100 ++++++++++++++++++ 21 files changed, 462 insertions(+) create mode 100644 .DS_Store create mode 100644 Part 1 - Data Preprocessing/.DS_Store create mode 100644 Part 1 - Data Preprocessing/Data.csv create mode 100644 Part 1 - Data Preprocessing/categorical_data.R create mode 100644 Part 1 - Data Preprocessing/categorical_data.py create mode 100644 Part 1 - Data Preprocessing/data_preprocessing.py create mode 100644 Part 1 - Data Preprocessing/data_preprocessing_template.R create mode 100644 Part 1 - Data Preprocessing/missing_data.R create mode 100644 Part 1 - Data Preprocessing/missing_data.py create mode 100644 Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv create mode 100644 Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.R create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/data_preprocessing_template.py create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.R create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/multiple_linear_regression.py create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._.DS_Store create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv create mode 100644 Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4edea803f6bad5812eb9875282ba9f731e01c174 GIT binary patch literal 6148 zcmeHK!AiqG5PhRP6ucC?2wwK+MX-KBEb*owdJsHm(-4K2NE`3@bl=Rbm}1TnWVYFms8_CalK1Y98Cobz!c(G*Qu`x)-=f-O4M z{ZoI58Uh#MJ_7OFU+AW;u9|A^* K3uoXD82AEOA2PE5 literal 0 HcmV?d00001 diff --git a/Part 1 - Data Preprocessing/.DS_Store b/Part 1 - Data Preprocessing/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9117c32c9bdb0e593c37137866f459cf4ad4c428 GIT binary patch literal 6148 zcmeHKJ#PXr41I1Hkh(B3<_|E?Ul5_|3=)5!LH+eemc*wJDj+I2|;C_oaZEV z;(O91X8gVD$`7zhS}fnea# z8So2b`1a_DMt;FSFmQJU^n6HE#q3xN>(RlcmH@;7hpW)mT0&!zV|FZtoS}$?5-n7` z#1ISTcyhn&SPU&3;>Cyf&VR*=)ZMv$s_u|%7;P{R40IWIweCpo{||mLy+wXEBznO> zF!0Y9kU_I-W_&5XTd!VE@7hHDq^hW2QG-Ie^Ao_1o+INrY4b^K`enyrsH^CB4JXEr NKqVxaVBii6Tma`{BaQ$7 literal 0 HcmV?d00001 diff --git a/Part 1 - Data Preprocessing/Data.csv b/Part 1 - Data Preprocessing/Data.csv new file mode 100644 index 0000000..564b65b --- /dev/null +++ b/Part 1 - Data Preprocessing/Data.csv @@ -0,0 +1,11 @@ +Country,Age,Salary,Purchased +France,44,72000,No +Spain,27,48000,Yes +Germany,30,54000,No +Spain,38,61000,No +Germany,40,,Yes +France,35,58000,Yes +Spain,,52000,No +France,48,79000,Yes +Germany,50,83000,No +France,37,67000,Yes \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/categorical_data.R b/Part 1 - Data Preprocessing/categorical_data.R new file mode 100644 index 0000000..84614db --- /dev/null +++ b/Part 1 - Data Preprocessing/categorical_data.R @@ -0,0 +1,20 @@ +# Data Preprocessing + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Taking care of missing data +dataset$Age = ifelse(is.na(dataset$Age), + ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Age) +dataset$Salary = ifelse(is.na(dataset$Salary), + ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Salary) + +# Encoding categorical data +dataset$Country = factor(dataset$Country, + levels = c('France', 'Spain', 'Germany'), + labels = c(1, 2, 3)) +dataset$Purchased = factor(dataset$Purchased, + levels = c('No', 'Yes'), + labels = c(0, 1)) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/categorical_data.py b/Part 1 - Data Preprocessing/categorical_data.py new file mode 100644 index 0000000..35c9b37 --- /dev/null +++ b/Part 1 - Data Preprocessing/categorical_data.py @@ -0,0 +1,28 @@ +# Data Preprocessing + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +# Taking care of missing data +from sklearn.preprocessing import Imputer +imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) +imputer = imputer.fit(X[:, 1:3]) +X[:, 1:3] = imputer.transform(X[:, 1:3]) + +# Encoding categorical data +# Encoding the Independent Variable +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder_X = LabelEncoder() +X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) +onehotencoder = OneHotEncoder(categorical_features = [0]) +X = onehotencoder.fit_transform(X).toarray() +# Encoding the Dependent Variable +labelencoder_y = LabelEncoder() +y = labelencoder_y.fit_transform(y) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/data_preprocessing.py b/Part 1 - Data Preprocessing/data_preprocessing.py new file mode 100644 index 0000000..96e8bdd --- /dev/null +++ b/Part 1 - Data Preprocessing/data_preprocessing.py @@ -0,0 +1,40 @@ +# Data Preprocessing Template + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +#Take care of misssing data +from sklearn.preprocessing import Imputer +imputer=Imputer(missing_values="NaN", strategy="mean", axis=0) +imputer = imputer.fit(X[:, 1:3]) +X[:,1:3]=imputer.transform(X[:,1:3]) + + +#Encoding Categorical data +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +labelencoder_x = LabelEncoder() +X[:,0] = labelencoder_x.fit_transform(X[:,0]) +onehotencoder = OneHotEncoder(categorical_features=[0]) +X = onehotencoder.fit_transform(X).toarray() + +labelencoder_y = LabelEncoder() +y = labelencoder_y.fit_transform(y) + +# Splitting the dataset into the Training set and Test set +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +# Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.transform(X_test) +sc_y = StandardScaler() +y_train = sc_y.fit_transform(y_train) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/data_preprocessing_template.R b/Part 1 - Data Preprocessing/data_preprocessing_template.R new file mode 100644 index 0000000..73fd269 --- /dev/null +++ b/Part 1 - Data Preprocessing/data_preprocessing_template.R @@ -0,0 +1,16 @@ +# Data Preprocessing Template + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Splitting the dataset into the Training set and Test set +# install.packages('caTools') +library(caTools) +set.seed(123) +split = sample.split(dataset$DependentVariable, SplitRatio = 0.8) +training_set = subset(dataset, split == TRUE) +test_set = subset(dataset, split == FALSE) + +# Feature Scaling +# training_set = scale(training_set) +# test_set = scale(test_set) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/missing_data.R b/Part 1 - Data Preprocessing/missing_data.R new file mode 100644 index 0000000..21baccb --- /dev/null +++ b/Part 1 - Data Preprocessing/missing_data.R @@ -0,0 +1,12 @@ +# Data Preprocessing + +# Importing the dataset +dataset = read.csv('Data.csv') + +# Taking care of missing data +dataset$Age = ifelse(is.na(dataset$Age), + ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Age) +dataset$Salary = ifelse(is.na(dataset$Salary), + ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)), + dataset$Salary) \ No newline at end of file diff --git a/Part 1 - Data Preprocessing/missing_data.py b/Part 1 - Data Preprocessing/missing_data.py new file mode 100644 index 0000000..ac2dd8a --- /dev/null +++ b/Part 1 - Data Preprocessing/missing_data.py @@ -0,0 +1,17 @@ +# Data Preprocessing + +# Importing the libraries +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +# Taking care of missing data +from sklearn.preprocessing import Imputer +imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) +imputer = imputer.fit(X[:, 1:3]) +X[:, 1:3] = imputer.transform(X[:, 1:3]) \ No newline at end of file diff --git a/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv b/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv new file mode 100644 index 0000000..a6863aa --- /dev/null +++ b/Part 2 - Regression/Section 4 - Simple Linear Regression/Salary_Data.csv @@ -0,0 +1,31 @@ +YearsExperience,Salary +1.1,39343.00 +1.3,46205.00 +1.5,37731.00 +2.0,43525.00 +2.2,39891.00 +2.9,56642.00 +3.0,60150.00 +3.2,54445.00 +3.2,64445.00 +3.7,57189.00 +3.9,63218.00 +4.0,55794.00 +4.0,56957.00 +4.1,57081.00 +4.5,61111.00 +4.9,67938.00 +5.1,66029.00 +5.3,83088.00 +5.9,81363.00 +6.0,93940.00 +6.8,91738.00 +7.1,98273.00 +7.9,101302.00 +8.2,113812.00 +8.7,109431.00 +9.0,105582.00 +9.5,116969.00 +9.6,112635.00 +10.3,122391.00 +10.5,121872.00 diff --git a/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py b/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py new file mode 100644 index 0000000..bfcdbde --- /dev/null +++ b/Part 2 - Regression/Section 4 - Simple Linear Regression/simple_linear_regression.py @@ -0,0 +1,28 @@ +#Simple Linear Regression +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('Salary_Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 1].values + +from sklearn.model_selection import train_test_split +X_train,X_test, y_train, y_test=train_test_split(X,y,test_size=1/3, random_state=0) + +#Fittin Simple Linear Regression +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +#Predicting the results +y_pred = regressor.predict(X_test) + +#Visualizing the predictions +plt.scatter(X_train,y_train, color='red') +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs Experience (Training set)') +plt.xlabel('Years of Experience') +plt.ylabel('Salary') +plt.show() \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv b/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv new file mode 100644 index 0000000..14ffb86 --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv @@ -0,0 +1,51 @@ +R&D Spend,Administration,Marketing Spend,State,Profit +165349.2,136897.8,471784.1,New York,192261.83 +162597.7,151377.59,443898.53,California,191792.06 +153441.51,101145.55,407934.54,Florida,191050.39 +144372.41,118671.85,383199.62,New York,182901.99 +142107.34,91391.77,366168.42,Florida,166187.94 +131876.9,99814.71,362861.36,New York,156991.12 +134615.46,147198.87,127716.82,California,156122.51 +130298.13,145530.06,323876.68,Florida,155752.6 +120542.52,148718.95,311613.29,New York,152211.77 +123334.88,108679.17,304981.62,California,149759.96 +101913.08,110594.11,229160.95,Florida,146121.95 +100671.96,91790.61,249744.55,California,144259.4 +93863.75,127320.38,249839.44,Florida,141585.52 +91992.39,135495.07,252664.93,California,134307.35 +119943.24,156547.42,256512.92,Florida,132602.65 +114523.61,122616.84,261776.23,New York,129917.04 +78013.11,121597.55,264346.06,California,126992.93 +94657.16,145077.58,282574.31,New York,125370.37 +91749.16,114175.79,294919.57,Florida,124266.9 +86419.7,153514.11,0,New York,122776.86 +76253.86,113867.3,298664.47,California,118474.03 +78389.47,153773.43,299737.29,New York,111313.02 +73994.56,122782.75,303319.26,Florida,110352.25 +67532.53,105751.03,304768.73,Florida,108733.99 +77044.01,99281.34,140574.81,New York,108552.04 +64664.71,139553.16,137962.62,California,107404.34 +75328.87,144135.98,134050.07,Florida,105733.54 +72107.6,127864.55,353183.81,New York,105008.31 +66051.52,182645.56,118148.2,Florida,103282.38 +65605.48,153032.06,107138.38,New York,101004.64 +61994.48,115641.28,91131.24,Florida,99937.59 +61136.38,152701.92,88218.23,New York,97483.56 +63408.86,129219.61,46085.25,California,97427.84 +55493.95,103057.49,214634.81,Florida,96778.92 +46426.07,157693.92,210797.67,California,96712.8 +46014.02,85047.44,205517.64,New York,96479.51 +28663.76,127056.21,201126.82,Florida,90708.19 +44069.95,51283.14,197029.42,California,89949.14 +20229.59,65947.93,185265.1,New York,81229.06 +38558.51,82982.09,174999.3,California,81005.76 +28754.33,118546.05,172795.67,California,78239.91 +27892.92,84710.77,164470.71,Florida,77798.83 +23640.93,96189.63,148001.11,California,71498.49 +15505.73,127382.3,35534.17,New York,69758.98 +22177.74,154806.14,28334.72,California,65200.33 +1000.23,124153.04,1903.93,New York,64926.08 +1315.46,115816.21,297114.46,Florida,49490.75 +0,135426.92,0,California,42559.73 +542.05,51743.15,0,New York,35673.41 +0,116983.8,45173.06,California,14681.4 \ No newline at end of file diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression.zip new file mode 100644 index 0000000000000000000000000000000000000000..c0d35d229fbc625ac31204ed609df2327f187924 GIT binary patch literal 5501 zcmbVQWmr_}79L7QLE$!%2!tswkKPUl20A&w*cN=GW zOSqDaqot`U9A;_lYU$=?lJW6eAnTCc9qrRtC&sIQz+$kf8Wb033kY>6K~acJ z1~}&A2f5XMA@@9r6F6L1LP}Lb_n1n56piTTCzb#GH8hpKUi$~<{1x%PJBQ<6BC^B( zr=QGCz~oSeRNwiq+edaxueI6aGw0e*DAJcOpy^cT}IYFH+%T( zY4yABx0SkyH`Mw}M{oezS#H;>$%UUMa@oBPb%7-%Dv=fmjMq#hE#>ESDNZOFpg{~37OakcB8!dJ5aUuNe6Cg$m_I^KW%|ZLy8kBpI6=R?xQM@bN)!q)*qzo?K zzGRUAf5S^wNT;L@eWOqfG5DfnccV{Y!)v9(s=WS7%flzUr7|%ClP2qydR+UqOi&KB z+~k@fV^Vw{;!8)0qlZGy!1u|=Q?(XX?ku3|Ag7^BWuRc_)=WIN%OT%oc@Ze2W z=TBO?;6t)zTI_t(J)|X;4n=rRC`Rt-3p;BQV9?KfTQ0F0Yc8HI`Zin{mf?VPq_;3k zd~cp5Ud|vlNTPG9r;}x8uuseXtu$8-opnS$-GiB`E7w>n>ye558z0S=BatdfwcDkI zNwi>08f8u@Uq0@I8NqGaAdhqHIJc>@vX(cjn>L$pi1Tq(&6?_MugP4xJvab5=OD}x z7|X1yIQr5>J*N+A6nqZ1)L_9(Q1l3l z^a>~Fq1W>4HLy!ThGc2^94$o9S}k5Yc(K?WG|_D>&tHoSjT&RMJ^z9hR`C86|43>r z{6;meIDIWn_%?E}*4el4oi1iX^L5HfxP?%ChZPsfdRd6uy@}^s>JYIB;)BzdA zZgX`*`IvdR$#7-0xRcqX&^uNLQjF1hUvl0L_mStR1YQ4AIR}p)}&o&FoAxdT}hgj&8r=Nc>Cmb692t0pQ%H#dlIr$&5-)r@5 zmA4S0rX@~%}6qs zMFraTKkqTcI+^Ua6I%GOZ+|VX-M^ZGY{e%CTmpP8ZcBg6@j+*JerHWAk8K(Z8Qu45 zLmJwPZ+yxLjkS;nC!S3(d6>81nAd2-gMU+Tg6Cm^AJpke1%Mg8J4Er^tG$si$fg8H zE;p|b!(2J%w-KnN72ib7`rTz2uP8P5y23T4>#1=!;&Vnd$tR2Mv{D=BOz#PMk^wQul>Q{ ztl}T8BOy{=?SYTDn?38qAwv4OK)QS@edEfP$XEKeZoO%gmFtGctJvjOz#l=CGX)Zj z)bTUYSP{r~Alyn>Ji1oS%cPSSX#XX2KGRpm001DEC;f+SL4T@b37%cDEKJ=^;m)p> z&aO`8Xffqz4R^P6aJDyfw`6zrMprPjrmD#9IIdt0(W-WhocPUahoE#(o_k%(z4=9B zWXpUv3GZ?FKNd)OVoqtm=9O{>mKc$>bggGiR6%u{A{oTu^*~5^YB4TC_>D1P1Is(f zmme*GVplZw-|U-egU-rX@7>drX@1u;DjBSCb1pdhb@xg|Eiy z4hV^(ohVMTfvJLWV*V*7X8*;B0{_1gVLyF1^8AMngVdDKKCGS70Sbj;F)mM+KnVlN zI7du*kOfi-($=3xTVS|*JFz5$iX6gvK0c~z?W`l|v^7j&*CLwi4~Tt)WI`mhUi;5! z;45&mq+rL!JjrC?u%{(fS!tWf?g@9*;@5jGV?C>nI}A;@CziJ5ov6MU>r*l#n>G9@ z)y<*l3tnVXHgz#!L$MbX3HBpekU@J` zM@&=;#?E_B*|JwFeN{Qn+%0m9(xzIUq27<4%;Dg;VHFDH2 zYm>*llmV-L+p4B?iR3m}GFN!BB{cSjCGF$UibWfZH=l=A3PcnJWBOJl_g4fJ55$oj zL*sQZIS<+A6%*s4sr>C|54m~32H#U}JA+k&D;kYfXN;d{LV_^s47Qp-I(Ol>YTl*2 zs9~#8qSyMA3MaefYpS|Q6d$$W&9C=Ld!A|a62QZ{g=R@UeoAF1o1uwL7Km-V(P>5y8OdVQMNXzC^4{7l_S5fO2v-s^nh63B_qtMat05% z6?+;3!rDTi3kcPEEBQayynu&~m{GuMYdf0FG5VFRxb^Mq^B@{|(|`!0WZS0<>x5B6l^mi)*J z4Z~&%d##^L6i-Uyb^a-pm&?`?u)Ek=ON%uo)87HJ(Oy1N{?~JJ_n~Z4&m&0oj*D@X zh@d(HDhEuX3i9vBWYuCl6`A~J=YcaK^EeH3qgvz=vY{V=2~)NkgAmm&>EJV@f`QS5 z7iwqJ!YIs-S+7%#tflExc~jkgh^*Cr4c?p(ZsKv%?P%(0nck#mhrRUCgx9v zE(H-pC&`FcbX}TzK@7_f)xU3V+~9#hBc4p%&a9=UahS=xOZ6@7W!nD3$rXCskV<_Y zhY>S9-_?L>fTmGCYeC3HG@E;Vz8GNx@R6%E(g;K@uP2C=v~8dzeD zdC1R`a3-p}I@s?GAUtQZ?sZwu=%9Z-IkHxct)>4B{N^N^(3CQH;f77%t?XgsVDQ|* z!rC()O_t>HByLUQ18Fx~W2w~VNJQPxaXpY*h`AJt%1*La=s3W@1Y`ZLfe1b-j~!o- z6$$kBnH@Or*LdUy=Q06o!y1AmQ5PA>WHV}{@82NFi8P5zo5 zXz2n0KSu;-L7ZIRfkfO%h%;9Dcl6)eKiT(loZ9~9e~Er-81Yvoei%ZJ5l*UYmQ#j8 zP{;m^)&EUa1Ah$_PWlpU&}?ljsEg z5}hY^h!#aB3%I~3lSim?a>mBaGKtEKKP;jp(n*VGVQ?}rpSI|L3g0b?sa}J#`@KCods@@P1qyPN}@}|2b literal 0 HcmV?d00001 diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/Multiple_Linear_Regression/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0$Vqox1Ojhs@R)|o50+1L3ClDI}u^SMB_!U6R09PRZ;$Z`hAt6CfagYwM HE_8JOM2!ci literal 0 HcmV?d00001 diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_Linear_Regression/__MACOSX/Multiple_Linear_Regression/._50_Startups.csv new file mode 100644 index 0000000000000000000000000000000000000000..be931daf95f3c13c53fc8494dfff9926ebd65bf7 GIT binary patch literal 576 zcmcJL%}&BV6oqeVj2ew$qiZ&@E1j19PzVtU7LgDWA^aE*kTQf`1aX>s4H!#kMV7tTrA0rEeQwUrWSXP|do&<~QQ4D~nKbIsLC2by zx0A?pr$}~6rmtT4zgJU>_w=8Q>$Os2}?jfA5WrUDY)lJth6l-2>f5r5)RZ%nGk&`Ma fSvN+hi6mRG%95rvRLL@QMY55jwGgs3&351ub@7Wo literal 0 HcmV?d00001 diff --git a/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py new file mode 100644 index 0000000..b467d1a --- /dev/null +++ b/Part 2 - Regression/Section 5 - Multiple Linear Regression/Multiple_linear_regression.py @@ -0,0 +1,100 @@ +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# Importing the dataset +dataset = pd.read_csv('50_Startups.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 4].values + + +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder_X = LabelEncoder() +X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) +onehotencoder = OneHotEncoder(categorical_features = [3]) +X = onehotencoder.fit_transform(X).toarray() + +#Avoid Dummy Variable trap +X = X[:,1:] + +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) + +#Fitting Multiple Linear regression +from sklearn.linear_model import LinearRegression +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +y_pred = regressor.predict(X_test) + +#Building for Backward Elimination +import statsmodels.formula.api as sm +X = np.append(arr=np.ones((50,1)).astype(int), values = X, axis=1) +X_opt = X[:, [0,1,2,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,1,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3,4,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3,5]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +X_opt = X[:, [0,3]] +regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() +regressor_OLS.summary() + +#Backward Elimination with p-values only: +import statsmodels.formula.api as sm +def backwardElimination(x, sl): + numVars = len(x[0]) + for i in range(0, numVars): + regressor_OLS = sm.OLS(y, x).fit() + maxVar = max(regressor_OLS.pvalues).astype(float) + if maxVar > sl: + for j in range(0, numVars - i): + if (regressor_OLS.pvalues[j].astype(float) == maxVar): + x = np.delete(x, j, 1) + regressor_OLS.summary() + return x + +SL = 0.05 +X_opt = X[:, [0, 1, 2, 3, 4, 5]] +X_Modeled = backwardElimination(X_opt, SL) + + +#Backward Elimination with p-values and Adjusted R Squared: +import statsmodels.formula.api as sm +def backwardElimination(x, SL): + numVars = len(x[0]) + temp = np.zeros((50,6)).astype(int) + for i in range(0, numVars): + regressor_OLS = sm.OLS(y, x).fit() + maxVar = max(regressor_OLS.pvalues).astype(float) + adjR_before = regressor_OLS.rsquared_adj.astype(float) + if maxVar > SL: + for j in range(0, numVars - i): + if (regressor_OLS.pvalues[j].astype(float) == maxVar): + temp[:,j] = x[:, j] + x = np.delete(x, j, 1) + tmp_regressor = sm.OLS(y, x).fit() + adjR_after = tmp_regressor.rsquared_adj.astype(float) + if (adjR_before >= adjR_after): + x_rollback = np.hstack((x, temp[:,[0,j]])) + x_rollback = np.delete(x_rollback, j, 1) + print (regressor_OLS.summary()) + return x_rollback + else: + continue + regressor_OLS.summary() + return x + +SL = 0.05 +X_opt = X[:, [0, 1, 2, 3, 4, 5]] +X_Modeled = backwardElimination(X_opt, SL)