Você está na página 1de 19

> #Question 1

> getwd()
[1] "G:/TERMTHREE/BUSINESSANALYTICS/CLASSWORK/new"
> setwd("G:/TERMTHREE/BUSINESSANALYTICS/CLASSWORK/new")
> bData = read.csv(file="Cereals.csv", header = TRUE, sep = ",")
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491
5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
> attach(bData)
The following objects are masked from bData (pos = 6):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
>
> #Checking Multicollinearity
>
> require("usdm")
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491

5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
> vif(bData[,c(5:12)])
Variables
VIF
1 protein 1.870357
2
fat 1.461836
3
sodium 1.279833
4
fiber 9.884092
5
carbo 2.426581
6
sugars 2.180586
7
potass 9.670269
8 vitamins 1.218836
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491
5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
>
> #Solution Q1 part1
> #Since in all the independent variables VIF values are less than 10 so there i
s no multicollinearity.
>
> #Solution Q1 part2
> mreg1 <- lm(calories~protein+fat+sodium+fiber+carbo+sugars+potass+vitamins, da
ta=bData)
> summary(mreg1)
Call:
lm(formula = calories ~ protein + fat + sodium + fiber + carbo +
sugars + potass + vitamins, data = bData)
Residuals:
Min
1Q Median
-12.759 -3.412 -0.084

3Q
Max
2.676 15.914

Coefficients:
(Intercept)
protein
fat
sodium
fiber
carbo

Estimate Std. Error


-2.988711 4.606167
4.208274 0.755114
9.142877 0.713309
-0.001236 0.008119
1.269414 0.770597
4.242214 0.237763

t value
-0.649
5.573
12.818
-0.152
1.647
17.842

Pr(>|t|)
0.519
5.17e-07 ***
< 2e-16 ***
0.879
0.104
< 2e-16 ***

sugars
4.105063 0.201220 20.401 <
potass
-0.037836 0.026061 -1.452
vitamins
-0.007032 0.029414 -0.239
--Signif. codes: 0 *** 0.001 ** 0.01 *

2e-16 ***
0.151
0.812
0.05 .

0.1

Residual standard error: 5.075 on 65 degrees of freedom


(3 observations deleted due to missingness)
Multiple R-squared: 0.9418,
Adjusted R-squared: 0.9346
F-statistic: 131.4 on 8 and 65 DF, p-value: < 2.2e-16
> plot(mreg1)
Hit <Return> to see next plot: #Summary interpretation
Hit <Return> to see next plot: #MOdel significance= Adjusted R square= 93.46% so
its a good model, also the p value is less than 0.001(2.2e-16), so its highly s
ignificant model. In variables, Protein, fat, carbo and sugars are significant ,
others are insignificant.
Hit <Return> to see next plot: #plot interpretation
Hit <Return> to see next plot: #From plot 1 that is residual vs fitted we see th
at there is no dependency and heteroskedasticity. IN plot 4, all data points are
within cooks distance of 0.5
>
> #Backward Elimination Method
> #Dropping sodium as p value is 0.8794 and is max
> mreg2 <- lm(calories~protein+fat+fiber+carbo+sugars+potass+vitamins, data=bDat
a)
> summary(mreg2)
Call:
lm(formula = calories ~ protein + fat + fiber + carbo + sugars +
potass + vitamins, data = bData)
Residuals:
Min
1Q Median
-12.7975 -3.4168 -0.0952

3Q
Max
2.7673 15.9076

Coefficients:
Estimate Std. Error t value
(Intercept) -2.919296 4.549512 -0.642
protein
4.206780 0.749442 5.613
fat
9.133479 0.705356 12.949
fiber
1.257055 0.760619 1.653
carbo
4.229816 0.221730 19.076
sugars
4.099017 0.195799 20.935
potass
-0.037603 0.025822 -1.456
vitamins
-0.007978 0.028537 -0.280
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.523
4.27e-07
< 2e-16
0.103
< 2e-16
< 2e-16
0.150
0.781
*

***
***
***
***

0.05 .

0.1

Residual standard error: 5.037 on 66 degrees of freedom


(3 observations deleted due to missingness)
Multiple R-squared: 0.9417,
Adjusted R-squared: 0.9356
F-statistic: 152.4 on 7 and 66 DF, p-value: < 2.2e-16
> #Dropping vitamins as p value= 0.781
> mreg3 <- lm(calories~protein+fat+fiber+carbo+sugars+potass, data=bData)
> summary(mreg3)
Call:
lm(formula = calories ~ protein + fat + fiber + carbo + sugars +

potass, data = bData)


Residuals:
Min
1Q Median
-12.7620 -3.2794 -0.0728

3Q
Max
2.7873 15.9560

Coefficients:
Estimate Std. Error t value
(Intercept) -2.67847
4.43638 -0.604
protein
4.18666
0.74083 5.651
fat
9.13427
0.70048 13.040
fiber
1.23970
0.75285 1.647
carbo
4.20865
0.20696 20.335
sugars
4.08456
0.18754 21.779
potass
-0.03730
0.02562 -1.456
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.548
3.54e-07
< 2e-16
0.104
< 2e-16
< 2e-16
0.150
*

***
***
***
***

0.05 .

0.1

Residual standard error: 5.003 on 67 degrees of freedom


(3 observations deleted due to missingness)
Multiple R-squared: 0.9417,
Adjusted R-squared: 0.9364
F-statistic: 180.3 on 6 and 67 DF, p-value: < 2.2e-16
> #dropping potass as pvalue = 0.150
> mreg4 <- lm(calories~protein+fat+fiber+carbo+sugars, data=bData)
> summary(mreg4)
Call:
lm(formula = calories ~ protein + fat + fiber + carbo + sugars,
data = bData)
Residuals:
Min
1Q Median
-12.1891 -3.0826 -0.9633

3Q
Max
2.5433 16.4884

Coefficients:
Estimate Std. Error t value
(Intercept) -1.2589
4.2917 -0.293
protein
3.8860
0.6963 5.581
fat
8.6983
0.6512 13.356
fiber
0.2501
0.3277 0.763
carbo
4.1446
0.2005 20.675
sugars
3.9681
0.1692 23.449
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.770
4.23e-07
< 2e-16
0.448
< 2e-16
< 2e-16
*

***
***
***
***

0.05 .

0.1

Residual standard error: 5.002 on 70 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9392,
Adjusted R-squared: 0.9348
F-statistic: 216.2 on 5 and 70 DF, p-value: < 2.2e-16
> #dropping fiber as only insignificant left
> mreg5 <- lm(calories~protein+fat+carbo+sugars, data=bData)
> summary(mreg5)
Call:
lm(formula = calories ~ protein + fat + carbo + sugars, data = bData)
Residuals:
Min

1Q

Median

3Q

Max

-12.2166 -3.0147 -0.5182

2.5555 16.5380

Coefficients:
Estimate Std. Error t value
(Intercept) 0.04175
3.92730 0.011
protein
4.15389
0.59962 6.928
fat
8.59671
0.63562 13.525
carbo
4.06656
0.17194 23.651
sugars
3.94232
0.16534 23.844
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.992
1.59e-09
< 2e-16
< 2e-16
< 2e-16
*

***
***
***
***

0.05 .

0.1

Residual standard error: 4.987 on 71 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9387,
Adjusted R-squared: 0.9352
F-statistic: 271.7 on 4 and 71 DF, p-value: < 2.2e-16
> plot(mreg5)
Hit <Return> to see next plot: #Solution Question 1
Hit <Return> to see next plot: #Summary interpretation
Hit <Return> to see next plot: #MOdel significance= Adjusted R square= 93.52% so
its a good model, also the p value is less than 0.001(2.2e-16), so its highly s
ignificant model. In variables, Protein, fat, carbo and sugars are highly signif
icant , others are insignificant.
Hit <Return> to see next plot: #plot interpretation
> #From plot 1 that is residual vs fitted we see that there is no dependency and
heteroskadacity. IN plot 4, all data points are within cooks distance of 0.5
>
>
> #Question 2
> #Checking for best 5 independent variables
> require("usdm")
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491
5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
> vif(bData[,c(5:12)])
Variables
VIF
1 protein 1.870357
2
fat 1.461836
3
sodium 1.279833

4
fiber 9.884092
5
carbo 2.426581
6
sugars 2.180586
7
potass 9.670269
8 vitamins 1.218836
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491
5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
>
> #Correlation
>
> cor(bData[,c(4:12)],use="complete.obs")
calories
protein
fat
sodium
fiber
c
arbo
sugars
calories 1.00000000 0.03399166 0.5073732397 0.2962474981 -0.29521183 0.2706
0605 0.569120535
protein 0.03399166 1.00000000 0.2023533963 0.0115588913 0.51400610 -0.0367
4326 -0.286583967
fat
0.50737324 0.20235340 1.0000000000 0.0008219036 0.01403587 -0.2849
3369 0.287152487
sodium
0.29624750 0.01155889 0.0008219036 1.0000000000 -0.07073492 0.3284
0919 0.037058961
fiber
-0.29521183 0.51400610 0.0140358654 -0.0707349230 1.00000000 -0.3790
8370 -0.150948502
carbo
0.27060605 -0.03674326 -0.2849336855 0.3284091857 -0.37908370 1.0000
0000 -0.452069189
sugars
0.56912054 -0.28658397 0.2871524866 0.0370589612 -0.15094850 -0.4520
6919 1.000000000
potass -0.07136125 0.57874284 0.1996367171 -0.0394380876 0.91150392 -0.3650
0293 0.001413982
vitamins 0.25984556 0.05479952 -0.0305139099 0.3315759640 -0.03871734 0.2535
7897 0.072954382
potass
vitamins
calories -0.071361247 0.25984556
protein 0.578742837 0.05479952
fat
0.199636717 -0.03051391
sodium -0.039438088 0.33157596
fiber
0.911503921 -0.03871734
carbo
-0.365002934 0.25357897
sugars
0.001413982 0.07295438
potass
1.000000000 -0.00263583

vitamins -0.002635830 1.00000000


>
>
> #Since we need to choose 5 top among 8 IV's so we have selected them on the ba
sis on correlation values we took the highest five (mod values) and discarded th
e ones that have higher correlation values.
> #Selected IV's are FIBER, protein, Fat, Sodium ,Vitamins, Sugar
> #Round One
> est11 <- lm(calories~fiber, data=bData)
> est12 <- lm(calories~sodium, data=bData)
> est13 <- lm(calories~fat, data=bData)
> est14 <- lm(calories~carbo, data=bData)
> est15 <- lm(calories~sugars, data=bData)
>
> #Round One Summary
> summary(est11)$adj.r.squared
[1] 0.07390559
> summary(est12)$adj.r.squared
[1] 0.07826182
> summary(est13)$adj.r.squared
[1] 0.2385932
> summary(est14)$adj.r.squared
[1] 0.05376073
> summary(est15)$adj.r.squared
[1] 0.3117838
> summary(est15)
Call:
lm(formula = calories ~ sugars, data = bData)
Residuals:
Min
1Q Median
3Q
Max
-39.158 -9.585 0.486 11.441 37.879
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 89.1578
3.5429 25.165 < 2e-16 ***
sugars
2.5356
0.4287 5.914 9.58e-08 ***
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1
Residual standard error: 16.26 on 74 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.321,
Adjusted R-squared: 0.3118
F-statistic: 34.98 on 1 and 74 DF, p-value: 9.581e-08
>
> #Round Two
> est21 <- lm(calories~fiber+sugars, data=bData)
> est22 <- lm(calories~sodium+sugars, data=bData)
> est23 <- lm(calories~fat+sugars, data=bData)
> est24 <- lm(calories~carbo+sugars, data=bData)
>
>
> #Round Two Summary
> summary(est21)$adj.r.squared
[1] 0.3503595
> summary(est22)$adj.r.squared
[1] 0.3752477
> summary(est23)$adj.r.squared

[1] 0.4296726
> summary(est24)$adj.r.squared
[1] 0.6657591
> summary(est24)
Call:
lm(formula = calories ~ carbo + sugars, data = bData)
Residuals:
Min
1Q Median
-25.670 -7.790 -1.998

3Q
Max
5.136 32.178

Coefficients:
Estimate Std. Error t value
(Intercept) 29.1064
7.1786 4.055
carbo
3.3819
0.3796 8.909
sugars
3.9575
0.3387 11.683
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.000124 ***
2.76e-13 ***
< 2e-16 ***
*

0.05 .

0.1

Residual standard error: 11.33 on 73 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.6747,
Adjusted R-squared: 0.6658
F-statistic: 75.69 on 2 and 73 DF, p-value: < 2.2e-16
>
> #Round Three
> est31 <- lm(calories~fiber+sugars+carbo, data=bData)
> est32 <- lm(calories~sodium+sugars+carbo, data=bData)
> est33 <- lm(calories~fat+sugars+carbo, data=bData)
>
> #Round Three Summary
> summary(est31)$adj.r.squared
[1] 0.6718587
> summary(est32)$adj.r.squared
[1] 0.6637315
> summary(est33)$adj.r.squared
[1] 0.8929605
> summary(est33)
Call:
lm(formula = calories ~ fat + sugars + carbo, data = bData)
Residuals:
Min
1Q Median
-15.6209 -3.4242 -0.9963

3Q
Max
3.3473 22.2276

Coefficients:
Estimate Std. Error t value
(Intercept) 15.0069
4.2163 3.559
fat
9.8095
0.7855 12.488
sugars
3.4904
0.1953 17.871
carbo
3.8934
0.2187 17.803
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.000663
< 2e-16
< 2e-16
< 2e-16
*

***
***
***
***

0.05 .

0.1

Residual standard error: 6.412 on 72 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.8972,
Adjusted R-squared: 0.893
F-statistic: 209.6 on 3 and 72 DF, p-value: < 2.2e-16

>
> #Round Four
> est41 <- lm(calories~fiber+sugars+carbo+fat, data=bData)
> est42 <- lm(calories~sodium+sugars+carbo+fat, data=bData)
>
>
> #Round Four Summary
> summary(est41)$adj.r.squared
[1] 0.9071853
> summary(est42)$adj.r.squared
[1] 0.8916343
> summary(est41)
Call:
lm(formula = calories ~ fiber + sugars + carbo + fat, data = bData)
Residuals:
Min
1Q Median
-16.1526 -3.8290 -0.9644

3Q
Max
2.9200 24.2138

Coefficients:
Estimate Std. Error t value
(Intercept) 4.3915
4.9778 0.882
fiber
1.1719
0.3378 3.469
sugars
3.7476
0.1964 19.082
carbo
4.3113
0.2366 18.222
fat
9.9192
0.7321 13.548
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.380639
0.000892
< 2e-16
< 2e-16
< 2e-16
*

***
***
***
***

0.05 .

0.1

Residual standard error: 5.97 on 71 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9121,
Adjusted R-squared: 0.9072
F-statistic: 184.3 on 4 and 71 DF, p-value: < 2.2e-16
>
> #Round Five
> est51 <- lm(calories~fiber+sugars+carbo+fat+sodium, data=bData)
>
>
>
> #Round Five Summary
> summary(est51)$adj.r.squared
[1] 0.9059162
> summary(est51)
Call:
lm(formula = calories ~ fiber + sugars + carbo + fat + sodium,
data = bData)
Residuals:
Min
1Q Median
-16.0947 -3.7209 -0.9654

3Q
Max
2.8776 24.3893

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.262571 5.050786 0.844 0.40158
fiber
1.183730 0.344923 3.432 0.00101 **
sugars
3.758738 0.205047 18.331 < 2e-16 ***

carbo
4.332839 0.260324 16.644 < 2e-16 ***
fat
9.932379 0.739920 13.424 < 2e-16 ***
sodium
-0.001901 0.009246 -0.206 0.83770
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1

Residual standard error: 6.011 on 70 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9122,
Adjusted R-squared: 0.9059
F-statistic: 145.4 on 5 and 70 DF, p-value: < 2.2e-16
> #interpretation question 2- sodium is insignificant that is the p value is les
s than 0.05 or p value is 0.9059 is found significant from anova.
> #so the best model is est41
>
> # Doing ANOVA to CHECK FOR BEST MODEL
> anova(est15,est24,est33,est41)
Analysis of Variance Table
Model 1: calories ~ sugars
Model 2: calories ~ carbo + sugars
Model 3: calories ~ fat + sugars + carbo
Model 4: calories ~ fiber + sugars + carbo + fat
Res.Df
RSS Df Sum of Sq
F
Pr(>F)
1
74 19559.0
2
73 9370.7 1 10188.3 285.821 < 2.2e-16 ***
3
72 2959.8 1
6410.9 179.850 < 2.2e-16 ***
4
71 2530.8 1
429.0 12.035 0.0008916 ***
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1
1
> #As we can see from anova interpretation 4th model is the best one as it is si
gnificant, so est41 is the final chosen model.
>
> #Final Result
> summary(est41)
Call:
lm(formula = calories ~ fiber + sugars + carbo + fat, data = bData)
Residuals:
Min
1Q Median
-16.1526 -3.8290 -0.9644

3Q
Max
2.9200 24.2138

Coefficients:
Estimate Std. Error t value
(Intercept) 4.3915
4.9778 0.882
fiber
1.1719
0.3378 3.469
sugars
3.7476
0.1964 19.082
carbo
4.3113
0.2366 18.222
fat
9.9192
0.7321 13.548
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.380639
0.000892
< 2e-16
< 2e-16
< 2e-16
*

***
***
***
***

0.05 .

0.1

Residual standard error: 5.97 on 71 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9121,
Adjusted R-squared: 0.9072
F-statistic: 184.3 on 4 and 71 DF, p-value: < 2.2e-16
> plot(est41)
Hit <Return> to see next plot: # for the final model that is est 41, we have adj

usted R2 as 90.72%, model p value is 2.2e-16, so the model is good.


Hit <Return> to see next plot: #All the 4 independent variables are significant.
Hit <Return> to see next plot: #plot interpretation
Hit <Return> to see next plot: #From plot 1 that is residual vs fitted we see th
at there is no dependency and heteroskedasticity. IN plot 4, all data points are
within cooks distance of 0.5

> #Question 3,4,5


> getwd()
[1] "G:/TERMTHREE/BUSINESSANALYTICS/CLASSWORK/new"
> setwd("D:/PGPM16-17/Term 3/Business Analytics/GA")
Error in setwd("D:/PGPM16-17/Term 3/Business Analytics/GA") :
cannot change working directory
> bData= read.csv(file="Cereals.csv", header = TRUE, sep = ",")
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating
1
3
1 0.33 68.40297
2
3
1 1.00 33.98368
3
3
1 0.33 59.42551
4
3
1 0.50 93.70491
5
3
1 0.75 34.38484
6
1
1 0.75 29.50954
> attach(bData)
The following objects are masked from bData (pos = 3):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
The following objects are masked from bData (pos = 7):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
>
>
>
>
>

#Question 3
#Regression model-1
mreg1 <- lm(rating~sugars+fat+sodium+fiber, data=bData)
summary(mreg1)

Call:
lm(formula = rating ~ sugars + fat + sodium + fiber, data = bData)

Residuals:
Min
1Q Median
3Q
Max
-4.3394 -1.3669 -0.2298 1.1915 7.3055
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.535230 0.818157 76.43 <2e-16 ***
sugars
-1.953304 0.066835 -29.23 <2e-16 ***
fat
-3.325458 0.287630 -11.56 <2e-16 ***
sodium
-0.055642 0.003358 -16.57 <2e-16 ***
fiber
2.832353 0.116396 24.33 <2e-16 ***
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1

Residual standard error: 2.386 on 71 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9729,
Adjusted R-squared: 0.9714
F-statistic: 638 on 4 and 71 DF, p-value: < 2.2e-16
> plot(mreg1)
Hit <Return> to see next plot: #All the variables are significant(99% significan
ce level) and adj R2 = 97.14%
Hit <Return> to see next plot: #Residual vs fitted plot - No heteroskedasity,dat
a is independent
Hit <Return> to see next plot: #Residual vs Leverage plot - No influential outli
ers
Hit <Return> to see next plot:
> #Regression model-2
> tab1 <- table(mfr)
> tab1
mfr
A G K N P Q R
1 22 23 6 9 8 8
> bData$mfr_dummyG <- as.numeric(mfr == "G")
> bData$mfr_dummyK <- as.numeric( mfr== "K")
> table(bData$mfr_dummyG)
0 1
55 22
> table(bData$mfr_dummyK)
0 1
54 23
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating mfr_dummyG mfr_dummyK

1
3
1 0.33 68.40297
0
0
2
3
1 1.00 33.98368
0
0
3
3
1 0.33 59.42551
0
1
4
3
1 0.50 93.70491
0
1
5
3
1 0.75 34.38484
0
0
6
1
1 0.75 29.50954
1
0
>
> mreg2 <- lm(rating~sugars+fat+sodium+fiber+mfr_dummyK+mfr_dummyG, data=bData)
> summary(mreg2)
Call:
lm(formula = rating ~ sugars + fat + sodium + fiber + mfr_dummyK +
mfr_dummyG, data = bData)
Residuals:
Min
1Q Median
3Q
Max
-4.7066 -1.5615 -0.3118 1.4276 7.6004
Coefficients:
Estimate Std. Error t value
(Intercept) 62.519046 0.826690 75.626
sugars
-1.955599 0.069076 -28.311
fat
-3.231652 0.306129 -10.557
sodium
-0.054722 0.003664 -14.936
fiber
2.798131 0.121134 23.099
mfr_dummyK 0.181009 0.717970 0.252
mfr_dummyG -0.662525 0.747075 -0.887
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
< 2e-16
< 2e-16
4.77e-16
< 2e-16
< 2e-16
0.802
0.378
*

***
***
***
***
***

0.05 .

0.1

Residual standard error: 2.398 on 69 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.9734,
Adjusted R-squared: 0.9711
F-statistic: 421.2 on 6 and 69 DF, p-value: < 2.2e-16
> plot(mreg2)
Hit <Return> to see next plot: #Dummy variables are not significant, rest are si
gnificant and adj R2 = 97.11%
Hit <Return> to see next plot: #Residual vs fitted plot - No heteroskedasity,dat
a is independent
Hit <Return> to see next plot: #Residual vs Leverage plot - No influential outli
ers
Hit <Return> to see next plot:
> #Question 4A
>
>
>
>

bData$weight_c = weight - mean(weight)


bData$sugars_c = sugars - mean(sugars, na.rm = TRUE)
bData$interaction1 = bData$weight_c*bData$sugars_c
head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25

Almond_Delight R
C
110
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
10
70
25
shelf weight cups rating mfr_dummyG mfr_dummyK
ction1
1
3
1 0.33 68.40297
0
0
038961
2
3
1 1.00 33.98368
0
0
883117
3
3
1 0.33 59.42551
0
1
000000
4
3
1 0.50 93.70491
0
1
805195
5
3
1 0.75 34.38484
0
0
883117
6
1
1 0.75 29.50954
1
0
805195
>
> detach(bData)
> attach(bData)
The following objects are masked from bData (pos =

200

1.0 14.0

180

1.5 10.5

weight_c

sugars_c intera

-0.02961039 -1.0263158

0.03

-0.02961039 0.9736842 -0.02


-0.02961039 -2.0263158

0.06

-0.02961039 -7.0263158

0.20

-0.02961039 0.9736842 -0.02


-0.02961039 2.9736842 -0.08

3):

calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
The following objects are masked from bData (pos = 7):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
>
> mreg3 <- lm(calories~sugars+weight+interaction1, data=bData)
> summary(mreg3)
Call:
lm(formula = calories ~ sugars + weight + interaction1, data = bData)
Residuals:
Min
1Q Median
-45.695 -5.617 0.038

3Q
Max
8.994 38.660

Coefficients:
Estimate Std. Error t value
(Intercept) 22.6174
10.6016 2.133
sugars
1.3303
0.3867 3.440
weight
73.7232
11.1794 6.595
interaction1 -3.1044
1.8042 -1.721
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.036301
0.000971
6.13e-09
0.089617
*

*
***
***
.

0.05 .

0.1

Residual standard error: 12.96 on 72 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.58,
Adjusted R-squared: 0.5625
F-statistic: 33.15 on 3 and 72 DF, p-value: 1.435e-13
> plot(mreg3)
Hit <Return> to see next plot: #Combine effect of weight and sugars is not signi

ficant,
Hit <Return> to see next plot: #but individually weight and sugars are significa
nt.And adj R2 = 56.25%
Hit <Return> to see next plot: #Data is independent and no heteroskedasity. And
no influential outliers.
Hit <Return> to see next plot:
> #Question 4B
> mean(fat)
[1] 1.012987
> bData$fat_c = fat - mean(fat)
> bData$interaction2 = bData$weight_c*bData$fat_c
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating mfr_dummyG mfr_dummyK
weight_c sugars_c intera
ction1
fat_c
1
3
1 0.33 68.40297
0
0 -0.02961039 -1.0263158 0.03
038961 -0.01298701
2
3
1 1.00 33.98368
0
0 -0.02961039 0.9736842 -0.02
883117 3.98701299
3
3
1 0.33 59.42551
0
1 -0.02961039 -2.0263158 0.06
000000 -0.01298701
4
3
1 0.50 93.70491
0
1 -0.02961039 -7.0263158 0.20
805195 -1.01298701
5
3
1 0.75 34.38484
0
0 -0.02961039 0.9736842 -0.02
883117 0.98701299
6
1
1 0.75 29.50954
1
0 -0.02961039 2.9736842 -0.08
805195 0.98701299
interaction2
1 0.0003845505
2 -0.1180570079
3 0.0003845505
4 0.0299949401
5 -0.0292258391
6 -0.0292258391
>
> detach(bData)
> attach(bData)
The following objects are masked from bData (pos = 3):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
The following objects are masked from bData (pos = 7):

calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
>
> mreg4 <- lm(calories~fat+weight+interaction2, data=bData)
> summary(mreg4)
Call:
lm(formula = calories ~ fat + weight + interaction2, data = bData)
Residuals:
Min
1Q Median
-47.485 -5.782 0.988

3Q
Max
5.339 30.988

Coefficients:
Estimate Std. Error t value
(Intercept)
18.416
9.976 1.846
fat
6.974
1.454 4.797
weight
79.273
9.709 8.165
interaction2 -6.797
11.623 -0.585
--Signif. codes: 0 *** 0.001 ** 0.01

Pr(>|t|)
0.0689 .
8.28e-06 ***
6.90e-12 ***
0.5605
*

0.05 .

0.1

Residual standard error: 12.35 on 73 degrees of freedom


Multiple R-squared: 0.6142,
Adjusted R-squared: 0.5983
F-statistic: 38.74 on 3 and 73 DF, p-value: 4.35e-15
> plot(mreg4)
Hit <Return> to see next
at is not significant,
Hit <Return> to see next
And adj R2 = 59.83%
Hit <Return> to see next
no influential outliers.
Hit <Return> to see next

plot: #Combine effect(interaction term) of weight and f


plot: #but individually weight and fat are significant.
plot: #Data is independent and no heteroskedasity. And
plot:

> #Question 5
>
>
>
>
>

#Testing curviliner effects


rg1 <- lm(calories~weight, data=bData)
rg2 <- lm(calories~sugars, data=bData)
rg3 <- lm(calories~carbo, data=bData)
summary(rg1)

Call:
lm(formula = calories ~ weight, data = bData)
Residuals:
Min
1Q Median
-54.214 -4.214 5.786

3Q
Max
5.786 45.786

Coefficients:
Estimate Std. Error t
(Intercept)
14.08
11.17
weight
90.13
10.73
--Signif. codes: 0 *** 0.001 **

value Pr(>|t|)
1.261
0.211
8.397 2.1e-12 ***
0.01

0.05 .

0.1

Residual standard error: 14.08 on 75 degrees of freedom

Multiple R-squared: 0.4845,


Adjusted R-squared: 0.4777
F-statistic: 70.5 on 1 and 75 DF, p-value: 2.098e-12
> summary(rg2)
Call:
lm(formula = calories ~ sugars, data = bData)
Residuals:
Min
1Q Median
3Q
Max
-39.158 -9.585 0.486 11.441 37.879
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 89.1578
3.5429 25.165 < 2e-16 ***
sugars
2.5356
0.4287 5.914 9.58e-08 ***
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1

Residual standard error: 16.26 on 74 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.321,
Adjusted R-squared: 0.3118
F-statistic: 34.98 on 1 and 74 DF, p-value: 9.581e-08
> summary(rg3)
Call:
lm(formula = calories ~ carbo, data = bData)
Residuals:
Min
1Q Median
-54.644 -8.844 0.187

3Q
Max
9.555 50.187

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 87.8459
8.6211 10.190 9.78e-16 ***
carbo
1.2922
0.5634 2.294 0.0246 *
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1

Residual standard error: 19.06 on 74 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.06638, Adjusted R-squared: 0.05376
F-statistic: 5.261 on 1 and 74 DF, p-value: 0.02465
> plot(rg1)
Hit <Return> to see next plot: plot(rg2)
Hit <Return> to see next plot: plot(rg3)
Hit <Return> to see next plot:
Hit <Return> to see next plot:
> bData$carbo_c = carbo - mean(carbo, na.rm = TRUE)
> bData$weight2 = weight_c*weight_c
> bData$sugars2 = sugars_c*sugars_c
> bData$carbo2 = bData$carbo_c*bData$carbo_c
> detach(bData)
> attach(bData)
The following objects are masked from bData (pos = 3):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,

sugars, type, vitamins, weight


The following objects are masked from bData (pos = 7):
calories, carbo, cups, fat, fiber, mfr, name, potass, protein, rating, shelf
, sodium,
sugars, type, vitamins, weight
> head(bData)
name mfr type calories protein fat sodium fiber carbo sug
ars potass vitamins
1
100%_Bran N
C
70
4 1
130 10.0 5.0
6
280
25
2
100%_Natural_Bran Q
C
120
3 5
15 2.0 8.0
8
135
0
3
All-Bran K
C
70
4 1
260 9.0 7.0
5
320
25
4 All-Bran_with_Extra_Fiber K
C
50
4 0
140 14.0 8.0
0
330
25
5
Almond_Delight R
C
110
2 2
200 1.0 14.0
8
NA
25
6 Apple_Cinnamon_Cheerios G
C
110
2 2
180 1.5 10.5
10
70
25
shelf weight cups rating mfr_dummyG mfr_dummyK
weight_c sugars_c intera
ction1
fat_c
1
3
1 0.33 68.40297
0
0 -0.02961039 -1.0263158 0.03
038961 -0.01298701
2
3
1 1.00 33.98368
0
0 -0.02961039 0.9736842 -0.02
883117 3.98701299
3
3
1 0.33 59.42551
0
1 -0.02961039 -2.0263158 0.06
000000 -0.01298701
4
3
1 0.50 93.70491
0
1 -0.02961039 -7.0263158 0.20
805195 -1.01298701
5
3
1 0.75 34.38484
0
0 -0.02961039 0.9736842 -0.02
883117 0.98701299
6
1
1 0.75 29.50954
1
0 -0.02961039 2.9736842 -0.08
805195 0.98701299
interaction2
carbo_c
weight2
sugars2
carbo2
1 0.0003845505 -9.8026316 0.0008767752 1.0533241 96.0915859
2 -0.1180570079 -6.8026316 0.0008767752 0.9480609 46.2757964
3 0.0003845505 -7.8026316 0.0008767752 4.1059557 60.8810596
4 0.0299949401 -6.8026316 0.0008767752 49.3691136 46.2757964
5 -0.0292258391 -0.8026316 0.0008767752 0.9480609 0.6442175
6 -0.0292258391 -4.3026316 0.0008767752 8.8427978 18.5126385
> #running curvilinear regression analysis
> regcur1 <- lm(calories~weight+weight2, data=bData)
> summary(regcur1)
Call:
lm(formula = calories ~ weight + weight2, data = bData)
Residuals:
Min
1Q Median
-54.838 -4.838 5.162

3Q
Max
5.162 45.162

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept)
13.33
11.17 1.193
0.237
weight
91.54
10.78 8.488 1.55e-12 ***
weight2
-31.00
27.27 -1.137
0.259

--Signif. codes: 0

***

0.001

** 0.01

0.05 .

0.1

Residual standard error: 14.05 on 74 degrees of freedom


Multiple R-squared: 0.4934,
Adjusted R-squared: 0.4797
F-statistic: 36.03 on 2 and 74 DF, p-value: 1.183e-11
> #since weight2 is not significant,hence it is not curvilinear.
> #This is also evident from residuals vs fitted plot as we are getting almost s
traight line.
> plot(regcur1)
Hit <Return> to see next plot: regcur2 <- lm(calories~sugars+sugars2, data=bData
)
Hit <Return> to see next plot: summary(regcur2)
Hit <Return> to see next plot: #since sugars2 is significant at 95% significance
,hence it is curvilinear.
Hit <Return> to see next plot: #Residuals vs fitted plot shows slight dent in st
raight line.
> plot(regcur2)
Error in plot(regcur2) : object 'regcur2' not found
> regcur3 <- lm(calories~carbo+carbo2, data=bData)
> summary(regcur3)
Call:
lm(formula = calories ~ carbo + carbo2, data = bData)
Residuals:
Min
1Q Median
3Q
Max
-57.601 -11.135 0.425 10.076 47.194
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 90.1798
8.4120 10.720 <2e-16 ***
carbo
1.4070
0.5481 2.567 0.0123 *
carbo2
-0.2677
0.1114 -2.403 0.0188 *
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1

Residual standard error: 18.48 on 73 degrees of freedom


(1 observation deleted due to missingness)
Multiple R-squared: 0.1348,
Adjusted R-squared: 0.1111
F-statistic: 5.688 on 2 and 73 DF, p-value: 0.005063
> #since carbo2 is significant at 95% significance,hence it is curvilinear.
> #Residuals vs fitted plot shows slight curve in straight line.
> plot(regcur3)
Hit <Return> to see next plot: #Data is independent and no heteroskedasity. And
no influential outliers for all plots.