Você está na página 1de 11

Statistics 110 Autumn 2007–2008

B. Srinivasan Out: Oct. 5, 2007


Handout #5

Homework #1 Solutions
1. MB 1.1
This dataset is also in the houseprices dataset within DAAG. The linear histogram
emphasizes outliers, while the logarithmic histogram conceals them.

> rm(list = ls())


> library(DAAG)
> data(houseprices)
> attach(houseprices)
> layout(matrix(1:4, 2, 2, byrow = TRUE))
> plot(sale.price, area)
> hist(sale.price)
> plot(log(sale.price), area)
> hist(log(sale.price))
> detach(houseprices)

Histogram of sale.price


6
5


Frequency
900 1100

4
area

● ●

2

● ●

1

●●

700

● ● ●

0

150 200 250 300 350 100 200 300 400

sale.price sale.price

Histogram of log(sale.price)


6
5


Frequency
900 1100

4
area

●●

2

● ●

1

●●

700

● ● ●

0

4.8 5.2 5.6 4.6 5.0 5.4 5.8

log(sale.price) log(sale.price)

1
2. MB 1.3
It is usually good practice to explicitly clear the workspace in between problems. The
code below shows how to quickly inspect columns with str() and how to determine
which rows and columns contain at least one missing element in an automatic fashion.
Note that the nesting of invocations here shows how to use R as a functional program-
ming language, in which one result is handed off to the invoking function.

> rm(list = ls())


> library(DAAG)
> data(possum)
> for (nn in 1:ncol(possum)) {
+ print(colnames(possum)[nn])
+ str(possum[, nn])
+ }

[1] "case"
num [1:104] 1 2 3 4 5 6 7 8 9 10 ...
[1] "site"
num [1:104] 1 1 1 1 1 1 1 1 1 1 ...
[1] "Pop"
Factor w/ 2 levels "Vic","other": 1 1 1 1 1 1 1 1 1 1 ...
[1] "sex"
Factor w/ 2 levels "f","m": 2 1 1 1 1 1 2 1 1 1 ...
[1] "age"
num [1:104] 8 6 6 6 2 1 2 6 9 6 ...
[1] "hdlngth"
num [1:104] 94.1 92.5 94 93.2 91.5 93.1 95.3 94.8 93.4 91.8 ...
[1] "skullw"
num [1:104] 60.4 57.6 60 57.1 56.3 54.8 58.2 57.6 56.3 58 ...
[1] "totlngth"
num [1:104] 89 91.5 95.5 92 85.5 90.5 89.5 91 91.5 89.5 ...
[1] "taill"
num [1:104] 36 36.5 39 38 36 35.5 36 37 37 37.5 ...
[1] "footlgth"
num [1:104] 74.5 72.5 75.4 76.1 71 73.2 71.5 72.7 72.4 70.9 ...
[1] "earconch"
num [1:104] 54.5 51.2 51.9 52.2 53.2 53.6 52 53.9 52.9 53.4 ...
[1] "eye"
num [1:104] 15.2 16 15.5 15.2 15.1 14.2 14.2 14.5 15.5 14.4 ...
[1] "chest"
num [1:104] 28 28.5 30 28 28.5 30 30 29 28 27.5 ...
[1] "belly"
num [1:104] 36 33 34 34 33 32 34.5 34 33 32 ...

> missing.inds = which(!complete.cases(possum))


> print(missing.inds)

[1] 41 44 46

2
> print(possum[missing.inds, ])

case site Pop sex age hdlngth skullw totlngth taill footlgth earconch eye
BB36 41 2 Vic f 5 88.4 57.0 83 36.5 NA 40.3 15.9
BB41 44 2 Vic m NA 85.1 51.5 76 35.5 70.3 52.6 14.4
BB45 46 2 Vic m NA 91.4 54.4 84 35.0 72.8 51.2 14.4
chest belly
BB36 27.0 30.5
BB41 23.0 27.0
BB45 24.5 35.0

> print(which(apply(is.na(possum[missing.inds, ]), 2, sum) > 0))

age footlgth
5 10

3. MB 1.4
This shows a different way to figure out which columns are missing, by transposing
the ais data frame and then running the complete.cases command. We see that no
columns have missing values, and Gym, Netball, T_Sprnt, and W_Polo are sex imbal-
anced by a factor of 2:1 or more.

> rm(list = ls())


> library(DAAG)
> data(ais)
> for (nn in 1:ncol(ais)) {
+ print(colnames(ais)[nn])
+ str(ais[, nn])
+ }

[1] "rcc"
num [1:202] 3.96 4.41 4.14 4.11 4.45 4.1 4.31 4.42 4.3 4.51 ...
[1] "wcc"
num [1:202] 7.5 8.3 5 5.3 6.8 4.4 5.3 5.7 8.9 4.4 ...
[1] "hc"
num [1:202] 37.5 38.2 36.4 37.3 41.5 37.4 39.6 39.9 41.1 41.6 ...
[1] "hg"
num [1:202] 12.3 12.7 11.6 12.6 14 12.5 12.8 13.2 13.5 12.7 ...
[1] "ferr"
num [1:202] 60 68 21 69 29 42 73 44 41 44 ...
[1] "bmi"
num [1:202] 20.6 20.7 21.9 21.9 19.0 ...
[1] "ssf"
num [1:202] 109.1 102.8 104.6 126.4 80.3 ...
[1] "pcBfat"
num [1:202] 19.8 21.3 19.9 23.7 17.6 ...
[1] "lbm"
num [1:202] 63.3 58.5 55.4 57.2 53.2 ...

3
[1] "ht"
num [1:202] 196 190 178 185 185 ...
[1] "wt"
num [1:202] 78.9 74.4 69.1 74.9 64.6 63.7 75.2 62.3 66.5 62.9 ...
[1] "sex"
Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
[1] "sport"
Factor w/ 10 levels "B_Ball","Field",..: 1 1 1 1 1 1 1 1 1 1 ...

> missing.cols = which(!complete.cases(t(ais)))


> print(paste("Number of missing columns = ", length(missing.cols)))

[1] "Number of missing columns = 0"

> sex.vs.sport = table(ais$sex, ais$sport)


> sex.sport.ratio = sex.vs.sport[1, ]/sex.vs.sport[2, ]
> which(sex.sport.ratio > 2 | sex.sport.ratio < 0.5)

Gym Netball T_Sprnt W_Polo


3 4 8 10

4. MB 1.6

(a) For the first plot, we explicitly use a log transform in the plot invocation.
> rm(list = ls())
> data(Manitoba.lakes)
> attach(Manitoba.lakes)
> plot(log2(area) ~ elevation, pch = 16, xlim = c(170, 280), ylab = "Log of Area,
+ xlab = "Elevation (meters above sea level)")
> text(log2(area) ~ elevation, labels = row.names(Manitoba.lakes),
+ pos = 4)
> text(log2(area) ~ elevation, labels = area, pos = 2)
> title("Manitoba's Largest Lakes (Area in square.km near points)")

4
Manitoba's Largest Lakes (Area in square.km near points)

14 24387 ● Winnipeg
Log of Area, square km

13

5374 ● Winnipegosis
4624 ● Manitoba
12

2247 ● SouthernIndian
11

1353 ● Cedar
1151 ● Gods 1223 ● Island
10

755 ●Cross
657 ● Playgreen

180 200 220 240 260 280

Elevation (meters above sea level)

(b) For the second plot, we invoke plot with untransformed data but specify log-scale
(i.e. unevenly spaced) ticks on the y-axis.
> plot(area ~ elevation, pch = 16, xlim = c(170, 280), ylab = "Area, square km",
+ xlab = "Elevation (meters above sea level)", log = "y")
> text(area ~ elevation, labels = row.names(Manitoba.lakes), pos = 4)
> text(area ~ elevation, labels = area, pos = 2)
> title("Manitoba's Largest Lakes (Area in square.km near points)")
> detach(Manitoba.lakes)

5
Manitoba's Largest Lakes (Area in square.km near points)

20000
10000 24387 ● Winnipeg
Area, square km

5000

5374 ● Winnipegosis
4624 ● Manitoba

2247 SouthernIndian
2000

1353 ● Cedar
1151 ● Gods 1223 ● Island
1000

755 ●Cross
657 ● Playgreen

180 200 220 240 260 280

Elevation (meters above sea level)

5. MB 1.7
This is fairly simple; just two invocations of dotchart. A dotchart is similar to a box-
plot in that a continuous variable is plotted against a categorical variable. The main
difference is that a dotchart keeps the individual dots and does not create a box. In this
case we have one-to-one relationships between the categorical and continuous variable,
but in general this need not be the case; for example, we might have multiple noisy
measurements of the area for each lake.

> rm(list = ls())


> library(DAAG)
> data(Manitoba.lakes)
> layout(1:2)
> dotchart(Manitoba.lakes$area, rownames(Manitoba.lakes), main = "Manitoba's largest lak
+ xlab = "Area (km^2)")
> dotchart(log(Manitoba.lakes$area), rownames(Manitoba.lakes),
+ main = "Manitoba's largest lakes", xlab = "Log(Area) (km^2)")

6
Manitoba's largest lakes

Playgreen ●
Cross ●
Gods ●
Island ●
Cedar ●
SouthernIndian ●
Manitoba ●
Winnipegosis ●
Winnipeg ●

0 5000 10000 15000 20000 25000

Area (km^2)

Manitoba's largest lakes

Playgreen ●
Cross ●
Gods ●
Island ●
Cedar ●
SouthernIndian ●
Manitoba ●
Winnipegosis ●
Winnipeg ●

7 8 9 10

Log(Area) (km^2)

6. MB 1.13
The correlation between brain and body size is most apparent from the log-log plot. By
contrast, the linear plot makes it appear as if brain and body size are not related. The
square root and .1 power plots are successively closer to the log plot, because within
this range these transformations (esp. the .1 power) are close to the log. This can be
confirmed by executing plot(log(body),body^{0.1}) which will show that the two
transformations are close-to-linearly related (though their absolute values differ).

> par(mfrow = c(2, 2))


> library(MASS)
> attach(Animals)
> plot(body, brain)
> plot(sqrt(body), sqrt(brain))
> plot(body^0.1, brain^0.1)
> plot(log(body), log(brain))
> detach(Animals)
> par(mfrow = c(1, 1))

7
● ●

60

4000

sqrt(brain)
brain

40

2000

20
●●●

●●●
● ●●● ●●

● ●
● ●

● ●● ● ●

0

0
0 20000 60000 0 50 100 200 300

body sqrt(body)

● ● ●

8


2.0

● ●
● ●●●
6
log(brain)


brain^0.1

● ●
● ● ● ●● ● ●
● ●●
● ●● ● ● ●
4

● ●
● ●●
1.5

● ●
● ● ●
● ●
2

● ●
● ●
● ●


1.0


0


● ●

1.0 1.5 2.0 2.5 3.0 0 5 10

body^0.1 log(body)

7. MB 1.16
You would first execute the code below:

> rm(list = ls())


> library(DAAG)
> data(socsupport)
> attach(socsupport)
> gender1 = abbreviate(gender, 1)
> table(gender1)

gender1
f m
71 24

> country3 = abbreviate(country, 3)


> table(country3)

country3
ast oth
85 10

8
> num = seq(along = gender)
> lab = paste(gender1, country3, num, sep = ":")
> plot(BDI ~ age)
> detach(socsupport)



40



30
BDI

20


10
0

18−20 21−24 25−30 31−40 40+

age

You would then run the command identify(BDI ~ age,labels=lab) to locate the
rows with indexes 8 12 36 59 68 95 as outliers; these would be automatically labeled
as you clicked on them. The final plot would look like this:

9
8. MB 1.17
The seq(along=x) returns integer(0) which is another way of describing a vector
of length 0. This is better than seq(1,length(x)) which returns c(1,0) due to the
definition of the seq function (see ?seq for why).

> x = c(8, 54, 534, 1630, 6611)


> seq(1, length(x))

[1] 1 2 3 4 5

> seq(along = x)

[1] 1 2 3 4 5

> x = NULL
> seq(1, length(x))

[1] 1 0

> seq(along = x)

10
integer(0)

11

Você também pode gostar