# Math 322 Biostatistics # Homework 1, due January 23, 2009 # Geir Arne Hjelle (hjelle@math.wustl.edu) # # Problem 1a) # # Print out a heading for the problem. We could use print(), but with cat() # R does not print out the [1] and the "s that print() does. \n means new line cat("Problem 1a)\n\n") # Define geometric mean geometricmean <- function(v) { return( exp(mean(log(v))) ) } # Define harmonic mean harmonicmean <- function(v) { return( 1 / mean(1 / v) ) } # # Problem 1b) # cat("Problem 1b)\n\n") # Read in the heights data set from the first lecture # heights <- read.table("http://www.math.wustl.edu/~hjelle/m322/r/heights.txt", header = T) attach(heights) # Calculate the means for the whole sample mean(height) geometricmean(height) harmonicmean(height) # To calculate means grouped on sex, we can use [] notation mean(height[sex == "m"]) mean(height[sex == "w"]) # Another option is to use tapply() tapply(height, sex, geometricmean) tapply(height, sex, harmonicmean) # Clean up our workspace detach(heights) # # Problem 2 # cat("Problem 2\n\n") # We first need to enter the data. One way of entering data given by a # frequency table is to use the rep() function p2.midvalues <- c(0.075, 0.25, 0.45, 0.8, 1.55, 2.55, 3.55, 4.55, 5.55) p2.freq <- c( 458, 268, 151, 79, 44, 19, 9, 3, 2) p2.astigmatism <- rep(p2.midvalues, p2.freq) # Arithmetic mean mean(p2.astigmatism) # Standard deviation sd(p2.astigmatism) # Bar graph barplot(table(p2.astigmatism)) # Names for the horizontal axis p2.intervals <- c("0.0 - 0.2", "0.2 - 0.3", "0.4 - 0.5", "0.6 - 1.0", "1.1 - 2.0", "2.1 - 3.0", "3.1 - 4.0", "4.1 - 5.0", "5.1 - 6.0") barplot(table(p2.astigmatism), names.arg=p2.intervals) # Since we already have the frequencies, we do not need to use table() barplot(p2.freq, names.arg = p2.intervals) # # Problem 3 # cat("Problem 3\n\n") # To read in the data set in LEAD.DAT is not quite straight forward. The # problem is that each record spans two lines, which confuses read.table(). # Instead we can use the more flexible scan() function. scan() will just # read in all the numbers without caring about formatting. Then we need # to do the formatting after the scanning is finished. # # I have placed the data file in a subdirectory of my working directory # called rosner/ From the description of the data set in LEAD.DOC, we # need to note that each record consists of 41 columns (fields). p3.rawdata <- scan("rosner/LEAD.DAT") p3.datamatrix <- matrix(p3.rawdata, ncol = 41, byrow = T) # # Problem 3a) # # We will analyze age and gender so we pick out those columns, together # with the factor describing the group (exposed or control) # # In the data set, the exposed group is actually further subdivided. We # start by joining group 2 and 3. These are categorical data (the # numerical value is irrelevant). We let R know this by using factor() p3.group <- p3.datamatrix[,21] p3.group[p3.group == 3] <- 2 p3.group <- factor(p3.group, labels = c("Control", "Exposed")) # The ages are given in a strange format (e.g. 1011 = 10 years 11 months) # We convert this to actual numbers so we can do math on them p3.age <- p3.datamatrix[,4] p3.years <- trunc(p3.age / 100) # Years p3.months <- p3.age - p3.years * 100 # Months p3.age <- p3.years + p3.months / 12 # The gender should also be converted into categorical data p3.gender <- factor(p3.datamatrix[,5], labels = c("Male", "Female")) # Finally, we assemble our vectors into a data frame. p3.data.a <- data.frame(group = p3.group, age = p3.age, gender = p3.gender) attach(p3.data.a) # We can now do some summary statistics on our data summary(p3.data.a) table(group, gender) table(group, trunc(age)) # Group ages on years (do not care about months) prop.table(table(group, trunc(age)), 1) # Proportional values, instead of # absolute counts tapply(age, group, summary) barplot(table(group, trunc(age)), beside = TRUE, legend = TRUE) barplot(prop.table(table(group, trunc(age)), 1) / 2, beside = TRUE, legend = TRUE) stem(age[group == "Control"], scale = 2) stem(age[group == "Exposed"], scale = 2) boxplot(age ~ group) detach(p3.data.a) # Clean up # # Problem 3b) # # Now we need to analyse verbal and performance IQ. These are columns 17 # and 18, respectively p3.vIQ <- p3.datamatrix[,17] p3.pIQ <- p3.datamatrix[,18] # Assemble the data frame p3.data.b <- data.frame(group = p3.group, vIQ = p3.vIQ, pIQ = p3.pIQ) attach(p3.data.b) summary(p3.data.b) tapply(vIQ, group, summary) tapply(pIQ, group, summary) table(cut(vIQ, breaks=seq(50, 150, 5), right=F)) # Group IQ into bins of table(cut(pIQ, breaks=seq(50, 150, 5), right=F)) # width 5. par(mfrow = c(2, 1)) barplot(prop.table(table(group, cut(vIQ, breaks=seq(50, 150, 5), right=F)), 1), beside = TRUE, legend = TRUE, main = "Verbal IQ") barplot(prop.table(table(group, cut(pIQ, breaks=seq(50, 150, 5), right=F)), 1), beside = TRUE, legend = TRUE, main = "Performance IQ") par(mfrow = c(1, 2)) boxplot(vIQ ~ group, ylim = c(50, 150), main = "Verbal IQ") boxplot(pIQ ~ group, ylim = c(50, 150), main = "Performance IQ") par(mfrow = c(1, 1)) detach(p3.data.b) # Clean up # # Problem 4a) # cat("Problem 4a)\n\n") attach(airquality) # Stem-and-Leaf plot of Ozone levels stem(Ozone) # Bar graph on temperatures grouped on months barplot(table(Temp, Month)) # To make this a little nicer, let us group the temperatures into 10 degree # bins, and name the months p4.temp <- cut(Temp, seq(50, 100, 10), right=F) p4.month <- factor(Month, labels = month.name[5:9])# month.name is built into R barplot(table(p4.temp, p4.month), legend = TRUE, beside = TRUE) # Another way to do this, which may be more informative is to plot the # temperatures of each day, and use color to group the months. barplot(Temp, col = Month) # To make the colors different shades of gray (so they print nicely), we # can use the gray function barplot(Temp, col = gray((Month - 4) / 5)) # Finally, we can make a quick'n'dirty names vector to print the month names p4.mthname <- rep(NA, length(Temp)) p4.mthname[(0:4) * 31 + 15] = month.name[5:9] barplot(Temp, col = gray((Month - 4) / 5), names.arg = p4.mthname) detach(airquality) # Clean up # # Problem 4b) # cat("Problem 4b)\n\n") attach(heights) # To get an impression of how close our data are to being normally distributed # we can overlay a normal curve on the histogram. dnorm(x) is the density # function of the normal distribution. hist(height, breaks = 58.5:74.5, freq = FALSE, col = "gray") curve(dnorm(x, mean = mean(height), sd = sd(height)), add = TRUE) # It may be better to do this grouped on sex par(mfrow = c(2, 1)) hist(height[sex == "m"], breaks = 58.5:74.5, freq = FALSE, col = "gray") curve(dnorm(x, mean = mean(height[sex == "m"]), sd = sd(height[sex == "m"])), add = TRUE) hist(height[sex == "w"], breaks = 58.5:74.5, freq = FALSE, col = "gray") curve(dnorm(x, mean = mean(height[sex == "w"]), sd = sd(height[sex == "w"])), add = TRUE) par(mfrow = c(1, 1)) # Another way to check normality is through QQ-plots. qqnorm(height) # The spineplot is an alternative to histograms or barplots spineplot(sex ~ height, breaks = 58.5:74.5)