#
# Programming in R part 1 solutions.R
#
# Ben Haller, ben.haller@mail.mcgill.ca
# For the BGSA R/Stats Workshop at McGill University
# 6 February 2012
#
# Copyright 2012 Ben Haller, all rights reserved.
#
# This material is based upon work supported by the National Science
# Foundation Graduate Research Fellowship under Grant No. 1038597.
#
# Please note: there are a million ways to solve any problem, especially
# in R. The "solutions" given here are by no means the only possible
# solutions, nor are they likely to be the best solutions. In many cases
# I have deliberately done things in a less-than-perfect way because it
# better illustrates the point I am trying to teach. So try to use these
# solutions to learn, but don't take them as gospel!
#
#####################################################################
#
# Exercise 1: Flow control
#
#####################################################################
# Exercise 1: write some simple code that uses flow control statements to
# do something with the built-in dataset USArrests. Then show your code to
# the person sitting next to you, and see if they can figure out what your
# code does.
# First, let's look at the dataset a little bit. These are some useful
# functions for examining objects.
class(USArrests) # what kind of object it is
head(USArrests) # the first rows or elements of an object
str(USArrests) # the internal structure of an object
row.names(USArrests) # the names of rows
names(USArrests) # the names of columns
NROW(USArrests) # how many rows?
NCOL(USArrests) # how many columns?
# -------------------------------------------------------------------
# Solution 1: print all murder rates, one by one
for (index in 1:NROW(USArrests))
print(USArrests$Murder[index])
# or a bit prettier with cat() (?cat):
for (index in 1:NROW(USArrests))
cat(USArrests$Murder[index], "\n", sep="")
# -------------------------------------------------------------------
# Solution 2: add an "if" to print out only murder rates greater than 15
for (index in 1:NROW(USArrests))
{
rate <- USArrests$Murder[index]
if (rate > 15)
cat(rate, "\n", sep="")
}
# -------------------------------------------------------------------
# Solution 3: print the state names that go with those entries
for (index in 1:NROW(USArrests))
{
rate <- USArrests$Murder[index]
name <- row.names(USArrests)[index]
if (rate > 15)
cat(name, ": ", rate, "\n", sep="")
}
# -------------------------------------------------------------------
# Solution 4: use a loop to find and print the average murder rate
total <- 0
for (index in 1:NROW(USArrests))
total <- total + USArrests$Murder[index]
total / NROW(USArrests) # compare to mean(USArrests$Murder)
# -------------------------------------------------------------------
# Solution 5: find the state with the highest murder rate, and print it
best <- 0
best_index <- 0
for (index in 1:NROW(USArrests))
{
rate <- USArrests$Murder[index]
if (rate > best)
{
best <- rate
best_index <- index
}
}
best_index # compare to which.max(USArrests$Murder)
best # compare to max(USArrests$Murder)
cat(row.names(USArrests)[best_index], "has the highest murder rate:", best)
#####################################################################
#
# Exercise 2: Subsetting and sorting
#
#####################################################################
# Exercise 2: write some simple code that uses subsetting and/or sorting to
# analyze the built-in dataset USArrests. Then show your code to the
# person sitting next to you, and see if they can figure out what your
# code does.
# -------------------------------------------------------------------
# Solution 1: use subsetting to "select" states with high assault rates
USArrests$Assault > 250
USArrests[USArrests$Assault > 250, ] # using a logical vector
which(USArrests$Assault > 250)
USArrests[which(USArrests$Assault > 250), ] # using an index vector
# -------------------------------------------------------------------
# Solution 2: use order() to print out a list of states sorted by assault rates
assault_order <- order(USArrests$Assault, decreasing=TRUE)
assault_order
USArrests[assault_order, ]
# or you can do it in one line without the temporary variable:
USArrests[order(USArrests$Assault, decreasing=TRUE), ]
# -------------------------------------------------------------------
# Solution 3: print a "top ten" list of states with the highest assault rates, sorted
assault_order <- order(USArrests$Assault, decreasing=TRUE)
sorted_df <- USArrests[assault_order, ]
sorted_df[1:10, ]
# or you can "chain" subsets to do this in one line:
USArrests[order(USArrests$Assault, decreasing=TRUE), ][1:10, ]
# -------------------------------------------------------------------
# Solution 4: use & to find states that are high in both murder and assault
high_in_murder <- (USArrests$Murder > 15)
high_in_murder
high_in_assault <- (USArrests$Assault > 250)
high_in_assault
high_in_murder & high_in_assault
USArrests[high_in_murder & high_in_assault, ]
# or in one compact line:
USArrests[(USArrests$Murder > 15) & (USArrests$Assault > 250), ]
# -------------------------------------------------------------------
# Solution 5: use intersect() to find states that are high in both murder and assault
high_in_murder <- which(USArrests$Murder > 15)
high_in_murder
high_in_assault <- which(USArrests$Assault > 250)
high_in_assault
intersect(high_in_murder, high_in_assault)
USArrests[intersect(high_in_murder, high_in_assault), ]
# or in one compact line:
USArrests[intersect(which(USArrests$Murder > 15), which(USArrests$Assault > 250)), ]
#####################################################################
#
# Exercise 3: Functions
#
#####################################################################
# Exercise 3: The function below is a "mystery function". Work with the person
# next to you to figure out what it does and how it does it. If you find
# this too easy, then for "extra credit" modify this function to do something
# different and interesting.
sol <- function()
{
objs <- objects(name=.GlobalEnv)
osizes <- sapply(objs, function(x) { object.size(get(x)) } )
oclasses <- sapply(objs, function(x) { class(get(x))[1] } )
o <- data.frame(obj=as.character(objs), osize=as.numeric(osizes), oclass=as.factor(oclasses), stringsAsFactors=FALSE)
o <- o[o$oclass != "function",]
top_objs <- head(o[order(o$osize, decreasing=TRUE),], 20)
row.names(top_objs) <- NULL
top_objs
}
# -------------------------------------------------------------------
# Solution: figure out what the mystery function does and how it works.
sol()
# hmm; seems to be variable names that we have been using, sorted by "osize"
# let's look at each line to better understand what it's doing
objs <- objects(name=.GlobalEnv)
objs
# This gets the names of all defined objects from the global environment;
# there is lots to read at ?objects at ?environment
osizes <- sapply(objs, function(x) { object.size(get(x)) } )
osizes
# Gives us a named vector that lists the size of each object in our objs
# vector. More to read at ?object.size. The key point here is the use
# of sapply() with an anonymous function. This means: for each element of
# objs, call the anonymous function; that function then 1) gets the named
# object from the environment, and 2) passes it to object.size to get its
# size. sapply() gloms the results of all this together into a named vector.
oclasses <- sapply(objs, function(x) { class(get(x))[1] } )
oclasses
# Exactly the same idea as the preceding line, but now we get the class of
# each object. The [1] is there because some objects have more than one
# class; this code just takes the first class, in that case.
o <- data.frame(obj=as.character(objs), osize=as.numeric(osizes), oclass=as.factor(oclasses), stringsAsFactors=FALSE)
o
# Assemble a dataframe from the vectors we've created. The calls to
# as.character, as.numeric and as.factor guarantee that each column is
# of the type we want it to be -- generally good practice. The argument
# stringsAsFactors=FALSE prevents R from turning the names of the objects
# into a factor.
o <- o[o$oclass != "function",]
o
# This uses subsetting to select only rows for objects that are not
# functions; the sol() function only wants to list data objects, not
# functions (which are also objects, in R).
top_objs <- head(o[order(o$osize, decreasing=TRUE),], 20)
top_objs
# With complex statements like this, it is often useful to parse them
# "inside-out". The first thing that gets done is the most "inner":
# order(o$osize, decreasing=TRUE). That gives us an order vector,
# with the indices of the rows that have decreasing values of $osize.
# Next, from inner to outer, is o[...,]: this uses the order vector
# to actually reorder the dataframe. Next is head(..., 20): this
# selects the first 20 rows of the reordered dataframe. Finally, we
# have top_objs <- ...: this puts the result into the variable top_objs.
row.names(top_objs) <- NULL
# If you look at the output for top_objs above, you see that each object
# name is repeated twice. This is because the object names are both a
# column in the dataframe (top_objs$obj) and the names of the rows
# (row.names(top_objs)). Either would be fine, but both together is silly.
# This line just strips away the row names by assigning NULL into them.
top_objs
# Functions return the value of the last evaluated expression; so this
# makes it so that the return value of sol() is the dataframe top_objs.
# This can be done more explicitly with return() (?return) from anywhere
# inside a function, too.
# As a whole, this function illustrates sapply(), anonymous functions,
# subsetting, sorting, etc. It also illustrates some of R's "introspection"
# functions, such as objects(), get(), object.size() and class(), which
# let you write code that explores the objects and functions defined
# in the R environment itself. This can sometimes be extremely useful,
# although it's a pretty advanced topic; keep it in mind for later.
#####################################################################
#
# Exercise 4: Functions
#
#####################################################################
# Exercise 4: write a simple function to analyze or plot one column of the
# built-in dataset USArrests (pass the data for the column as a parameter
# to your function). Then call your function for each of the four columns
# of the dataset. Show your code to the person sitting next to you, and
# see if they can figure out what your code does.
# First, I use rm() here to restore the standard USArrests dataset,
# since we did some playing with it earlier. If you don't do this,
# your results from the code below may be different from mine...
rm(USArrests)
# -------------------------------------------------------------------
# Solution 1: adapt your earlier code into a function, to save work
# Let's work with solution 3 from exercise 2:
assault_order <- order(USArrests$Assault, decreasing=TRUE)
sorted_df <- USArrests[assault_order,]
sorted_df[1:10,]
# And turn it into a function. We will design it to take a one-column
# dataframe, although there are other ways to do it. To keep the names
# "stuck" to the data when we sort and subset, we need drop=FALSE.
# This fix is somewhat obscure, but drop is documented in ?"[";
# basically, I use it here so that the result of the subsetting operation
# is still a dataframe, and thus has names, instead of being "dropped"
# down to a vector, in which case the names would be lost.
top_ten <- function(data)
{
stopifnot(is.data.frame(data)) # safety check
data_order <- order(data, decreasing=TRUE)
sorted_data <- data[data_order, , drop=FALSE]
sorted_data[1:10, , drop=FALSE]
}
# And now we can call it with each of the columns of our dataframe.
# USArrests["Murder"] is the same as USArrests[, "Murder", drop=FALSE].
# Note that USArrests$Murder and USArrests[, "Murder"] would both
# drop the state names from the data, which is not what we want.
top_ten(USArrests["Murder"])
top_ten(USArrests["Assault"])
top_ten(USArrests["UrbanPop"])
top_ten(USArrests["Rape"])
# -------------------------------------------------------------------
# Solution 2: add a parameter or two to your function
top_ten <- function(data, topX=10)
{
stopifnot(is.data.frame(data)) # safety check
data_order <- order(data, decreasing=TRUE)
sorted_data <- data[data_order, , drop=FALSE]
sorted_data[1:topX, , drop=FALSE]
}
# top 10, using the default value
top_ten(USArrests["Murder"])
top_ten(USArrests["Assault"])
top_ten(USArrests["UrbanPop"])
top_ten(USArrests["Rape"])
# top 1, passing a value for topX
top_ten(USArrests["Murder"], topX=1)
top_ten(USArrests["Assault"], topX=1)
top_ten(USArrests["UrbanPop"], topX=1)
top_ten(USArrests["Rape"], topX=1)
# -------------------------------------------------------------------
# Solution 3: automate the calls to your function with a loop over the columns
for (colname in names(USArrests))
{
print(top_ten(USArrests[colname], topX=5))
cat("\n") # blank line separating each block of output
}
# -------------------------------------------------------------------
# Solution 4: write an uber-function that loops over the columns of any dataframe
do_for_columns <- function(data, FUN=top_ten, ...)
{
for (colname in names(data))
{
print(FUN(data[colname], ...))
cat("\n") # blank line separating each block of output
}
}
# A vanilla call to our new uber-function
do_for_columns(USArrests)
# But we can supply different column-summary functions to it:
do_for_columns(USArrests, FUN=mean) # print means of each column
do_for_columns(USArrests, FUN=sd) # print standard deviations of each column
# Parameters can be passed through to the column-summary function,
# because of the way we used ... in our function:
do_for_columns(USArrests, FUN=top_ten, topX=3)
# We can even supply a new column-summary function anonymously:
do_for_columns(USArrests, FUN=function(x) { data.frame(max=max(x), min=min(x), row.names=names(x)) } )
# Now perhaps you have some idea of how a function like sapply() is
# constructed internally. Not so hard, eh?
# Notice that in the end, we made a very powerful, reuseable function,
# with only a little more work and thought than just doing our analysis
# the brute-force way. This is the power of programming in R!