#
# Programming in R part 2 solutions.R
#
# Ben Haller, ben.haller@mail.mcgill.ca
# For the BGSA R/Stats Workshop at McGill University
# 13 February 2012
#
# Copyright 2012 Ben Haller, all rights reserved.
#
# This material is based upon work supported by the National Science
# Foundation Graduate Research Fellowship under Grant No. 1038597.
#
# Please note: there are a million ways to solve any problem, especially
# in R. The "solutions" given here are by no means the only possible
# solutions, nor are they likely to be the best solutions. In many cases
# I have deliberately done things in a less-than-perfect way because it
# better illustrates the point I am trying to teach. So try to use these
# solutions to learn, but don't take them as gospel!
#
#####################################################################
#
# Exercise 1: Optimization
#
#####################################################################
# Exercise 1: optimize the following function. Use system.time() or Rprof()
# to quantify the speedup you obtain. Compare your work with your neighbor.
# -------------------------------------------------------------------
# The original code, timed and saving results: 5.1 seconds.
xx <- matrix(rnorm(2000000), ncol=100)
row_means <- function(x)
{
means <- numeric(0)
for (i in 1:NROW(x))
{
total <- 0
for (j in 1:NCOL(x))
total <- total + x[i, j]
means <- c(means, total / NCOL(x))
}
invisible(means)
}
system.time(result_original <- row_means(xx))
# -------------------------------------------------------------------
# Solution 1: The quick and easy way. Google "R row means" and the first hit
# points you to the built-in rowMeans() function: 0.007 seconds.
system.time(result_optimized <- rowMeans(xx))
all.equal(result_original, result_optimized) # check that the result is the same
# -------------------------------------------------------------------
# Solution 2: Assuming that function doesn't exist, though. Let's consider the
# fact that we know the purpose of the inner loop: to compute the mean of row i.
# We can vectorize this operation using the built-in function mean(): 1.5 seconds.
row_means_opt1 <- function(x)
{
means <- numeric(0)
for (i in 1:NROW(x))
means <- c(means, mean(x[i, ]))
invisible(means)
}
system.time(result_optimized <- row_means_opt1(xx))
all.equal(result_original, result_optimized) # check that the result is the same
# -------------------------------------------------------------------
# Solution 3: So that's an improvement, but it's still pretty slow. We can profile
# that function to see why it's slow:
Rprof(tmp <- tempfile()) # start profiling
row_means_opt1(xx) # run the code to be profiled
Rprof() # turn off profiling
summaryRprof(tmp) # print a summary of the profiling output
file.remove(tmp) # delete the profiling temporary file
# We see that we are spending almost 94% of our time in c(), building our
# result vector. As with rbind(), this is a very slow way to do things.
# Let's try using apply instead: 0.56 seconds.
row_means_opt2 <- function(x)
{
means <- apply(x, 1, mean)
invisible(means)
}
system.time(result_optimized <- row_means_opt2(xx))
all.equal(result_original, result_optimized) # check that the result is the same
# Well, 0.56 seconds is not bad. But the built-in function rowSums did it in
# 0.007 seconds! That's still almost 100 times faster! How did they do it?
# Let's look at it and find out:
rowMeans
# Aha! Note the core of the function is this call:
#
# .Internal(rowMeans(x, prod(dn), p, na.rm))
#
# That is one way of calling out to an external (probably C) function; in this
# case, a function built into R itself. See ?.Internal; or see ?.External for
# the equivalent calls you would use to call code not built into R itself.
# So this illustrates how much faster compiled languages are, compared to R.
# It also illustrates why using R's built-in functions, such as rowMeans(), is
# worthwhile; such built-ins are generally much, much faster than you could
# achieve on your own without a huge amount of work!
#####################################################################
#
# Exercise 2:
#
#####################################################################
# Exercise 2: Here's a function with three bugs in it. Find and fix the bugs;
# talk to your neighbor or raise your hand if you get stuck!
# -------------------------------------------------------------------
# The original function
myPlot <- function(fuzz, hiod, eh)
{
cx <- sapply(0:fuzz, FUN=function(x) { c(cos(x), sin(x)) })
pts <- matrix(data=cm, ncol=2, byrow=TRUE)
plot(x=pts[,1], y=pts[2], pch=19, cex=1.5)
points(x=c(hiod, -hoid), y=c(eh, eh), pch=21, bg="yellow", cex=10, lwd=3)
segments(x0=-0.5, y0=-0.5, x1=0.5, y1=-0.3, lwd=10)
}
myPlot(43, 0.3, 0.4)
# -------------------------------------------------------------------
# Solution 1: fixing the first error found, which is:
#
# Error in as.vector(x, mode) :
# cannot coerce type 'builtin' to vector of type 'any'
# The function doesn't have a call to as.vector, so the first question is:
# where is the error actually occurring? Use traceback() to find out:
traceback()
# That gives:
#
# 3: as.vector(data)
# 2: matrix(data = cm, ncol = 2, byrow = TRUE)
# 1: myPlot(43, 0.3, 0.4)
#
# So it's the call to matrix(). But what's wrong with it? Let's trace into it
# using debug():
debug(myPlot)
myPlot(43, 0.3, 0.4)
n
n
# Now if we did another 'n' we would hit the error, and we'd break out of the
# debugger (the debugger terminates when an error occurs, oddly). So let's
# print some stuff and see if we can predict why the matrix() call fails:
cm
# Hmm, that's odd; why are we passing a function called cm as data to matrix()?
# Ah, of course; we must mean to pass cx, since we just made that object. Let's
# make that change and see where we stand.
Q
myPlot <- function(fuzz, hiod, eh)
{
cx <- sapply(0:fuzz, FUN=function(x) { c(cos(x), sin(x)) })
pts <- matrix(data=cx, ncol=2, byrow=TRUE)
plot(x=pts[,1], y=pts[2], pch=19, cex=1.5)
points(x=c(hiod, -hoid), y=c(eh, eh), pch=21, bg="yellow", cex=10, lwd=3)
segments(x0=-0.5, y0=-0.5, x1=0.5, y1=-0.3, lwd=10)
}
myPlot(43, 0.3, 0.4)
# -------------------------------------------------------------------
# Solution 2: fixing the second error found, which is:
#
# Error in xy.coords(x, y, xlabel, ylabel, log) :
# 'x' and 'y' lengths differ
# So again, we have no call to xy.coords in our code, so let's use traceback:
traceback()
# 5: stop("'x' and 'y' lengths differ")
# 4: xy.coords(x, y, xlabel, ylabel, log)
# 3: plot.default(x = pts[, 1], y = pts[2], pch = 19, cex = 1.5)
# 2: plot(x = pts[, 1], y = pts[2], pch = 19, cex = 1.5)
# 1: myPlot(43, 0.3, 0.4)
# The offending call in our code is the call to plot(). Let's debug.
debug(myPlot)
myPlot(43, 0.3, 0.4)
n
n
n
# Now we're at the point where the error would occur. Again, let's check what
# we're passing to the plot() command:
pts[, 1]
# Well, OK, 44 random-looking numbers. Whatever.
pts[2]
# Hmm. Just one number. That makes no sense; we need the same number of x and y
# coordinates to make a plot. That must be what the error message is telling us.
# The x coordinates are the first column of the matrix 'pts'. Ah, the code must
# intend to use the second column of 'pts' as the y coordinates, but the comma got
# left out. Pesky commas.
Q
myPlot <- function(fuzz, hiod, eh)
{
cx <- sapply(0:fuzz, FUN=function(x) { c(cos(x), sin(x)) })
pts <- matrix(data=cx, ncol=2, byrow=TRUE)
plot(x=pts[,1], y=pts[,2], pch=19, cex=1.5)
points(x=c(hiod, -hoid), y=c(eh, eh), pch=21, bg="yellow", cex=10, lwd=3)
segments(x0=-0.5, y0=-0.5, x1=0.5, y1=-0.3, lwd=10)
}
myPlot(43, 0.3, 0.4)
# Hey, we got... a circle! How did that happen?
# -------------------------------------------------------------------
# Solution 2: fixing the third error found, which is:
#
# Error in points(x = c(hiod, -hoid), y = c(eh, eh), pch = 21, bg = "yellow", :
# object 'hoid' not found
# OK, this one is easy. Search the code for "hoid", which is obviously a
# typo for "hiod" (whatever that is). Make the change and run it again:
myPlot <- function(fuzz, hiod, eh)
{
cx <- sapply(0:fuzz, FUN=function(x) { c(cos(x), sin(x)) })
pts <- matrix(data=cx, ncol=2, byrow=TRUE)
plot(x=pts[,1], y=pts[,2], pch=19, cex=1.5)
points(x=c(hiod, -hiod), y=c(eh, eh), pch=21, bg="yellow", cex=10, lwd=3)
segments(x0=-0.5, y0=-0.5, x1=0.5, y1=-0.3, lwd=10)
}
myPlot(43, 0.3, 0.4)
# Boo! Hey, it's Sparky, the Ghost of Bugs Past! Hi Sparky!
# (The management would like to apologize for this unfortunate turn of
# events; the person responsible has been sacked. Sorry.)
# 'hiod', by the way, stands for "half intra-ocular distance", obviously.
# Try to avoid using cryptic variable names like this in your code;
# it only makes it harder to debug later, when you've forgotten what all
# your variable names mean!
#####################################################################
#
# Exercise 3: Generating warnings and errors
#
#####################################################################
# Exercise 3: Write a function named safe_power that takes two parameters,
# base and exponent, and uses the ^ operator to raise the base to
# the exponent. You know that you will be calling this function only
# with base >= 0, and only with single numeric values for both arguments
# (not with multi-value vectors). You wish to produce a warning before
# returning NULL, NA, NaN, or Inf (infinity). Test your function
# against the given test suite below.
# -------------------------------------------------------------------
# Solution 1: this is pretty straightforward, so here it is. :->
safe_power <- function(base, exponent)
{
stopifnot(is.numeric(base) && is.numeric(exponent))
stopifnot((length(base) == 1) && (length(exponent) == 1))
stopifnot(base >= 0)
r <- base ^ exponent
if (is.na(r) || is.nan(r) || is.infinite(r) || is.null(r))
warning("safe_power returning an unexpected value: ", r)
r
}
#####################################################################
#
# Bonus code: plotting the R matrix
#
#####################################################################
par(mar=c(0,0,0,0))
plot(-1:1, -1:1, type="n", bty="n", xaxt="n", yaxt="n", xlab="", ylab="")
points(0, 0, cex=92, pch=16, lwd=2)
set.seed(4)
for (i in 1:300)
{
x <- runif(1, -1, 1)
y <- runif(1, -1, 1)
if (x^2 + y^2 < 1)
text(x=x, y=y, labels="R", cex=rnorm(1, 2.0, 0.8), col=colors()[sample(c(47:51, 85, 88, 89, 254:258), 1)], srt=runif(1, 0, 360))
}