## PLEASE GO THROUGH THIS PRIOR TO CLASS
# Clear the workspace
rm(list = ls())
# Load the libraries used for this lab
library(ggplot2)
library(ggthemes)
library(reshape2)
library(dplyr)
library(magrittr)
library(tidyr)
# This lab cover how to plot data. Although it may not feel as precise, visual
# inspection of the data can reveal a lot about the structure of the data. The
# classic example for that is Anscombe's quartet. R includes Anscombe's quartet
# as a built-in dataset. We will begin by plotting those data.
# Load the data. Remember this is a built-in dataset, so we can use the data()
# command.
data(anscombe)
anscombe # notice that it's divided into 4 sets of x & y variables.
# Plotting in base R is straightforward -- we just use the plot() command
with(anscombe, plot(x = x1, y = y1)) # it works, but it's ugly
# There are a variety of different ways to modify a plot in base R. Instead of
# learning them, we will focus on learning a package from the Hadley-verse,
# ggplot2. ggplot2 builds on the idea of a "grammar of graphics" (hence gg)
# allowing us to build our plot from its constituent components. It also has
# more attractive defaults.
# ggplot has a command called qplot that does something very similar to the
# plot command in base R:
qplot(data = anscombe, x = x1, y = y1) # prettier, right?
# The cool thing about ggplot is that it allows us to save our plot, and then
# to update pieces of it.
(p <- qplot(data = anscombe, x = x1, y = y1)) # saves the plot to the object p
# again, the () wrapper around a <- statement prints whatever was assigned
# on the left hand side of the "<-" to the screen.
# Now we can change what values are plotted:
p + aes(x = x2, y = y2) # don't worry about what aes() means, we'll come to that
p + aes(x = x3, y = y3)
p + aes(x = x4, y = y4)
# That's all well and good, but what if we want to plot all of those on the same
# page? To do that, we'll have to use the "grammar" for graphics.
# The basic idea behind the grammar is that you build up a plot piece by piece.
# First, you add the data, and you decide which data values get mapped to which
# dimensions on the plot (x axis, y axis, size, color, shape, etc.). Hadley
# calls these mappings "aesthetics" and appreviates them "aes".
p1 <- ggplot(data = anscombe, aes(x = x1, y = y1))
# Next you decide what "geometry" is going to be used for your plot. Are you
# going to just do a scatterplot, where there are just points?
p1 + geom_point()
# Or are you going to connect those points in a line graph?
p1 + geom_line()
# It's the same data, we've just changed how we're drawing the points. Let's
# stick with points.
p1 <- p1 + geom_point()
# Next, you decide how to scale the axes. Should everything start from 0?
p1 + xlim(c(0, NA)) + ylim(c(0, NA)) # NA in this case means calculate the
# limit automatically. (I'm not going to save this.)
# Or, if I wanted to change the y-axis to be on a log scale, I can just add it:
p1 + scale_y_log10() # (I'm not going to save this either.)
# Next you label the axes, and give the graph a title:
(p1 <- p1 + ggtitle("Anscombe's quartet") + xlab("X") + ylab("Y"))
# Finally, change the color scheme, if you want.
# Some themes that come with ggplot2:
p1 + theme_bw()
p1 + theme_classic()
# A few from ggthemes
p1 + theme_economist() # looks like The Economist magazine
p1 + theme_fivethirtyeight() #looks like plots from 538
p1 + theme_stata() # if you really, really miss Stata plots
p1 + theme_excel() # please don't do this. It's so ugly.
# You can customize plots to look however you want. And, if you save a
# customized plot, you can reuse the theme with different data.
p2 <- p1 + theme_economist()
p2 + aes(x = x2, y = y2) # See? Same theme, different variables.
# One of the other powerful things that ggplot2 can do for you is create small
# multiples -- several plots that show the same thing. Hadley calls this
# faceting.
# To make a plot with facets, first we have to rearrange the data into columns.
# To do that, we use a new package from the Hadley-verse, tidyr. I won't cover
# it in depth; for an intro, see:
vignette("tidy-data")
# The gather function strings the data out into two long columns. One column
# is an indicator of what the variable name was (key), and the other is
# an indicator of what the value of that variable was (value).
anscombe %>% gather()
# This isn't quite what we want. The separate command takes us a step closer by
# creating a column for x or y, and a column for series (1, 2, 3, or 4):
anscombe %>%
gather() %>%
separate(key, c("x.or.y", "series"), 1) # the 1 means separate after the first character
# Equivalently, we could have used extract_numeric()
# mutate(series = extract_numeric(key))
# ASIDE: we could have created a new column using substr.
# substr stands for sub-string -- it takes a string (variable), and takes
# characters starting at 1 and ending at 1. For example, say I have a string
# "catinthehat". If I just want the word "cat", I would start at the first
# character (c), and stop at the third character (t). This is how that would
# look
substr("catinthehat", 1, 3)
# Still not quite what we're looking for. We want a column for x values,
# a column for y values, and a column for series. The spread command takes care
# of this:
anscombe %>%
gather() %>%
separate(key, c("x.or.y", "series"), 1) %>%
spread(x.or.y, value)
# Whoops! That gives us an error. What went wrong?
# The error says "duplicate identifiers for rows". That must mean that R
# decided it was going to try to combine all of the rows where series == 1, and
# got confused. Let's help it out by giving it an unduplicated identifier. In
# this case, we'll have to use row numbers.
anscombe %>%
mutate(sid = row_number()) %>% # sid stands for subject ID
gather(key = key, value = value, everything(), -sid) %>%
# key = key and value = value means create two new columns named key and value
# like we did before.
# everything() means gather all the columns, like we did before.
# the -sid means do NOT gather sid, but leave it as its own column
separate(key, c("x.or.y", "series"), 1) # same as before
# Now spread works fine. Let's save the result.
(anscombe.ggplot <- anscombe %>%
mutate(sid = row_number()) %>%
gather(key = key, value = value, everything(), -sid) %>%
separate(key, c("x.or.y", "series"), 1) %>%
spread(x.or.y, value))
# Let's get rid of the sid column, and let's sort by series, just to make this
# cleaner to look at:
(anscombe.ggplot %<>%
select(-sid) %>%
arrange(series))
# Now the data have one column for x, one column for y, and one column
# indicating which series the data are from. We can map each one of these
# columns to a dimension in the plot. For example, let's make x and y the
# x and y dimensions, as before, but let's let the color of the points represent
# the series.
(p3 <- ggplot(data = anscombe.ggplot, aes(x = x, y = y, color = series)) +
geom_point())
# Well that's cool, but it's pretty hard to see. We could have changed the
# color of the points
p3 + scale_color_brewer() # uses colors from colorbrewer2.org -- check it out!
# Still hard to see. Really, we want these in separate plots. Again, those
# are called facets in ggplot
(p4 <- p3 +
facet_wrap(~ series) + # could do up to two columns with facet_grid(~ col1 + col2)
aes(color = NULL)) # removes the colored points from before
# That's more like it! Now we could clean up the names, etc., like we did
# before. If we just wanted to plot one, we would have to change the data:
p4 %+% filter(anscombe.ggplot, series == 1)
# The last thing we'll do in this script is show you how to add calculated
# layers. Here I want to add a layer that shows the regression line for each
# of these. This is fairly easy -- you just add another "geometry":
p4 + geom_smooth(method = "lm") # lm means "linear model" -- forces the line to be straight
# (Because this is Anscombe's quartet, all of the lines are the same.)
# There's a whole book about ggplot. It's available online from:
# http://search.library.duke.edu/search?id=DUKE004969277
# The easiest way to learn is to start thinking of what you want to present, and
# then search for it on the internet or in the help (or come talk to me!).
# ggplot has a gallery of the different "geoms" on it's main page:
# http://docs.ggplot2.org/current/