## PLEASE GO THROUGH THIS PRIOR TO CLASS # Clear the workspace rm(list = ls()) # Load the libraries used for this lab library(ggplot2) library(ggthemes) library(reshape2) library(dplyr) library(magrittr) library(tidyr) # This lab cover how to plot data. Although it may not feel as precise, visual # inspection of the data can reveal a lot about the structure of the data. The # classic example for that is Anscombe's quartet. R includes Anscombe's quartet # as a built-in dataset. We will begin by plotting those data. # Load the data. Remember this is a built-in dataset, so we can use the data() # command. data(anscombe) anscombe # notice that it's divided into 4 sets of x & y variables. # Plotting in base R is straightforward -- we just use the plot() command with(anscombe, plot(x = x1, y = y1)) # it works, but it's ugly # There are a variety of different ways to modify a plot in base R. Instead of # learning them, we will focus on learning a package from the Hadley-verse, # ggplot2. ggplot2 builds on the idea of a "grammar of graphics" (hence gg) # allowing us to build our plot from its constituent components. It also has # more attractive defaults. # ggplot has a command called qplot that does something very similar to the # plot command in base R: qplot(data = anscombe, x = x1, y = y1) # prettier, right? # The cool thing about ggplot is that it allows us to save our plot, and then # to update pieces of it. (p <- qplot(data = anscombe, x = x1, y = y1)) # saves the plot to the object p # again, the () wrapper around a <- statement prints whatever was assigned # on the left hand side of the "<-" to the screen. # Now we can change what values are plotted: p + aes(x = x2, y = y2) # don't worry about what aes() means, we'll come to that p + aes(x = x3, y = y3) p + aes(x = x4, y = y4) # That's all well and good, but what if we want to plot all of those on the same # page? To do that, we'll have to use the "grammar" for graphics. # The basic idea behind the grammar is that you build up a plot piece by piece. # First, you add the data, and you decide which data values get mapped to which # dimensions on the plot (x axis, y axis, size, color, shape, etc.). Hadley # calls these mappings "aesthetics" and appreviates them "aes". p1 <- ggplot(data = anscombe, aes(x = x1, y = y1)) # Next you decide what "geometry" is going to be used for your plot. Are you # going to just do a scatterplot, where there are just points? p1 + geom_point() # Or are you going to connect those points in a line graph? p1 + geom_line() # It's the same data, we've just changed how we're drawing the points. Let's # stick with points. p1 <- p1 + geom_point() # Next, you decide how to scale the axes. Should everything start from 0? p1 + xlim(c(0, NA)) + ylim(c(0, NA)) # NA in this case means calculate the # limit automatically. (I'm not going to save this.) # Or, if I wanted to change the y-axis to be on a log scale, I can just add it: p1 + scale_y_log10() # (I'm not going to save this either.) # Next you label the axes, and give the graph a title: (p1 <- p1 + ggtitle("Anscombe's quartet") + xlab("X") + ylab("Y")) # Finally, change the color scheme, if you want. # Some themes that come with ggplot2: p1 + theme_bw() p1 + theme_classic() # A few from ggthemes p1 + theme_economist() # looks like The Economist magazine p1 + theme_fivethirtyeight() #looks like plots from 538 p1 + theme_stata() # if you really, really miss Stata plots p1 + theme_excel() # please don't do this. It's so ugly. # You can customize plots to look however you want. And, if you save a # customized plot, you can reuse the theme with different data. p2 <- p1 + theme_economist() p2 + aes(x = x2, y = y2) # See? Same theme, different variables. # One of the other powerful things that ggplot2 can do for you is create small # multiples -- several plots that show the same thing. Hadley calls this # faceting. # To make a plot with facets, first we have to rearrange the data into columns. # To do that, we use a new package from the Hadley-verse, tidyr. I won't cover # it in depth; for an intro, see: vignette("tidy-data") # The gather function strings the data out into two long columns. One column # is an indicator of what the variable name was (key), and the other is # an indicator of what the value of that variable was (value). anscombe %>% gather() # This isn't quite what we want. The separate command takes us a step closer by # creating a column for x or y, and a column for series (1, 2, 3, or 4): anscombe %>% gather() %>% separate(key, c("x.or.y", "series"), 1) # the 1 means separate after the first character # Equivalently, we could have used extract_numeric() # mutate(series = extract_numeric(key)) # ASIDE: we could have created a new column using substr. # substr stands for sub-string -- it takes a string (variable), and takes # characters starting at 1 and ending at 1. For example, say I have a string # "catinthehat". If I just want the word "cat", I would start at the first # character (c), and stop at the third character (t). This is how that would # look substr("catinthehat", 1, 3) # Still not quite what we're looking for. We want a column for x values, # a column for y values, and a column for series. The spread command takes care # of this: anscombe %>% gather() %>% separate(key, c("x.or.y", "series"), 1) %>% spread(x.or.y, value) # Whoops! That gives us an error. What went wrong? # The error says "duplicate identifiers for rows". That must mean that R # decided it was going to try to combine all of the rows where series == 1, and # got confused. Let's help it out by giving it an unduplicated identifier. In # this case, we'll have to use row numbers. anscombe %>% mutate(sid = row_number()) %>% # sid stands for subject ID gather(key = key, value = value, everything(), -sid) %>% # key = key and value = value means create two new columns named key and value # like we did before. # everything() means gather all the columns, like we did before. # the -sid means do NOT gather sid, but leave it as its own column separate(key, c("x.or.y", "series"), 1) # same as before # Now spread works fine. Let's save the result. (anscombe.ggplot <- anscombe %>% mutate(sid = row_number()) %>% gather(key = key, value = value, everything(), -sid) %>% separate(key, c("x.or.y", "series"), 1) %>% spread(x.or.y, value)) # Let's get rid of the sid column, and let's sort by series, just to make this # cleaner to look at: (anscombe.ggplot %<>% select(-sid) %>% arrange(series)) # Now the data have one column for x, one column for y, and one column # indicating which series the data are from. We can map each one of these # columns to a dimension in the plot. For example, let's make x and y the # x and y dimensions, as before, but let's let the color of the points represent # the series. (p3 <- ggplot(data = anscombe.ggplot, aes(x = x, y = y, color = series)) + geom_point()) # Well that's cool, but it's pretty hard to see. We could have changed the # color of the points p3 + scale_color_brewer() # uses colors from colorbrewer2.org -- check it out! # Still hard to see. Really, we want these in separate plots. Again, those # are called facets in ggplot (p4 <- p3 + facet_wrap(~ series) + # could do up to two columns with facet_grid(~ col1 + col2) aes(color = NULL)) # removes the colored points from before # That's more like it! Now we could clean up the names, etc., like we did # before. If we just wanted to plot one, we would have to change the data: p4 %+% filter(anscombe.ggplot, series == 1) # The last thing we'll do in this script is show you how to add calculated # layers. Here I want to add a layer that shows the regression line for each # of these. This is fairly easy -- you just add another "geometry": p4 + geom_smooth(method = "lm") # lm means "linear model" -- forces the line to be straight # (Because this is Anscombe's quartet, all of the lines are the same.) # There's a whole book about ggplot. It's available online from: # http://search.library.duke.edu/search?id=DUKE004969277 # The easiest way to learn is to start thinking of what you want to present, and # then search for it on the internet or in the help (or come talk to me!). # ggplot has a gallery of the different "geoms" on it's main page: # http://docs.ggplot2.org/current/