# Managing network data in R # Author: Jake Fisher # This lab will introduce using network data with R. As the introduction to R # lab described, one of the main benefits to learning R is its ability to handle # non-traditional data structures. Key among these, for our purposes, is a # network. # This file will describe how to: # 1. Create a network object from raw network data # 2. Save and recall network (incl. vertex and edge) attributes # 3. Calculate individual-level and global network statistics # 4. Perform operations on a list of networks # There are two main network packages in R. I will focus primarily on statnet # today. The statnet suite of packages is mostly used for statistical models # of networks. The other common package is igraph, which is often used for # computer science-style applications, like community detection and calculations # on large graphs. You can convert between them using the intergraph package. # Following best practices, we will begin by setting up the workspace: rm(list = ls()) setwd("C:/Users/fishe/Dropbox/SN&H Files/r_scripts") library(statnet) # lots of messages... ##### Creating a network object ##### # Statnet can create a network object from any of the common network data # structures: edgelists, adjacency lists, and adjacency matrices. # Let's start with an example of an adjacency matrix. Here I'll define a # network where a vertex is a member of my immediate family, and an edge # indicates "parent": people <- c("mom", "dad", "sister", "me") (adj.mat <- matrix(0, nrow = 4, ncol = 4)) # create empty matrix rownames(adj.mat) <- people colnames(adj.mat) <- people adj.mat["mom", "sister"] <- 1 adj.mat["mom", "me"] <- 1 adj.mat["dad", "sister"] <- 1 adj.mat["dad", "me"] <- 1 adj.mat # completed adjacency matrix # We can create a network object using the network() function (family <- network(adj.mat, matrix.type = "adjacency")) # Network objects have a number of special functions for them. We will cover # these in the next section. For now, let's plot the network plot(family) # not terribly informative # Note that we could have expressed the information above more compactly if we # just listed the cell values that should be 1 (i.e., mom -> sister, mom -> me, # etc.) and assumed everything else should be 0. That's called an edgelist. # We can generate a network from an edgelist, too: (edges <- data.frame(from = c(rep("mom", 2), rep("dad", 2)), to = rep(c("sister", "me"), 2))) family.2 <- network(edges, matrix.type = "edgelist") # We can show the adjacency matrix from the network object using as.matrix as.matrix(family.2) # order of the rows is different, but network is the same # And finally, we could compress this even more by holding one the sender in # one column, and the receiver in subsequent columns. This is called an # adjacency list. (adj.list <- data.frame(parent = c("mom", "dad"), child1 = rep("sister", 2), child2 = rep("me", 2))) # Statnet doesn't handle adjacency lists well, but they are the most common type # of raw survey network data. Fortunately, we can convert from an adjacency # list to an edgelist easily, with the tidyr package library(tidyr) library(dplyr) # for the pipe operator library(magrittr) # for the %<>% operator adj.list %>% gather(key = "nomination.number", value = "alter", -parent) # Gather strings your data out into a long column. We can select which # variables to string out -- in this case, we wanted to keep the column for # parent separate, and to repeat the values for each child. # If you drop the middle column, you have an edgelist... adj.list %>% gather(key = "nomination.number", value = "alter", -parent) %>% select(-nomination.number) # ...which you can pass to network! adj.list %>% # note that we didn't save the output gather(key = "nomination.number", value = "alter", -parent) %>% select(-nomination.number) %>% network(., matrix.type = "edgelist") # That's great, but how do we do it with a real dataset? Let's try with the # faux Add Health data. dat <- read.csv("C:/Users/fishe/Dropbox/SN&H Files/SampleData/ahs_wpvar.csv") # So what's in here, anyway? names(dat) # First of all, there are several communities in these data. Let's pick the # first one: cmty1 <- dat %>% filter(commcnt == 1) # Notice that the "friend" variables contain "fnid", and no other variables do. # We can use that to create an edgelist: el <- cmty1 %>% select(ego_nid, contains("fnid")) %>% gather(key = "nomination", value = "alter", -ego_nid) head(el) # hmmmm.... contains 99999 and NA values. Let's standardize those el %>% arrange(ego_nid) %>% head el %<>% mutate(alter = ifelse(alter == 99999, NA, alter)) # Now you would think this would work as a network... el %>% select(-nomination) %>% network(., matrix.type = "edgelist") # but statnet also can't really deal with missing values in the edgelist. So # the best way to do this is to construct an empty network first, and then # add the edges from the edgelist. This has the advantage of including isolates # too. # To do that, first we get a list of the people who are involved in the network: # Get unique list of people in the network (ah.people <- with(el, unique(c(ego_nid, alter)))) # this would be all the people, including those out of school # ah.people <- unique(dat$ego_nid) # this is only people who took the survey el %<>% filter(!is.na(alter)) # drop missing cases # We're not going to do that, because it includes a lot of people who didn't # take the survey. # Now create an empty network add.health <- network.initialize(n = length(ah.people)) # statnet adds edges by index values, not by names of variables. So, to make # this work, we have to create an index value of 1, 2, ..., N, where N is the # number of people in the network el %<>% mutate(ego.idx = match(ego_nid, ah.people), alter.idx = match(alter, ah.people)) # Finally, add the edges from the edgelist add.edges(add.health, el$ego.idx, el$alter.idx) # Now, as a quick gut check, let's plot it: set.seed(919) plot(add.health) # (As an aside, note that you could create a list of all the networks using # lapply...) ##### Save and recall network attributes ##### # One of the benefits of statnet is that it's possible to save attributes for # different pieces of the network. We can save vertex attributes, like whether # the person is a smoker, edge attributes, like whether the ties is homophilous, # or network attributes, like the name of the school, or the survey wave. The # procedure for this is simple; let's illustrate a few. # First, network attributes are characteristics of the network as whole. Set # them with %n% add.health %n% "cmty" <- 1 add.health # note "cmty" in there # Second, vertex attributes are characteristics of each vertex. Set them with # %v% add.health %v% "sex" <- cmty1$sex # we can do this because vertices and data # are sorted in the same way # A special case of vertex attributes is vertex names, which you set with # network.vertex.names() network.vertex.names(add.health) <- cmty1$ego_nid # All of these methods can also be used to retrive network attributes add.health %v% "sex" # Third, edge attributes are characteristics of a pair of vertices. You can set # them with %e%, but the indexing is more complicated, and I will not cover it # here. # These are useful for plotting and advanced statistical methods. plot(add.health, vertex.col = "sex") ##### Calculate individual-level and global network statistics ##### # statnet has functions for virtually every network statistic. They come in two # flavors, individual-level network statistics, and global network statistics. # Individual level statistics are calculated for each person. These include # things like centrality. degree(add.health, cmode = "indegree") degree(add.health, cmode = "outdegree") # Typically, these are most useful in a dataset. We can construct that with the # data.frame command: (ind.net.stats <- data.frame( ego = network.vertex.names(add.health), indegree = degree(add.health, cmode = "indegree"), outdegree = degree(add.health, cmode = "outdegree") )) # Then you would merge these on to a regular dataset, and run a regression. # Global network statistics are characteristics of the network as a whole. # statnet has a number of commands for these as well: grecip(add.health) # reciprocity gtrans(add.health) # transitivity gden(add.health) # density # Often you would calculate these on a single network, and include them as # text in a plot, or somewhere in the text of a paper. As we increasingly # work with several networks, however, you might have a list of network objects, # which you could summarize using the network level statistics for each. The # next section will explain how to do that. ##### Performing operations with lists of network data ##### # The faux Add Health data contains many communities. (This is increasingly # common with network data.) The easiest way to perform a calculation on each # of these networks is to create a list of network objects. # To save each of the networks in the faux Add Health data, we will perform the # operations above for each of the networks. The easiest way to do this is to # construct a list, and loop through it with lapply. # We begin by splitting up the data into a list of data.frames, one for each # community. cmties <- split(dat, dat[, c("commcnt", "sdummy")], drop = T) # note that this is now a list of data.frames, which are split up by unique # combinations of commcnt and sdummy sapply(cmties, class) # Now, for each one of these, we want to run the same set of commands that we # ran earlier. This is an ideal case for writing your own function! createNetwork <- function(indat) { # Creates a network object from an adjacency list for the faux Add Health data # # Args: # indat: a data.frame with columns for ego_nid, and at least one column with # a name containing "fnid" # # Returns: # A statnet network object # Save edgelist without missings el <- indat %>% select(ego_nid, contains("fnid")) %>% gather(key = "nomination", value = "alter", -ego_nid) %>% mutate(alter = ifelse(alter == 99999, NA, alter)) # Get list of unique people in edgelist # people <- unique(with(el, c(ego_nid, alter))) # everyone involved people <- unique(el$ego_nid) # only people who completed survey # Create an empty network out <- network.initialize(n = length(people)) # Save the index numbers to the edgelist el %<>% filter(!is.na(alter)) %>% mutate(ego.idx = match(ego_nid, people), alter.idx = match(alter, people)) # Add those edges to the edgelist add.edges(out, tail = el$ego.idx, head = el$alter.idx) # Save the names of the people as vertex names network.vertex.names(out) <- people # Return the network object return(out) } # Now we can loop through all of data sets and create a network for each one... nets <- lapply(cmties, createNetwork) # And then we can create a data.frame of descriptive network statistics for each # of those: net.stats <- data.frame( net.id = names(nets), # Network-level statistics reciprocity = sapply(nets, grecip, USE.NAMES = FALSE), transitivity = sapply(nets, gtrans, USE.NAMES = FALSE), density = sapply(nets, gden, USE.NAMES = FALSE), size = sapply(nets, network.size, USE.NAMES = FALSE), # Aggregate individual-level statistics n.isolates = sapply(nets, isolates, USE.NAMES = FALSE) %>% sapply(., length, USE.NAMES = FALSE), mean.degree = sapply(nets, degree, cmode = "outdegree", USE.NAMES = FALSE) %>% sapply(., mean, USE.NAMES = FALSE) ) %>% # Function from tidyr to separate values of the form community.school into # separate columns separate(col = net.id, into = c("commcnt", "sdummy")) # Or, equivalently, write a function to output a data.frame calculateNetStats <- function(net) { # Calculates a variety of global and aggregate individual network statistics # # Args: # net: a statnet network object # # Returns: # a data.frame with columns for reciprocity, transitivity, density, size, # number of isolates, and mean degree return(data.frame(reciprocity = grecip(net), transitivity = gtrans(net), density = gden(net), size = network.size(net), n.isolates = length(isolates(nets)), mean.degree = mean(degree(net, cmode = "outdegree")))) } # This crashed my computer... # net.stats.equiv <- lapply(nets, calculateNetStats) %>% # bind_rows %>% # mutate(net.id = names(nets)) %>% # separate(col = net.id, into = c("commcnt", "sdummy")) # Finally, we could also save individual level statistics from each of the # networks, and merge them on to the main data. calculateIndNetStats <- function(net) { # Calculates isolation and degree from a network # # Args: # net: a statnet network object # # Returns: # a data.frame with columns for ego_nid (the vertex names), isolate (T/F), # indegree, and outdegree return(data.frame(ego_nid = network.vertex.names(net), isolate = is.isolate(net, ego = seq(network.size(net))), indegree = degree(net, cmode = "indegree"), outdegree = degree(net, cmode = "outdegree"))) } ind.net.stats <- lapply(nets, calculateIndNetStats) %>% bind_rows(., .id = "sch") %>% # save a column indicating which list it's from separate(sch, into = c("commcnt", "sdummy")) %>% # Convert them to integers, so that they can be merged mutate_each(funs(as.integer(as.character(.))), commcnt, sdummy) # This can then be merged onto the original data by ego_nid, commcnt, and sdummy # e.g., dat %<>% left_join(., ind.net.stats) ##### Practice! ##### # Now let's practice doing this ourselves! Tom Valente collected similar data # as part of the Empirical Networks Project. Let's download it: # http://www-hsc.usc.edu/~tvalente/enp/enp_individual.xls # Read in the data using readxl # install.packages("readxl") library(readxl) (enp <- read_excel("enp_individual.xls")) # Now, using the steps above: # 1. Create a network object # 2. Calculate a network statistic from the network object # 3. Find the correlation of that network object with the "outcome" column # 4. (advanced) Create a list of network objects, and create a data.frame # containing a network statistic for each