# Randomization and generating random graphs # Author: Jake Fisher # Set up workspace rm(list = ls()) setwd("C:/Users/fishe/Dropbox/SN&H Files/r_scripts") library(statnet) library(dplyr) # Often in statistics we are interested in a reference distribution that our # sample, or case, is drawn from. For networks, those reference distributions # are usually given by a distribution of random graphs. We will explore some # simple random graph distributions today, and tomorrow Jeff will discuss more # complicated random graph distributions with ERGMs. # This lab will cover: # 1. Generating random graphs # 2. QAP tests # 3. Running latent space models with latentnet ##### Generating random graphs ##### ### Erdos-Renyi random graphs ### # The simplest random graph distribution is the Erdos-Renyi random graph. We # construct that by flipping a coin for every edge. statnet has a command for # generating random graphs: (er1 <- rgraph( n = 50, # 50 people in the graph m = 1, # generate 1 graph -- if greater than 1, it returns a list of matrices tprob = .05 # probability of a tie (density of the graph) )) # This is not magic -- we could have done the same thing by generating a matrix # of random 1's and 0's with a given probability: rbinom( size = 1, # 1 binomial trial -- a Bernoulli trial n = 50 ^ 2, # number of 0's or 1's to generate p = .05 ) %>% matrix(., nrow = 50, ncol = 50) # structure 0's and 1's as a matrix # Often, we'll use different random graphs to recreate a t-statistic of sorts -- # we want to see how unlikely the statistic that we observed is under a given # null model. Let's try that for the reciprocity of the Add Health network. # Load the Add Health data: load("add_health_cmty1.Rdata") # saved R file from yesterday # Store the density and network size, which we will use for the probability of # a tie and the size of the in the random graph, respectively. (prob <- gden(add.health)) (n <- network.size(add.health)) # Store the reciprocity of the network, which we will use for comparison later (ah.recip <- grecip(add.health, measure = "dyadic.nonnull")) # Generate 100 random graphs, convert them to network objects, and calculate # the reciprocity for each er.draws <- rgraph(n = 72, m = 100, tprob = prob) %>% apply(., 1, network, matrix.type = "adjacency") %>% sapply(., grecip, measure = "dyadic.nonnull") # (As an aside, note apply loops over an index of an array. rgraph creates a # 100 x 72 x 72 matrix, so we loop over the first index.) # Now plot a histogram of the reciprocities of the random graph, and add a line # showing how the observed value compares: plot(hist(er.draws), xlim = c(0, .5)) abline(v = ah.recip, col = "red") # *way* higher than the random graphs! # Aside: ggplot version: # library(ggplot2) # data.frame(recip = er.draws) %>% # qplot(data = ., x = recip, geom = "histogram") + # geom_vline(xintercept = ah.recip, color = "red") # # note the + in ggplot is very similar to %>% in dplyr. In an interview, Hadley # said that if he had known about the %>% when he was developing ggplot2, he # wouldn't have used the + ### Random graphs from a mixing matrix ### # We rarely believe that a completely random graph, with no structure at all, is # the right comparison. (If we did, studying networks wouldn't tell us much!) # So, instead, we usually use a more sophisticated null model. # As a next step, we consider models where people mix randomly in proportion to # how much they mix in the observed data. The rates at which different groups # mix are given by the mixing matrix. (mix.mat <- mixingmatrix(add.health, "grade")) # We can use this to get the probability that a given tie will go from one # person to a person of another grade -- it's just the row percentages mix.mat$matrix / rowSums(mix.mat$matrix) (tie.probs <- prop.table(mix.mat$matrix, margin = 1)) # equivalent # And now, we have to draw ties randomly with that probability. We'll make a # matrix of probabilities, and then will flip a coin (draw a Bernoulli trial) # for each possible tie. # First, construct an empty matrix: prob.matrix <- matrix(NA, ncol = network.size(add.health), nrow = network.size(add.health)) # Next, fill in the values based on the mixing matrix: grades <- add.health %v% "grade" grade.values <- sort(unique(grades)) # The general idea is to do this... prob.matrix[which(grades == 7), which(grades == 7)] <- tie.probs[1, 1] # ...but that's going to require 36 lines of code. Let's write a loop instead. for (i in 1:nrow(tie.probs)) { for (j in 1:ncol(tie.probs)) { prob.matrix[which(grades == grade.values[i]), which(grades == grade.values[j])] <- tie.probs[i, j] } } # And now we can flip a coin for each element of that matrix: rg <- rgraph(n = network.size(add.health), tprob = prob.matrix) # Ordinarily, we would do this procedure repeatedly, calculate the network # statistic, and compare the randomly generated network statistics with the # one from the raw data. Let's try that again with reciprocity. mix.recip <- rgraph(n = network.size(add.health), m = 100, tprob = prob.matrix) %>% # Convert them all to a list of network objects apply(., 1, network, matrix.type = "adjacency") %>% # calculate the reciprocity for each one sapply(., grecip, measure = "dyadic.nonnull") # And now plot it against the true value: plot(hist(mix.recip), xlim = c(0, .5)) abline(v = ah.recip, col = "red") ### Random graphs from a fixed degree distribution ### # As a next step, we consider null models where the degree distribution matches # the degree distribution in the observed data. Fortunately, this is # implemented in igraph: # Calculate the degree distribution from the original network: (indegree <- degree(add.health, cmode = "indegree")) # We actually want a table indicating the count of how many times each degree is # observed in the real data. We use table() to do that. (indegree <- table(indegree)) # 4 isolates, 4 people with one nomination, etc. # Same thing for outdegree: outdegree <- table(degree(add.health, cmode = "outdegree")) # To generate random directed graphs with a given indegree and outdegree # sequence, there have to be the same number of indegree values as outdegree # values, and they must sum to the same thing. The latter happens necessarily, # but the former does not. So we pad the smaller of the two with 0 values. length(indegree) # 13 length(outdegree) # 11 outdegree[12:13] <- 0 # pad the smaller list with 0's # Now we pass both of those to the function in igraph igraph::sample_degseq(out.deg = outdegree, in.deg = indegree, method = "simple") # Aside: converting between igraph and statnet # igraph and statnet do very similar things, but occasionally there is something # that you can do in one package and not in the other (like generating random # graphs with a given degree sequence). In those cases, it's helpful to be able # to convert from one to the other. The package to do this is called # intergraph. # intergraph::asNetwork() # converts to statnet from igraph # intergraph::asIgraph() # converts to igraph from statnet # Let's use this approach again to compare the reciprocity of the observed # network versus the reciprocity of some simulated networks matching the same # degree distribution: degseqRecip <- function(indeg, outdeg) { # Generates a random network with a given degree sequence, and returns the # reciprocity of that network # # Args: # indeg: a numeric vector of counts of people with degree 0, 1, 2... # outdeg: a numeric vector of counts of people with degree 0, 1, 2... # must be same length as indeg, and sum to the same value # # Returns: # a numeric value, the percentage of reciprocated ties igraph::sample_degseq(out.deg = outdegree, in.deg = indegree, method = "simple") %>% igraph::reciprocity(.) %>% return(.) } # replicate allows you to repeat the same command repeatedly. It's useful for # generating random numbers more.reciprocities <- replicate(n = 100, degseqRecip(indegree, outdegree)) plot(hist(more.reciprocities), xlim = c(0, .7)) abline(v = ah.recip, col = "red") ##### QAP tests ##### # Another common null model is the permutation test. If we randomly shuffle # the rows and columns of the adjacency matrix, how often would a correlation # as large as the one we observed occur? # For a simple example, let's look at whether there is a significant correlation # between smoking behavior in the faux Add Health network and friendship. # To begin, we construct a square matrix where the (i, j) cell is a dummy # variable indicating whether i and j have the same smoking behavior (i.e., # they both smoke, or the they both don't smoke.) To construct that, we use the # outer command (like an outer product) smoking <- outer( add.health %v% "PSMOKES", add.health %v% "PSMOKES", FUN = "==" ) # We compare that with the matrix of friendships: friends <- as.matrix(add.health) # The command for QAP tests is netlm or netlogit (depending on whether you use # a linear or logistic regression). Here we use logistic: (qap.out <- netlogit(y = smoking, x = friends, nullhyp = "qap", reps = 100)) # You can add additional covariates by entering them as an array. Let's # add same-grade: grade <- outer( add.health %v% "grade", add.health %v% "grade", FUN = "==" ) # Make a 3 dimensional array -- the abind command from the abind package also # does this. X <- array(, dim = c(2, nrow(grade), ncol(grade))) X[1, , ] <- friends X[2, , ] <- grade (qap2 <- netlogit(y = smoking, x = X, nullhyp = "qap", reps = 100)) ##### Optional: latent space models ##### # Finally, another way to control for the underlying network structure is the # latent space model. Simply put, the idea behind a latent space model is that # we place pepole in an underlying latent "space", and then treat the ties # between them as independent, given their positions in the space. (You can # think of the space as the positions in a force-directed layout in a network # picture.) This allows us to test for homophily effects, for example, or to # generate random networks that match an observed network with very little # complexity in model fitting. # The most user-friendly latent space model software is in the latentnet package # (more recent models are provided by the amen package). library(latentnet) latent.fit <- ergmm(add.health ~ euclidean(d = 2)) # Can add additional dimensions... latent.fit.3d <- ergmm(add.health ~ euclidean(d = 3)) # ... latent groups ... latent.fit.3d <- ergmm(add.health ~ euclidean(d = 2, G = 5)) # ... or homophily effects latent.fit.3d <- ergmm(add.health ~ nodematch("grade") + euclidean(d = 2))