# Randomization and generating random graphs
# Author: Jake Fisher
# Set up workspace
rm(list = ls())
setwd("C:/Users/fishe/Dropbox/SN&H Files/r_scripts")
library(statnet)
library(dplyr)
# Often in statistics we are interested in a reference distribution that our
# sample, or case, is drawn from. For networks, those reference distributions
# are usually given by a distribution of random graphs. We will explore some
# simple random graph distributions today, and tomorrow Jeff will discuss more
# complicated random graph distributions with ERGMs.
# This lab will cover:
# 1. Generating random graphs
# 2. QAP tests
# 3. Running latent space models with latentnet
##### Generating random graphs #####
### Erdos-Renyi random graphs ###
# The simplest random graph distribution is the Erdos-Renyi random graph. We
# construct that by flipping a coin for every edge. statnet has a command for
# generating random graphs:
(er1 <- rgraph(
n = 50, # 50 people in the graph
m = 1, # generate 1 graph -- if greater than 1, it returns a list of matrices
tprob = .05 # probability of a tie (density of the graph)
))
# This is not magic -- we could have done the same thing by generating a matrix
# of random 1's and 0's with a given probability:
rbinom(
size = 1, # 1 binomial trial -- a Bernoulli trial
n = 50 ^ 2, # number of 0's or 1's to generate
p = .05
) %>%
matrix(., nrow = 50, ncol = 50) # structure 0's and 1's as a matrix
# Often, we'll use different random graphs to recreate a t-statistic of sorts --
# we want to see how unlikely the statistic that we observed is under a given
# null model. Let's try that for the reciprocity of the Add Health network.
# Load the Add Health data:
load("add_health_cmty1.Rdata") # saved R file from yesterday
# Store the density and network size, which we will use for the probability of
# a tie and the size of the in the random graph, respectively.
(prob <- gden(add.health))
(n <- network.size(add.health))
# Store the reciprocity of the network, which we will use for comparison later
(ah.recip <- grecip(add.health, measure = "dyadic.nonnull"))
# Generate 100 random graphs, convert them to network objects, and calculate
# the reciprocity for each
er.draws <- rgraph(n = 72, m = 100, tprob = prob) %>%
apply(., 1, network, matrix.type = "adjacency") %>%
sapply(., grecip, measure = "dyadic.nonnull")
# (As an aside, note apply loops over an index of an array. rgraph creates a
# 100 x 72 x 72 matrix, so we loop over the first index.)
# Now plot a histogram of the reciprocities of the random graph, and add a line
# showing how the observed value compares:
plot(hist(er.draws), xlim = c(0, .5))
abline(v = ah.recip, col = "red") # *way* higher than the random graphs!
# Aside: ggplot version:
# library(ggplot2)
# data.frame(recip = er.draws) %>%
# qplot(data = ., x = recip, geom = "histogram") +
# geom_vline(xintercept = ah.recip, color = "red")
#
# note the + in ggplot is very similar to %>% in dplyr. In an interview, Hadley
# said that if he had known about the %>% when he was developing ggplot2, he
# wouldn't have used the +
### Random graphs from a mixing matrix ###
# We rarely believe that a completely random graph, with no structure at all, is
# the right comparison. (If we did, studying networks wouldn't tell us much!)
# So, instead, we usually use a more sophisticated null model.
# As a next step, we consider models where people mix randomly in proportion to
# how much they mix in the observed data. The rates at which different groups
# mix are given by the mixing matrix.
(mix.mat <- mixingmatrix(add.health, "grade"))
# We can use this to get the probability that a given tie will go from one
# person to a person of another grade -- it's just the row percentages
mix.mat$matrix / rowSums(mix.mat$matrix)
(tie.probs <- prop.table(mix.mat$matrix, margin = 1)) # equivalent
# And now, we have to draw ties randomly with that probability. We'll make a
# matrix of probabilities, and then will flip a coin (draw a Bernoulli trial)
# for each possible tie.
# First, construct an empty matrix:
prob.matrix <- matrix(NA, ncol = network.size(add.health),
nrow = network.size(add.health))
# Next, fill in the values based on the mixing matrix:
grades <- add.health %v% "grade"
grade.values <- sort(unique(grades))
# The general idea is to do this...
prob.matrix[which(grades == 7), which(grades == 7)] <- tie.probs[1, 1]
# ...but that's going to require 36 lines of code. Let's write a loop instead.
for (i in 1:nrow(tie.probs)) {
for (j in 1:ncol(tie.probs)) {
prob.matrix[which(grades == grade.values[i]),
which(grades == grade.values[j])] <- tie.probs[i, j]
}
}
# And now we can flip a coin for each element of that matrix:
rg <- rgraph(n = network.size(add.health), tprob = prob.matrix)
# Ordinarily, we would do this procedure repeatedly, calculate the network
# statistic, and compare the randomly generated network statistics with the
# one from the raw data. Let's try that again with reciprocity.
mix.recip <- rgraph(n = network.size(add.health), m = 100,
tprob = prob.matrix) %>%
# Convert them all to a list of network objects
apply(., 1, network, matrix.type = "adjacency") %>%
# calculate the reciprocity for each one
sapply(., grecip, measure = "dyadic.nonnull")
# And now plot it against the true value:
plot(hist(mix.recip), xlim = c(0, .5))
abline(v = ah.recip, col = "red")
### Random graphs from a fixed degree distribution ###
# As a next step, we consider null models where the degree distribution matches
# the degree distribution in the observed data. Fortunately, this is
# implemented in igraph:
# Calculate the degree distribution from the original network:
(indegree <- degree(add.health, cmode = "indegree"))
# We actually want a table indicating the count of how many times each degree is
# observed in the real data. We use table() to do that.
(indegree <- table(indegree)) # 4 isolates, 4 people with one nomination, etc.
# Same thing for outdegree:
outdegree <- table(degree(add.health, cmode = "outdegree"))
# To generate random directed graphs with a given indegree and outdegree
# sequence, there have to be the same number of indegree values as outdegree
# values, and they must sum to the same thing. The latter happens necessarily,
# but the former does not. So we pad the smaller of the two with 0 values.
length(indegree) # 13
length(outdegree) # 11
outdegree[12:13] <- 0 # pad the smaller list with 0's
# Now we pass both of those to the function in igraph
igraph::sample_degseq(out.deg = outdegree, in.deg = indegree, method = "simple")
# Aside: converting between igraph and statnet
# igraph and statnet do very similar things, but occasionally there is something
# that you can do in one package and not in the other (like generating random
# graphs with a given degree sequence). In those cases, it's helpful to be able
# to convert from one to the other. The package to do this is called
# intergraph.
# intergraph::asNetwork() # converts to statnet from igraph
# intergraph::asIgraph() # converts to igraph from statnet
# Let's use this approach again to compare the reciprocity of the observed
# network versus the reciprocity of some simulated networks matching the same
# degree distribution:
degseqRecip <- function(indeg, outdeg) {
# Generates a random network with a given degree sequence, and returns the
# reciprocity of that network
#
# Args:
# indeg: a numeric vector of counts of people with degree 0, 1, 2...
# outdeg: a numeric vector of counts of people with degree 0, 1, 2...
# must be same length as indeg, and sum to the same value
#
# Returns:
# a numeric value, the percentage of reciprocated ties
igraph::sample_degseq(out.deg = outdegree, in.deg = indegree,
method = "simple") %>%
igraph::reciprocity(.) %>%
return(.)
}
# replicate allows you to repeat the same command repeatedly. It's useful for
# generating random numbers
more.reciprocities <- replicate(n = 100, degseqRecip(indegree, outdegree))
plot(hist(more.reciprocities), xlim = c(0, .7))
abline(v = ah.recip, col = "red")
##### QAP tests #####
# Another common null model is the permutation test. If we randomly shuffle
# the rows and columns of the adjacency matrix, how often would a correlation
# as large as the one we observed occur?
# For a simple example, let's look at whether there is a significant correlation
# between smoking behavior in the faux Add Health network and friendship.
# To begin, we construct a square matrix where the (i, j) cell is a dummy
# variable indicating whether i and j have the same smoking behavior (i.e.,
# they both smoke, or the they both don't smoke.) To construct that, we use the
# outer command (like an outer product)
smoking <- outer(
add.health %v% "PSMOKES",
add.health %v% "PSMOKES",
FUN = "=="
)
# We compare that with the matrix of friendships:
friends <- as.matrix(add.health)
# The command for QAP tests is netlm or netlogit (depending on whether you use
# a linear or logistic regression). Here we use logistic:
(qap.out <- netlogit(y = smoking, x = friends, nullhyp = "qap", reps = 100))
# You can add additional covariates by entering them as an array. Let's
# add same-grade:
grade <- outer(
add.health %v% "grade",
add.health %v% "grade",
FUN = "=="
)
# Make a 3 dimensional array -- the abind command from the abind package also
# does this.
X <- array(, dim = c(2, nrow(grade), ncol(grade)))
X[1, , ] <- friends
X[2, , ] <- grade
(qap2 <- netlogit(y = smoking, x = X, nullhyp = "qap", reps = 100))
##### Optional: latent space models #####
# Finally, another way to control for the underlying network structure is the
# latent space model. Simply put, the idea behind a latent space model is that
# we place pepole in an underlying latent "space", and then treat the ties
# between them as independent, given their positions in the space. (You can
# think of the space as the positions in a force-directed layout in a network
# picture.) This allows us to test for homophily effects, for example, or to
# generate random networks that match an observed network with very little
# complexity in model fitting.
# The most user-friendly latent space model software is in the latentnet package
# (more recent models are provided by the amen package).
library(latentnet)
latent.fit <- ergmm(add.health ~ euclidean(d = 2))
# Can add additional dimensions...
latent.fit.3d <- ergmm(add.health ~ euclidean(d = 3))
# ... latent groups ...
latent.fit.3d <- ergmm(add.health ~ euclidean(d = 2, G = 5))
# ... or homophily effects
latent.fit.3d <- ergmm(add.health ~ nodematch("grade") + euclidean(d = 2))