#An ERGM tutorial using R for the Social Networks and Health
#workshop at Duke University on May 19, 2016
#The examples are based on a network and dataset called schoolnet1.Rdata
#which is on the dropbox page
#this the first add health example network
#In order for the code to work this file must be saved on your computer
#You must update the code (see below) to indicate where these files are saved
#Additionally, the code also requires the ergm, network, coda and sna packages
#these packages can be installed in the normal way:
#i.e. install.packages("sna")
#The tutorial is broken down as follows:
#Part I:Data Management-using the school network
#Part II: Statistics and Plots-using the school network
#Part III: ERGM- using the school network
#Part IV: ERGM- using the Florentine network
#Part I:Data Management
#getting the data into R in a format we can use
#loading neccessary packages
library(ergm); library(sna);library(network);library(coda)
#let's set the directory where our data is stored so that we can
#read it in more easily
setwd("/Users/jsmith77/classes/workshops/social_networks_health_duke/data/")
#example 1, add health school network
#friendship nominations-up to 10
#first reading in data, saved as schoolnet1
load("schoolnet1.Rdata")
#note there are characteristics for race, gnd and smoking behavior
#and grade in school
##################################################################
#Part II: Descrpitive Statistics and Plots
#getting descriptive statistics and plots so we can run an informed ergm
plot(schoolnet1) #a simple plot
plot(schoolnet1,vertex.col="gnd") #looking at homophily by gender
plot(schoolnet1,vertex.col="grade") #homophily on grade
#scaling nodes by indegree
#odegree=rowSums(as.matrix(schoolnet1)) #first getting outdegree and indegree for nodes in network
idegree=colSums(as.matrix(schoolnet1))
#plots with in and out degree
plot(schoolnet1,vertex.col="grade",vertex.cex=idegree/1.5)
#scaling node size by indegree and coloring by grade
plot(schoolnet1,vertex.col="smoke",vertex.cex=idegree/1.5)
#scaling node size by indegree and coloring by smoke behavior
#now thinking about more structural features, still using descriptive statistics
gden(schoolnet1) #density
cug.test(schoolnet1,"gden",cmode="size") #compare desnity to random network of right size
#is there clustering, reciprocity?
grecip(schoolnet1); #reciprocity first
grecip(schoolnet1,measure="dyadic.nonnull"); #not including null dyads
dyad.census(schoolnet1)#full dyad census
#comparing reciprocity to the amount expected by chance of network with right size and density
cug.test(schoolnet1,"grecip",FUN.args=list(measure="dyadic.nonnull"),cmode="edges")
gtrans(schoolnet1)#now transitivity
cug.test(schoolnet1,"gtrans",cmode="dyad.census")#compare transitivity to
#the amount expected by network with correct dyad census
########################################
#before we actually fit any ergms, let's take a look at some of the help files
?ergm
?'ergm-terms'
?mcmc.diagnostics
?gof
############################################
#Part III: ERGM on a School Network
#now, running some ergms on our school network
mod.rand<-ergm(schoolnet1~edges)
#an initial model-just the edge parameter,bernoulli random graph model
summary(mod.rand) #taking a look at the model results
#interpretation of model coefficients:
#the log-odds of any tie existing is -2.7275
#probability: exp(-2.7275)/(1+exp(-2.7275))=.0613
#which is the just the density...
gden(schoolnet1)
#we know homophily matters from the plots, let's add some terms
#for gender ("gnd") and grade ("grd") and smoking behavior
#still dyadic independent
summary(mod.homoph<-ergm(schoolnet1~edges+nodematch("gnd")+nodematch("grade")
+nodematch('smoke')))
#note that here we are doing the model and summary at the same time
#only using simple match/no match terms for now
#the basic terms here are nodematch with the name of the characteristic
#in quotes
#interpretation of coefficient for nodematch on gender:
#for a tie between two nodes that match on grade and mismatch on smoking and gender
#the conditional log-odds is:
#-4.5787+3.0493=-1.5294
#
#the probability is: exp(-1.5294)/(1+exp(-1.5294))=.178
#
#for a tie between two nodes that mismatch on gender and mismatch on smoking and grade
#the conditional log-odds is:
#-4.5787=-4.5787
#
#the probability is: exp(-4.5787)/(1+exp(-4.5787))=.01
#adding reciprocity to the model
summary(mod.homoph.mutual1<-ergm(schoolnet1~edges+nodematch("gnd")+nodematch("grade")
+nodematch('smoke')+mutual))
#lets also add a term for reciprocity (mutual) as we know the level of reciprocity is high
#note that it is now being estimated using MCMC sampling
mcmc.diagnostics(mod.homoph.mutual1); #looking at the model diagnostics
#-the MCMC sample of network statistics
#can also use output directly from ergm to asses convergence
plot(mod.homoph.mutual1$sample)
#seems to have converged. hasn't gone off dramatically in one direction
#and is not erractic
#now looking at goodness of fit
#does the model reproduce the global features of the network?
#we can first ask if the model is reproducing the terms in the model
#this is a useful check, but hardly a good test of model fit
x=gof(mod.homoph.mutual1~model) #here use a gof function to test for
#goodness of fit
plot(x)
#now let's do the same thing but see if the model can produce the features
#of the network not in the model, like distance and shared partners
par(mfrow=c(1,2)) #setting up the plot
x=gof(mod.homoph.mutual1~distance+espartners)
plot(x)
#not a great fit, especially on shared partner
#perhaps allow for differential homophily on grade?
summary(mod.homoph.mutual2<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual))
#same model as above but differential homophily for grade, set with diff=T
#note there are now 6 terms associated with grade
#in-group bias for grade 12 seems higher than grade 7
#mcmc.diagnostics(mod.homoph.mutual2,density=F);#diagnostics
#plot(mod.homoph.mutual2$sample) #looking at diagnostics
#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual2~distance+espartners)
plot(x)
#okay (although far from perfect) on distance but missing clustering badly
#maybe we need to include terms that deal with differential degree
#where some actors have higher in/out degree than others
summary(mod.homoph.mutual3<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodefactor("smoke")))
#same model as above but include nodal factor for degree
#for smoking behavior
#here not differentiating between in and out degree
plot(mod.homoph.mutual3$sample) #looking at diagnostics
#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual3~distance+espartners)
plot(x)
#now differentiating between in and out degree for effect of smoke
summary(mod.homoph.mutual4<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")))
plot(mod.homoph.mutual4$sample) #looking at diagnostics
#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual4~distance+espartners)
plot(x)
#how about the degree distribution, are we getting that right?
x=gof(mod.homoph.mutual4~idegree+odegree)
plot(x)
#deal with fact that way under counting number of people who send zero ties?
summary(mod.homoph.mutual5<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)))
x=gof(mod.homoph.mutual5~idegree+odegree)
plot(x)
x=gof(mod.homoph.mutual5~distance+espartners)
#fit is getting a bit better but still clearly off!
############
#we have so far fit a number of models, learned a bit but have not
#captured the clustering in the network
#so, maybe we need a clustering term...
##here we will try and include the number of triangles
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+triangle))
#clearly didn't work
#let's run the same thing again but cut the
#algorithm short, just to see what's happening
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+triangle,control=control.ergm(MCMLE.maxit= 1)))
#an example of degeneracy...
plot(mod.triangle$sample)
#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.triangle$newnetwork)
#In general, there are two reasons we could have a degenerate model:
#1. A poorly specified model. So the terms in the model do not reflect
#the actual process that generated the network. When this happens the simulated
#network, used to estimate the coefficients, do not yield realistic networks
#making it difficult to estimate the paramaters very well
#2.The inputs that control the algorithm (like burnin and interval between samples
#are not sufficient for the model to converge
#We can try and change both aspects to get the model to a proper specification
#Let's try something else, here include a gwesp term
summary(mod.homoph.mutual.gwesp1<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=1,fixed=T),control=control.ergm(MCMLE.maxit= 5)))
#setting the max iterations low as this model takes a bit too run
mcmc.diagnostics(mod.homoph.mutual.gwesp1);
plot(mod.homoph.mutual.gwesp1$sample)
#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.homoph.mutual.gwesp1$newnetwork)
#clearly a bad model, the MCMC sample seemed not to have converged
#maybe we should allow the algorithm to run longer?
#we can also decrease the alpha paramater for the gwesp term
#alpha governs how unlikley 3,4,5,etc triangles are relative to 1 and 2 triangles
#when alpha is high the probabilty of tie is greatly increased by adding another shared partner
#even if the pair already have many shared partners.
#when alpha is low adding more shared partners (or more triangles) does not greatly increase the probabilty of a tie
#if the pair already have a few shared partners
summary(mod.homoph.mutual.gwesp2<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 10,MCMC.interval=2000,MCMC.samplesize=2000)))
#diagnostics
mcmc.diagnostics(mod.homoph.mutual.gwesp2,density=F);
plot(mod.homoph.mutual.gwesp2$newnetwork)
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp2~distance+espartners)
plot(x)
#clearly a better model in terms of fit
#let's try and make sure the model has converged, up the number of iterations here
#looks like we can also drop the outdegree term if we wanted to
summary(mod.homoph.mutual.gwesp3<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 15,MCMC.interval=2000,MCMC.samplesize=2000)))
plot(mod.homoph.mutual.gwesp3$sample)
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp3~distance+espartners)
plot(x)
#simulating a network to see how it looks
sim.schoolnet<-simulate(mod.homoph.mutual.gwesp2,nsim=1)
plot(sim.schoolnet,vertex.col="grade") #looks reasonably close to the empirical network
#Of course we could run more models, tweak the inputs, etc. looking for a better
#fit. but in the end the key is being able to answer our questions with a
#model that has converged.
#########################################################################################################3
#Part IV: ERGM using the Florentine Network
#a second ergm example, this time we are going to use some data from within R
#also we won't go through the detailed data analysis we did with the school network
#we are going to use the florentine data from within the ergm package
#there is data on marriage and business relationships between families in florentine some time ago
#the first part of the code comes from the ergm help files
data(flo)
#
# attach the sociomatrix for the Florentine marriage data
#
# Let's make a network from the matrix
#
flomarriage <- network(flo,directed=FALSE)
flomarriage
#
#
# create a vector indicating the wealth of each family (in thousands of lira)
# and add it as a covariate to the network object
#
wealth=c(10,36,27,146,55,44,20,8,42,103,48,49,10,48,32,3)
flomarriage %v% "wealth" <- wealth
#
# create a plot of the social network
#
plot(flomarriage)
#
# now make the vertex size proportional to their wealth
#
plot(flomarriage, vertex.cex=wealth/10, main="Marriage Ties")
# Load a network object of the Florentine data
#
data(florentine)
#
# Fit a model where the propensity to form ties between
# families depends on the absolute difference in wealth
#
flo.model1 <- ergm(flomarriage ~ edges + absdiff("wealth"))
summary(flo.model1)
x=gof(flo.model1~distance)
plot(x)
#add triangle to model to see if there is clustering in the network
flo.model2 <- ergm(flomarriage ~edges+absdiff("wealth") + triangle)
summary(flo.model2)
#diagnostics
mcmc.diagnostics(flo.model2) #triangle seems a bit off
#what happens if we use gwesp instead of triangle
flo.model3 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
summary(flo.model3)
mcmc.diagnostics(flo.model3)
#similar convergence
flo.model4 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
summary(flo.model4)
mcmc.diagnostics(flo.model4)
#probably not worth having a triangle or gwesp term
#now adding an edgecovariate-one can add an edge covariate in the form of a matrix or a network
#here we put in a network of business relations between the families
#do families mix marriage with business?
flo.model5 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness))
summary(flo.model5) #adding edgecov
#gof
par(mfrow=c(1,2))
x=gof(flo.model5~distance+espartners)
plot(x)
#a very nice fit but do we need node covariates?
flo.model6 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness)+nodecov("wealth"))
summary(flo.model6)
#how about k star terms?
flo.model7 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness)+nodecov("wealth")+kstar(2:3))
summary(flo.model7)
mcmc.diagnostics(flo.model7)
#looking at gof for distance, shared partners, and degree distribution
par(mfrow=c(1,3))
x=gof(flo.model7~distance+espartners+degree,nsim=40)
plot(x)
#simulating an example network
sim.flo=simulate(flo.model7)
#dev.off()
plot(sim.flo)
########################################################################################################
#A very simple simulation example
#here we will try to simulate a network
#like the school network in the first example
#with the same size and density, but here with higehr
#gender homophily
net=network.initialize(71,directed=T)
#first initialize network of right size and directed
net%v% "gnd"=get.vertex.attribute(schoolnet1,"gnd")
#here seeding each case with characteristic (doing this randomly)
#based on the distribution of gender in the original network
mod=ergm(net~edges+nodematch("gnd"), target.stats=c(305,250))
#here getting parameters to use in simulation for network
#of right size, with network with 305 edges and 250 of those
#go to people of same gender.
#note that 305 is the same number of edges as in the true network
#but 250 same gender ties is much higher than in the true network
sim.mod=simulate(mod)
#here we simulate network based on the estimated model above
summary(sim.mod~edges+nodematch('gnd'))
#success!
plot(sim.mod, vertex.col="gnd")