AN ERGM TUTORIAL USING R

Download R source file


              #An ERGM tutorial using R for the Social Networks and Health 
#workshop at Duke University on May 19, 2016


#The examples are based on a network and dataset called schoolnet1.Rdata
#which is on the dropbox page
#this the first add health example network
#In order for the code to work this file must be saved on your computer
#You must update the code (see below) to indicate where these files are saved

#Additionally, the code also requires the ergm, network, coda and sna packages
#these packages can be installed in the normal way:
#i.e. install.packages("sna")

#The tutorial is broken down as follows:


#Part I:Data Management-using the school network
#Part II: Statistics and Plots-using the school network
#Part III: ERGM- using the school network 
#Part IV: ERGM- using the Florentine network 


#Part I:Data Management
#getting the data into R in a format we can use

#loading neccessary packages
library(ergm); library(sna);library(network);library(coda)


#let's set the directory where our data is stored so that we can
#read it in more easily
setwd("/Users/jsmith77/classes/workshops/social_networks_health_duke/data/")

#example 1, add health school network 

#friendship nominations-up to 10 

#first reading in data, saved as schoolnet1
load("schoolnet1.Rdata")
#note there are characteristics for race, gnd and smoking behavior
#and grade in school

##################################################################

#Part II: Descrpitive Statistics and Plots

#getting descriptive statistics and plots so we can run an informed ergm

plot(schoolnet1)  #a simple plot


plot(schoolnet1,vertex.col="gnd")  #looking at homophily by gender

plot(schoolnet1,vertex.col="grade") #homophily on grade


#scaling nodes by indegree 

#odegree=rowSums(as.matrix(schoolnet1)) #first getting outdegree and indegree for nodes in network
idegree=colSums(as.matrix(schoolnet1))

#plots with in and out degree

plot(schoolnet1,vertex.col="grade",vertex.cex=idegree/1.5) 
#scaling node size by indegree and coloring by grade

plot(schoolnet1,vertex.col="smoke",vertex.cex=idegree/1.5) 
#scaling node size by indegree and coloring by smoke behavior
 

#now thinking about more structural features, still using descriptive statistics


gden(schoolnet1) #density 
cug.test(schoolnet1,"gden",cmode="size") #compare desnity to random network of right size


#is there clustering, reciprocity?

grecip(schoolnet1); #reciprocity first
grecip(schoolnet1,measure="dyadic.nonnull"); #not including null dyads
dyad.census(schoolnet1)#full dyad census


#comparing reciprocity to the amount expected by chance of network with right size and density 
cug.test(schoolnet1,"grecip",FUN.args=list(measure="dyadic.nonnull"),cmode="edges")


gtrans(schoolnet1)#now transitivity 

cug.test(schoolnet1,"gtrans",cmode="dyad.census")#compare transitivity to 
#the amount expected by network with correct dyad census




########################################

#before we actually fit any ergms, let's take a look at some of the help files

?ergm
?'ergm-terms'
?mcmc.diagnostics
?gof

############################################
#Part III: ERGM on a School Network 


#now, running some ergms on our school network

mod.rand<-ergm(schoolnet1~edges)  
#an initial model-just the edge parameter,bernoulli random graph model


summary(mod.rand)  #taking a look at the model results
#interpretation of model coefficients: 

#the log-odds of any tie existing is -2.7275
#probability: exp(-2.7275)/(1+exp(-2.7275))=.0613
#which is the just the density... 
gden(schoolnet1)



#we know homophily matters from the plots, let's add some terms
#for gender ("gnd") and grade ("grd") and smoking behavior
#still dyadic independent 

summary(mod.homoph<-ergm(schoolnet1~edges+nodematch("gnd")+nodematch("grade")
+nodematch('smoke')))  

#note that here we are doing the model and summary at the same time
#only using simple match/no match terms for now
#the basic terms here are nodematch with the name of the characteristic
#in quotes
          
#interpretation of coefficient for nodematch on gender: 
#for a tie between two nodes that match on grade and mismatch on smoking and gender
#the conditional log-odds  is: 
#-4.5787+3.0493=-1.5294
#
#the probability is: exp(-1.5294)/(1+exp(-1.5294))=.178
#
#for a tie between two nodes that mismatch on gender and mismatch on smoking and grade
#the conditional log-odds  is: 

#-4.5787=-4.5787
#
#the probability is: exp(-4.5787)/(1+exp(-4.5787))=.01
                                      
                                      
#adding reciprocity to the model
summary(mod.homoph.mutual1<-ergm(schoolnet1~edges+nodematch("gnd")+nodematch("grade")
+nodematch('smoke')+mutual))  
 #lets also add a term for reciprocity (mutual) as we know the level of reciprocity is high 

#note that it is now being estimated using MCMC sampling

mcmc.diagnostics(mod.homoph.mutual1); #looking at the model diagnostics
#-the MCMC sample of network statistics

#can also use output directly from ergm to asses convergence
plot(mod.homoph.mutual1$sample)

#seems to have converged. hasn't gone off dramatically in one direction
#and is not erractic


#now looking at goodness of fit
#does the model reproduce the global features of the network?

#we can first ask if the model is reproducing the terms in the model
#this is a useful check, but hardly a good test of model fit

x=gof(mod.homoph.mutual1~model) #here use a gof function to test for
#goodness of fit
plot(x)

#now let's do the same thing but see if the model can produce the features 
#of the network not in the model, like distance and shared partners

par(mfrow=c(1,2)) #setting up the plot
x=gof(mod.homoph.mutual1~distance+espartners)
plot(x)

#not a great fit, especially on shared partner
#perhaps allow for differential homophily on grade?

summary(mod.homoph.mutual2<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual))  
#same model as above but differential homophily for grade, set with diff=T
#note there are now 6 terms associated with grade
#in-group bias for grade 12 seems higher than grade 7

#mcmc.diagnostics(mod.homoph.mutual2,density=F);#diagnostics
#plot(mod.homoph.mutual2$sample) #looking at diagnostics


#gof
par(mfrow=c(1,2))
 x=gof(mod.homoph.mutual2~distance+espartners)

plot(x)
#okay (although far from perfect) on distance but missing clustering badly

#maybe we need to include terms that deal with differential degree
#where some actors have higher in/out degree than others

summary(mod.homoph.mutual3<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodefactor("smoke")))

  #same model as above but include nodal factor for degree
  #for smoking behavior 
#here not differentiating between in and out degree

plot(mod.homoph.mutual3$sample) #looking at diagnostics

#gof
par(mfrow=c(1,2))
 x=gof(mod.homoph.mutual3~distance+espartners)

plot(x)

#now differentiating between in and out degree for effect of smoke
summary(mod.homoph.mutual4<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")))


plot(mod.homoph.mutual4$sample) #looking at diagnostics

#gof
par(mfrow=c(1,2))
 x=gof(mod.homoph.mutual4~distance+espartners)

plot(x)


#how about the degree distribution, are we getting that right? 
 x=gof(mod.homoph.mutual4~idegree+odegree)
plot(x)


#deal with fact that way under counting number of people who send zero ties?

summary(mod.homoph.mutual5<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)))




 x=gof(mod.homoph.mutual5~idegree+odegree)
plot(x)

 x=gof(mod.homoph.mutual5~distance+espartners)
#fit is getting a bit better but still clearly off!


############
#we have so far fit a number of models, learned a bit but have not 
#captured the clustering in the network
#so, maybe we need a clustering term...

##here we will try and include the number of triangles
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+triangle))

#clearly didn't work
#let's run the same thing again but cut the 
#algorithm short, just to see what's happening
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+triangle,control=control.ergm(MCMLE.maxit= 1)))


#an example of degeneracy...
plot(mod.triangle$sample)

#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.triangle$newnetwork)

#In general, there are two reasons we could have a degenerate model:
#1. A poorly specified model. So the terms in the model do not reflect
#the actual process that generated the network. When this happens the simulated
#network, used to estimate the coefficients, do not yield realistic networks
#making it difficult to estimate the paramaters very well
#2.The inputs that control the algorithm (like burnin and interval between samples
#are not sufficient for the model to converge


#We can try and change both aspects to get the model to a proper specification

#Let's try something else, here include a gwesp term

summary(mod.homoph.mutual.gwesp1<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=1,fixed=T),control=control.ergm(MCMLE.maxit= 5)))
#setting the max iterations low as this model takes a bit too run

mcmc.diagnostics(mod.homoph.mutual.gwesp1);

plot(mod.homoph.mutual.gwesp1$sample)

#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.homoph.mutual.gwesp1$newnetwork)

#clearly a bad model, the MCMC sample seemed not to have converged


#maybe we should allow the algorithm to run longer? 
#we can also decrease the alpha paramater for the gwesp term 
#alpha governs how unlikley 3,4,5,etc triangles are relative to 1 and 2 triangles
#when alpha is high the probabilty of tie is greatly increased by adding another shared partner
#even if the pair already have many shared partners.
#when alpha is low adding more shared partners (or more triangles) does not greatly increase the probabilty of a tie
#if the pair already have a few shared partners 


summary(mod.homoph.mutual.gwesp2<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 10,MCMC.interval=2000,MCMC.samplesize=2000)))
                    

#diagnostics
mcmc.diagnostics(mod.homoph.mutual.gwesp2,density=F);
plot(mod.homoph.mutual.gwesp2$newnetwork)


par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp2~distance+espartners)
plot(x)

#clearly a better model in terms of fit
#let's try and make sure the model has converged, up the number of iterations here
#looks like we can also drop the outdegree term if we wanted to

summary(mod.homoph.mutual.gwesp3<-ergm(schoolnet1~edges+nodematch("gnd")+
nodematch("grade",diff=T)+nodematch('smoke')+mutual+nodeifactor("smoke")
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 15,MCMC.interval=2000,MCMC.samplesize=2000)))

plot(mod.homoph.mutual.gwesp3$sample)

par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp3~distance+espartners)
plot(x)


#simulating a network to see how it looks 
sim.schoolnet<-simulate(mod.homoph.mutual.gwesp2,nsim=1)
plot(sim.schoolnet,vertex.col="grade") #looks reasonably close to the empirical network


#Of course we could run more models, tweak the inputs, etc. looking for a better
#fit. but in the end the key is being able to answer our questions with a 
#model that has converged. 

#########################################################################################################3


#Part IV: ERGM using the Florentine Network

#a second ergm example, this time we are going to use some data from within R
#also we won't go through the detailed data analysis we did with the school network

#we are going to use the florentine data from within the ergm package
#there is data on marriage and business relationships between families in florentine some time ago

#the first part of the code comes from the ergm help files
  data(flo)
     #
     # attach the sociomatrix for the Florentine marriage data
     #  
     # Let's make a network from the matrix
     #
     flomarriage <- network(flo,directed=FALSE)
     flomarriage
     #
     #
     # create a vector indicating the wealth of each family (in thousands of lira) 
     # and add it as a covariate to the network object
     #
wealth=c(10,36,27,146,55,44,20,8,42,103,48,49,10,48,32,3)
 
     flomarriage %v% "wealth" <- wealth
     #
     # create a plot of the social network
     #
     plot(flomarriage)
     #
     # now make the vertex size proportional to their wealth
     #
     plot(flomarriage, vertex.cex=wealth/10, main="Marriage Ties")


     # Load a network object of the Florentine data
     #
     data(florentine)

     #
     # Fit a model where the propensity to form ties between
     # families depends on the absolute difference in wealth
     #
     flo.model1 <- ergm(flomarriage ~ edges + absdiff("wealth"))
     summary(flo.model1)

 x=gof(flo.model1~distance)
plot(x)



#add triangle to model to see if there is clustering in the network

     flo.model2 <- ergm(flomarriage ~edges+absdiff("wealth") + triangle)
     summary(flo.model2)

#diagnostics
mcmc.diagnostics(flo.model2) #triangle seems a bit off


#what happens if we use gwesp instead of triangle

     flo.model3 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
     summary(flo.model3)

mcmc.diagnostics(flo.model3)


#similar convergence
     flo.model4 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
     summary(flo.model4)

mcmc.diagnostics(flo.model4)


#probably not worth having a triangle or gwesp term 


#now adding an edgecovariate-one can add an edge covariate in the form of a matrix or a network
#here we put in a network of business relations between the families
#do families mix marriage with business?

     flo.model5 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness))
     summary(flo.model5)  #adding edgecov

#gof
par(mfrow=c(1,2))
 x=gof(flo.model5~distance+espartners)
plot(x)

#a very nice fit but do we need node covariates?


     flo.model6 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness)+nodecov("wealth"))
     summary(flo.model6)

#how about k star terms?
     flo.model7 <- ergm(flomarriage ~edges+absdiff("wealth") +edgecov(flobusiness)+nodecov("wealth")+kstar(2:3))
     summary(flo.model7)

mcmc.diagnostics(flo.model7)

#looking at gof for distance, shared partners, and degree distribution 
par(mfrow=c(1,3))
 x=gof(flo.model7~distance+espartners+degree,nsim=40)
plot(x)


#simulating an example network


sim.flo=simulate(flo.model7)

#dev.off()
plot(sim.flo)



########################################################################################################

#A very simple simulation example
#here we will try to simulate a network
#like the school network in the first example
#with the same size and density, but here with higehr
#gender homophily

net=network.initialize(71,directed=T)
#first initialize network of right size and directed

net%v% "gnd"=get.vertex.attribute(schoolnet1,"gnd")
#here seeding each case with characteristic (doing this randomly)
#based on the distribution of gender in the original network

mod=ergm(net~edges+nodematch("gnd"), target.stats=c(305,250))
#here getting parameters to use in simulation for network
#of right size, with network with 305 edges and 250 of those 
#go to people of same gender. 
#note that 305 is the same number of edges as in the true network
#but 250 same gender ties is much higher than in the true network

sim.mod=simulate(mod)
#here we simulate network based on the estimated model above
summary(sim.mod~edges+nodematch('gnd'))
#success!


plot(sim.mod, vertex.col="gnd")