## AN ERGM TUTORIAL USING R

``````
#An ERGM tutorial using R for the Social Networks and Health
#workshop at Duke University on May 19, 2016

#The examples are based on a network and dataset called schoolnet1.Rdata
#which is on the dropbox page
#this the first add health example network
#In order for the code to work this file must be saved on your computer
#You must update the code (see below) to indicate where these files are saved

#Additionally, the code also requires the ergm, network, coda and sna packages
#these packages can be installed in the normal way:
#i.e. install.packages("sna")

#The tutorial is broken down as follows:

#Part I:Data Management-using the school network
#Part II: Statistics and Plots-using the school network
#Part III: ERGM- using the school network
#Part IV: ERGM- using the Florentine network

#Part I:Data Management
#getting the data into R in a format we can use

library(ergm); library(sna);library(network);library(coda)

#let's set the directory where our data is stored so that we can
setwd("/Users/jsmith77/classes/workshops/social_networks_health_duke/data/")

#example 1, add health school network

#friendship nominations-up to 10

#first reading in data, saved as schoolnet1
#note there are characteristics for race, gnd and smoking behavior

##################################################################

#Part II: Descrpitive Statistics and Plots

#getting descriptive statistics and plots so we can run an informed ergm

plot(schoolnet1)  #a simple plot

plot(schoolnet1,vertex.col="gnd")  #looking at homophily by gender

#scaling nodes by indegree

#odegree=rowSums(as.matrix(schoolnet1)) #first getting outdegree and indegree for nodes in network
idegree=colSums(as.matrix(schoolnet1))

#plots with in and out degree

#scaling node size by indegree and coloring by grade

plot(schoolnet1,vertex.col="smoke",vertex.cex=idegree/1.5)
#scaling node size by indegree and coloring by smoke behavior

#now thinking about more structural features, still using descriptive statistics

gden(schoolnet1) #density
cug.test(schoolnet1,"gden",cmode="size") #compare desnity to random network of right size

#is there clustering, reciprocity?

grecip(schoolnet1); #reciprocity first

#comparing reciprocity to the amount expected by chance of network with right size and density

gtrans(schoolnet1)#now transitivity

#the amount expected by network with correct dyad census

########################################

#before we actually fit any ergms, let's take a look at some of the help files

?ergm
?'ergm-terms'
?mcmc.diagnostics
?gof

############################################
#Part III: ERGM on a School Network

#now, running some ergms on our school network

mod.rand<-ergm(schoolnet1~edges)
#an initial model-just the edge parameter,bernoulli random graph model

summary(mod.rand)  #taking a look at the model results
#interpretation of model coefficients:

#the log-odds of any tie existing is -2.7275
#probability: exp(-2.7275)/(1+exp(-2.7275))=.0613
#which is the just the density...
gden(schoolnet1)

#we know homophily matters from the plots, let's add some terms
#for gender ("gnd") and grade ("grd") and smoking behavior

+nodematch('smoke')))

#note that here we are doing the model and summary at the same time
#only using simple match/no match terms for now
#the basic terms here are nodematch with the name of the characteristic
#in quotes

#interpretation of coefficient for nodematch on gender:
#for a tie between two nodes that match on grade and mismatch on smoking and gender
#the conditional log-odds  is:
#-4.5787+3.0493=-1.5294
#
#the probability is: exp(-1.5294)/(1+exp(-1.5294))=.178
#
#for a tie between two nodes that mismatch on gender and mismatch on smoking and grade
#the conditional log-odds  is:

#-4.5787=-4.5787
#
#the probability is: exp(-4.5787)/(1+exp(-4.5787))=.01

+nodematch('smoke')+mutual))
#lets also add a term for reciprocity (mutual) as we know the level of reciprocity is high

#note that it is now being estimated using MCMC sampling

mcmc.diagnostics(mod.homoph.mutual1); #looking at the model diagnostics
#-the MCMC sample of network statistics

#can also use output directly from ergm to asses convergence
plot(mod.homoph.mutual1\$sample)

#seems to have converged. hasn't gone off dramatically in one direction
#and is not erractic

#now looking at goodness of fit
#does the model reproduce the global features of the network?

#we can first ask if the model is reproducing the terms in the model
#this is a useful check, but hardly a good test of model fit

x=gof(mod.homoph.mutual1~model) #here use a gof function to test for
#goodness of fit
plot(x)

#now let's do the same thing but see if the model can produce the features
#of the network not in the model, like distance and shared partners

par(mfrow=c(1,2)) #setting up the plot
x=gof(mod.homoph.mutual1~distance+espartners)
plot(x)

#not a great fit, especially on shared partner
#perhaps allow for differential homophily on grade?

summary(mod.homoph.mutual2<-ergm(schoolnet1~edges+nodematch("gnd")+
#same model as above but differential homophily for grade, set with diff=T
#note there are now 6 terms associated with grade

#mcmc.diagnostics(mod.homoph.mutual2,density=F);#diagnostics
#plot(mod.homoph.mutual2\$sample) #looking at diagnostics

#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual2~distance+espartners)

plot(x)
#okay (although far from perfect) on distance but missing clustering badly

#maybe we need to include terms that deal with differential degree
#where some actors have higher in/out degree than others

summary(mod.homoph.mutual3<-ergm(schoolnet1~edges+nodematch("gnd")+

#same model as above but include nodal factor for degree
#for smoking behavior
#here not differentiating between in and out degree

plot(mod.homoph.mutual3\$sample) #looking at diagnostics

#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual3~distance+espartners)

plot(x)

#now differentiating between in and out degree for effect of smoke
summary(mod.homoph.mutual4<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")))

plot(mod.homoph.mutual4\$sample) #looking at diagnostics

#gof
par(mfrow=c(1,2))
x=gof(mod.homoph.mutual4~distance+espartners)

plot(x)

#how about the degree distribution, are we getting that right?
x=gof(mod.homoph.mutual4~idegree+odegree)
plot(x)

#deal with fact that way under counting number of people who send zero ties?

summary(mod.homoph.mutual5<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)))

x=gof(mod.homoph.mutual5~idegree+odegree)
plot(x)

x=gof(mod.homoph.mutual5~distance+espartners)
#fit is getting a bit better but still clearly off!

############
#we have so far fit a number of models, learned a bit but have not
#captured the clustering in the network
#so, maybe we need a clustering term...

##here we will try and include the number of triangles
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)+triangle))

#clearly didn't work
#let's run the same thing again but cut the
#algorithm short, just to see what's happening
summary(mod.triangle<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)+triangle,control=control.ergm(MCMLE.maxit= 1)))

#an example of degeneracy...
plot(mod.triangle\$sample)

#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.triangle\$newnetwork)

#In general, there are two reasons we could have a degenerate model:
#1. A poorly specified model. So the terms in the model do not reflect
#the actual process that generated the network. When this happens the simulated
#network, used to estimate the coefficients, do not yield realistic networks
#making it difficult to estimate the paramaters very well
#2.The inputs that control the algorithm (like burnin and interval between samples
#are not sufficient for the model to converge

#We can try and change both aspects to get the model to a proper specification

#Let's try something else, here include a gwesp term

summary(mod.homoph.mutual.gwesp1<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=1,fixed=T),control=control.ergm(MCMLE.maxit= 5)))
#setting the max iterations low as this model takes a bit too run

mcmc.diagnostics(mod.homoph.mutual.gwesp1);

plot(mod.homoph.mutual.gwesp1\$sample)

#another quick way of judging if the sample of MCMC stats are okay is plotting the last network from the MCMC sample
plot(mod.homoph.mutual.gwesp1\$newnetwork)

#clearly a bad model, the MCMC sample seemed not to have converged

#maybe we should allow the algorithm to run longer?
#we can also decrease the alpha paramater for the gwesp term
#alpha governs how unlikley 3,4,5,etc triangles are relative to 1 and 2 triangles
#when alpha is high the probabilty of tie is greatly increased by adding another shared partner
#even if the pair already have many shared partners.
#when alpha is low adding more shared partners (or more triangles) does not greatly increase the probabilty of a tie
#if the pair already have a few shared partners

summary(mod.homoph.mutual.gwesp2<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 10,MCMC.interval=2000,MCMC.samplesize=2000)))

#diagnostics
mcmc.diagnostics(mod.homoph.mutual.gwesp2,density=F);
plot(mod.homoph.mutual.gwesp2\$newnetwork)

par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp2~distance+espartners)
plot(x)

#clearly a better model in terms of fit
#let's try and make sure the model has converged, up the number of iterations here
#looks like we can also drop the outdegree term if we wanted to

summary(mod.homoph.mutual.gwesp3<-ergm(schoolnet1~edges+nodematch("gnd")+
+nodeofactor("smoke")+odegree(0)+gwesp(alpha=.5,fixed=T),
control=control.ergm(MCMLE.maxit= 15,MCMC.interval=2000,MCMC.samplesize=2000)))

plot(mod.homoph.mutual.gwesp3\$sample)

par(mfrow=c(1,2))
x=gof(mod.homoph.mutual.gwesp3~distance+espartners)
plot(x)

#simulating a network to see how it looks
sim.schoolnet<-simulate(mod.homoph.mutual.gwesp2,nsim=1)
plot(sim.schoolnet,vertex.col="grade") #looks reasonably close to the empirical network

#Of course we could run more models, tweak the inputs, etc. looking for a better
#fit. but in the end the key is being able to answer our questions with a
#model that has converged.

#########################################################################################################3

#Part IV: ERGM using the Florentine Network

#a second ergm example, this time we are going to use some data from within R
#also we won't go through the detailed data analysis we did with the school network

#we are going to use the florentine data from within the ergm package
#there is data on marriage and business relationships between families in florentine some time ago

#the first part of the code comes from the ergm help files
data(flo)
#
# attach the sociomatrix for the Florentine marriage data
#
# Let's make a network from the matrix
#
flomarriage <- network(flo,directed=FALSE)
flomarriage
#
#
# create a vector indicating the wealth of each family (in thousands of lira)
# and add it as a covariate to the network object
#
wealth=c(10,36,27,146,55,44,20,8,42,103,48,49,10,48,32,3)

flomarriage %v% "wealth" <- wealth
#
# create a plot of the social network
#
plot(flomarriage)
#
# now make the vertex size proportional to their wealth
#
plot(flomarriage, vertex.cex=wealth/10, main="Marriage Ties")

# Load a network object of the Florentine data
#
data(florentine)

#
# Fit a model where the propensity to form ties between
# families depends on the absolute difference in wealth
#
flo.model1 <- ergm(flomarriage ~ edges + absdiff("wealth"))
summary(flo.model1)

x=gof(flo.model1~distance)
plot(x)

#add triangle to model to see if there is clustering in the network

flo.model2 <- ergm(flomarriage ~edges+absdiff("wealth") + triangle)
summary(flo.model2)

#diagnostics
mcmc.diagnostics(flo.model2) #triangle seems a bit off

#what happens if we use gwesp instead of triangle

flo.model3 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
summary(flo.model3)

mcmc.diagnostics(flo.model3)

#similar convergence
flo.model4 <- ergm(flomarriage ~edges+absdiff("wealth") + gwesp(.5,fixed=T))
summary(flo.model4)

mcmc.diagnostics(flo.model4)

#probably not worth having a triangle or gwesp term

#now adding an edgecovariate-one can add an edge covariate in the form of a matrix or a network
#here we put in a network of business relations between the families
#do families mix marriage with business?

#gof
par(mfrow=c(1,2))
x=gof(flo.model5~distance+espartners)
plot(x)

#a very nice fit but do we need node covariates?

summary(flo.model6)

summary(flo.model7)

mcmc.diagnostics(flo.model7)

#looking at gof for distance, shared partners, and degree distribution
par(mfrow=c(1,3))
x=gof(flo.model7~distance+espartners+degree,nsim=40)
plot(x)

#simulating an example network

sim.flo=simulate(flo.model7)

#dev.off()
plot(sim.flo)

########################################################################################################

#A very simple simulation example
#here we will try to simulate a network
#like the school network in the first example
#with the same size and density, but here with higehr
#gender homophily

net=network.initialize(71,directed=T)
#first initialize network of right size and directed

net%v% "gnd"=get.vertex.attribute(schoolnet1,"gnd")
#here seeding each case with characteristic (doing this randomly)
#based on the distribution of gender in the original network

mod=ergm(net~edges+nodematch("gnd"), target.stats=c(305,250))
#here getting parameters to use in simulation for network
#of right size, with network with 305 edges and 250 of those
#go to people of same gender.
#note that 305 is the same number of edges as in the true network
#but 250 same gender ties is much higher than in the true network

sim.mod=simulate(mod)
#here we simulate network based on the estimated model above
summary(sim.mod~edges+nodematch('gnd'))
#success!

plot(sim.mod, vertex.col="gnd")

``````