#Jonathan H. Morgan
#SN&H Example Data
#12 May 2016
#LOADING PACKAGES
library (tabplot)
library(corrgram)
library (corrplot)
library (ggplot2)
library(VIM)
library (plyr)
library (dplyr)
library (readr)
library (haven)
library (tidyr)
library (igraph)
###########################
# IMPORTING ADHealth DATA #
###########################
AHS_WPVAR <- read.csv('C:/Users/jhm18/Desktop/Duke Projects/DNAC/SN&H/ahs_wpvar.csv')
AHS_WPVAR <- read.csv('C:/Users/Jonathan H Morgan/Desktop/Duke Projects/SN&H/ahs_wpvar.csv')
#AHS_WPVAR
##############################
# Diagnostic Visualizations #
##############################
#SIMPLE VISUALIZATION: IN-DEGREE BY NUMBER OF FRIENDS WHO HAVE SEX
plot(AHS_WPVAR$IDG,AHS_WPVAR$FRNDSEX,xlab="Weighted In-Degree",ylab="Weighted Number of Sexually Active Friends",main="AddHealth")
ggplot (AHS_WPVAR, aes (x=IDG, y=FRNDSEX)) + geom_point() + geom_smooth() + xlab("Weighted In-Degree") + ylab("Weighted Number of Sexually Active Friends")
#VISUALIZATIONS Of MULTIPLE VARAIBLES
AHS_FRIEND_BEHAVIORS <- AHS_WPVAR[c(29, 37:38)] #Sub-setting the data to look at the relationship between in-dgree, peer smoking, and peer sexual activity
corrplot(cor(AHS_FRIEND_BEHAVIORS[,c("IDG","FRNDSEX","FRNDSMOKE")]), type = 'lower')
tableplot(AHS_FRIEND_BEHAVIORS, sortCol = (IDG), num_scale = "lin", legend.line = 8, fontsize = 12, numPals = c(IDG = "Oranges"))
#DETECTING MISSING DATA
aggr(AHS_WPVAR)
##########################################
# Creating School 7's Edgelist #
##########################################
#Step 1: Subsetting AHS_WPVAR to Isolate Community 1
#AHS_Community1 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(1))
AHS_Community7 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(7))
#Step 2: Creating Male and Female Data Subsets for Community 1
AHS_M <- AHS_Community7[c(3, 4:8)]
AHS_F <- AHS_Community7[c(3, 14:18)]
#Step 3: Converting from wide to long data set format using tidyr package
AHS_Male <- AHS_M %>%
gather(M_ID, value, mfnid_1:mfnid_5, na.rm = TRUE)
AHS_Female <- AHS_F %>%
gather(F_ID, value, ffnid_1:ffnid_5, na.rm = TRUE)
#Step 4: Deleting 9999 values from the data subsets; the gather statements have eliminated the other missing values.
#Deleting M_ID and F_ID
#Renaming ego_nid to Sender
#Renaming Value to Target
AHS_Male <- subset(AHS_Male, AHS_Male[,3] != c(99999))
AHS_Male [,"Female"] <- c(0)
AHS_Male [, 2] <- NULL
names(AHS_Male)[1] <- "Sender"
names(AHS_Male)[2] <- "Target"
AHS_Female <- subset(AHS_Female, AHS_Female[,3] != c(99999))
AHS_Female [,"Female"] <- c(1)
AHS_Female [, 2] <- NULL
names(AHS_Female)[1] <- "Sender"
names(AHS_Female)[2] <- "Target"
#Step 5: Combining Male and Female Edgelists
AHS_EdgeList <- rbind(AHS_Male, AHS_Female)
AHS_EdgeList [, "weight"] <- c(1)
AHS_EdgeList %>% arrange(Sender)
#Step 6: Creating an iGraph Style Edge List
AHS_EdgeList <- AHS_EdgeList[c(1, 2, 4)]
#Step 7: Removing Non-essential data sets
rm(AHS_F, AHS_Female, AHS_M, AHS_Male)
####################################
# Community Detection Using iGraph #
####################################
#For more information: http://igraph.org/r/doc/communities.html
#Creating a Graph Obeject for Subsequent Analyses
AHS_Graph=graph.data.frame(AHS_EdgeList)
#Diagnostic Measures
#Normalized Shannon Entropy Score
graph.diversity(AHS_Graph, weights = NULL, vids = V(AHS_Graph))
#Jaccard Similarity
similarity(AHS_Graph, method = "jaccard")
#Initial Network Visualization using Fructerman.Reingold Layout
# set seed to make the layout reproducible
set.seed(3952)
E(AHS_Graph)$color <- "grey"
V(AHS_Graph)$color <- "grey"
V(AHS_Graph)[degree(AHS_Graph, mode="in")>10]$color <- "yellow" #Destinguishing High Degree Nodes as yellow
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
tkplot(AHS_Graph, layout=layout1) #tkplot allows us to manually manipulte the visualization if we want
#Finding Strongly Connected Components
#Strongly Connected Component Definition:
#A directed graph is called strongly connected
#if there is a path in each direction between each pair of vertices of the graph.
scc <- clusters(AHS_Graph, "strong") #Type scc in the console to have the strongly connected components reported
#Membership: Top number indicates the node ID, the bottom the component to which it belongs
#csize: Component Size, the number of nodes in the component.
#no: no indicates the number of components in the graph. We have 7 components, 1 central component and 6 nodes that recieve but do not send ties.
#Type v_idx in the console to get this report
#In graphs with multiple components, it can be helpful to distinguish connected components from isolates.
#Community Detection Algorithms in iGraph: Approaches Supported by iGraph
#Detecting communitities by iteratively calculating edge betweeness (e.g., Girvan & Newman 2001)
#Detecting communities by using eigenvector matrices (e.g., Newman 2006)
#Detecting communities by iteratively optimizing for modularity (e.g., Blondel, Guillaume, Lambiotte, & Lefebvre 2008)
#Detecting communities using random walk methods (e.g, Pons & Latapy 2005; Reichardt & Bornholdt 2006)
#Detecting communities using label propogation techniques (e.g., Ragavan, Albert, & Kumara 2007)
#Edge-Betweeness: Girvan-Newman (2001)
GNC <- cluster_edge_betweenness(AHS_Graph, weights = NULL)
V(AHS_Graph)$color <-membership(GNC) #Plot setting specifying the coloring of vertices by community
AHS_Graph$palette <- diverging_pal(length(GNC)) #Plot setting specifying the color pallette I am using (iGraph supports 3)
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
plot_dendrogram(GNC)
modularity(GNC)
membership(GNC)
head(GNC, n=21) #Looking at what nodes got assigned to the communities (23 communities in all)
#Transforming iGraph's Community Object GNC AND Merging the Community ID Variable into the Community Dataset
str(membership(GNC)) #Creating a string from iGraph's Community Object
members <- membership(GNC)
GNC_ID <- data.frame(GNC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
GNC_ID <- data.matrix(GNC_ID) #Converting GNC_ID from a Factor Variable to a Interger
GNC_ID <- data.frame(GNC_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, GNC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(GNC_ID)
#Leading Eigenvector: Newman (2006)
#Newman in iGraph assumes an undirected graph
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
#Newman
GC <- cluster_leading_eigen(AHS_Graph, weights = NULL)
V(AHS_Graph)$color <-membership(GC)
AHS_Graph$palette <- diverging_pal(length(GC))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
modularity(GC)
membership(GC)
head(GC, n=12) #Looking at what nodes got assigned to the communities (23 communities in all)
#Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset
str(membership(GC)) #Creating a string from iGraph's Community Object
members <- membership(GC)
GC_ID <- data.frame(GC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
GC_ID <- data.matrix(GC_ID) #Converting GNC_ID from a Factor Variable to a Interger
GC_ID <- data.frame(GC_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, GC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(GC_ID) #Removing GC_ID, as it is now unessential
#Multilevel Techniques: Louvain
#Louvain in iGraph assumes an undirected graph
#Consequently, we are created an undirected iGraph object to demonstrate this method.
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
#Louvain
#Resolution parameter set to 1
#When the resolution parameter is 1 the standard Louvain method.
#Higher resolutions produce larger numbe of clusters, while lower resolutions produce lower number of clusters.
#iGraph does not support changing the resolution parameter, but it can be important.
#In instances where the group appear too coarse or too fine, we suggest trying Pajek which is also publicly avaialable: http://mrvar.fdv.uni-lj.si/pajek/pajekman.pdf
LC <- cluster_louvain(AHS_Graph, weights = NULL)
V(AHS_Graph)$color <-membership(LC)
AHS_Graph$palette <- diverging_pal(length(LC))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
modularity(LC)
membership(LC)
head(LC, n=12)
#Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset
str(membership(LC)) #Creating a string from iGraph's Community Object
members <- membership(LC)
LC_ID <- data.frame(LC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
LC_ID <- data.matrix(LC_ID) #Converting GNC_ID from a Factor Variable to a Interger
LC_ID <- data.frame(LC_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, LC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(LC_ID) #Removing GC_ID, as it is now unessential
#Random Walk Methods: Walktrap and Spinglass
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE)
#Walktrap (Pons & Latapy 2005):
WC <- cluster_walktrap(AHS_Graph)
V(AHS_Graph)$color <-membership(WC)
AHS_Graph$palette <- diverging_pal(length(WC))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
plot_dendrogram(WC)
modularity(WC)
membership(WC)
head(WC, n=30)
#Transforming iGraph's Community Object WC AND Merging the Community ID Variable into the Community Dataset
str(membership(WC)) #Creating a string from iGraph's Community Object
members <- membership(WC)
WC_ID <- data.frame(WC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
WC_ID <- data.matrix(WC_ID) #Converting GNC_ID from a Factor Variable to a Interger
WC_ID <- data.frame(WC_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, WC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(WC_ID) #Removing GC_ID, as it is now unessential
#Spinglass (Reichardt & Bornholdt 2006)
SPG <- cluster_walktrap(AHS_Graph)
V(AHS_Graph)$color <-membership(SPG)
AHS_Graph$palette <- diverging_pal(length(SPG))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
plot_dendrogram(SPG)
modularity(SPG)
membership(SPG)
head(WC, n=30)
#Transforming iGraph's Community Object SPG AND Merging the Community ID Variable into the Community Dataset
str(membership(SPG)) #Creating a string from iGraph's Community Object
members <- membership(SPG)
SPG_ID <- data.frame(SPG_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
SPG_ID <- data.matrix(SPG_ID) #Converting GNC_ID from a Factor Variable to a Interger
SPG_ID <- data.frame(SPG_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, SPG_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(SPG_ID) #Removing GC_ID, as it is now unessential
#Label Propogation Techniques (Ragavan, Albert, & Kumara 2007)
#Label_Prop in iGraph assumes an undirected graph
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
#Label Propogation
LP <- cluster_label_prop(AHS_Graph)
V(AHS_Graph)$color <-membership(LP)
AHS_Graph$palette <- diverging_pal(length(LP))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5)
modularity(LP)
membership(LP)
head(LP, n=32)
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(LP)) #Creating a string from iGraph's Community Object
members <- membership(LP)
LP_ID <- data.frame(LP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
LP_ID <- data.matrix(LP_ID) #Converting GNC_ID from a Factor Variable to a Interger
LP_ID <- data.frame(LP_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, LP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(LP_ID) #Removing GC_ID, as it is now unessential
#ADDITIONAL METHODS
#Fast-Greedy Techniques: (Clauset & Moore 2004)
#DetectS communities using hierarchical agglomeration alorithms that consider the vertices (nodes), edges, and the depth of the dendogram (hierarchical structure)
#Operable on large data sets of unique node-edge pairs, not applicable here.
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
#Fast-Greedy
FG <- cluster_fast_greedy(AHS_Graph, weights = NULL)
V(AHS_Graph)$color <-membership(FG)
AHS_Graph$palette <- diverging_pal(length(FG))
#InfoMAP (Rosvall, Axelsson, Berstrom 2009)
#Using a map algorithm that models a network work as a system of flows.
#http://www.tp.umu.se/~rosvall/livemod/mapequation/
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE)
#InfoMAP
IMP <- cluster_infomap(AHS_Graph)
V(AHS_Graph)$color <-membership(IMP)
AHS_Graph$palette <- diverging_pal(length(IMP))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
modularity(IMP)
membership(IMP)
head(IMP, n=48)
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(IMP)) #Creating a string from iGraph's Community Object
members <- membership(IMP)
IMP_ID <- data.frame(IMP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
IMP_ID <- data.matrix(IMP_ID) #Converting GNC_ID from a Factor Variable to a Interger
IMP_ID <- data.frame(IMP_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, IMP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(IMP_ID) #Removing GC_ID, as it is now unessential
#Optimal Modularity (Good, Yva de Montjoye, and Clauset 2010)
#Simulated annealing method that repeatedly samples the network
#http://tuvalu.santafe.edu/~aaronc/modularity/
#WARNING: This routine is computationally intensive!!!
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
#Optimal Modularity
OM <- cluster_optimal(AHS_Graph)
V(AHS_Graph)$color <-membership(OM)
AHS_Graph$palette <- diverging_pal(length(OM))
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size
V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5)
modularity(OM)
membership(OM)
head(OM, n=6)
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(OM)) #Creating a string from iGraph's Community Object
members <- membership(OM)
OM_ID <- data.frame(OM_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
OM_ID <- data.matrix(OM_ID) #Converting GNC_ID from a Factor Variable to a Interger
OM_ID <- data.frame(OM_ID) #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, OM_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(OM_ID) #Removing GC_ID, as it is now unessential
#Measures that Can be Helpful for Evalauting Community Fit of Social Networks
#Shanon Entropy Scores with respect to race and gender, with the expectation being that in a school setting a better community solution will have lower entropy scores
#Modularity
#Within-group distance
#Proportion of within-group triads against total triads