#Jonathan H. Morgan #SN&H Example Data #12 May 2016 #LOADING PACKAGES library (tabplot) library(corrgram) library (corrplot) library (ggplot2) library(VIM) library (plyr) library (dplyr) library (readr) library (haven) library (tidyr) library (igraph) ########################### # IMPORTING ADHealth DATA # ########################### AHS_WPVAR <- read.csv('C:/Users/jhm18/Desktop/Duke Projects/DNAC/SN&H/ahs_wpvar.csv') AHS_WPVAR <- read.csv('C:/Users/Jonathan H Morgan/Desktop/Duke Projects/SN&H/ahs_wpvar.csv') #AHS_WPVAR ############################## # Diagnostic Visualizations # ############################## #SIMPLE VISUALIZATION: IN-DEGREE BY NUMBER OF FRIENDS WHO HAVE SEX plot(AHS_WPVAR$IDG,AHS_WPVAR$FRNDSEX,xlab="Weighted In-Degree",ylab="Weighted Number of Sexually Active Friends",main="AddHealth") ggplot (AHS_WPVAR, aes (x=IDG, y=FRNDSEX)) + geom_point() + geom_smooth() + xlab("Weighted In-Degree") + ylab("Weighted Number of Sexually Active Friends") #VISUALIZATIONS Of MULTIPLE VARAIBLES AHS_FRIEND_BEHAVIORS <- AHS_WPVAR[c(29, 37:38)] #Sub-setting the data to look at the relationship between in-dgree, peer smoking, and peer sexual activity corrplot(cor(AHS_FRIEND_BEHAVIORS[,c("IDG","FRNDSEX","FRNDSMOKE")]), type = 'lower') tableplot(AHS_FRIEND_BEHAVIORS, sortCol = (IDG), num_scale = "lin", legend.line = 8, fontsize = 12, numPals = c(IDG = "Oranges")) #DETECTING MISSING DATA aggr(AHS_WPVAR) ########################################## # Creating School 7's Edgelist # ########################################## #Step 1: Subsetting AHS_WPVAR to Isolate Community 1 #AHS_Community1 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(1)) AHS_Community7 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(7)) #Step 2: Creating Male and Female Data Subsets for Community 1 AHS_M <- AHS_Community7[c(3, 4:8)] AHS_F <- AHS_Community7[c(3, 14:18)] #Step 3: Converting from wide to long data set format using tidyr package AHS_Male <- AHS_M %>% gather(M_ID, value, mfnid_1:mfnid_5, na.rm = TRUE) AHS_Female <- AHS_F %>% gather(F_ID, value, ffnid_1:ffnid_5, na.rm = TRUE) #Step 4: Deleting 9999 values from the data subsets; the gather statements have eliminated the other missing values. #Deleting M_ID and F_ID #Renaming ego_nid to Sender #Renaming Value to Target AHS_Male <- subset(AHS_Male, AHS_Male[,3] != c(99999)) AHS_Male [,"Female"] <- c(0) AHS_Male [, 2] <- NULL names(AHS_Male)[1] <- "Sender" names(AHS_Male)[2] <- "Target" AHS_Female <- subset(AHS_Female, AHS_Female[,3] != c(99999)) AHS_Female [,"Female"] <- c(1) AHS_Female [, 2] <- NULL names(AHS_Female)[1] <- "Sender" names(AHS_Female)[2] <- "Target" #Step 5: Combining Male and Female Edgelists AHS_EdgeList <- rbind(AHS_Male, AHS_Female) AHS_EdgeList [, "weight"] <- c(1) AHS_EdgeList %>% arrange(Sender) #Step 6: Creating an iGraph Style Edge List AHS_EdgeList <- AHS_EdgeList[c(1, 2, 4)] #Step 7: Removing Non-essential data sets rm(AHS_F, AHS_Female, AHS_M, AHS_Male) #################################### # Community Detection Using iGraph # #################################### #For more information: http://igraph.org/r/doc/communities.html #Creating a Graph Obeject for Subsequent Analyses AHS_Graph=graph.data.frame(AHS_EdgeList) #Diagnostic Measures #Normalized Shannon Entropy Score graph.diversity(AHS_Graph, weights = NULL, vids = V(AHS_Graph)) #Jaccard Similarity similarity(AHS_Graph, method = "jaccard") #Initial Network Visualization using Fructerman.Reingold Layout # set seed to make the layout reproducible set.seed(3952) E(AHS_Graph)$color <- "grey" V(AHS_Graph)$color <- "grey" V(AHS_Graph)[degree(AHS_Graph, mode="in")>10]$color <- "yellow" #Destinguishing High Degree Nodes as yellow V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) tkplot(AHS_Graph, layout=layout1) #tkplot allows us to manually manipulte the visualization if we want #Finding Strongly Connected Components #Strongly Connected Component Definition: #A directed graph is called strongly connected #if there is a path in each direction between each pair of vertices of the graph. scc <- clusters(AHS_Graph, "strong") #Type scc in the console to have the strongly connected components reported #Membership: Top number indicates the node ID, the bottom the component to which it belongs #csize: Component Size, the number of nodes in the component. #no: no indicates the number of components in the graph. We have 7 components, 1 central component and 6 nodes that recieve but do not send ties. #Type v_idx in the console to get this report #In graphs with multiple components, it can be helpful to distinguish connected components from isolates. #Community Detection Algorithms in iGraph: Approaches Supported by iGraph #Detecting communitities by iteratively calculating edge betweeness (e.g., Girvan & Newman 2001) #Detecting communities by using eigenvector matrices (e.g., Newman 2006) #Detecting communities by iteratively optimizing for modularity (e.g., Blondel, Guillaume, Lambiotte, & Lefebvre 2008) #Detecting communities using random walk methods (e.g, Pons & Latapy 2005; Reichardt & Bornholdt 2006) #Detecting communities using label propogation techniques (e.g., Ragavan, Albert, & Kumara 2007) #Edge-Betweeness: Girvan-Newman (2001) GNC <- cluster_edge_betweenness(AHS_Graph, weights = NULL) V(AHS_Graph)$color <-membership(GNC) #Plot setting specifying the coloring of vertices by community AHS_Graph$palette <- diverging_pal(length(GNC)) #Plot setting specifying the color pallette I am using (iGraph supports 3) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) plot_dendrogram(GNC) modularity(GNC) membership(GNC) head(GNC, n=21) #Looking at what nodes got assigned to the communities (23 communities in all) #Transforming iGraph's Community Object GNC AND Merging the Community ID Variable into the Community Dataset str(membership(GNC)) #Creating a string from iGraph's Community Object members <- membership(GNC) GNC_ID <- data.frame(GNC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame GNC_ID <- data.matrix(GNC_ID) #Converting GNC_ID from a Factor Variable to a Interger GNC_ID <- data.frame(GNC_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, GNC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(GNC_ID) #Leading Eigenvector: Newman (2006) #Newman in iGraph assumes an undirected graph AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE) #Newman GC <- cluster_leading_eigen(AHS_Graph, weights = NULL) V(AHS_Graph)$color <-membership(GC) AHS_Graph$palette <- diverging_pal(length(GC)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) modularity(GC) membership(GC) head(GC, n=12) #Looking at what nodes got assigned to the communities (23 communities in all) #Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset str(membership(GC)) #Creating a string from iGraph's Community Object members <- membership(GC) GC_ID <- data.frame(GC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame GC_ID <- data.matrix(GC_ID) #Converting GNC_ID from a Factor Variable to a Interger GC_ID <- data.frame(GC_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, GC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(GC_ID) #Removing GC_ID, as it is now unessential #Multilevel Techniques: Louvain #Louvain in iGraph assumes an undirected graph #Consequently, we are created an undirected iGraph object to demonstrate this method. AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE) #Louvain #Resolution parameter set to 1 #When the resolution parameter is 1 the standard Louvain method. #Higher resolutions produce larger numbe of clusters, while lower resolutions produce lower number of clusters. #iGraph does not support changing the resolution parameter, but it can be important. #In instances where the group appear too coarse or too fine, we suggest trying Pajek which is also publicly avaialable: http://mrvar.fdv.uni-lj.si/pajek/pajekman.pdf LC <- cluster_louvain(AHS_Graph, weights = NULL) V(AHS_Graph)$color <-membership(LC) AHS_Graph$palette <- diverging_pal(length(LC)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) modularity(LC) membership(LC) head(LC, n=12) #Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset str(membership(LC)) #Creating a string from iGraph's Community Object members <- membership(LC) LC_ID <- data.frame(LC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame LC_ID <- data.matrix(LC_ID) #Converting GNC_ID from a Factor Variable to a Interger LC_ID <- data.frame(LC_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, LC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(LC_ID) #Removing GC_ID, as it is now unessential #Random Walk Methods: Walktrap and Spinglass AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE) #Walktrap (Pons & Latapy 2005): WC <- cluster_walktrap(AHS_Graph) V(AHS_Graph)$color <-membership(WC) AHS_Graph$palette <- diverging_pal(length(WC)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) plot_dendrogram(WC) modularity(WC) membership(WC) head(WC, n=30) #Transforming iGraph's Community Object WC AND Merging the Community ID Variable into the Community Dataset str(membership(WC)) #Creating a string from iGraph's Community Object members <- membership(WC) WC_ID <- data.frame(WC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame WC_ID <- data.matrix(WC_ID) #Converting GNC_ID from a Factor Variable to a Interger WC_ID <- data.frame(WC_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, WC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(WC_ID) #Removing GC_ID, as it is now unessential #Spinglass (Reichardt & Bornholdt 2006) SPG <- cluster_walktrap(AHS_Graph) V(AHS_Graph)$color <-membership(SPG) AHS_Graph$palette <- diverging_pal(length(SPG)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) plot_dendrogram(SPG) modularity(SPG) membership(SPG) head(WC, n=30) #Transforming iGraph's Community Object SPG AND Merging the Community ID Variable into the Community Dataset str(membership(SPG)) #Creating a string from iGraph's Community Object members <- membership(SPG) SPG_ID <- data.frame(SPG_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame SPG_ID <- data.matrix(SPG_ID) #Converting GNC_ID from a Factor Variable to a Interger SPG_ID <- data.frame(SPG_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, SPG_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(SPG_ID) #Removing GC_ID, as it is now unessential #Label Propogation Techniques (Ragavan, Albert, & Kumara 2007) #Label_Prop in iGraph assumes an undirected graph AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE) #Label Propogation LP <- cluster_label_prop(AHS_Graph) V(AHS_Graph)$color <-membership(LP) AHS_Graph$palette <- diverging_pal(length(LP)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5) modularity(LP) membership(LP) head(LP, n=32) #Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset str(membership(LP)) #Creating a string from iGraph's Community Object members <- membership(LP) LP_ID <- data.frame(LP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame LP_ID <- data.matrix(LP_ID) #Converting GNC_ID from a Factor Variable to a Interger LP_ID <- data.frame(LP_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, LP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(LP_ID) #Removing GC_ID, as it is now unessential #ADDITIONAL METHODS #Fast-Greedy Techniques: (Clauset & Moore 2004) #DetectS communities using hierarchical agglomeration alorithms that consider the vertices (nodes), edges, and the depth of the dendogram (hierarchical structure) #Operable on large data sets of unique node-edge pairs, not applicable here. AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE) #Fast-Greedy FG <- cluster_fast_greedy(AHS_Graph, weights = NULL) V(AHS_Graph)$color <-membership(FG) AHS_Graph$palette <- diverging_pal(length(FG)) #InfoMAP (Rosvall, Axelsson, Berstrom 2009) #Using a map algorithm that models a network work as a system of flows. #http://www.tp.umu.se/~rosvall/livemod/mapequation/ AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE) #InfoMAP IMP <- cluster_infomap(AHS_Graph) V(AHS_Graph)$color <-membership(IMP) AHS_Graph$palette <- diverging_pal(length(IMP)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) modularity(IMP) membership(IMP) head(IMP, n=48) #Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset str(membership(IMP)) #Creating a string from iGraph's Community Object members <- membership(IMP) IMP_ID <- data.frame(IMP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame IMP_ID <- data.matrix(IMP_ID) #Converting GNC_ID from a Factor Variable to a Interger IMP_ID <- data.frame(IMP_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, IMP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(IMP_ID) #Removing GC_ID, as it is now unessential #Optimal Modularity (Good, Yva de Montjoye, and Clauset 2010) #Simulated annealing method that repeatedly samples the network #http://tuvalu.santafe.edu/~aaronc/modularity/ #WARNING: This routine is computationally intensive!!! AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE) #Optimal Modularity OM <- cluster_optimal(AHS_Graph) V(AHS_Graph)$color <-membership(OM) AHS_Graph$palette <- diverging_pal(length(OM)) V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5) ## text size V(AHS_Graph)$size <- seq(10,60,length.out=0.5) ## circle size proportional to text size layout1 <- layout.kamada.kawai(AHS_Graph) plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5) modularity(OM) membership(OM) head(OM, n=6) #Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset str(membership(OM)) #Creating a string from iGraph's Community Object members <- membership(OM) OM_ID <- data.frame(OM_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame OM_ID <- data.matrix(OM_ID) #Converting GNC_ID from a Factor Variable to a Interger OM_ID <- data.frame(OM_ID) #Converting the data set back into a dataframe AHS_Community7 <- merge(AHS_Community7, OM_ID, by= 'ego_nid', all=TRUE) #Merging the data sets. rm(OM_ID) #Removing GC_ID, as it is now unessential #Measures that Can be Helpful for Evalauting Community Fit of Social Networks #Shanon Entropy Scores with respect to race and gender, with the expectation being that in a school setting a better community solution will have lower entropy scores #Modularity #Within-group distance #Proportion of within-group triads against total triads