SN&H COMMUNITY DETECTION

Download R source file


              #Jonathan H. Morgan
#SN&H Example Data
#12 May 2016

#LOADING PACKAGES
library (tabplot)
library(corrgram)
library (corrplot)
library (ggplot2)
library(VIM) 
library (plyr)
library (dplyr)
library (readr)
library (haven)
library (tidyr)
library (igraph)

###########################
# IMPORTING ADHealth DATA #
###########################

AHS_WPVAR <- read.csv('C:/Users/jhm18/Desktop/Duke Projects/DNAC/SN&H/ahs_wpvar.csv')

AHS_WPVAR <- read.csv('C:/Users/Jonathan H Morgan/Desktop/Duke Projects/SN&H/ahs_wpvar.csv')
#AHS_WPVAR

##############################
#  Diagnostic Visualizations #
##############################

#SIMPLE VISUALIZATION: IN-DEGREE BY NUMBER OF FRIENDS WHO HAVE SEX
plot(AHS_WPVAR$IDG,AHS_WPVAR$FRNDSEX,xlab="Weighted In-Degree",ylab="Weighted Number of Sexually Active Friends",main="AddHealth")

ggplot (AHS_WPVAR, aes (x=IDG, y=FRNDSEX)) + geom_point() + geom_smooth() + xlab("Weighted In-Degree") + ylab("Weighted Number of Sexually Active Friends")

#VISUALIZATIONS Of MULTIPLE VARAIBLES
AHS_FRIEND_BEHAVIORS <- AHS_WPVAR[c(29, 37:38)] #Sub-setting the data to look at the relationship between in-dgree, peer smoking, and peer sexual activity

corrplot(cor(AHS_FRIEND_BEHAVIORS[,c("IDG","FRNDSEX","FRNDSMOKE")]), type = 'lower')
tableplot(AHS_FRIEND_BEHAVIORS, sortCol = (IDG), num_scale = "lin", legend.line = 8, fontsize = 12, numPals = c(IDG = "Oranges"))

#DETECTING MISSING DATA
aggr(AHS_WPVAR)

##########################################
#   Creating School 7's Edgelist         #
##########################################

#Step 1: Subsetting AHS_WPVAR to Isolate Community 1
#AHS_Community1 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(1))
AHS_Community7 <- subset(AHS_WPVAR, AHS_WPVAR[,1] == c(7))

#Step 2: Creating Male and Female Data Subsets for Community 1
AHS_M <- AHS_Community7[c(3, 4:8)]
AHS_F <- AHS_Community7[c(3, 14:18)]

#Step 3: Converting from wide to long data set format using tidyr package 
AHS_Male <- AHS_M %>%
  gather(M_ID, value, mfnid_1:mfnid_5, na.rm = TRUE)

AHS_Female <- AHS_F %>%
  gather(F_ID, value, ffnid_1:ffnid_5, na.rm = TRUE)


#Step 4: Deleting 9999 values from the data subsets; the gather statements have eliminated the other missing values.
  #Deleting M_ID and F_ID
  #Renaming ego_nid to Sender
  #Renaming Value to Target

AHS_Male <- subset(AHS_Male, AHS_Male[,3] != c(99999))
AHS_Male [,"Female"] <- c(0)
AHS_Male [, 2] <- NULL
names(AHS_Male)[1] <- "Sender"
names(AHS_Male)[2] <- "Target"

AHS_Female <- subset(AHS_Female, AHS_Female[,3] != c(99999))
AHS_Female [,"Female"] <- c(1)
AHS_Female [, 2] <- NULL
names(AHS_Female)[1] <- "Sender"
names(AHS_Female)[2] <- "Target"


#Step 5: Combining Male and Female Edgelists
AHS_EdgeList <- rbind(AHS_Male, AHS_Female)
AHS_EdgeList [, "weight"] <- c(1)
AHS_EdgeList %>% arrange(Sender)

#Step 6: Creating an iGraph Style Edge List
AHS_EdgeList <- AHS_EdgeList[c(1, 2, 4)]

#Step 7: Removing Non-essential data sets
rm(AHS_F, AHS_Female, AHS_M, AHS_Male)

####################################
# Community Detection Using iGraph #
####################################

#For more information: http://igraph.org/r/doc/communities.html

#Creating a Graph Obeject for Subsequent Analyses
AHS_Graph=graph.data.frame(AHS_EdgeList)

#Diagnostic Measures

#Normalized Shannon Entropy Score
graph.diversity(AHS_Graph, weights = NULL, vids = V(AHS_Graph))

#Jaccard Similarity
similarity(AHS_Graph, method = "jaccard")

#Initial Network Visualization using Fructerman.Reingold Layout

# set seed to make the layout reproducible
set.seed(3952)
E(AHS_Graph)$color <- "grey"
V(AHS_Graph)$color <- "grey"
V(AHS_Graph)[degree(AHS_Graph, mode="in")>10]$color <- "yellow"  #Destinguishing High Degree Nodes as yellow
V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
layout1 <- layout.kamada.kawai(AHS_Graph)
tkplot(AHS_Graph, layout=layout1)   #tkplot allows us to manually manipulte the visualization if we want

#Finding Strongly Connected Components

#Strongly Connected Component Definition:
#A directed graph is called strongly connected 
#if there is a path in each direction between each pair of vertices of the graph.

scc <- clusters(AHS_Graph, "strong")     #Type scc in the console to have the strongly connected components reported
  #Membership: Top number indicates the node ID, the bottom the component to which it belongs
  #csize: Component Size, the number of nodes in the component. 
  #no: no indicates the number of components in the graph. We have 7 components, 1 central component and 6 nodes that recieve but do not send ties. 

 #Type v_idx in the console to get this report
  #In graphs with multiple components, it can be helpful to distinguish connected components from isolates.


#Community Detection Algorithms in iGraph: Approaches Supported by iGraph
  #Detecting communitities by iteratively calculating edge betweeness (e.g., Girvan & Newman 2001)
  #Detecting communities by using eigenvector matrices (e.g., Newman 2006)
  #Detecting communities by iteratively optimizing for modularity (e.g., Blondel, Guillaume, Lambiotte, & Lefebvre 2008)
  #Detecting communities using random walk methods (e.g, Pons & Latapy 2005; Reichardt & Bornholdt 2006)
  #Detecting communities using label propogation techniques (e.g., Ragavan, Albert, & Kumara 2007)

#Edge-Betweeness: Girvan-Newman (2001)

  GNC <- cluster_edge_betweenness(AHS_Graph, weights = NULL)
    V(AHS_Graph)$color <-membership(GNC)              #Plot setting specifying the coloring of vertices by community
    AHS_Graph$palette <- diverging_pal(length(GNC))   #Plot setting specifying the color pallette I am using (iGraph supports 3)
    V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
    V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
    layout1 <- layout.kamada.kawai(AHS_Graph)
    plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
    plot_dendrogram(GNC)
    
    modularity(GNC)
    membership(GNC)
    head(GNC, n=21)    #Looking at what nodes got assigned to the communities (23 communities in all)

#Transforming iGraph's Community Object GNC AND Merging the Community ID Variable into the Community Dataset
str(membership(GNC))            #Creating a string from iGraph's Community Object
members <- membership(GNC)      
GNC_ID <- data.frame(GNC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
GNC_ID <- data.matrix(GNC_ID)  #Converting GNC_ID from a Factor Variable to a Interger
GNC_ID <- data.frame(GNC_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, GNC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(GNC_ID)
    
#Leading Eigenvector: Newman (2006)
    #Newman in iGraph assumes an undirected graph

AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
    
    #Newman
    GC <- cluster_leading_eigen(AHS_Graph, weights = NULL)
    V(AHS_Graph)$color <-membership(GC)
    AHS_Graph$palette <- diverging_pal(length(GC))
    V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
    V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
    layout1 <- layout.kamada.kawai(AHS_Graph)
    plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
    
    modularity(GC)
    membership(GC)
    head(GC, n=12)   #Looking at what nodes got assigned to the communities (23 communities in all)
    
#Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset
str(membership(GC))            #Creating a string from iGraph's Community Object
members <- membership(GC)      
GC_ID <- data.frame(GC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
GC_ID <- data.matrix(GC_ID)  #Converting GNC_ID from a Factor Variable to a Interger
GC_ID <- data.frame(GC_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, GC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(GC_ID)  #Removing GC_ID, as it is now unessential
    
#Multilevel Techniques: Louvain
  #Louvain in iGraph assumes an undirected graph
  #Consequently, we are created an undirected iGraph object to demonstrate this method.

AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)

  #Louvain
  #Resolution parameter set to 1
  #When the resolution parameter is 1 the standard Louvain method.
  #Higher resolutions produce larger numbe of clusters, while lower resolutions produce lower number of clusters.
  #iGraph does not support changing the resolution parameter, but it can be important.
  #In instances where the group appear too coarse or too fine, we suggest trying Pajek which is also publicly avaialable: http://mrvar.fdv.uni-lj.si/pajek/pajekman.pdf

  LC <- cluster_louvain(AHS_Graph, weights = NULL)
  V(AHS_Graph)$color <-membership(LC)
  AHS_Graph$palette <- diverging_pal(length(LC))
  V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
  V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
  layout1 <- layout.kamada.kawai(AHS_Graph)
  plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5)
  
  modularity(LC)
  membership(LC)
  head(LC, n=12) 
  
#Transforming iGraph's Community Object GC AND Merging the Community ID Variable into the Community Dataset
str(membership(LC))            #Creating a string from iGraph's Community Object
members <- membership(LC)      
LC_ID <- data.frame(LC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
LC_ID <- data.matrix(LC_ID)  #Converting GNC_ID from a Factor Variable to a Interger
LC_ID <- data.frame(LC_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, LC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(LC_ID)  #Removing GC_ID, as it is now unessential

#Random Walk Methods: Walktrap and Spinglass

AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE)

  #Walktrap (Pons & Latapy 2005): 
  WC <- cluster_walktrap(AHS_Graph)
  V(AHS_Graph)$color <-membership(WC)
  AHS_Graph$palette <- diverging_pal(length(WC))
  V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
  V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
  layout1 <- layout.kamada.kawai(AHS_Graph)
  plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) 
  plot_dendrogram(WC)
  
  modularity(WC)
  membership(WC)
  head(WC, n=30) 
  
#Transforming iGraph's Community Object WC AND Merging the Community ID Variable into the Community Dataset
str(membership(WC))            #Creating a string from iGraph's Community Object
members <- membership(WC)      
WC_ID <- data.frame(WC_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
WC_ID <- data.matrix(WC_ID)  #Converting GNC_ID from a Factor Variable to a Interger
WC_ID <- data.frame(WC_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, WC_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(WC_ID)  #Removing GC_ID, as it is now unessential
  
  
  #Spinglass (Reichardt & Bornholdt 2006)
  SPG <- cluster_walktrap(AHS_Graph)
  V(AHS_Graph)$color <-membership(SPG)
  AHS_Graph$palette <- diverging_pal(length(SPG))
  V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
  V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
  layout1 <- layout.kamada.kawai(AHS_Graph)
  plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) 
  plot_dendrogram(SPG)
  
  modularity(SPG)
  membership(SPG)
  head(WC, n=30) 

#Transforming iGraph's Community Object SPG AND Merging the Community ID Variable into the Community Dataset
str(membership(SPG))            #Creating a string from iGraph's Community Object
members <- membership(SPG)      
SPG_ID <- data.frame(SPG_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
SPG_ID <- data.matrix(SPG_ID)  #Converting GNC_ID from a Factor Variable to a Interger
SPG_ID <- data.frame(SPG_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, SPG_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(SPG_ID)  #Removing GC_ID, as it is now unessential
  
#Label Propogation Techniques (Ragavan, Albert, & Kumara 2007)
  #Label_Prop in iGraph assumes an undirected graph
  
AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)

  #Label Propogation
  LP <- cluster_label_prop(AHS_Graph)
  V(AHS_Graph)$color <-membership(LP)
  AHS_Graph$palette <- diverging_pal(length(LP))
  V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
  V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
  layout1 <- layout.kamada.kawai(AHS_Graph)
  plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5) 
  
  modularity(LP)
  membership(LP)
  head(LP, n=32) 
  
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(LP))            #Creating a string from iGraph's Community Object
members <- membership(LP)      
LP_ID <- data.frame(LP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
LP_ID <- data.matrix(LP_ID)  #Converting GNC_ID from a Factor Variable to a Interger
LP_ID <- data.frame(LP_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, LP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(LP_ID)  #Removing GC_ID, as it is now unessential
  
#ADDITIONAL METHODS
  
  #Fast-Greedy Techniques:  (Clauset & Moore 2004)
  #DetectS communities using hierarchical agglomeration alorithms that consider the vertices (nodes), edges, and the depth of the dendogram (hierarchical structure) 
  #Operable on large data sets of unique node-edge pairs, not applicable here.
  
  AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
  
    #Fast-Greedy
    FG <- cluster_fast_greedy(AHS_Graph, weights = NULL)
    V(AHS_Graph)$color <-membership(FG)
    AHS_Graph$palette <- diverging_pal(length(FG))
  
  #InfoMAP (Rosvall, Axelsson, Berstrom 2009)
  #Using a map algorithm that models a network work as a system of flows.
  #http://www.tp.umu.se/~rosvall/livemod/mapequation/
  
  AHS_Graph=graph.data.frame(AHS_EdgeList, directed=TRUE)
  
    #InfoMAP
    IMP <- cluster_infomap(AHS_Graph)
    V(AHS_Graph)$color <-membership(IMP)
   AHS_Graph$palette <- diverging_pal(length(IMP))
    V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
    V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
    layout1 <- layout.kamada.kawai(AHS_Graph)
    plot(AHS_Graph, edge.arrow.size=.5, edge.arrow.width=.5) 
    
    modularity(IMP)
    membership(IMP)
    head(IMP, n=48) 
    
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(IMP))            #Creating a string from iGraph's Community Object
members <- membership(IMP)      
IMP_ID <- data.frame(IMP_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
IMP_ID <- data.matrix(IMP_ID)  #Converting GNC_ID from a Factor Variable to a Interger
IMP_ID <- data.frame(IMP_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, IMP_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(IMP_ID)  #Removing GC_ID, as it is now unessential
  
  #Optimal Modularity (Good, Yva de Montjoye, and Clauset 2010)
  #Simulated annealing method that repeatedly samples the network
  #http://tuvalu.santafe.edu/~aaronc/modularity/

  #WARNING: This routine is computationally intensive!!! 
  
  AHS_Graph=graph.data.frame(AHS_EdgeList, directed=FALSE)
  
    #Optimal Modularity
    OM <- cluster_optimal(AHS_Graph)
    V(AHS_Graph)$color <-membership(OM)
    AHS_Graph$palette <- diverging_pal(length(OM))
    V(AHS_Graph)$label.cex <- seq(0.5,5,length.out=0.5)         ## text size
    V(AHS_Graph)$size      <- seq(10,60,length.out=0.5)         ## circle size proportional to text size
    layout1 <- layout.kamada.kawai(AHS_Graph)
    plot(AHS_Graph,edge.arrow.size=.5, edge.arrow.width=.5) 
    
    modularity(OM)
    membership(OM)
    head(OM, n=6) 
    
#Transforming iGraph's Community Object LP AND Merging the Community ID Variable into the Community Dataset
str(membership(OM))            #Creating a string from iGraph's Community Object
members <- membership(OM)      
OM_ID <- data.frame(OM_ID = as.numeric(members), ego_nid = names(members)) #Creating a new data frame
OM_ID <- data.matrix(OM_ID)  #Converting GNC_ID from a Factor Variable to a Interger
OM_ID <- data.frame(OM_ID)   #Converting the data set back into a dataframe
AHS_Community7 <- merge(AHS_Community7, OM_ID, by= 'ego_nid', all=TRUE) #Merging the data sets.
rm(OM_ID)  #Removing GC_ID, as it is now unessential
    
#Measures that Can be Helpful for Evalauting Community Fit of Social Networks
  #Shanon Entropy Scores with respect to race and gender, with the expectation being that in a school setting a better community solution will have lower entropy scores
  #Modularity
  #Within-group distance
  #Proportion of within-group triads against total triads