#R Orientation #Author: Jonathan H. Morgan #Based in Part on Jake Fisher's introduction to R: https://dnac.ssri.duke.edu/r-labs/2017/01_data_management.php #9 May 2018 ################################### # STARTING FROM A CLEAR SLATE # ################################### #Remove: Removes objects from memory rm(list = ls()) #Garbage Collection: Frees up memory, but preserves variables created in previous steps gc() ####################################### # INSTALLING AND LOADING PACKAGES # ####################################### #R is a modular language. #The intuition is that you have in memory only what you need to perform the analysis tasks specified in the script. #Consequently, we need to load the packages we will be using during our analyses each time we run a new instance of R. #Installing a Package #Installing a package multiple times can result in R being unable to read the package files. #Consequently, if you are uncertain whether a particular package is installed on you machine, #use the search window in the Packages tab to check. install.packages("readr") #Reading a Package library(readr) #Import csv and other delimited files library(haven) #Import SPSS, SAS, or Stata files library(magrittr) #Supports pipe (%>%) commands that allow you to perform multiple operations with one statement library(dplyr) #Manipulate data library(tidyr) #Additional functions for manipulating data library(ggplot2) #Visualizing data library(statnet) #Network Analysis Software library(ggnetwork) #Network Visualization ################################## # DATA AND OBJECT TYPES IN R # ################################## # R is an object-oriented language, which puts it somewhere between statistical # computing languages such as SAS, Stata, or SPSS, and object-oriented programming # languages such as Python or Java. # We will focus mainly on using R for statistical computing, but # I will demonstrate one instance where writing a simple function is quite useful for importing large data sets. #DATA TYPES #Logical: TRUE or FALSE #Integer: 1, 2, 3, 4, ... #Numeric: 1.2, 3.5, 5.5, 0, -1 #Complex: 1 + 2i (imaginary numbers) #Character: "Jon", "1.2" #Raw: A mixture of types in the same cell: "Jon" 1 2 2.5 -1 #Function: Essentially, conditional statements or transformations you want to apply to multiple cases #that utilize operations from Base R or packages that you load. #When reading in data, R will, by default, treat columns with different data types as different types of objects. #There are a few instances where this can be problematic. #For example, R tends to treat a column consisting of character variables as a factor, #essentially treating it as a categorical varaible when you may simply want a list of names. #We can avoid these problems if we are mindful of the data types in our data, #and specify the data type when importing our data in R. #We can also "coerce" or transform a variable from one type to another. #We discuss both methods in this orientation. #OBJECT TYPES: Vectors, Lists, and Factors Oh My! #Vectors #Lists #Factors #Arrays #Matrices #Data Frames #Functions #Vectors: A vector is a sequence of data elements of the same basic type. c(2, 3, 5) c(TRUE, FALSE, TRUE, FALSE, FALSE) c("aa", "bb", "cc", "dd", "ee") #Lists: A list is a generic vector containing other objects. #For example, the following variable x is a list containing copies of three members n, c, l, and a numeric value 3. n = c(2, 3, 5) c = c("aa", "bb", "cc", "dd", "ee") l = c(TRUE, FALSE, TRUE, FALSE, FALSE) x = list(n, c, l, 3) # x contains copies of n, s, b and the number 3 #Slicing a list: We retrieve a list slice with the single square bracket "[]" operator. #The following is a slice containing the second member of x, which is a copy of c. x[2] #Modifying a list x[[2]][1] = "ta" #We are manipulating the list directly, indicated by the double brackets around the 2, #the 1 first element of the second member of the list. x[2] #Factors: A vector of integer values with a corresponding set of character values to use when the factor is displayed. #Factors are R's way of representing categorical variables. #Creating an example factor data = c(1,2,2,3,1,2,3,3,1,2,3,3,1) factor = factor(data) #We are specifying that 1, 2, and 3 correspond to levels, similar to SAS's class statement. factor #Creating Labels for Levels: 1 2 3 factor = factor(factor,labels=c("I","II","III")) factor #When importing data, R will specify varibles that it thinks have levels as factors. #This is problematic because R is now treating the variable as a catgorical variable, #and thus will not perform many basic operations. #Arrays: A multidimensional rectangular data object. #"Rectangular" refers to the fact that each row is the same length, and likewise for each column. Three_D_Array <- array( 1:24, #24 rows for each dimension dim = c(4, 3, 2), #3 dimensions consisting of 4, 3, and 2 objects dimnames = list( c("one", "two", "three", "four"), c("ein", "zwei", "drei"), c("un", "deux") ) ) Three_D_Array #Enlish Numbers by German Numbers by French Numbers #Matrix: A collection of data elements arranged in a two-dimensional rectangular layout. #A matrix is a special case of an array, the 2D version. Matrix <- matrix( 1:12, #Creating cell values nrow = 4, #Specifying the number of rows, ncol = 3 works the same dimnames = list( c("one", "two", "three", "four"), #Specifying the rows and columns c("ein", "zwei", "drei") ) ) Matrix #Data Frame: A list of vectors of equal length. #A data frame is a special case of a matrix, #one where we have specified that the data elments in each column are the same type. #Data frames are R's counterpart to a classic statistical package's data set. #The top line of the table is a header, and contains the column names. #Each horizontal line after the header denotes a data row, which begins with the name of the row, #and then followed by the actual data. #Each data member of a row is called a cell. n = c(2, 3, 5) c = c("aa", "bb", "cc") l = c(TRUE, FALSE, TRUE) data_frame = data.frame(n, c, l) #In R forums, df is often used to refer to a data frame. data_frame ###################### # IMPORTING DATA # ###################### #Getting and Setting Your Work Directory #It's important to know where you are saving the data. #By default, R will save your data to the highest level of your user directory. #You can determine where R is saving your data by using the following command: getwd() #We can set a working directory which is quite useful because we, then, do not have specify #the file location of eah our data sets when we import them. #You can even synchronize your work directory with an online directory. #We can set our work directory by using the following command: setwd("C:/Users/Jonathan H Morgan/Desktop/SN&H 2018") # Note: forward slashes #Importing data into R #There are numerous functions and packages for importing data into R. I am going to priamrily discuss "readr" #because this package is capable of importing multiple data types, and is capable of importing large data #sets (e.g., 87 GB). #For importing SPSS, SAS, and Stata files directly, we recommend using the "haven" package. #Documentaion for Haven: https://cran.r-project.org/web/packages/haven/haven.pdf #R does provide a GUI based option, but this is not optimal for large data sets AHS_Base=read.csv(file.choose(),header=TRUE) #Reading the CSV where readr is inferring the data type based on the first 1000 rows of data AHS_Base <- read_csv ('C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv', col_names = TRUE) #Useful functionality when importing very large data sets by subsets f <- function(x, pos) subset(x, x[[27]] == 2) #Subsetting by gender to isolate female respondents #I am using column's index number because this notation #works whether the file has a header or not. AHS_Base <- read_csv_chunked("C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv", col_names = TRUE, DataFrameCallback$new(f), chunk_size = 10000, progress=TRUE ) #Transforming Grade and Sex into factors #Specifying a vector that specifies which variables I want to transform cols <- c("sex", "grade") AHS_Base %<>% mutate_each_(funs(factor(.)),cols) #Confirming that sex and grade now have levels str(AHS_Base) ############################## # DATA MANAGEMENT BASICS # ############################## #The Basic Grammar of Data Management in R #Selecting #Arranging #Mutating #Filtering #Renaming #Gathering #Summarizing #Separating #Making Distinct #Joining #Selecting: "Selecting" always refers to selecting the columns you want. AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade) #Arranging: "Arranging" reorder rows with respect to columns. AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% #Using a second pipe to chain commands arrange(ego_nid, sex) #Arraning the rows with respect to ego ID and gender #Mutating: "Mutating" refers to creating a new variable based on operations peformed on another variable. #Mutating is admittedly the strangest function name in the R Tidyverse, but it refers to the idea that #a new variable is the result of a transformation of an old one. AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% arrange(ego_nid, sex) %>% mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) #Filtering: "Filtering" refers to filtering by rows (e.g., choosing only 7th grade girls in this case). AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% arrange(ego_nid, sex) %>% mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>% filter (grade == "7" & Female == 1) #Double == comes from set notation if and only if #Renaming: "Renaming" refers to relabeling column names. AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% arrange(ego_nid, sex) %>% mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>% filter (grade == "7" & Female == 1) %>% rename( id = `ego_nid`, gender = `sex`) #Gathering: "Gathering" refers to gathering columns to transform a wide data set into a long one. AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% arrange(ego_nid, sex) %>% mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>% filter (grade == "7" & Female == 1) %>% rename( id = `ego_nid`, gender = `sex`) %>% gather(Alter_Label, Target, mfnid_1:mfnid_5, ffnid_1:ffnid_5, na.rm = TRUE) #Summarizing: "Summarizing" refers to generating summary statitics for a given variable. #In this case, we are going to calculate the average number of friends boys and girls have gc() #Reading in the data to calcualte separate gender means AHS_Base <- read_csv ('C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv', col_names = TRUE) AHS_Edges <- AHS_Base %>% select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, commcnt, sex) %>% gather(Alter_Label, Target, mfnid_1:mfnid_5, ffnid_1:ffnid_5, na.rm = TRUE) %>% arrange(ego_nid, sex) %>% filter (Target != 99999) #Eliminating 99999 values #Generating Summary Statistics Gender_Mean <- group_by(AHS_Edges, ego_nid, sex, commcnt) %>% #Group by ego ID to create a count of alters filter (commcnt == 7 & sex != 0) %>% #Examining community 7's school network, and dropping 0s summarise(count = n()) %>% #Creating a count of each students alters group_by (sex) %>% #Grouping by gender to generate seaparate averages summarise (Gender_Mean = mean(count)) #Generating male and female averages gc() #Separating: "Seperating" refers to splitting delimited values in one column into multiple columns #Separating is very useful when dealing with delimited items in text data. #For example, Qulatrics output for questions where respondents can makes multiple responses #has each response to the given question separated by commas in one column. #The separate function combined with gather can be quite useful to splitting responses, #and then grouping them by each responsdent. #Simulating data where the output is a string ID = c("Jim", "Molly", "Jaemin") Male_Friends = c("Jon Jaemin Joe Jim", "Jim Mudit Marcus", "Jim Peter Chris Marcus") Female_Friends = c("MC Molly Liann", "Crystal Molly Liann", "MC Molly Crystal") data_frame = data.frame(ID, Male_Friends, Female_Friends) #In R forums, df is often used to refer to a data frame. data_frame #Converting varaibles into character variables to avoid potential problems with gathering #and spearating data. data_frame %<>% mutate_if(is.factor,as.character) #This data is a mess, lets fix it Edges <- data_frame %>% select (ID, Male_Friends, Female_Friends) %>% separate(Male_Friends, c("MF_1", "MF_2", "MF_3", "MF_4"), " ") %>% #Separating each element separated by #a space in the male friends into its #own column separate(Female_Friends, c("FF_1", "FF_2", "FF_3"), " ") %>% #Repeating this step for female friends gather(Alter_Label, Target, MF_1:MF_4, FF_1:FF_3, na.rm = TRUE) %>% #Gathering all the variables to create #an edgelist select (ID, Target) #Dropping Alter_Label #We have got the data into something we can use, but character IDs can be problematic #Let make unique numeric IDs for all the nodes #Distinct: Eliminates all duplicate values Nodes <- Edges %>% gather(Variable_Label, Sender, ID, Target, na.rm = TRUE)%>% #Gathering ID and Target into one list mutate(ID = Sender) %>% #Creating Node Labels for later select (ID) %>% #Dropping the other variables distinct(ID) %>% #Isolating unique cases (add_rownames) %>% #Getting the rownames to create sequential IDs rename (Sender_ID = rowname)%>% #Renaming rowname to Sender mutate(Sender_ID = as.numeric(Sender_ID)) #Converting rowname into a numeric variable #Joing: "Joing" refers to merging data sets using key variable. #There are several kinds of joins. We are going to do left and right joins in this case. #To learn more about joins see: ttp://www.rpubs.com/williamsurles/293454 #We now want to merge our numeric IDs, Sender, with our edgelist with the ID variable Edges <- Edges %>% left_join(Nodes, by = c("ID")) #Renaming to merge Nodes with Target to get Taret_ID Nodes <- Nodes %>% rename( Target_ID = `Sender_ID`, Target = `ID`) #Merging Numeric IDs for the alters or targets Edges <- Edges %>% right_join(Nodes, by = c("Target")) #Final Formatting Edges <- Edges %>% select(Sender_ID, Target_ID) %>% rename ( Target = `Target_ID`, Sender = `Sender_ID`) ################################################################ # VISUALIZING OUR SIMULATED NETWORK: PREPARATION FOR DAY 2 # ################################################################ #Step 1: Formatting Sender and Target Variables to Construct a Statnet Network Object Edges [,1]=as.character(Edges[,1]) Edges [,2]=as.character(Edges[,2]) #Step 2: Creating a Network Object #Note, this is a directed graph. So, we specify that in the network object now. #The specification of the graph as either directed or undirected is important because it impacts fundamentally how we interpret the relationships described by the graph. AHS_Network=network(Edges,matrix.type="edgelist",directed=TRUE) #Creating a label vertex to assign to the network Label <- as.vector(Nodes$Target) #Step 3: Assigning Attributes to Vertices from our nodelist set.vertex.attribute(AHS_Network,"Label",Label) #Step 5: Visualizing the Network AHS_Network summary(AHS_Network) #Get numerical summaries of the network set.seed(12345) ggnetwork(AHS_Network) %>% ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + geom_edges(color = "lightgray") + geom_nodelabel_repel (label = Label) + #For networks with fewer nodes, we might want to label theme_blank() + geom_density_2d()