# Diffusion simulation models in R # Author: Jake Fisher # This lab will focus on how to simulate a diffusion process in R. We will # cover: # - Setting up a diffusion simulation # - Changing different parameter settings # - Presenting the results # Following best practices, we begin by setting up the workspace rm(list = ls()) setwd("C:/Users/fishe/Box Sync/Home Folder jcf26/SNH2017/Instructor dropbox") library(statnet) library(tidyverse) library(magrittr) library(ggnetwork) # Dedicated diffusion packages library(EpiModel) library(netdiffuseR) # Typically, the purpose of a diffusion simulation is to illustrate the effects # of a new diffusion process, or a new type of network structure, on the # ultimate results of a diffusion process. As such diffusion simulations are # highly variable -- after all, the point for each one is to show something new! # However, despite the variability, most diffusion simulations build on a # common set of building blocks: # - an underlying network structure, # - a transmission process, and # - a running timer. # In essence, we are defining an algorithm, or a set of rules, for how people # get "infected" with a social contagion, and looking at how those rules perform # under differnt conditions. # To illustrate this, I will show a few examples of diffusion models. These are # just an introduction to what you can do; the sky is the limit! ##### Example 1: Disease epidemics #### # The most straightforward and common use for a diffusion simulation is to # simulate an epidemic spreading through a population. Some examples of this # include: # Moody, James. 2002. "The Importance of Relationship Timing for Diffusion." # Social Forces 81(1):25-56. # Merli, M.Giovanna, James Moody, Joshua Mendelsohn, and Robin Gauthier. 2015. # "Sexual Mixing in Shanghai: Are Heterosexual Contact Patterns Compatible # With an HIV/AIDS Epidemic?" Demography 52(3):919-42. # In the example here, we will simulate an epidemic spreading through a random # network. This roughly approximates a "random mixing" scenario, where people # interact randomly. # To simulate random mixing, we construct the following rules: # 1. Set up a world with a fixed number of people and a small fraction of # initial infections, arranged randomly as a network # 2. Pick an edge at random # 3. If the edge is discordant (a susceptible-infected connection), flip a coin # to determine whether the non-infected person gets infected # 4. Repeat steps 1 and 2 until everyone is infected, or until a certain number # of steps has passed # First, in our simple example, let's set our parameters ahead of time n.people <- 100 p.infection <- 0.5 pct.starting.infected <- 0.01 max.time <- 5000 contact.rate <- 0.05 / 2 # prob. of contact between any 2 people in the network ### Step 1: Set up world ### # Create a random graph, where edges are placed randomly. This is called a # Bernoulli or an Erdos-Renyi random graph set.seed(919) net <- rgraph(n = n.people, tprob = contact.rate) %>% symmetrize %>% # Make symmetric -- doesn't matter who is connected to who network(matrix.type = "adjacency") # convert to statnet # Chose a subset of people who are infected initially infected <- sample( x = c(T, F), # people can be infected (T) or susceptible (F) size = n.people, # create a vector that is n.people long replace = T, # with replacement, so that >1 person can be infected prob = c(pct.starting.infected, 1 - pct.starting.infected) ) ### Step 2: Choose an edge ### # For each step, we're going to choose an edge at random, and then, if the edge # is discordant, flip a coin to determine whether the susceptible person gets # infected. # First, create an edgelist... el <- as.edgelist(net) %>% as.data.frame %>% set_names(value = c("from", "to")) %>% # ... attach the values of infected... mutate(from.infected = infected[from], to.infected = infected[to], # ... and create a discordant variable discordant = (from.infected != to.infected)) # Next, choose an edge at random random.edge <- sample(nrow(el), size = 1) # Check if the edge is discordant el[random.edge, "discordant"] # it's not, so we do nothing. # For the example, I'm going to speed this up by choosing a discordant edge discordant.edge <- sample(which(el$discordant), size = 1) ### Step 3: Flip a coin to see if the person gets infected ### # Now, flip a coin to see if the uninfected person gets infected el[discordant.edge, ] # Person 62 is the suceptible, but we will want to be # able to determine that without looking manually # A little tricky indexing to pull out the ID of the susceptible person... who.susceptible <- with( el[discordant.edge, ], c(from, to)[!c(from.infected, to.infected)] ) # Flip the coin to determine if infection spreads (it does) (infected[who.susceptible] <- sample(c(T, F), size = 1, prob = c(p.infection, 1 - p.infection))) ### Step 4: Repeat ### # To repeat this process, we actually embed steps 1 and 2 in a loop. # Set up a list with the output infections <- vector(length = max.time, mode = "list") # Save what we already did as the first iteration infections[[1]] <- infected # Quick aside -- what did we create? head(infections) # Drop the "from.infected", "to.infected", and "discordant" columns from el, # because they'll actually change with every iteration el %<>% select(-from.infected, -to.infected, -discordant) # Next, run the loop set.seed(27708) for (t in 2:max.time) { infections[[t]] <- infections[[t - 1]] # Pick an edge at random random.edge <- sample(nrow(el), size = 1) # If the edge is discordant, flip a coin to decide if the infection spreads if (with(el[random.edge, ], infections[[t]][from] != infections[[t]][to])) { who.susceptible <- with( el[random.edge, ], c(from, to)[!c(infections[[t]][from], infections[[t]][to])] ) infections[[t]][who.susceptible] <- sample( c(T, F), size = 1, prob = c(p.infection, 1 - p.infection) ) } } # Now we have a list of who was infected at what time point. Let's combine # that into a data.frame, so we can work with it more easily (results <- infections %>% lapply(FUN = as.data.frame) %>% lapply(FUN = set_names, value = "infected") %>% bind_rows(.id = "t") %>% mutate(id = rep(1:network.size(net), times = max.time), t = as.integer(t)) %>% tbl_df) # This dataset is the raw results of our simulation, but it's usually easier to # look at a summary. Let's look at the number of people infected over time infections.by.time <- results %>% group_by(t) %>% summarize(n.infections = sum(infected)) %>% mutate_each(funs(as.numeric), t, n.infections) # Aside: in R, there are many ways to solve the same problem. We could have # gone directly from the raw data to the summaries using the apply functions: # infections.by.time <- data.frame( # t = 1:max.time, # n.infected = sapply(infections, sum) # ) # Plotting this relationship shows the qplot(data = infections.by.time, x = t, y = n.infections, geom = "line") # Or, alternatively, we could look at whether people who are more central # are infected sooner time.to.infection <- results %>% group_by(id) %>% summarize(time.infected = min(t[infected])) %>% arrange(id) %>% mutate(indegc = degree(net)) ggplot(time.to.infection, aes(x = indegc, y = time.infected)) + geom_point() + geom_smooth(method = "lm") # For fun, let's plot a few slices of the network in time set.seed(330) net.layout <- ggnetwork(net) %>% rename(id = vertex.names) net.layout.by.time <- split(results, f = results$t) %>% lapply(FUN = right_join, y = net.layout, by = "id") %>% bind_rows net.layout.by.time %>% filter(t %in% c(1, 100, 250, 500, 750, 1000)) %>% ggplot(aes(xend = xend, yend = yend, x = x, y = y)) + geom_edges(color = "lightgray") + geom_nodes(aes(color = infected)) + facet_wrap(~ t) + theme_blank() # We could animate this with gganimate... but it doesn't work on R v. 3.4. See # https://raw.githubusercontent.com/jalapic/rmeetup_examples/master/ggnetwork_gganimate.R # for example code. # The statnet team developed a new package, called EpiModel, to simplify running # epidemic diffusion models. Tutorials are online here: # http://www.epimodel.org/, and we'll walk through one to illustrate. # Set the parameters ahead of time param <- param.icm( inf.prob = 0.2, # probability of contracting an infection in a step act.rate = 0.25 # speed with which people interact ) init <- init.icm( s.num = 500, # number of people who are infected initially i.num = 1 # number of people who are susceptible initially ) control <- control.icm( type = "SI", # type of model, vs., e.g., SIR or SIS nsims = 10, # number of simulations to run nsteps = 300 # number of steps per simulation ) # Run the model. ICM means individual contact model (vs. a deterministic model, # where the results are determined by a set of differential equations) set.seed(919) mod <- icm(param, init, control) # Now we can look at some summaries of the model # Look at the state of the model at a particular time slice summary(mod, at = 125) # Look at a dataset with the state of the model at a given time mod %>% as.data.frame(mod = "mean") %>% tbl_df # Get a plot of the number of infections by time plot(mod) # We can customize the plot, showing the individual simulations, for example plot(mod, y = "i.num", sim.lines = TRUE, mean.smooth = FALSE, qnts.smooth = FALSE) # The package is quite flexible. It can incorporate demographic changes, # different network structures, and even custom functions for each of the # components of the model (like different mortality schedules). Check the # website for more examples of how to do this. ##### Example 2: Diffusion of innovations ##### # The second example is the diffusion of innovations. The process by which # innovations spread is less well-understood. We usually simulate it with a # process that's very similar to diseases diffusing. # In this case, we will focus on a "complex contagion", meaning something you # have to hear about from multiple people before you will adopt it. # Setup # 1. Construct a small world network # 2. Choose one person at random, and set him/her and his/her neighbors as # initial adopters # 3. Infect all the people who have more than tau neighbors infected # 4. Set the people who were infected at the previous round as "recovered" # 5. Repeat steps ### Set parameters in advance ### n <- 50 tau <- 2 / 6 max.time <- 50 ### Step 1: Construct network ### # Construct a small world network set.seed(919) sw.net <- igraph::sample_smallworld(1, n, 2, p = .2) %>% intergraph::asNetwork(.) ### Step 2: Choose a person & neighbors at random as an initial adopter ### # Set up vector to indicate adoption adopters <- rep(F, n) # Choose a person at random initial.adopter <- sample(seq_len(n), size = 1) # Get the list of people they're attached to initial.neighbors <- get.neighborhood(sw.net, initial.adopter) # Set them all as "adopters" adopters[c(initial.adopter, initial.neighbors)] <- T ### Step 3: infect all people who have more than tau neighbors infected ### # Let's look at one person, person 18, to see how this works ego.extract(sw.net, ego = 20, neighborhood = "out") # Person 18 hasn't adopted adopters[20] # About half of their neighbors have adopted adopters[c(18, 19, 22, 25)] mean(adopters[c(18, 19, 22, 25)]) # To decide whether the person adopts, we test whether the fraction of # adopters is greater than tau mean(adopters[c(18, 19, 22, 25)]) >= tau # adoption! # We can update everyone simultaneously using matrix multiplication adj.mat <- sw.net[, ] diag(adj.mat) <- 0 # set the diagonal to 0, b/c people don't weight themselves # We could take the sum... adj.mat %*% adopters #... or the percentage (adj.mat.rn <- adj.mat / rowSums(adj.mat)) adj.mat.rn %*% adopters # And then we calculate the people who are above our threshold (adj.mat.rn %*% adopters) >= tau # Note that this actually allows people to abandon the innovation if enough of # of their neighbors are not adopters. For now we don't want that to happen, # so we'll only test people who are not yet adopters ifelse(adopters, TRUE, ((adj.mat.rn %*% adopters) >= tau)) ### Step 4: Repeat ### # Again, we can take care of this by wrapping it in a loop adopt <- vector(mode = "list", length = max.time) adopt[[1]] <- adopters for (t in 2:max.time) { adopt[[t]] <- ifelse(adopters, TRUE, ((adj.mat.rn %*% adopt[[t - 1]]) >= tau)) } # Note that again we get the characteristic S-shaped curve: data.frame( t = 1:max.time, n.adopt = sapply(adopt, sum) ) %>% ggplot(aes(x = t, y = n.adopt)) + geom_line() # Let's plot a few frames set.seed(330) sw.net.layout <- ggnetwork(sw.net) %>% rename(id = vertex.names) sw.net.layout.by.time <- adopt %>% lapply(FUN = as.data.frame) %>% lapply(FUN = set_names, value = "adopter") %>% lapply(FUN = mutate, id = 1:n) %>% lapply(FUN = right_join, y = sw.net.layout, by = "id") %>% bind_rows(.id = "t") %>% mutate(t = as.integer(t)) sw.net.layout.by.time %>% filter(t < 10) %>% ggplot(aes(xend = xend, yend = yend, x = x, y = y)) + geom_edges(color = "lightgray") + geom_nodes(aes(color = adopter)) + facet_wrap(~ t) + theme_blank() # There's a new package for analyzing the diffusion of innovations. It focuses # mainly on creating survival models for diffusion data (and has a different # data structure) so we won't cover it here. But it's well worth checking out # the vignette: vignette("analyzing-medical-innovation-data") ##### Example 3: Averaging ideas ##### # A third common approach use for diffusion simulation is to suggest how # people's beliefs change through interaction with others. A longstanding # assumption is that people take the weighted average of their friends. # Set the parameters in advance self.weight <- 0.2 max.time <- 5 # Load the data and save a few values for convenience load("faux_add_health_with_faux_attitude.Rdata") n.people <- network.size(add.health) attitude <- (add.health %v% "faux.attitude") adj.mat <- add.health[, ] # save the adjacency matrix to its own object # Plotting the network shows that attitude values are randomly distributed in # this network set.seed(27708) net.layout <- ggnetwork(add.health) p <- ggplot(net.layout, aes(x = x, y = y, xend = xend, yend = yend)) + geom_edges(color = "lightgray", arrow = arrow(length = unit(6, "pt"), type = "closed")) + theme_blank() + scale_color_distiller(palette = "Spectral") p + geom_nodes(aes(color = faux.attitude), size = 4) # In general, we assume that people update their attitude using the average of # their neighbors' opinions, and their own opinion. For example, person 3 names # 4 friends: people 6, 29, 34, and 40 ego.extract(add.health, ego = 3, neighborhood = "out") # Those people have 4, 2, 3, and 2 as their attitudes attitude[c(6, 29, 34, 40)] # Their mean value is 2.75 mean(attitude[c(6, 29, 34, 40)]) # Then we assume that people take the weighted average of their own attitudes # and their friends' attitudes self.weight * attitude[3] + (1 - self.weight) * mean(attitude[c(6, 29, 34, 40)]) # We could repeat this for everyone individually, but this can actually be done # by matrix multiplication, which makes it much easier to program. # First, we have to row-normalize (set rows so that they sum to 1 by dividing # each row by its sum) the adjacency matrix diag(adj.mat) <- 0 # set the diagonal to 0 (adj.mat.rn <- adj.mat / rowSums(adj.mat)) # note the missing values! # Isolates => dividing by 0 # NaN does not play nicely with other values, so swap it for 0 in the row- # normalized matrix. adj.mat.rn[which(is.nan(adj.mat.rn), arr.ind = T)] <- 0 # In general, we assume that isolates do not change under this model. # Mechanically, we can either remove them from the analysis or we can set their # self-weights to 1. # Identify outdegree isolates isos <- (degree(add.health, cmode == "outdegree") == 0) # Set self-weights to 1: self.weight.vec <- rep(self.weight, times = n.people) self.weight.vec[isos] <- 1 # Remove isolates self.weight.no.iso <- self.weight.vec[!isos] adj.mat.no.iso <- adj.mat.rn[!isos, !isos] attitude.no.iso <- attitude[!isos] # Now run the simulation... # ... with self-weights of isolates = 1 I <- diag(n.people) # identity matrix SW <- diag(self.weight.vec) # diagonal matrix with self-weights on diagonal (adj.mat.rn %*% (I - SW) + SW) %*% attitude # ... removing the isolates I.no.iso <- diag(sum(!isos)) SW.no.iso <- diag(self.weight.vec[!isos]) (adj.mat.no.iso %*% (I.no.iso - SW.no.iso) + SW.no.iso) %*% attitude.no.iso # Similarly to the other simulations, we can repeatedly perform this procedure # using a for loop: sim.attitude <- vector(mode = "list", length = max.time) sim.attitude[[1]] <- attitude for (t in 2:max.time) { sim.attitude[[t]] <- (adj.mat.rn %*% (I - SW) + SW) %*% sim.attitude[[t - 1]] } # Now let's plot the values by time, to show how the diffusion process unfolds # Convert each list element to a data frame that ggplot can use, & combine them layout.with.attitudes <- sim.attitude %>% lapply(FUN = as.data.frame) %>% lapply(FUN = set_names, value = "attitude.value") %>% lapply(FUN = mutate, vertex.names = seq_len(n.people)) %>% lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>% bind_rows(.id = "iteration") # Plot it, with facets to show how this changes over each iteration p %+% layout.with.attitudes + geom_nodes(aes(color = attitude.value), size = 4) + facet_wrap(~ iteration) # A package that's under development, latentnetDiffusion, takes care of the # matrix multiplication and reshaping of the data for you devtools::install_github("jcfisher/latentnetDiffusion") library(latentnetDiffusion) library(latentnet) # For example, to run the same simulation as above, use: (sim.attitudes.2 <- degroot(W = add.health, Y = as.matrix(attitude), all.iter = T, self.weight = 0.8) %>% tidyDegroot(id = seq_len(n.people)) ) # Or, we can fit a latent space model, and simulate the diffusion process over # the latent space: # fit <- latentnet::ergmm(add.health ~ euclidean(d = 2)) # takes a minute! # save(fit, file = "ergmm_2d_fit.Rdata") load("ergmm_2d_fit.Rdata") (ls.attitudes <- ergmmDegroot(fit, Y = as.matrix(attitude), simulate = F, all.iter = T) %>% tidyDegrootList(id = seq_len(n.people))) # Plot it, to show the difference layout.with.ls.attitudes <- ls.attitudes %>% rename(vertex.names = id) %>% {split(., f = .$iteration)} %>% lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>% bind_rows(.id = "iteration") p %+% layout.with.ls.attitudes + geom_nodes(aes(color = pred.value), size = 4) + facet_wrap(~ iteration) # The same model, simulated: set.seed(919) (ls.attitudes.sim <- ergmmDegroot(fit, Y = as.matrix(attitude), simulate = T, all.iter = T) %>% tidyDegrootList(id = seq_len(n.people))) # Plotted again layout.with.ls.attitudes.sim <- ls.attitudes.sim %>% rename(vertex.names = id) %>% {split(., f = .$iteration)} %>% lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>% bind_rows(.id = "iteration") p %+% layout.with.ls.attitudes.sim + geom_nodes(aes(color = pred.value), size = 4) + facet_wrap(~ iteration)