@Ryan_Fox_Squire_SafeGraph Thank you for making this data available, and the incredible Python scripts. I wrote a piece of R code for some basic analysis and creating a dataset that suits my needs. I thought I will share it with others and see if anyone can put it to use and make it better.
##I wrote a For Loop to arrange data in the way I wanted. Create columns from JSON data instead of rows and do some basic analysis using the JSON fields
##Looking for ways to make this work faster or if someone else has faced similar issues.
library(SafeGraphR) #Used for expand_cat_json function
library(stringr)
library(pracma)
options(scipen=999)
zz1=gzfile('patterns-part1.csv.gz','rt') #link to the latest data from SG AWS
zz2=gzfile('patterns-part2.csv.gz','rt')
zz3=gzfile('patterns-part3.csv.gz','rt')
zz4=gzfile('patterns-part4.csv.gz','rt')
dat1=read.csv(zz1,colClasses = list(poi_cbg = "character")) #data.table might be faster. But .5 GB is barely going to make you sweat.
dat2=read.csv(zz2,colClasses = list(poi_cbg = "character"))
dat3=read.csv(zz3,colClasses = list(poi_cbg = "character"))
dat4=read.csv(zz4,colClasses = list(poi_cbg = "character"))
dat = rbind(dat1,dat2,dat3,dat4) #
rm(dat1,dat2,dat3,dat4,zz1,zz2,zz3,zz4) #frees some memory
dat[,"nchar"] = nchar(dat[,"poi_cbg"]) #I find some blank POI_CBG fields
nrow(dat[dat[,"nchar"]==0,]) #check if any SGPIDs have a blank POI_CBG
dat<-dat[dat[,"nchar"]>0,] #delete any rows with blank POI_CBG
###Match and Merge
dat1 = merge(dat, filter_data, by = "safegraph_place_id") #I have a filtered set of SG Place IDs that I will use for some analysis.
##Begin For Loop##
t1 = Sys.time()
final={}
for (i in 1:nrow(dat1)){ ##This for loop will be in another loop that goes over all the 100+ week level files.
dat2 = dat1[i,]
poi = as.numeric(dat2["poi_cbg"])
if (nchar(dat2$visitor_home_cbgs)<3) { #nchar is used for those POIs with empty visitor_home_cbgs values. I found <30 cases in one dataset.
home_cbg_data_df = data.frame(matrix(0,1,3)) # if the visitor_home_cbgs is {}, expand json function fails. Circumventing that through the if condition
colnames(home_cbg_data_df) <- c("origin_cbg","safegraph_place_id","visitor_home_cbgs")
} else {
home_cbg_data_df <- expand_cat_json(dat2,
expand = 'visitor_home_cbgs',
index = 'origin_cbg',
by = 'safegraph_place_id')
}
home_visitors = as.data.frame(home_cbg_data_df[which(as.numeric(unlist(home_cbg_data_df[,"origin_cbg"])) == poi),]) #Interested in how many visitors from same CBG as POI visit the CBG. I feel there is too much information in the JSON columns
# and hard to use. So, I implemented a home vs neighbor CBG classification.
non_home_visitors = as.data.frame(home_cbg_data_df[which(as.numeric(unlist(home_cbg_data_df[,"origin_cbg"])) != poi),])
tot_home_vistors = sum(home_visitors[,"visitor_home_cbgs"])
tot_non_home_visitors = sum(non_home_visitors[,"visitor_home_cbgs"]) #neighboring CBGs
dat2["tot_home_visitors"] = tot_home_vistors
dat2["tot_non_home_visitors"] = tot_non_home_visitors
if (nchar(dat2$visitor_daytime_cbgs)<3) {
daytime_cbg_data_df = data.frame(matrix(0,1,3))
colnames(daytime_cbg_data_df) <- c("origin_cbg","safegraph_place_id","visitor_daytime_cbgs")
} else
{
daytime_cbg_data_df <- expand_cat_json(dat2,
expand = 'visitor_daytime_cbgs',
index = 'origin_cbg',
by = 'safegraph_place_id')}
home_daytime_visitors = as.data.frame(daytime_cbg_data_df[which(as.numeric(unlist(daytime_cbg_data_df[,"origin_cbg"])) == poi),])
non_home_daytime_visitors = as.data.frame(daytime_cbg_data_df[which(as.numeric(unlist(daytime_cbg_data_df[,"origin_cbg"])) != poi),])
tot_home_daytime_vistors = sum(home_daytime_visitors[,"visitor_daytime_cbgs"])
tot_non_home_daytime_visitors = sum(non_home_daytime_visitors[,"visitor_daytime_cbgs"])
dat2["tot_home_daytime_visits"] = tot_home_daytime_vistors
dat2["tot_non_home_daytime_visits"] = tot_non_home_daytime_visitors
device <- expand_cat_json(dat2,
expand = 'device_type',
index = 'origin_cbg', ##haven't changed this but didn't find any issues using origin_cbg as index
by = 'safegraph_place_id')
dat2["android"] = device[1,"device_type"]
dat2["ios"] = device[2,"device_type"]
dwell_times <- expand_cat_json(dat2,
expand = 'bucketed_dwell_times',
index = 'origin_cbg',
by = 'safegraph_place_id')
dwell_times = t(dwell_times[,c("bucketed_dwell_times")])
colnames(dwell_times) <- c("<5" , "5-20", "21-60" ,"61-240", ">240")
dat2["<5 Min"] = dwell_times[,1]
dat2["5-20 Min"] = dwell_times[,2]
dat2["21-60 Min"] = dwell_times[,3]
dat2["61-240 Min"] = dwell_times[,4]
dat2["Over 240 Min"] = dwell_times[,5]
if (nchar(dat2$visitor_country_of_origin)<3) {
visitor_country_df = data.frame(matrix(0,1,3))
colnames(visitor_country_df) <- c("Country","safegraph_place_id","visitor_country_of_origin")
} else {
visitor_country_df <- expand_cat_json(dat2,
expand = 'visitor_country_of_origin',
index = 'Country',
by = 'safegraph_place_id')
}
US_visitors = as.data.frame(visitor_country_df[which(visitor_country_df[,"Country"] == "US"),])
non_US_visitors = as.data.frame(visitor_country_df[which(visitor_country_df[,"Country"] != "US"),])
tot_US_vistors = sum(US_visitors[,"visitor_country_of_origin"])
tot_non_US_visitors = sum(non_US_visitors[,"visitor_country_of_origin"])
dat2["US_Visitors"] = tot_US_vistors
dat2["Non_US_Visitors"] = tot_non_US_visitors
#The columns that have values in a list - I remove the square brackets and expand the comma separated values into individual columns
#This is for useful to classify visits by hour and day.
dat2["visits_by_each_hour"]=gsub('[','', dat2["visits_by_each_hour"],fixed=TRUE)
dat2["visits_by_each_hour"]=gsub(']','', dat2["visits_by_each_hour"],fixed=TRUE)
visits_by_hour <- ((strsplit(as.character(dat2["visits_by_each_hour"]), split = ",")))
visits_by_hour <- as.numeric(unlist(visits_by_hour))
visits_by_hour <- t(as.data.frame(visits_by_hour))
##Name the 168 columns to Day-Hour Format##
colnames(visits_by_hour) <- c("Day-1-Hour-1", "Day-1-Hour-2", "Day-1-Hour-3", "Day-1-Hour-4", "Day-1-Hour-5", "Day-1-Hour-6", "Day-1-Hour-7", "Day-1-Hour-8", "Day-1-Hour-9", "Day-1-Hour-10", "Day-1-Hour-11", "Day-1-Hour-12", "Day-1-Hour-13", "Day-1-Hour-14", "Day-1-Hour-15", "Day-1-Hour-16", "Day-1-Hour-17", "Day-1-Hour-18", "Day-1-Hour-19", "Day-1-Hour-20", "Day-1-Hour-21", "Day-1-Hour-22", "Day-1-Hour-23", "Day-1-Hour-24",
"Day-2-Hour-1", "Day-2-Hour-2", "Day-2-Hour-3", "Day-2-Hour-4", "Day-2-Hour-5", "Day-2-Hour-6", "Day-2-Hour-7", "Day-2-Hour-8", "Day-2-Hour-9", "Day-2-Hour-10", "Day-2-Hour-11", "Day-2-Hour-12", "Day-2-Hour-13", "Day-2-Hour-14", "Day-2-Hour-15", "Day-2-Hour-16", "Day-2-Hour-17", "Day-2-Hour-18", "Day-2-Hour-19", "Day-2-Hour-20", "Day-2-Hour-21", "Day-2-Hour-22", "Day-2-Hour-23", "Day-2-Hour-24",
"Day-3-Hour-1", "Day-3-Hour-2", "Day-3-Hour-3", "Day-3-Hour-4", "Day-3-Hour-5", "Day-3-Hour-6", "Day-3-Hour-7", "Day-3-Hour-8", "Day-3-Hour-9", "Day-3-Hour-10", "Day-3-Hour-11", "Day-3-Hour-12", "Day-3-Hour-13", "Day-3-Hour-14", "Day-3-Hour-15", "Day-3-Hour-16", "Day-3-Hour-17", "Day-3-Hour-18", "Day-3-Hour-19", "Day-3-Hour-20", "Day-3-Hour-21", "Day-3-Hour-22", "Day-3-Hour-23", "Day-3-Hour-24",
"Day-4-Hour-1", "Day-4-Hour-2", "Day-4-Hour-3", "Day-4-Hour-4", "Day-4-Hour-5", "Day-4-Hour-6", "Day-4-Hour-7", "Day-4-Hour-8", "Day-4-Hour-9", "Day-4-Hour-10", "Day-4-Hour-11", "Day-4-Hour-12", "Day-4-Hour-13", "Day-4-Hour-14", "Day-4-Hour-15", "Day-4-Hour-16", "Day-4-Hour-17", "Day-4-Hour-18", "Day-4-Hour-19", "Day-4-Hour-20", "Day-4-Hour-21", "Day-4-Hour-22", "Day-4-Hour-23", "Day-4-Hour-24",
"Day-5-Hour-1", "Day-5-Hour-2", "Day-5-Hour-3", "Day-5-Hour-4", "Day-5-Hour-5", "Day-5-Hour-6", "Day-5-Hour-7", "Day-5-Hour-8", "Day-5-Hour-9", "Day-5-Hour-10", "Day-5-Hour-11", "Day-5-Hour-12", "Day-5-Hour-13", "Day-5-Hour-14", "Day-5-Hour-15", "Day-5-Hour-16", "Day-5-Hour-17", "Day-5-Hour-18", "Day-5-Hour-19", "Day-5-Hour-20", "Day-5-Hour-21", "Day-5-Hour-22", "Day-5-Hour-23", "Day-5-Hour-24",
"Day-6-Hour-1", "Day-6-Hour-2", "Day-6-Hour-3", "Day-6-Hour-4", "Day-6-Hour-5", "Day-6-Hour-6", "Day-6-Hour-7", "Day-6-Hour-8", "Day-6-Hour-9", "Day-6-Hour-10", "Day-6-Hour-11", "Day-6-Hour-12", "Day-6-Hour-13", "Day-6-Hour-14", "Day-6-Hour-15", "Day-6-Hour-16", "Day-6-Hour-17", "Day-6-Hour-18", "Day-6-Hour-19", "Day-6-Hour-20", "Day-6-Hour-21", "Day-6-Hour-22", "Day-6-Hour-23", "Day-6-Hour-24",
"Day-7-Hour-1", "Day-7-Hour-2", "Day-7-Hour-3", "Day-7-Hour-4", "Day-7-Hour-5", "Day-7-Hour-6", "Day-7-Hour-7", "Day-7-Hour-8", "Day-7-Hour-9", "Day-7-Hour-10", "Day-7-Hour-11", "Day-7-Hour-12", "Day-7-Hour-13", "Day-7-Hour-14", "Day-7-Hour-15", "Day-7-Hour-16", "Day-7-Hour-17", "Day-7-Hour-18", "Day-7-Hour-19", "Day-7-Hour-20", "Day-7-Hour-21", "Day-7-Hour-22", "Day-7-Hour-23", "Day-7-Hour-24")
###End Parsing visits by each hour###
###Begin Parsing visits by each day###
dat2[,"visits_by_day"]=gsub('[','', dat2[,"visits_by_day"],fixed=TRUE)
dat2[,"visits_by_day"]=gsub(']','', dat2[,"visits_by_day"],fixed=TRUE)
visits_by_day <- ((strsplit(as.character(dat2[,"visits_by_day"]), split = ",")))
visits_by_day <- as.numeric(unlist(visits_by_day))
visits_by_day <- t(as.data.frame(visits_by_day))
colnames(visits_by_day) <- c("Day-1", "Day-2","Day-3","Day-4", "Day-5","Day-6","Day-7")
###End Parsing visits by each day###
dat2 = cbind(dat2,visits_by_hour,visits_by_day)
drop_cols <- c("nchar","visits_by_day","visits_by_each_hour","poi_cbg",
"visitor_home_cbgs","visitor_daytime_cbgs","visitor_country_of_origin","bucketed_dwell_times")
dat2 = dat2[ , !(names(dat2) %in% drop_cols)]
final = rbind(final,dat2) #comes out with 207 columns with safegraph_place_id as key.
}
print(Sys.time()-t1)
##End For Loop##
###If anyone can make this loop faster or another way to represent the JSON fields into columns without having to spend a fortune or computing equipment,
###it would be really helpful.```