Bulk data files download with R

InvTech · December 5, 2022, 7:50pm

Hello Everyone.

I needed to download a bunch of files from Dewey to my hard drive in R. I found one example in Python ( Tutorial: How to Access Files via the API - Help - Dewey Community (deweydata.io)) but not in R. So I made one. Hope this helps R users.

# Load R libraries
library(httr);
library(rjson);
library(jsonlite);
library(rstudioapi);

# Define global variables
DEWEY_TOKEN_URL = "https://marketplace.deweydata.io/api/auth/tks/get_token";
DEWEY_MP_ROOT   = "https://marketplace.deweydata.io";
DEWEY_DATA_ROOT = "https://marketplace.deweydata.io/api/data/v2/list";

# Get access token
get_access_token = function(username, passw) {
  response = POST(DEWEY_TOKEN_URL, authenticate(username, passw));
  response_content = content(response);

  return(response_content$access_token);
}

# Return file paths in the sub_path folder
get_file_paths = function(token, sub_path = NULL) {
  response = GET(paste0(DEWEY_DATA_ROOT, sub_path),
                 headers=add_headers(Authorization = paste0("Bearer ", token)));
  
  json_text = content(response, as = "text", encoding = "UTF-8");
  
  response_df = as.data.frame(fromJSON(json_text));
  response_df;
  
  return(response_df);
}

# Download a single file from Dewey (src_url) to a local destination file (dest_file).
download_file = function(token, src_url, dest_file) {
  options(timeout=200); # increase the timeout if you have a large file to download
  download.file(src_url, dest_file, mode = "wb",
                headers = c(Authorization = paste0("Bearer ", token)));
}

# Example ----------------------------------------------------------

# Avoid including your credentials in the code.
# (You can hard type your credentials in the code as well, though.)
user_name = askForPassword("User name (email address)");
pass_word = askForPassword("Password");

# Get access token
tkn = get_access_token(user_name, pass_word);
tkn;

# Get file paths in the "/2018/01/01/SAFEGRAPH/MPSP" sub folder.
file_paths = get_file_paths(token = tkn,
                            sub_path = "/2018/01/01/SAFEGRAPH/MPSP");
head(file_paths);

# Download the first file to C:/temp/, as an example.
# In the file_paths data.frame,
# url[1] looks like:
# "/api/data/v2/data/2018/01/01/SAFEGRAPH/MPSP/20180101-safegraph_mpsp_cpgp_part9_0"
# and name[1] looks like: "core_poi-geometry-patterns-part9.csv.gz".
src_url = paste0(DEWEY_MP_ROOT, file_paths$url[1]);
dest_file = paste0("C:/temp/", file_paths$name[1]);

download_file(tkn, src_url, dest_file);

# Done!

evan-barry-dewey · December 5, 2022, 8:19pm

Awesome! Thanks for creating this. cc @ryank

Christian_Gunning_University_of_Georgia · December 20, 2022, 4:46am

This is great, thanks!
One suggestion/request - can you place code inside “preformatted output” tags (the </> in editing bar):

# url[1] looks like:
# "/api/data/v2/data/2018/01/01/SAFEGRAPH/MPSP/20180101-safegraph_mpsp_cpgp_part9_0"
# and name[1] looks like: "core_poi-geometry-patterns-part9.csv.gz".

This solves the problem of long lines getting cut off, as a scroll bar gets added.

Paola_Jaimes · February 9, 2023, 10:18pm

@InvTech, thank you so much for doing this! Without almost any knowledge of R, I was able to get the csv files. I was wondering if you can show me how can I process the data before downloading it. I need to group by state. Thanks again!

InvTech · June 9, 2023, 11:44pm

@Paola_Jaimes, I know it’s very late, but I have created a version for Python. This can be implemented in R as well. A lot things are going on my side and didn’t have time to do it in R.

rachelf · August 17, 2023, 2:49pm

Hi @InvTech

I’m trying to download some ADVAN data via dewey. I’m really struggling to download the bulk data and hope you can assist me on this. I have put my code down below so far. Dewey has organised all the files by year, with month subfolders and >100 files within each month subfolder. I need to download all files from Jan 2020 to July 2023 and am struggling to do this. Hope you can help:

#—

Load R libraries

library(httr)
library(jsonlite)
library(rstudioapi)

Define global variables

DEWEY_TOKEN_URL ← “https://marketplace.deweydata.io/api/auth/tks/get_token”
DEWEY_MP_ROOT ← “https://marketplace.deweydata.io”
DEWEY_DATA_ROOT ← “https://marketplace.deweydata.io/api/data/v2/list”

Define functions ------------------------------------------------

Get access token

get_access_token ← function(username, passw) {
response ← POST(DEWEY_TOKEN_URL, authenticate(username, passw))
response_content ← content(response)

Print the response content for debugging

print(response_content)

return(response_content$access_token)
}

Return file paths in the sub_path folder

get_file_paths ← function(token, sub_path = NULL) {
response ← GET(paste0(DEWEY_DATA_ROOT, sub_path),
headers = add_headers(Authorization = paste0("Bearer ", token)))

json_text ← content(response, as = “text”, encoding = “UTF-8”, flatten = TRUE)

print(json_text) # Print the JSON response for debugging

response_df ← as.data.frame(fromJSON(json_text))
return(response_df)
}

Download a single file from Dewey (src_url) to a local destination file (dest_file).

download_file ← function(token, src_url, dest_file) {
options(timeout=200); # increase the timeout if you have a large file to download
download.file(src_url, dest_file, mode = “wb”,
headers = c(Authorization = paste0("Bearer ", token)));
}

Main Code

user_name ← askForPassword(“User name (email address)”);
pass_word ← askForPassword(“Password”);

Get access token

tkn ← get_access_token(user_name, pass_word);
tkn;

Get file paths in the “/2023/01/01/ADVAN/MP” sub folder.

file_paths ← get_file_paths(token = tkn,
sub_path = “/2023/01/01/ADVAN/MP”);
head(file_paths)

#Download Files
src_url ← paste0(DEWEY_MP_ROOT, file_paths$url[1]);

desktop_folder ← file.path(path.expand(“~”), “Desktop”, “DataFiles”) #destination folder
dest_file ← file.path(desktop_folder, file_paths$name[1])

download_file(tkn, src_url, dest_file)

InvTech · August 17, 2023, 3:45pm

Hi Rachel,

Try this out. This is exactly what you are looking for: Bulk data download and Python in R.

The main code is written in Python, but you can run Python in RStudio.

rachelf · August 17, 2023, 4:16pm

Hi Dong!

Thank you so much for getting back to me with this. I’ve tried to put the code into R but am still facing some issues. I’m not sure if I have executed it correctly:

SetUp - only run once

install.packages(“reticulate”);

Load library

library(reticulate);

Install miniconda if first-time use-----------------

miniconda is package management software for Python and otehr

Install miniconda. This will take a while

install_miniconda(path = “C:/Temp/miniconda”, update = T);

This doesn’t do anything.

This will display the current virtual environment list

Virtual environment list. This will be explained and setup later

conda_list(conda = “C:/Temp/miniconda/_conda.exe”);

----------------------------------------------------

Create new virtual environment (venv)

Python keeps a copy version of Python environment and

installed packages in venv.

Change env_path only. ------------------------------

env_path = “C:/Temp/miniconda/envs/venv_sample”;

----------------------------------------------------

conda.exe path

conda_path = “C:/Temp/miniconda/_conda.exe”;

Extract venv_name (venv_sample) from env_path

evn_split = strsplit(env_path, “/”)[[1]];
env_name = evn_split[length(evn_split)]; # env_name
env_name;

Create venv if first time use

Create virtual enviroment “venv_sample”

conda_create(envname = env_path, conda = conda_path);

This will show newly created “venv_sample”

conda_list(conda = conda_path);
#conda_remove(“venv_test”)

------------------------------------------------------------------------------

Set Python venv to “venv_sample”.

to make sure Python uses “venv_sample” as the venv.

Otherwise, R will use system defalut Python

which may cause error (especially when some required Python packages are

not installed.)

use_condaenv(condaenv = env_name, conda = conda_path);

Set up Python system path.

Assuming Python sources are in the ./Py folder,

add “.\Py” to the system path so that Python can search source code files.

Impoort Python’s “sys” package

py_sys = import(“sys”)

Append “.\Py” to the system path.

R users!!:

Python is case sensitive for folders when calling source code.

Be cautious in using upper and lower characters for folders.

“from py.sub_module import *” will be error because “py” is in lower character.

py_sys$path = c(py_sys$path, “.\Py”)

------------------------------------------------------------------------------

Install required packages

List installed Python packages

py_list_packages();

Everything will be installed to “venv_sample” folder

May take time. Be patient…

py_install(packages = c(“pandas”)); # provides Python version of data.frame
py_install(packages = c(“requests”)); # allows you to send HTTP/1.1 requests

py_install(packages = c(“scikit-learn”); # regression

py_install(packages = c(“matplotlib”)); # plot

py_install(packages = c(“seaborn”)); # plot

py_install(packages = c(“geopy”)); # geo

Test block

if(F) {
print(“Python setup test pandas data frame test.—”);

Test

pd = import(“pandas”);
pd$array(c(1, 2, 3));

print(“--------------------------------------------”);
}

####Run everytime

library(reticulate);

venv path

env_path = “C:/Temp/miniconda/envs/venv_sample”;

conda.exe path

conda_path = “C:/Temp/miniconda/_conda.exe”;

Extract venv_name (venv_sample) from env_path

evn_split = strsplit(env_path, “/”)[[1]];
env_name = evn_split[length(evn_split)]; # env_name
env_name;

use_condaenv(condaenv = env_name, conda = conda_path);

If you saved your Python codes in “./Py” folder

py_sys = import(“sys”)
py_sys$path = c(py_sys$path, “.\Py”)

#Saving Code

Import your Python functions from dewey_mp.py

source_python(“dewey_mp.py”)

Now you can use Python functions in R

Replace “user_name” and “pass_word” with your actual credentials

user_name ← askForPassword(“User name (email address)”);
pass_word ← askForPassword(“Password”);

Get access token

tkn ← get_access_token(user_name, pass_word)
print(tkn)

Download monthly files for the specified time range

download_files(user_name, pass_word, “C:/Temp”, 202301, 202307, “ADVAN”, “MP”)

InvTech · August 18, 2023, 12:26am

Hi,

Are you getting a tkn value? If so, your initial setup is right. If you post your error codes it will help a lot to figure out issues.

Following needs to be corrected.

download_files(user_name, pass_word, “C:/Temp”, 202301, 202307, “ADVAN”, “MP”)

Try to add as.integer(yyyymm) like below. (this dis due to data type expectation between R and Python)

download_files(user_name, pass_word, “C:/Temp”, as.integer(202301), as.integer(202307), “ADVAN”, “MP”)

Dewey will be launching new data delivery system soon and I expect data download will get a lot easier.

Donn

rachelf · August 18, 2023, 12:55pm

Hi Donn,

I have tried to rerun the code but am facing a different error now (as below). I have attempted to delete the miniconda and reinstall it but the error still persists:

###SetUp - only run once
install.packages(“reticulate”);

#Load library
library(reticulate);

#Install miniconda if first-time use-----------------
#miniconda is package management software for Python and otehr

#Install miniconda. This will take a while
install_miniconda(path = “C:/Temp/miniconda”, update = T);

#This doesn’t do anything.
#This will display the current virtual environment list
#Virtual environment list. This will be explained and setup later
conda_list(conda = “C:/Temp/miniconda/_conda.exe”);

----------------------------------------------------

#Create new virtual environment (venv)
python keeps a copy version of Python environment and
#installed packages in venv.
#Change env_path only. ------------------------------
env_path = “C:/Temp/miniconda/envs/venv_sample”;

----------------------------------------------------

#conda.exe path
conda_path = “C:/Temp/miniconda/_conda.exe”;

#Extract venv_name (venv_sample) from env_path
evn_split = strsplit(env_path, “/”)[[1]];
env_name = evn_split[length(evn_split)]; # env_name
env_name;

#Create venv if first time use
#Create virtual enviroment “venv_sample”
conda_create(envname = env_path, conda = conda_path);

#This will show newly created “venv_sample”
conda_list(conda = conda_path);
#conda_remove(“venv_test”)

------------------------------------------------------------------------------

use_condaenv(condaenv = env_name, conda = conda_path);

#Impoort Python’s “sys” package
py_sys = import(“sys”)

#Append “.\Py” to the system path.
r users!!:
python is case sensitive for folders when calling source code.
#Be cautious in using upper and lower characters for folders.
#“from py.sub_module import *” will be error because “py” is in lower character.
py_sys$path = c(py_sys$path, “.\Py”)

------------------------------------------------------------------------------

#Install required packages
#List installed Python packages
py_list_packages();

#Everything will be installed to “venv_sample” folder
#May take time. Be patient…
py_install(packages = c(“pandas”)); # provides Python version of data.frame
py_install(packages = c(“requests”)); # allows you to send HTTP/1.1 requests
#py_install(packages = c(“scikit-learn”); # regression
#py_install(packages = c(“matplotlib”)); # plot
#py_install(packages = c(“seaborn”)); # plot
#py_install(packages = c(“geopy”)); # geo

Test block

if(F) {
print(“Python setup test pandas data frame test.—”);

Test

pd = import(“pandas”);
pd$array(c(1, 2, 3));

print(“--------------------------------------------”);
}

####Run everytime

library(reticulate);

#venv path
env_path = “C:/Temp/miniconda/envs/venv_sample”;

conda.exe path

conda_path = “C:/Temp/miniconda/_conda.exe”;

#Extract venv_name (venv_sample) from env_path
evn_split = strsplit(env_path, “/”)[[1]];
env_name = evn_split[length(evn_split)]; # env_name
env_name;

use_condaenv(condaenv = env_name, conda = conda_path);

#If you saved your Python codes in “./Py” folder
py_sys = import(“sys”)
py_sys$path = c(py_sys$path, “.\Py”)

#Saving Code

Import your Python functions from dewey_mp.py

source_python(“dewey_mp.py”)

#Now you can use Python functions in R

#Replace “user_name” and “pass_word” with your actual credentials
user_name ← askForPassword(“User name (email address)”);
pass_word ← askForPassword(“Password”);

#Get access token
tkn ← get_access_token(user_name, pass_word)
print(tkn)

#Download monthly files for the specified time range
download_files(user_name, pass_word, “C:/Temp”, as.integer(202301), as.integer(202307), “ADVAN”, “MP”)

Error:

conda_list(conda = “C:/Temp/miniconda/_conda.exe”);
Error: Specified conda binary ‘C:/Temp/miniconda/_conda.exe’ does not exist.
In addition: Warning message:
In conda_binary(conda) :
Supplied path is not a conda binary: ‘C:/Temp/miniconda/_conda.exe’

Martin_Andersen_UNC_Greensboro · August 18, 2023, 1:13pm

Since you are using R, here is my code for bulk downloading files (and checking versions, etc.)

library(jsonlite)
library(httr2)
library(glue)
library(tidyverse)
library(furrr)

oldwd=getwd()

advan_root=XXXXXXXXXXXX

setwd(advan_root)

plan(sequential)
plan(multisession,workers=12)

username=XXXXXXXXXXXXXXXX

auth=XXXXXXXXXXXXXXX

base64_auth=openssl::base64_encode(auth)

resp<-
request(‘https://marketplace.deweydata.io/api/auth/tks/get_token’)|>
req_method(“POST”)|>
req_auth_basic(“msander4@uncg.edu”,“Rh!n0ceros”)|>
req_perform(verbosity = 3)

token=resp_body_json(resp)$access_token

u0=“/api/data/v2/list”

get_url<-function(u) {
r<-
request(glue(‘https://marketplace.deweydata.io{u}?${username}’))|>
req_method(“GET”)|>
req_headers(
Accept = “application/json”,
)|>
req_auth_bearer_token(token)|>
req_perform()|>
resp_body_json()|>
enframe(name = NULL)|>
unnest_wider(col=c(value))

r=map(r,~ifelse(.$directory==“TRUE”,get_url(.$url),list(.)))|>

unlist(recursive = FALSE)

return(r)
}

download_file = function(token, src_url, dest_file) {
print(c(“dl”,dest_file))
options(timeout=200); # increase the timeout if you have a large file to download
tf=tempfile()

download.file(src_url, dest_file, mode = “wb”,
headers = c(Authorization = paste0("Bearer ", token)),
quiet=TRUE);

file.copy(tf,dest_file, overwrite = TRUE)
if(file.exists(tf)) {file.remove(tf)}
}

safe_download_file=purrr::possibly(download_file)

file_list=tibble()
dir_list=list(u0)
i=0

while(length(dir_list)>0 & i<20){
res=map_dfr(dir_list,~get_url(.))

dir_list=res|>filter(directory)%>%.$url

file_list=bind_rows(file_list,
res>>filter(!directory)
)

dir_list=dir_list[!is.na(dir_list)]

}

file_list$dest_dir=
str_extract(file_list$parent,“[0-9]{4}/[0-9]{2}/[0-9]{2}/ADVAN/[A-Z]+”)%>%
str_remove(.,“/ADVAN”)%>%
sub(“([0-9]{4})/([0-9]{2})/([0-9]{2})/([A-Z]+)”,“\4/\1/\2/\3/”,.)

file_list<-
file_list%>%
mutate(dest_file_path=paste0(dest_dir,“/”,fid,“.csv.gz”))

file_list<-
file_list%>%
mutate(
across(c(createdAt,updatedAt),~as_date(.))
)

file_list<-
file_list|>
mutate(dest_file_path=str_replace(dest_file_path,“//”,“/”))

get dates and sizes for the raw files

raw_info=file.info(file_list$dest_file_path)|>as_tibble(rownames = “file_path”)

new_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
select(url,dest_dir,dest_file_path)

updated_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(!is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
filter(!(size==dest_size & check_date>=updatedAt) | is.na(dest_size))|>
select(url,dest_dir,dest_file_path,check_date,updatedAt)

existing_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(!is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
filter((size==dest_size & check_date>=updatedAt))|>
select(url,dest_dir,dest_file_path,source_size=size,dest_size,createdAt,updatedAt,check_date)

check for files that need to be removed

all_files<-
map(
list(“MP”,“NP”,“NPCA”,“WP”),
~list.files(.,recursive = TRUE,full.names = TRUE)
)|>unlist()

all_files<-all_files[str_detect(all_files,“^.P”)]

check_list<-
map_lgl(
all_files,
~. %in% file_list$dest_file_path
)

file.remove(all_files[!check_list & !str_detect(all_files,“mp_brand”)])

now do the same for the clean folders

for clean folders, the date is the oldest date for a given type-by-date combination

clean_info<-
file.info(list.files(“clean”,recursive = TRUE,full.names = TRUE))|>as_tibble(rownames = “file_path”)|>
mutate(
data_date=str_extract(basename(file_path),“[0-9]{4}-[0-9]{2}-[0-9]{2}”),
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”)
)|>
summarize(
across(c(mtime,ctime,atime),~min(.)),
.by=c(data_date,group)
)|>
mutate(
dest_dir=paste0(group,“/”,str_replace_all(data_date,“-”,“/”),“/”),
data_date=NULL,
group=NULL
)

start making the to-do-list

now do the “to-do” list

to_do_list<-
bind_rows(
new_files,
updated_files
)|>
filter(!is.na(dest_dir), dest_dir!=“NA”)|>
select(url,dest_dir,dest_file_path)

ds=unique(to_do_list$dest_dir)|>unlist()|>sort()

map(ds,~dir.create(.,recursive = TRUE))

gs=str_extract(ds,“^[^/]+/”)|>unique()

for(g in gs){
ds_g=ds[str_detect(ds,g)]
for(d in ds_g) {
w<-to_do_list|>
filter(dest_dir==d)

print(d)

future_walk2(
w$url,
w$dest_file_path,
function(f,d) {
safe_download_file(token,paste0(“https://marketplace.deweydata.io”, f),d)
}

.progress = TRUE

)
}
}

plan(sequential)

InvTech · August 18, 2023, 10:24pm

Please check whether miniconda has been installed correctly. It takes time (~5-10 minutes), and you will see the following message in your R upon the successful installation.

And check your miniconda folder has “_conda.exe” file.

Hope this works out.

InvTech · August 18, 2023, 10:26pm

Thanks for chiming in!

rachelf · August 19, 2023, 10:46am

hi @InvTech and

think the issue is arising from the installation of the miniconda. I am getting this error below:

InvTech · August 22, 2023, 7:00pm

Dewey launches new data deployment system tomorrow and I posted a guide for it (Dewey Data Bulk Download in R (new system)).

As your problem comes from installing miniconda, it seems we are reaching a dead end. If you really wish to continue using this, I recommend directly use Python instead of using it via R.