Since you are using R, here is my code for bulk downloading files (and checking versions, etc.)
library(jsonlite)
library(httr2)
library(glue)
library(tidyverse)
library(furrr)
oldwd=getwd()
advan_root=XXXXXXXXXXXX
setwd(advan_root)
plan(sequential)
plan(multisession,workers=12)
username=XXXXXXXXXXXXXXXX
auth=XXXXXXXXXXXXXXX
base64_auth=openssl::base64_encode(auth)
resp<-
request(‘https://marketplace.deweydata.io/api/auth/tks/get_token’)|>
req_method(“POST”)|>
req_auth_basic(“msander4@uncg.edu”,“Rh!n0ceros”)|>
req_perform(verbosity = 3)
token=resp_body_json(resp)$access_token
u0=“/api/data/v2/list”
get_url<-function(u) {
r<-
request(glue(‘https://marketplace.deweydata.io{u}?${username}’))|>
req_method(“GET”)|>
req_headers(
Accept = “application/json”,
)|>
req_auth_bearer_token(token)|>
req_perform()|>
resp_body_json()|>
enframe(name = NULL)|>
unnest_wider(col=c(value))
r=map(r,~ifelse(.$directory==“TRUE”,get_url(.$url),list(.)))|>
unlist(recursive = FALSE)
return(r)
}
download_file = function(token, src_url, dest_file) {
print(c(“dl”,dest_file))
options(timeout=200); # increase the timeout if you have a large file to download
tf=tempfile()
download.file(src_url, dest_file, mode = “wb”,
headers = c(Authorization = paste0("Bearer ", token)),
quiet=TRUE);
file.copy(tf,dest_file, overwrite = TRUE)
if(file.exists(tf)) {file.remove(tf)}
}
safe_download_file=purrr::possibly(download_file)
file_list=tibble()
dir_list=list(u0)
i=0
while(length(dir_list)>0 & i<20){
res=map_dfr(dir_list,~get_url(.))
dir_list=res|>filter(directory)%>%.$url
file_list=bind_rows(file_list,
res>>filter(!directory)
)
dir_list=dir_list[!is.na(dir_list)]
}
file_list$dest_dir=
str_extract(file_list$parent,“[0-9]{4}/[0-9]{2}/[0-9]{2}/ADVAN/[A-Z]+”)%>%
str_remove(.,“/ADVAN”)%>%
sub(“([0-9]{4})/([0-9]{2})/([0-9]{2})/([A-Z]+)”,“\4/\1/\2/\3/”,.)
file_list<-
file_list%>%
mutate(dest_file_path=paste0(dest_dir,“/”,fid,“.csv.gz”))
file_list<-
file_list%>%
mutate(
across(c(createdAt,updatedAt),~as_date(.))
)
file_list<-
file_list|>
mutate(dest_file_path=str_replace(dest_file_path,“//”,“/”))
get dates and sizes for the raw files
raw_info=file.info(file_list$dest_file_path)|>as_tibble(rownames = “file_path”)
new_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
select(url,dest_dir,dest_file_path)
updated_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(!is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
filter(!(size==dest_size & check_date>=updatedAt) | is.na(dest_size))|>
select(url,dest_dir,dest_file_path,check_date,updatedAt)
existing_files<-
inner_join(
file_list,
raw_info|>
mutate(
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”),
check_date=pmax(mtime,ctime,atime,na.rm=TRUE)
)|>
filter(!is.na(check_date))|>
select(file_path,dest_size=size,check_date),
by=join_by(dest_file_path==file_path)
)|>
filter((size==dest_size & check_date>=updatedAt))|>
select(url,dest_dir,dest_file_path,source_size=size,dest_size,createdAt,updatedAt,check_date)
check for files that need to be removed
all_files<-
map(
list(“MP”,“NP”,“NPCA”,“WP”),
~list.files(.,recursive = TRUE,full.names = TRUE)
)|>unlist()
all_files<-all_files[str_detect(all_files,“^.P”)]
check_list<-
map_lgl(
all_files,
~. %in% file_list$dest_file_path
)
file.remove(all_files[!check_list & !str_detect(all_files,“mp_brand”)])
now do the same for the clean folders
for clean folders, the date is the oldest date for a given type-by-date combination
clean_info<-
file.info(list.files(“clean”,recursive = TRUE,full.names = TRUE))|>as_tibble(rownames = “file_path”)|>
mutate(
data_date=str_extract(basename(file_path),“[0-9]{4}-[0-9]{2}-[0-9]{2}”),
group=str_extract(str_remove(file_path,“clean/”),“[^/]*”)
)|>
summarize(
across(c(mtime,ctime,atime),~min(.)),
.by=c(data_date,group)
)|>
mutate(
dest_dir=paste0(group,“/”,str_replace_all(data_date,“-”,“/”),“/”),
data_date=NULL,
group=NULL
)
start making the to-do-list
now do the “to-do” list
to_do_list<-
bind_rows(
new_files,
updated_files
)|>
filter(!is.na(dest_dir), dest_dir!=“NA”)|>
select(url,dest_dir,dest_file_path)
ds=unique(to_do_list$dest_dir)|>unlist()|>sort()
map(ds,~dir.create(.,recursive = TRUE))
gs=str_extract(ds,“^[^/]+/”)|>unique()
for(g in gs){
ds_g=ds[str_detect(ds,g)]
for(d in ds_g) {
w<-to_do_list|>
filter(dest_dir==d)
print(d)
future_walk2(
w$url,
w$dest_file_path,
function(f,d) {
safe_download_file(token,paste0(“https://marketplace.deweydata.io”, f),d)
}
.progress = TRUE
)
}
}
plan(sequential)