UPDATE: There is a new version of the API (v3). Please find the R tutorial for that API here. Python (API v3) tutorial can be found here as well.
Dewey launched a new data deployment system starting on August 23, 2023. Here is a step-by-step guide to download in bulk or read sample data in R. (You can find a Python version here.)
1. Create API Key
In the system, click Connections → Add Connection to create your API key.
As the message says, please make a copy of your API key and store it somewhere. Also, please hit the Save button before use.
2. Get a product path
Choose your product and Get / Subscribe → Connect to API then you can get API endpoint (product path). Make a copy of it.
3. R Code
Here is the R code for file downloads.
library(httr);
library(rjson);
library(jsonlite);
# Get the list of files on server
get_file_list = function(apikey, product_path, print_info = T) {
response = tryCatch(
{
GET(url=product_path,
add_headers(.headers = c("X-API-KEY" = apikey,
"accept" = "application/json")))
}, warning = function(cond) {
message("Warning in GET.")
message(cond)
message("")
}, error = function(cond) {
message("Error in GET.")
message(cond)
message("")
}
)
if(is.null(response)) {
return(NULL)
} else if(response$status_code == 401) {
print(response);
return(NULL);
}
res_json = content(response)
num_files = res_json$metadata$num_files
total_size_mb = res_json$metadata$total_size_mb
avg_file_size_mb = res_json$metadata$avg_file_size_mb
expires_at = res_json$metadata$expires_at
if(print_info) {
message("Files information---------------------------------------")
message(paste0("Number of files: ", num_files))
message(paste0("Total size (MB): ", total_size_mb))
message(paste0("Average file size (MB): ", avg_file_size_mb))
message(paste0("Link expires: ", expires_at))
message("--------------------------------------------------------")
}
files_df = data.frame(download_link = unlist(res_json$download_links))
split_links = do.call(rbind.data.frame,
strsplit(files_df$download_link, "?", fixed = T))
files_df$file_link = split_links[, 1];
files_df = files_df[order(files_df$file_link), ];
# Extract the file name
file_names = apply(data.frame(files_df$file_link), 1,
function(x) tail(unlist(strsplit(x, "/")), n= 1) );
files_df$file_name = file_names;
return(files_df);
}
# Read URL data into memory
read_sample_data = function(url, nrows = 100) {
# if(nrows > 1000) {
# message("Warning: set nrows no greater than 1000.");
# nrows = 1000;
# }
url_con = gzcon(url(url), text = T);
df = read.csv(url_con, nrows = nrows);
return(df);
}
# Read first file data into memory
read_sample_data0 = function(apikey, product_path, nrows = 100) {
files_df = get_file_list(apikey, product_path, print_info = T);
message(" ");
if(!is.null(files_df) & (nrow(files_df) > 0)) {
return(read_sample_data(files_df$download_link[1], nrows));
}
}
# Download files from file list to a destination folder
download_files = function(files_df, dest_folder, filename_prefix = NULL) {
dest_folder = gsub("\\", "/", dest_folder, fixed = T);
if(!endsWith(dest_folder, "/")) {
dest_folder = paste0(dest_folder, "/");
}
# number of files
num_files = nrow(files_df);
for (i in 1:num_files) {
message(paste0("Downloading ", i, "/", num_files))
download_link = files_df$download_link[i];
file_name = paste0(filename_prefix, files_df$file_name[i]);
dest_path = paste0(dest_folder, file_name)
message(paste0("Writing ", dest_path))
message("Please be patient. It may take a while...")
data = GET(url = download_link)
file_con = file(dest_path, "wb")
writeBin(content(data), file_con)
close(file_con)
message(" ")
}
}
# Download files with apikey and product path to a destination folder
download_files0 = function(apikey, product_path, dest_folder, filename_prefix = NULL) {
files_df = get_file_list(apikey, product_path, print_info = T);
message(" ");
download_files(files_df, dest_folder, filename_prefix);
}
It has the following functions:
- get_file_list: gets the list of files in a data.frame
- read_sample_data: read a sample of data for a file download URL
- read_sample_data0: read a sample of data for the first file with apikey and product path
- download_files: download files from the file list to a destination folder
- download_files0: download files with apikey and product path to a destination folder
4. Examples
I am going to use Advan monthly pattern as an example.
# API Key
apikey_ = "Paste your API key from step 1 here."
# Advan product path
product_path_= "Paste product path from step 2 here."
You will only have one API Key while having different product paths for each product.
You can now see the list of files to download by
files_df = get_file_list(apikey_, product_path_, print_info = T);
files_df;
print_info = TRUE set to print the meta information of the files like below:
Advan has total 2560 files with 197.9MB average file size.
files_df includes a file list (data.frame) like below:
You can quickly load/see a sample data by
sample_data = read_sample_data(files_df$download_link[1], nrows = 100);
This will load sample data for the first file in files_df (files_df$download_link[1]) for the first 100 rows. You can see any files in the list.
If you want to see the first n rows of the first file skipping get_file_list, you can use
sample_data = read_sample_data0(apikey_, product_path_, nrows = 1000);
This will load the first 1000 rows for the first file of Advan data.
Now it’s time to download data to your local drive. First, you can download all the files by
download_files0(apikey_, product_path_, "E:/temp", "advan_mp_")
The third parameter is for your destination folder (“E:/temp”), and the last parameter (“advan_mp_”) is the filename prefix. So, all the files will be saved as “advan_mp_xxxxxxx.csv.gz”, etc. You can leave this empty or NULL not to have a prefix.
The second approach to download files is to pass files_df:
download_files(files_df, "E:/temp", "advan_mp_")
Sometimes, the download may stop/fail for any reason in the middle. If you want to resume some specific files (numbers), then you can pass a slice of files_df:
download_files(files_df[5:7, ], "E:/temp", "advan_mp_")
Donn