From c2c71a981c01c29a48433971c9f66e2e7a43b3e6 Mon Sep 17 00:00:00 2001 From: Tyler Date: Sun, 6 Jul 2025 12:29:53 -0700 Subject: [PATCH] Added a script to auto-download the data from Dryad --- README.md | 2 +- download_data.py | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 download_data.py diff --git a/README.md b/README.md index ea78782..5109f61 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ The code is organized into five main directories: `utils`, `analyses`, `data`, ` ## Data ### Data Overview -The data used in this repository (which can be downloaded from [Dryad](https://datadryad.org/stash/dataset/doi:10.5061/dryad.dncjsxm85)) consists of various datasets for recreating figures and training/evaluating the brain-to-text model: +The data used in this repository (which can be downloaded from [Dryad](https://datadryad.org/stash/dataset/doi:10.5061/dryad.dncjsxm85), either manually from the website, or using `download_data.py`) consists of various datasets for recreating figures and training/evaluating the brain-to-text model: - `t15_copyTask.pkl`: This file contains the online Copy Task results required for generating Figure 2. - `t15_personalUse.pkl`: This file contains the Conversation Mode data required for generating Figure 4. - `t15_copyTask_neuralData.zip`: This dataset contains the neural data for the Copy Task. diff --git a/download_data.py b/download_data.py new file mode 100644 index 0000000..9b37b2b --- /dev/null +++ b/download_data.py @@ -0,0 +1,111 @@ +""" +Run this file to download data from Dryad and unzip the zip files. Downloaded files end +up in this repostitory's data/ directory. + +First create the b2txt25 conda environment. Then in a Terminal, at this repository's +top-level directory (nejm-brain-to-text/), run: + +conda activate b2txt25 +python download_data.py +""" + +import sys +import os +import urllib.request +import json +import zipfile + + +######################################################################################## +# +# Helpers. +# +######################################################################################## + + +def display_progress_bar(block_num, block_size, total_size, message=""): + """""" + bytes_downloaded_so_far = block_num * block_size + MB_downloaded_so_far = bytes_downloaded_so_far / 1e6 + MB_total = total_size / 1e6 + sys.stdout.write( + f"\r{message}\t\t{MB_downloaded_so_far:.1f} MB / {MB_total:.1f} MB" + ) + sys.stdout.flush() + + +######################################################################################## +# +# Main function. +# +######################################################################################## + + +def main(): + """""" + DRYAD_DOI = "10.5061/dryad.dncjsxm85" + + ## Make sure the command is being run from the right place and we can see the data/ + ## directory. + + DATA_DIR = "data/" + data_dirpath = os.path.abspath(DATA_DIR) + assert os.getcwd().endswith( + "nejm-brain-to-text" + ), f"Please run the download command from the nejm-brain-to-text directory (instead of {os.getcwd()})" + assert os.path.exists( + data_dirpath + ), "Cannot find the data directory to download into." + + ## Get the list of files from the latest version on Dryad. + + DRYAD_ROOT = "https://datadryad.org" + urlified_doi = DRYAD_DOI.replace("/", "%2F") + + versions_url = f"{DRYAD_ROOT}/api/v2/datasets/doi:{urlified_doi}/versions" + with urllib.request.urlopen(versions_url) as response: + versions_info = json.loads(response.read().decode()) + + files_url_path = versions_info["_embedded"]["stash:versions"][-1]["_links"][ + "stash:files" + ]["href"] + files_url = f"{DRYAD_ROOT}{files_url_path}" + with urllib.request.urlopen(files_url) as response: + files_info = json.loads(response.read().decode()) + + file_infos = files_info["_embedded"]["stash:files"] + + ## Download each file into the data directory (and unzip for certain files). + + for file_info in file_infos: + filename = file_info["path"] + + if filename == "README.md": + continue + + download_path = file_info["_links"]["stash:download"]["href"] + download_url = f"{DRYAD_ROOT}{download_path}" + + download_to_filepath = os.path.join(data_dirpath, filename) + + urllib.request.urlretrieve( + download_url, + download_to_filepath, + reporthook=lambda *args: display_progress_bar( + *args, message=f"Downloading {filename}" + ), + ) + sys.stdout.write("\n") + + # If this file is a zip file, unzip it into the data directory. + + if file_info["mimeType"] == "application/zip": + print(f"Extracting files from {filename} ...") + with zipfile.ZipFile(download_to_filepath, "r") as zf: + zf.extractall(data_dirpath) + + print(f"Download complete. See data files in {data_dirpath}\n") + + +if __name__ == "__main__": + main()