| 
									
										
										
										
											2025-07-06 12:29:53 -07:00
										 |  |  | """
 | 
					
						
							|  |  |  | Run this file to download data from Dryad and unzip the zip files. Downloaded files end | 
					
						
							|  |  |  | up in this repostitory's data/ directory. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | First create the b2txt25 conda environment. Then in a Terminal, at this repository's | 
					
						
							|  |  |  | top-level directory (nejm-brain-to-text/), run: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | conda activate b2txt25 | 
					
						
							|  |  |  | python download_data.py | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import sys | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import urllib.request | 
					
						
							|  |  |  | import json | 
					
						
							|  |  |  | import zipfile | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ######################################################################################## | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Helpers. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ######################################################################################## | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def display_progress_bar(block_num, block_size, total_size, message=""): | 
					
						
							|  |  |  |     """""" | 
					
						
							|  |  |  |     bytes_downloaded_so_far = block_num * block_size | 
					
						
							|  |  |  |     MB_downloaded_so_far = bytes_downloaded_so_far / 1e6 | 
					
						
							|  |  |  |     MB_total = total_size / 1e6 | 
					
						
							|  |  |  |     sys.stdout.write( | 
					
						
							|  |  |  |         f"\r{message}\t\t{MB_downloaded_so_far:.1f} MB / {MB_total:.1f} MB" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     sys.stdout.flush() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ######################################################################################## | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Main function. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ######################################################################################## | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     """""" | 
					
						
							|  |  |  |     DRYAD_DOI = "10.5061/dryad.dncjsxm85" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ## Make sure the command is being run from the right place and we can see the data/ | 
					
						
							|  |  |  |     ## directory. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     DATA_DIR = "data/" | 
					
						
							|  |  |  |     data_dirpath = os.path.abspath(DATA_DIR) | 
					
						
							|  |  |  |     assert os.getcwd().endswith( | 
					
						
							|  |  |  |         "nejm-brain-to-text" | 
					
						
							|  |  |  |     ), f"Please run the download command from the nejm-brain-to-text directory (instead of {os.getcwd()})" | 
					
						
							|  |  |  |     assert os.path.exists( | 
					
						
							|  |  |  |         data_dirpath | 
					
						
							|  |  |  |     ), "Cannot find the data directory to download into." | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ## Get the list of files from the latest version on Dryad. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     DRYAD_ROOT = "https://datadryad.org" | 
					
						
							|  |  |  |     urlified_doi = DRYAD_DOI.replace("/", "%2F") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     versions_url = f"{DRYAD_ROOT}/api/v2/datasets/doi:{urlified_doi}/versions" | 
					
						
							|  |  |  |     with urllib.request.urlopen(versions_url) as response: | 
					
						
							|  |  |  |         versions_info = json.loads(response.read().decode()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     files_url_path = versions_info["_embedded"]["stash:versions"][-1]["_links"][ | 
					
						
							|  |  |  |         "stash:files" | 
					
						
							|  |  |  |     ]["href"] | 
					
						
							|  |  |  |     files_url = f"{DRYAD_ROOT}{files_url_path}" | 
					
						
							|  |  |  |     with urllib.request.urlopen(files_url) as response: | 
					
						
							|  |  |  |         files_info = json.loads(response.read().decode()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_infos = files_info["_embedded"]["stash:files"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ## Download each file into the data directory (and unzip for certain files). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for file_info in file_infos: | 
					
						
							|  |  |  |         filename = file_info["path"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if filename == "README.md": | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         download_path = file_info["_links"]["stash:download"]["href"] | 
					
						
							|  |  |  |         download_url = f"{DRYAD_ROOT}{download_path}" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         download_to_filepath = os.path.join(data_dirpath, filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         urllib.request.urlretrieve( | 
					
						
							|  |  |  |             download_url, | 
					
						
							|  |  |  |             download_to_filepath, | 
					
						
							|  |  |  |             reporthook=lambda *args: display_progress_bar( | 
					
						
							|  |  |  |                 *args, message=f"Downloading {filename}" | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         sys.stdout.write("\n") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # If this file is a zip file, unzip it into the data directory. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if file_info["mimeType"] == "application/zip": | 
					
						
							|  |  |  |             print(f"Extracting files from {filename} ...") | 
					
						
							|  |  |  |             with zipfile.ZipFile(download_to_filepath, "r") as zf: | 
					
						
							|  |  |  |                 zf.extractall(data_dirpath) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-07-06 17:18:43 -07:00
										 |  |  |     print(f"\nDownload complete. See data files in {data_dirpath}\n") | 
					
						
							| 
									
										
										
										
											2025-07-06 12:29:53 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |