# Copyright (C) 2014-2016 DNAnexus, Inc. # # This file is part of dx-toolkit (DNAnexus platform client libraries). # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy # of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ''' This module handles download commands for the dx command-line client. ''' from __future__ import print_function, unicode_literals, division, absolute_import import collections import os import subprocess import sys import logging import dxpy from ..utils.resolver import (resolve_existing_path, get_first_pos_of_char, is_project_explicit, object_exists_in_project, is_jbor_str) from ..exceptions import err_exit from . import try_call from dxpy.utils.printing import (fill) from dxpy.utils import pathmatch # Check if a program (wget, curl, etc.) is on the path, and # can be called. def _which(program): def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None # Caluclate the md5 checkum for [filename], and raise # an exception if the checksum is wrong. def _verify(filename, md5digest): md5sum_exe = _which("md5sum") if md5sum_exe is None: err_exit("md5sum is not installed on this system") cmd = [md5sum_exe, "-b", filename] try: print("Calculating checksum") cmd_out = subprocess.check_output(cmd) except subprocess.CalledProcessError: err_exit("Failed to run md5sum: " + str(cmd)) line = cmd_out.strip().split() if len(line) != 2: err_exit("md5sum returned weird results: " + str(line)) actual_md5 = line[0] if actual_md5 != md5digest: err_exit("Checksum doesn't match " + actual_md5 + " expected:" + md5digest) print("Checksum correct") def download_one_file(project, file_desc, dest_filename, args): if not args.overwrite: if os.path.exists(dest_filename): err_exit(fill('Error: path "' + dest_filename + '" already exists but -f/--overwrite was not set')) if file_desc['class'] != 'file': print("Skipping non-file data object {name} ({id})".format(**file_desc), file=sys.stderr) return if file_desc['state'] != 'closed': print("Skipping file {name} ({id}) because it is not closed".format(**file_desc), file=sys.stderr) return try: show_progress = args.show_progress except AttributeError: show_progress = False try: symlink_max_tries = args.symlink_max_tries if vars(args).get('symlink_max_tries') is not None else 15 dxpy.download_dxfile( file_desc['id'], dest_filename, show_progress=show_progress, project=project, describe_output=file_desc, symlink_max_tries=symlink_max_tries) return except: err_exit() def do_debug(msg): logging.debug(msg) # dest_filename = local file where downloaded file will go # src_filename = name of parquet file or folder being downloaded from database def download_one_database_file(project, database_desc, dest_filename, src_filename, file_status, args): do_debug("download.py#download_one_database_file - src_filename = {}".format(src_filename)); if file_status is not None: do_debug("download.py#download_one_database_file - file_status = {}".format(file_status)); if not args.overwrite: if os.path.exists(dest_filename): err_exit(fill('Error: path "' + dest_filename + '" already exists but -f/--overwrite was not set')) if database_desc['class'] != 'database': print("Skipping non-database data object {name} ({id})".format(**database_desc), file=sys.stderr) return try: show_progress = args.show_progress except AttributeError: show_progress = False try: dxpy.download_dxdatabasefile( database_desc['id'], dest_filename, src_filename, file_status, show_progress=show_progress, project=project, describe_output=database_desc) except: err_exit() def _ensure_local_dir(d): if not os.path.isdir(d): if os.path.exists(d): err_exit(fill('Error: path "' + d + '" already exists and is not a directory')) os.makedirs(d) def _is_glob(path): return get_first_pos_of_char('*', path) > -1 or get_first_pos_of_char('?', path) > -1 def _rel2abs(path, project): if path.startswith('/') or dxpy.WORKSPACE_ID != project: abs_path, strip_prefix = path, os.path.dirname(path.rstrip('/')) else: wd = dxpy.config.get('DX_CLI_WD', u'/') abs_path, strip_prefix = os.path.join(wd, path), os.path.dirname(os.path.join(wd, path).rstrip('/')) if len(abs_path) > 1: abs_path = abs_path.rstrip('/') return abs_path, strip_prefix def _download_files(files, destdir, args, dest_filename=None): for project in files: for f in files[project]: file_desc = f['describe'] dest = dest_filename or os.path.join(destdir, file_desc['name'].replace('/', '%2F')) download_one_file(project, file_desc, dest, args) def _download_folders(folders, destdir, args): try: show_progress = args.show_progress except AttributeError: show_progress = False for project in folders: for folder, strip_prefix in folders[project]: if not args.recursive: err_exit('Error: "' + folder + '" is a folder but the -r/--recursive option was not given') assert(folder.startswith(strip_prefix)) folder_destdir = os.path.join(destdir, folder[len(strip_prefix):].lstrip('/')) try: dxpy.download_folder(project, folder_destdir, folder=folder, overwrite=args.overwrite, show_progress=show_progress) except: err_exit() # Main entry point. def download(args): folders_to_get, files_to_get, count = collections.defaultdict(list), collections.defaultdict(list), 0 foldernames, filenames = [], [] for path in args.paths: # Attempt to resolve name. If --all is given or the path looks like a glob, download all matches. # Otherwise, the resolver will display a picker (or error out if there is no tty to display to). resolver_kwargs = {'allow_empty_string': False} if args.all or _is_glob(path): resolver_kwargs.update({'allow_mult': True, 'all_mult': True}) # include "parts" and a few additional fields in the description so that # we don't have to call a separate describe method downstream resolver_kwargs.update({"describe": {"parts": True, "size": True, "drive": True, "md5": True, "perPartCheckSum": True }}) project, folderpath, matching_files = try_call(resolve_existing_path, path, **resolver_kwargs) if matching_files is None: matching_files = [] elif not isinstance(matching_files, list): matching_files = [matching_files] # TODO: this could also be returned as metadata by resolve_path since # resolve_path knows these things in some circumstances path_has_explicit_proj = is_project_explicit(path) or is_jbor_str(path) if is_jbor_str(path): assert len(matching_files) == 1 project = matching_files[0]["describe"]["project"] matching_folders = [] # project may be none if path is an ID and there is no project context if project is not None: colon_pos = get_first_pos_of_char(":", path) if colon_pos >= 0: path = path[colon_pos + 1:] abs_path, strip_prefix = _rel2abs(path, project) parent_folder = os.path.dirname(abs_path) folder_listing = dxpy.list_subfolders(project, parent_folder, recurse=False) matching_folders = pathmatch.filter(folder_listing, abs_path) if '/' in matching_folders and len(matching_folders) > 1: # The list of subfolders is {'/', '/A', '/B'}. # Remove '/', otherwise we will download everything twice. matching_folders.remove('/') if len(matching_files) == 0 and len(matching_folders) == 0: err_exit(fill('Error: {path} is neither a file nor a folder name'.format(path=path))) # If the user did not explicitly provide the project, don't pass any # project parameter to the API call but continue with the download. if not path_has_explicit_proj: project = dxpy.DXFile.NO_PROJECT_HINT # If the user explicitly provided the project and it doesn't contain # the files, don't allow the download. # For speed's sake, skip this check (i.e. one API call) if the user # passed the lightweight argument # # If length of matching_files is 0 then we're only downloading folders # so skip this logic since the files will be verified in the API call. if not args.lightweight \ and len(matching_files) > 0 \ and path_has_explicit_proj \ and not any(object_exists_in_project(f['describe']['id'], project) for f in matching_files): err_exit(fill('Error: specified project does not contain specified file object')) files_to_get[project].extend(matching_files) folders_to_get[project].extend(((f, strip_prefix) for f in matching_folders)) count += len(matching_files) + len(matching_folders) filenames.extend(f["describe"]["name"] for f in matching_files) foldernames.extend(f[len(strip_prefix):].lstrip('/') for f in matching_folders) if len(filenames) > 0 and len(foldernames) > 0: name_conflicts = set(filenames) & set(foldernames) if len(name_conflicts) > 0: msg = "Error: The following paths are both file and folder names, and " \ "cannot be downloaded to the same destination: " msg += ", ".join(sorted(name_conflicts)) err_exit(fill(msg)) if args.output is None: destdir, dest_filename = os.getcwd(), None elif count > 1: if not os.path.exists(args.output): err_exit(fill("Error: When downloading multiple objects, --output must be an existing directory")) destdir, dest_filename = args.output, None elif os.path.isdir(args.output): destdir, dest_filename = args.output, None elif args.output.endswith('/'): err_exit(fill("Error: {path} could not be found".format(path=args.output))) else: destdir, dest_filename = os.getcwd(), args.output _download_folders(folders_to_get, destdir, args) _download_files(files_to_get, destdir, args, dest_filename=dest_filename)