From 915e8d7a4e7b81b97a7256eabc5b761424a48f9b Mon Sep 17 00:00:00 2001 From: Nick H <34072991+nickssl@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:07:46 -0700 Subject: [PATCH] Fixed docstrings, added examples. --- pyspedas/utilities/download.py | 451 +++++++++++++++++------------ pyspedas/utilities/download_ftp.py | 17 +- 2 files changed, 285 insertions(+), 183 deletions(-) diff --git a/pyspedas/utilities/download.py b/pyspedas/utilities/download.py index dad39d85..e8590cfe 100644 --- a/pyspedas/utilities/download.py +++ b/pyspedas/utilities/download.py @@ -15,16 +15,49 @@ from cdflib import CDF from time import sleep -# the following is used to parse the links from an HTML index file + class LinkParser(HTMLParser): + """ + A custom HTML parser to extract links from an HTML document. + + This class is a subclass of HTMLParser from the html.parser module. It overrides the handle_starttag method to + extract 'href' attributes from 'a' tags, which represent links in an HTML document. + + Attributes + ---------- + links : list + List of links extracted from the HTML document. + + Methods + ------- + handle_starttag(tag, attrs) + Handle the start of an HTML tag. + """ + def handle_starttag(self, tag, attrs): - if tag == 'a': + """ + Handle the start of an HTML tag. + + If the tag is an 'a' tag, this method extracts the 'href' attribute (if present) and adds it to the links list. + + Parameters + ---------- + tag : str + Name of the HTML tag. + attrs : list of (str, str) tuples + List of (name, value) pairs containing the attributes of the HTML tag. + + Notes + ----- + This method is called by the HTMLParser feed method for each start tag encountered in the HTML document. + """ + if tag == "a": attrs = {k: v for (k, v) in attrs} - if 'href' in attrs: - link = attrs['href'] + if "href" in attrs: + link = attrs["href"] # kludge to support http://rbspice?.ftecs.com/ - if '/' in link: - link = link.split('/')[-1] + if "/" in link: + link = link.split("/")[-1] try: self.links.append((link)) except AttributeError: @@ -36,11 +69,26 @@ def check_downloaded_file(filename): Check if a file exists and if it can be opened (for CDF and netCDF files). If the file exists but it is not CDF or netCDF, it returns True without trying to open the file. + + Parameters + ---------- + filename : str + Name of the file to check. + + Returns + ------- + bool + True if the file exists and can be opened, False otherwise. + + Notes + ----- + This function specifically checks for CDF and netCDF files. If the file is of a different type, + it simply checks for its existence without trying to open it. """ result = False fpath = Path(filename) if fpath.is_file() and len(filename) > 3: - if filename[-4:] == '.cdf': + if filename[-4:] == ".cdf": # Try to open the cdf file try: cdf_file = CDF(filename) @@ -48,7 +96,7 @@ def check_downloaded_file(filename): except: logging.info("Cannot open CDF file: " + filename) result = False - elif filename[-3:] == '.nc': + elif filename[-3:] == ".nc": # Try to open the netCDF file try: netcdf_file = Dataset(filename) @@ -64,49 +112,49 @@ def check_downloaded_file(filename): return result -def download_file(url=None, - filename=None, - headers={}, - username=None, - password=None, - verify=False, - session=None, - basic_auth=False, - nbr_tries=0): +def download_file( + url=None, + filename=None, + headers={}, + username=None, + password=None, + verify=False, + session=None, + basic_auth=False, + nbr_tries=0, +): """ - Download a file and return its local path; this function is primarily meant to be called by the download function - - Parameters: - url: str - Remote URL to download - - filename: str - Local file name - - headers: dict - Dictionary containing the headers to be passed to the requests get call - - username: str - Username to be used in HTTP authentication - - password: str - password to be used in HTTP authentication - - verify: bool - Flag indicating whether to verify the SSL/TLS certificate - - session: requests.Session object - Requests session object that allows you to persist things like HTTP authentication through multiple calls - - nbr_tries: int - Counts how many times we tried to download the file. Default is 0. - - Notes: - Checks if the CDF or netCDF file can be opened, and if it can't, tries to download the file for a second time. - - Returns: - String containing the local file name - + Download a file and return its local path; this function is primarily meant to be called by the download function. + + Parameters + ---------- + url : str + Remote URL to download. + filename : str + Local file name. + headers : dict + Dictionary containing the headers to be passed to the requests get call. + username : str, optional + Username to be used in HTTP authentication. + password : str, optional + Password to be used in HTTP authentication. + verify : bool, optional + Flag indicating whether to verify the SSL/TLS certificate. + session : requests.Session object, optional + Requests session object that allows you to persist things like HTTP authentication through multiple calls. + basic_auth : bool, optional + Flag to indicate that the remote server uses basic authentication instead of digest authentication. + nbr_tries : int, optional + Counts how many times we tried to download the file. Default is 0. + + Returns + ------- + str + String containing the local file name. + + Notes + ----- + Checks if the CDF or netCDF file can be opened, and if it can't, tries to download the file for a second time. """ headers_original = headers session_original = session @@ -120,39 +168,49 @@ def download_file(url=None, # check if the file exists, and if so, set the last modification time in the header # this allows you to avoid re-downloading files that haven't changed if os.path.exists(filename): - mod_tm = (datetime.datetime.fromtimestamp(os.path.getmtime(filename),datetime.timezone.utc)).strftime('%a, %d %b %Y %H:%M:%S GMT') - headers['If-Modified-Since'] = mod_tm + mod_tm = ( + datetime.datetime.fromtimestamp( + os.path.getmtime(filename), datetime.timezone.utc + ) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") + headers["If-Modified-Since"] = mod_tm with warnings.catch_warnings(): warnings.simplefilter("ignore", category=ResourceWarning) if not basic_auth: fsrc = session.get(url, stream=True, verify=verify, headers=headers) else: - fsrc = session.get(url, stream=True, verify=verify, headers=headers, auth=(username, password)) + fsrc = session.get( + url, + stream=True, + verify=verify, + headers=headers, + auth=(username, password), + ) # need to delete the If-Modified-Since header so it's not set in the dictionary in subsequent calls - if headers.get('If-Modified-Since') is not None: - del headers['If-Modified-Since'] + if headers.get("If-Modified-Since") is not None: + del headers["If-Modified-Since"] needs_to_download_file = False if fsrc.status_code == 304: # the file hasn't changed - logging.info('File is current: ' + filename) + logging.info("File is current: " + filename) fsrc.close() elif fsrc.status_code == 404: # file not found - logging.error('Remote file not found: ' + url) + logging.error("Remote file not found: " + url) fsrc.close() return None elif fsrc.status_code == 401 or fsrc.status_code == 403: # authentication issues - logging.error('Unauthorized: ' + url) + logging.error("Unauthorized: " + url) fsrc.close() return None elif fsrc.status_code == 200: # this is the main download case needs_to_download_file = True - logging.info('Downloading ' + url + ' to ' + filename) + logging.info("Downloading " + url + " to " + filename) else: # all other problems logging.error(fsrc.reason) @@ -162,11 +220,14 @@ def download_file(url=None, if needs_to_download_file: ftmp = NamedTemporaryFile(delete=False) - with open(ftmp.name, 'wb') as f: + with open(ftmp.name, "wb") as f: copyfileobj(fsrc.raw, f) # make sure the directory exists - if not os.path.exists(os.path.dirname(filename)) and os.path.dirname(filename) != '': + if ( + not os.path.exists(os.path.dirname(filename)) + and os.path.dirname(filename) != "" + ): os.makedirs(os.path.dirname(filename)) # if the download was successful, copy to data directory @@ -176,32 +237,34 @@ def download_file(url=None, ftmp.close() os.unlink(ftmp.name) # delete the temporary file - logging.info('Download complete: ' + filename) + logging.info("Download complete: " + filename) # At this point, we check if the file can be opened. # If it cannot be opened, we delete the file and try again. if nbr_tries == 0 and check_downloaded_file(filename) == False: nbr_tries = 1 - logging.info('There was a problem with the file: ' + filename) - logging.info('We are going to download it for a second time.') + logging.info("There was a problem with the file: " + filename) + logging.info("We are going to download it for a second time.") if os.path.exists(filename): os.unlink(filename) - download_file(url=url, - filename=filename, - headers=headers_original, - username=username, - password=password, - verify=verify, - session=session_original, - basic_auth=basic_auth, - nbr_tries=nbr_tries) + download_file( + url=url, + filename=filename, + headers=headers_original, + username=username, + password=password, + verify=verify, + session=session_original, + basic_auth=basic_auth, + nbr_tries=nbr_tries, + ) # If the file again cannot be opened, we give up. if nbr_tries > 0 and check_downloaded_file(filename) == False: nbr_tries = 2 - logging.info('Tried twice. There was a problem with the file: ' + filename) - logging.info('File will be removed. Try to download it again at a later time.') + logging.info("Tried twice. There was a problem with the file: " + filename) + logging.info("File will be removed. Try to download it again at a later time.") if os.path.exists(filename): os.unlink(filename) filename = None @@ -209,88 +272,86 @@ def download_file(url=None, return filename -def download(remote_path='', - remote_file='', - local_path='', - local_file='', - headers={}, - username=None, - password=None, - verify=True, - session=None, - no_download=False, - last_version=False, - basic_auth=False, - regex=False, - no_wildcards=False): +def download( + remote_path="", + remote_file="", + local_path="", + local_file="", + headers={}, + username=None, + password=None, + verify=True, + session=None, + no_download=False, + last_version=False, + basic_auth=False, + regex=False, + no_wildcards=False, +): """ Download one or more remote files and return their local paths. - Parameters: - remote_path: str - String consisting of a common URL base for all remote files - - remote_file: str or list of str - String or string array of URLs to remote files - - local_path: str - String consisting of a common local path for all local files - - local_file: str or list of str - String or string array of local destination file names - - headers: dict - Dictionary containing the headers to be passed to the requests get call - - username: str - Username to be used in HTTP authentication - - password: str - Password to be used in HTTP authentication - - basic_auth: bool - Flag to indicate that the remote server uses basic authentication - instead of digest authentication - - verify: bool - Flag indicating whether to verify the SSL/TLS certificate - - session: requests.Session object - Requests session object that allows you to persist things like HTTP authentication through multiple calls - - no_download: bool - Flag to not download remote files - - last_version: bool - Flag to only download the last in file in a lexically sorted - list when multiple matches are found using wildcards - - regex: bool - Flag to allow regular expressions in the file name matching, - instead of unix style matching - - no_wildcards: bool - Flag to assume no wild cards in the requested url/filename - - Returns: - String list specifying the full local path to all requested files - + Parameters + ---------- + remote_path : str + String consisting of a common URL base for all remote files. + remote_file : str or list of str + String or string array of URLs to remote files. + local_path : str + String consisting of a common local path for all local files. + local_file : str or list of str + String or string array of local destination file names. + headers : dict + Dictionary containing the headers to be passed to the requests get call. + username : str, optional + Username to be used in HTTP authentication. + password : str, optional + Password to be used in HTTP authentication. + basic_auth : bool, optional + Flag to indicate that the remote server uses basic authentication instead of digest authentication. + verify : bool, optional + Flag indicating whether to verify the SSL/TLS certificate. + session : requests.Session object, optional + Requests session object that allows you to persist things like HTTP authentication through multiple calls. + no_download : bool, optional + Flag to not download remote files. + last_version : bool, optional + Flag to only download the last in file in a lexically sorted list when multiple matches are found using wildcards. + regex : bool, optional + Flag to allow regular expressions in the file name matching, instead of unix style matching. + no_wildcards : bool, optional + Flag to assume no wild cards in the requested url/filename. + + Returns + ------- + list of str + String list specifying the full local path to all requested files. + + Examples + -------- + >>> from pyspedas import download + >>> remote_path = "https://spdf.gsfc.nasa.gov/pub/data/omni/omni_cdaweb/hro_5min/2012/" + >>> remote_files = ["omni_hro_5min_20121101_v01.cdf", "omni_hro_5min_20121201_v01.cdf"] + >>> local_path = "/tmp/omni/" + >>> files = download(remote_path=remote_path, remote_file=remote_files, local_path=local_path) + >>> print(files) + ['/tmp/omni/omni_hro_5min_20121101_v01.cdf', '/tmp/omni/omni_hro_5min_20121201_v01.cdf'] """ local_file_in = local_file if isinstance(remote_path, list): - logging.error('Remote path must be a string') + logging.error("Remote path must be a string") return if isinstance(local_path, list): - logging.error('Local path must be a string') + logging.error("Local path must be a string") return - if local_path == '': - local_path = str(Path('').resolve()) + if local_path == "": + local_path = str(Path("").resolve()) if username is not None and password is None: - logging.error('Username provided without password') + logging.error("Username provided without password") return if session is None: @@ -299,12 +360,12 @@ def download(remote_path='', if username is not None: session.auth = requests.auth.HTTPDigestAuth(username, password) - if headers.get('User-Agent') is None: + if headers.get("User-Agent") is None: try: release_version = version("pyspedas") except PackageNotFoundError: release_version = "bleeding edge" - headers['User-Agent'] = 'pySPEDAS ' + release_version + headers["User-Agent"] = "pySPEDAS " + release_version out = [] index_table = {} @@ -316,47 +377,60 @@ def download(remote_path='', if not isinstance(remote_file, list): remote_file = [remote_file] - urls = [remote_path+rfile for rfile in remote_file] + urls = [remote_path + rfile for rfile in remote_file] for url in urls: resp_data = None - url_file = url[url.rfind("/")+1:] - url_base = url.replace(url_file, '') + url_file = url[url.rfind("/") + 1 :] + url_base = url.replace(url_file, "") # automatically use remote_file locally if local_file is not specified - if local_file_in == '': + if local_file_in == "": # if remote_file is the entire url then only use the filename - if remote_path == '': + if remote_path == "": local_file = url_file else: - local_file = url.replace(remote_path, '') + local_file = url.replace(remote_path, "") - if local_file == '': # remote_path was the full file name - local_file = remote_path[remote_path.rfind("/")+1:] + if local_file == "": # remote_path was the full file name + local_file = remote_path[remote_path.rfind("/") + 1 :] filename = os.path.join(local_path, local_file) - short_path = local_file[:1+local_file.rfind("/")] + short_path = local_file[: 1 + local_file.rfind("/")] if not no_download: # expand the wildcards in the url - if ('?' in url or '*' in url or regex) and (not no_download and not no_wildcards): + if ("?" in url or "*" in url or regex) and ( + not no_download and not no_wildcards + ): if index_table.get(url_base) is not None: links = index_table[url_base] elif url_base in bad_index_set: - logging.info('Skipping remote index: ' + url_base + ' (previous attempt failed)') + logging.info( + "Skipping remote index: " + + url_base + + " (previous attempt failed)" + ) continue else: - logging.info('Downloading remote index: ' + url_base) + logging.info("Downloading remote index: " + url_base) # we'll need to parse the HTML index file for the file list with warnings.catch_warnings(): warnings.simplefilter("ignore", category=ResourceWarning) try: if not basic_auth: - html_index = session.get(url_base, verify=verify, headers=headers) + html_index = session.get( + url_base, verify=verify, headers=headers + ) else: - html_index = session.get(url_base, verify=verify, headers=headers, auth=(username, password)) + html_index = session.get( + url_base, + verify=verify, + headers=headers, + auth=(username, password), + ) except requests.exceptions.ConnectionError: # Add this index to bad_index_set and cool down a bit bad_index_set.add(url_base) @@ -364,14 +438,14 @@ def download(remote_path='', continue if html_index.status_code == 404: - logging.error('Remote index not found: ' + url_base) + logging.error("Remote index not found: " + url_base) # Add this index to bad_index_set and cool down a bit bad_index_set.add(url_base) sleep(2) continue if html_index.status_code == 401 or html_index.status_code == 403: - logging.error('Unauthorized: ' + url_base) + logging.error("Unauthorized: " + url_base) # Add this index to bad_index_set and cool down a bit bad_index_set.add(url_base) sleep(2) @@ -396,28 +470,48 @@ def download(remote_path='', new_links = list(filter(reg_expression.match, links)) if len(new_links) == 0: - logging.info("No links matching pattern %s found at remote index %s", url_file, url_base) + logging.info( + "No links matching pattern %s found at remote index %s", + url_file, + url_base, + ) if last_version and len(new_links) > 1: new_links = sorted(new_links) new_links = [new_links[-1]] - if '?' in remote_path or '*' in remote_path: + if "?" in remote_path or "*" in remote_path: # the user specified a wild card in the remote_path remote_path = url_base # download the files for new_link in new_links: - resp_data = download(remote_path=remote_path, remote_file=short_path+new_link, - local_path=local_path, username=username, password=password, - verify=verify, headers=headers, session=session, basic_auth=basic_auth) + resp_data = download( + remote_path=remote_path, + remote_file=short_path + new_link, + local_path=local_path, + username=username, + password=password, + verify=verify, + headers=headers, + session=session, + basic_auth=basic_auth, + ) if resp_data is not None: for file in resp_data: out.append(file) session.close() continue - resp_data = download_file(url=url, filename=filename, username=username, password=password, verify=verify, - headers=headers, session=session, basic_auth=basic_auth) + resp_data = download_file( + url=url, + filename=filename, + username=username, + password=password, + verify=verify, + headers=headers, + session=session, + basic_auth=basic_auth, + ) if resp_data is not None: if not isinstance(resp_data, list): @@ -426,17 +520,17 @@ def download(remote_path='', out.append(file) else: # download wasn't successful, search for local files - logging.info('Searching for local files...') - + logging.info("Searching for local files...") + temp_out = [] - - if local_path == '': - local_path_to_search = str(Path('.').resolve()) + + if local_path == "": + local_path_to_search = str(Path(".").resolve()) else: local_path_to_search = local_path for dirpath, dirnames, filenames in os.walk(local_path_to_search): - local = local_file[local_file.rfind("/")+1:] + local = local_file[local_file.rfind("/") + 1 :] if not regex: matching_files = fnmatch.filter(filenames, local) else: @@ -446,19 +540,18 @@ def download(remote_path='', for file in matching_files: # out.append(os.path.join(dirpath, file)) temp_out.append(os.path.join(dirpath, file)) - + # check if the file exists, and if so, set the last modification time in the header if len(temp_out) == 0: - logging.info('No local files found for ' + url) + logging.info("No local files found for " + url) continue temp_out = sorted(temp_out) - + if last_version: - out.append(temp_out[-1]) # append the latest version + out.append(temp_out[-1]) # append the latest version else: for file in temp_out: out.append(file) - session.close() return out diff --git a/pyspedas/utilities/download_ftp.py b/pyspedas/utilities/download_ftp.py index 3f22ea17..576c427e 100644 --- a/pyspedas/utilities/download_ftp.py +++ b/pyspedas/utilities/download_ftp.py @@ -28,14 +28,13 @@ def download_ftp( local_path : str Local directory to save the file. local_file : str, optional - Name of the file to save locally. Default is the same as the remote_file. + Name of the file to save locally. If not provided, the name of the remote file is used. username : str, optional Username for the FTP server. Default is 'anonymous'. password : str, optional Password for the FTP server. Default is 'anonymous@'. force_download : bool, optional - Force the download even if the remote file is not newer than the local file. - Default is False. + Force the download even if the remote file is not newer than the local file. Default is False. Returns ------- @@ -47,6 +46,16 @@ def download_ftp( Exception If the remote file is not found on the FTP server. + Examples + -------- + >>> from pyspedas import download_ftp + >>> ftp_site = "ftp.gfz-potsdam.de" + >>> kp_dir = "/pub/home/obs/kp-ap/wdc/yearly/" + >>> remote_file = "kp2012.wdc" + >>> local_dir = "/tmp/" + >>> files = download_ftp(ftp_site, kp_dir, remote_file, local_dir) + >>> print(files) + ['/tmp/kp2012.wdc'] """ return_files = [] @@ -83,7 +92,7 @@ def download_ftp( with open(local_file, "wb") as local_file_r: ftp.retrbinary("RETR " + remote_file, local_file_r.write) logging.warning( - f"File '{local_file}' downloaded successfully to '{local_path}'" + f"File '{remote_file}' downloaded successfully to '{local_file}'" ) else: logging.warning(