From 3afad6f4a961458db3f1880a04fc522988c44984 Mon Sep 17 00:00:00 2001 From: husby036 Date: Wed, 2 Jun 2021 09:55:30 -0500 Subject: [PATCH 1/2] pgc_ortho.py handle src CSV argument list exported from ArcGIS --- pgc_ortho.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pgc_ortho.py b/pgc_ortho.py index 532b3dd..e2a5b52 100755 --- a/pgc_ortho.py +++ b/pgc_ortho.py @@ -174,15 +174,28 @@ def main(): image_list1 = utils.find_images(src, True, ortho_functions.exts) elif srctype == 'csvfile': # Load CSV data - csv_arg_data = np.char.strip(np.loadtxt(src, dtype=str, delimiter=','), '\'"') - csv_header_argname_list = [argname.lstrip('-').replace('-', '_').lower() for argname in csv_arg_data[0, :]] - csv_arg_data = csv_arg_data[1:, :] # remove header row + csv_arg_data = np.char.strip(np.loadtxt(src, dtype=str, delimiter=',', encoding="utf-8-sig"), '\'"') + csv_header = csv_arg_data[0, :] + + # Adjust CSV header argument names + # Support ArcGIS export of typical fp.py output + if 'OID_' in csv_header: + csv_arg_data = np.delete(csv_arg_data, np.where(csv_header == 'OID_')[0], axis=1) + csv_header = csv_arg_data[0, :] + if 'O_FILEPATH' in csv_header: + csv_header[csv_header == 'O_FILEPATH'] = 'src' + + # Convert CSV header argument names to argparse namespace variable format + csv_header_argname_list = [argname.lstrip('-').replace('-', '_').lower() for argname in csv_header] + + # Remove header row + csv_arg_data = np.delete(csv_arg_data, 0, axis=0) # Verify CSV arguments and values if len(csv_header_argname_list) >= 1 and 'src' in csv_header_argname_list: pass else: - parser.error("'src' should be the header of the first colum of source CSV argument list file") + parser.error("'src' should be the header of the first column of source CSV argument list file") if 'epsg' in csv_header_argname_list: csv_epsg_array = csv_arg_data[:, csv_header_argname_list.index('epsg')].astype(int) invalid_epsg_code = False From bd129e22baedc2a5b964612874d33cdf5377b292 Mon Sep 17 00:00:00 2001 From: husby036 Date: Wed, 2 Jun 2021 10:36:40 -0500 Subject: [PATCH 2/2] pgc_ortho.py subset VRT tile mosaic DEM argument thru src CSV argument list --- lib/utils.py | 203 +++++++++++++++++++++++++++++++++++++++++++++++++++ pgc_ortho.py | 45 +++++++----- 2 files changed, 228 insertions(+), 20 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 7fdcae2..f432ec5 100755 --- a/lib/utils.py +++ b/lib/utils.py @@ -723,3 +723,206 @@ def yield_task_args(task_list, script_args, exec(exec_statement) yield task_args + + +def subset_vrt_dem(csv_arg_data, csv_header_argname_list, script_args): + """ + If source CSV argument list has tasks listed multiple times but with + different argument DEM values, and the script DEM argument is a large + VRT file composed of tiled DEMs with locations matching those specified + at the task-level argument DEMs, create new VRT files from subsets of + the larger script DEM VRT and have the tasks use those instead. + This can significantly speed up GDAL operations over using a single, + large (~20,000-element) VRT DEM. + + Subset VRT files are created in script argument 'scratch' directory. + + Parameters + ---------- + csv_arg_data : NumPy array, 2D + Source CSV argument list in NumPy array format, with header row removed. + csv_header_argname_list : list of strings + Header row of source CSV argument list, containing script argument names + in ArgumentParser namescape variable format. + script_args : ArgumentParser argument namespace object + ArgumentParser argument namespace object for the main script. + + Returns + ------- + csv_arg_data_trimmed : NumPy array, 2D + Source CSV argument list in NumPy array format, with header row and + duplicate task rows replaced with single tasks having updated DEM + argument value to use new subset VRT files. + """ + csv_col_idx_src = csv_header_argname_list.index('src') + csv_col_idx_dem = csv_header_argname_list.index('dem') + + script_arg_vrt_dem = script_args.dem + if not script_arg_vrt_dem.endswith('.vrt'): + raise InvalidArgumentError( + "Script DEM argument does not end with expected .vrt suffix: {}".format(script_arg_vrt_dem) + ) + + # Get the longest common prefix of all component DEMs + # that make up the script argument VRT file. + main_vrt_component_dem_prefix = [] + tree = ET.parse(script_arg_vrt_dem) + root = tree.getroot() + for sourceFilename in root.iter('SourceFilename'): + dem_filename = sourceFilename.text + main_vrt_component_dem_prefix.append(dem_filename) + main_vrt_component_dem_prefix = [os.path.commonprefix(main_vrt_component_dem_prefix)] + if len(main_vrt_component_dem_prefix) == 0: + raise InvalidArgumentError( + "Cannot find 'SourceFilename' elements in script DEM argument VRT file: {}".format(script_arg_vrt_dem) + ) + else: + main_vrt_component_dem_prefix = main_vrt_component_dem_prefix[0] + + # Parse CSV task argument values. + # Each src can be listed multiple times, once for each DEM + # it intersects with. + vrt_src_set = set() + vrt_dem_set = set() + nonvrt_src_set = set() + csv_src_keeprownum_dict = dict() + vrt_src_dem_dict = dict() + for rownum, task in enumerate(csv_arg_data): + task_src = task[csv_col_idx_src] + task_dem = task[csv_col_idx_dem] + + # Quick check if task DEM could be a component DEM of the main VRT, + # and would be applicable for subsetting. + if not task_dem.startswith(main_vrt_component_dem_prefix): + nonvrt_src_set.add(task_src) + csv_src_keeprownum_dict[task_src] = rownum + continue + + vrt_src_set.add(task_src) + vrt_dem_set.add(task_dem) + if task_src not in vrt_src_dem_dict: + vrt_src_dem_dict[task_src] = [] + vrt_src_dem_dict[task_src].append(task_dem) + if task_src not in csv_src_keeprownum_dict: + csv_src_keeprownum_dict[task_src] = rownum + else: + task_first_src = csv_arg_data[csv_src_keeprownum_dict[task_src]] + task_first_src[csv_col_idx_dem] = task_dem + if task_first_src.tolist() != task.tolist(): + raise InvalidArgumentError( + "Source CSV argument list rows {} and {} differ more than allowed " + "for VRT DEM subsetting".format(csv_src_keeprownum_dict[task_src], rownum) + ) + + invalid_src_set = set.intersection(vrt_src_set, nonvrt_src_set) + if len(invalid_src_set) > 0: + print("Source CSV argument list invalid 'src':\n{}".format('\n'.join(list(invalid_src_set)))) + raise InvalidArgumentError( + "Source CSV argument list has duplicate 'src' tasks listed with DEMs " + "not allowed for VRT DEM subsetting" + ) + + if len(vrt_src_set) == 0: + return csv_arg_data + + # Trim CSV data array to first occurrence of unique 'src' arguments, + # and tasks with DEM argument that is not part of a VRT subset. + keep_rows_idx = sorted(list(csv_src_keeprownum_dict.values())) + csv_arg_data_trimmed = csv_arg_data[np.asarray(keep_rows_idx)] + vrt_dem_list = sorted(list(vrt_dem_set)) + + subset_vrts_all = set() + + # It's likely that the same combination of DEMs is required + # by multiple src images. + # For each task src: + # - Derive a subset VRT filename for the particular combination of DEMs. + # - Change the DEM argument value for the task to the VRT filename. + # - For each DEM in the subset, add the VRT filename to a set of + # VRTs that need information for that DEM. + process_time = datetime.now().strftime("%Y%m%d%H%M%S") + process_pid = os.getpid() + dem_vrts_dict = dict() + for task in csv_arg_data_trimmed: + task_src = task[csv_col_idx_src] + if task_src in nonvrt_src_set: + continue + task_subset_dems = vrt_src_dem_dict[task_src] + demgroupid = '-'.join([str(vrt_dem_list.index(dem)) for dem in sorted(task_subset_dems)]) + task_subset_vrt = os.path.join( + script_args.scratch, + 'Or_dem_{}_{}_{}.vrt'.format( + process_time, process_pid, demgroupid + ) + ) + subset_vrts_all.add(task_subset_vrt) + task[csv_col_idx_dem] = task_subset_vrt # modifies mutable csv_arg_data + for dem in task_subset_dems: + if dem not in dem_vrts_dict: + dem_vrts_dict[dem] = set() + dem_vrts_dict[dem].add(task_subset_vrt) + + # Get all information subset VRT files need to contain beyond + # the 'SimpleSource' information for each DEM in the subset. + vrt_contents_prefix = '' + with open(script_arg_vrt_dem, 'r') as main_vrt_fp: + for line in main_vrt_fp.readlines(): + if '' in line: + vrt_contents_prefix += " " + break + vrt_contents_prefix += line + vrt_contents_suffix = "\n\n" + + print("Writing subsets of VRT DEM in directory: {}".format(script_args.scratch)) + subset_vrts_started = set() + + # Loop through all 'SimpleSource' items in the main script VRT DEM. + # If the 'SourceFilename' of the SimpleSource matches a DEM filename + # needed by any of the subset VRT files, append its SimpleSource info + # to all subset VRT files that DEM is part of. + # Assume main VRT was built from a sorted text file list of component + # DEM filenames, so that we only need to scan through the main VRT once. + tree = ET.parse(script_arg_vrt_dem) + root = tree.getroot() + current_vrt_dem_idx = 0 + for simpleSource in root.iter('SimpleSource'): + sourceFilename = simpleSource.find('SourceFilename') + if sourceFilename is None: + raise InvalidArgumentError( + "Source CSV argument list 'SimpleSource' missing 'SourceFilename'" + ) + dem_filename = sourceFilename.text + if dem_filename == vrt_dem_list[current_vrt_dem_idx]: + for vrt in dem_vrts_dict[dem_filename]: + if vrt not in subset_vrts_started: + subset_vrts_started.add(vrt) + with open(vrt, 'a') as subset_vrt_fp: + subset_vrt_fp.write(vrt_contents_prefix) + with open(vrt, 'a') as subset_vrt_fp: + subset_vrt_fp.write(ET.tostring(simpleSource, encoding='unicode', method='xml')) + current_vrt_dem_idx += 1 + if current_vrt_dem_idx == len(vrt_dem_list): + break + + if current_vrt_dem_idx != len(vrt_dem_list): + raise InvalidArgumentError( + "Could not find CSV DEM filename '{}' in scan of main VRT DEM (script argument) " + "'SimpleSource/SourceFilename' elements. Make sure CSV DEM filenames match exactly " + "the elements in the main VRT DEM, and that the VRT DEM was built from a SORTED " + "text file list of component DEM filenames.".format(vrt_dem_list[current_vrt_dem_idx]) + ) + + subset_vrts_not_started = subset_vrts_all - subset_vrts_started + if len(subset_vrts_not_started) > 0: + raise InvalidArgumentError( + "For source CSV argument list, {} of {} subset VRT DEMs were not written. " + "Make sure main VRT DEM (script argument) was built from a SORTED text file list " + "of component DEM filenames.".format(len(subset_vrts_not_started), len(subset_vrts_all)) + ) + + # Finish writing all subset VRT files + for vrt in subset_vrts_started: + with open(vrt, 'a') as subset_vrt_fp: + subset_vrt_fp.write(vrt_contents_suffix) + + return csv_arg_data_trimmed diff --git a/pgc_ortho.py b/pgc_ortho.py index e2a5b52..f9febf1 100755 --- a/pgc_ortho.py +++ b/pgc_ortho.py @@ -127,26 +127,8 @@ def main(): parser.error("--dem and --ortho_height options are mutually exclusive. Please choose only one.") #### Test if DEM exists - if args.dem: - if not os.path.isfile(args.dem): - parser.error("DEM does not exist: {}".format(args.dem)) - if args.l is None: - if args.dem.endswith('.vrt'): - total_dem_filesz_gb = 0.0 - tree = ET.parse(args.dem) - root = tree.getroot() - for sourceFilename in root.iter('SourceFilename'): - dem_filename = sourceFilename.text - if not os.path.isfile(dem_filename): - parser.error("VRT DEM component raster does not exist: {}".format(dem_filename)) - dem_filesz_gb = os.path.getsize(dem_filename) / 1024.0 / 1024 / 1024 - total_dem_filesz_gb += dem_filesz_gb - dem_filesz_gb = total_dem_filesz_gb - else: - dem_filesz_gb = os.path.getsize(args.dem) / 1024.0 / 1024 / 1024 - - pbs_req_mem_gb = int(max(math.ceil(dem_filesz_gb) + 2, 4)) - args.l = 'mem={}gb'.format(pbs_req_mem_gb) + if args.dem is not None and not os.path.isfile(args.dem): + parser.error("DEM does not exist: {}".format(args.dem)) #### Set up console logging handler lso = logging.StreamHandler() @@ -210,6 +192,10 @@ def main(): elif args.epsg == 0: parser.error("A valid EPSG argument must be specified") + # Create subsets of VRT DEM and trim CSV data if applicable + if args.dem is not None and args.dem.endswith('.vrt') and 'dem' in csv_header_argname_list: + csv_arg_data = utils.subset_vrt_dem(csv_arg_data, csv_header_argname_list, args) + # Extract src image paths and send to utils.find_images csv_src_array = csv_arg_data[:, csv_header_argname_list.index('src')] image_list1 = utils.find_images(csv_src_array.tolist(), True, ortho_functions.exts) @@ -222,6 +208,25 @@ def main(): else: image_list1 = [src] + ## Automatically set PBS job requested memory if applicable + if ( args.dem is not None and args.pbs and args.l is None + and (srctype != 'csvfile' or 'dem' not in csv_header_argname_list)): + if args.dem.endswith('.vrt'): + total_dem_filesz_gb = 0.0 + tree = ET.parse(args.dem) + root = tree.getroot() + for sourceFilename in root.iter('SourceFilename'): + dem_filename = sourceFilename.text + if not os.path.isfile(dem_filename): + parser.error("VRT DEM component raster does not exist: {}".format(dem_filename)) + dem_filesz_gb = os.path.getsize(dem_filename) / 1024.0 / 1024 / 1024 + total_dem_filesz_gb += dem_filesz_gb + dem_filesz_gb = total_dem_filesz_gb + else: + dem_filesz_gb = os.path.getsize(args.dem) / 1024.0 / 1024 / 1024 + pbs_req_mem_gb = int(min(50, max(4, math.ceil(dem_filesz_gb) + 2))) + args.l = 'mem={}gb'.format(pbs_req_mem_gb) + ## Group Ikonos image_list2 = [] for i, srcfp in enumerate(image_list1):