PolarGeospatialCenter · ehusby · Jan 4, 2022 · Jun 2, 2021 · Jun 2, 2021 · Jul 16, 2021
diff --git a/lib/utils.py b/lib/utils.py
@@ -723,3 +723,206 @@ def yield_task_args(task_list, script_args,
             exec(exec_statement)
 
         yield task_args
+
+
+def subset_vrt_dem(csv_arg_data, csv_header_argname_list, script_args):
+    """
+    If source CSV argument list has tasks listed multiple times but with
+    different argument DEM values, and the script DEM argument is a large
+    VRT file composed of tiled DEMs with locations matching those specified
+    at the task-level argument DEMs, create new VRT files from subsets of
+    the larger script DEM VRT and have the tasks use those instead.
+    This can significantly speed up GDAL operations over using a single,
+    large (~20,000-element) VRT DEM.
+
+    Subset VRT files are created in script argument 'scratch' directory.
+
+    Parameters
+    ----------
+    csv_arg_data : NumPy array, 2D
+        Source CSV argument list in NumPy array format, with header row removed.
+    csv_header_argname_list : list of strings
+        Header row of source CSV argument list, containing script argument names
+        in ArgumentParser namescape variable format.
+    script_args : ArgumentParser argument namespace object
+        ArgumentParser argument namespace object for the main script.
+
+    Returns
+    -------
+    csv_arg_data_trimmed : NumPy array, 2D
+        Source CSV argument list in NumPy array format, with header row and
+        duplicate task rows replaced with single tasks having updated DEM
+        argument value to use new subset VRT files.
+    """
+    csv_col_idx_src = csv_header_argname_list.index('src')
+    csv_col_idx_dem = csv_header_argname_list.index('dem')
+
+    script_arg_vrt_dem = script_args.dem
+    if not script_arg_vrt_dem.endswith('.vrt'):
+        raise InvalidArgumentError(
+            "Script DEM argument does not end with expected .vrt suffix: {}".format(script_arg_vrt_dem)
+        )
+
+    # Get the longest common prefix of all component DEMs
+    # that make up the script argument VRT file.
+    main_vrt_component_dem_prefix = []
+    tree = ET.parse(script_arg_vrt_dem)
+    root = tree.getroot()
+    for sourceFilename in root.iter('SourceFilename'):
+        dem_filename = sourceFilename.text
+        main_vrt_component_dem_prefix.append(dem_filename)
+        main_vrt_component_dem_prefix = [os.path.commonprefix(main_vrt_component_dem_prefix)]
+    if len(main_vrt_component_dem_prefix) == 0:
+        raise InvalidArgumentError(
+            "Cannot find 'SourceFilename' elements in script DEM argument VRT file: {}".format(script_arg_vrt_dem)
+        )
+    else:
+        main_vrt_component_dem_prefix = main_vrt_component_dem_prefix[0]
+
+    # Parse CSV task argument values.
+    # Each src can be listed multiple times, once for each DEM
+    # it intersects with.
+    vrt_src_set = set()
+    vrt_dem_set = set()
+    nonvrt_src_set = set()
+    csv_src_keeprownum_dict = dict()
+    vrt_src_dem_dict = dict()
+    for rownum, task in enumerate(csv_arg_data):
+        task_src = task[csv_col_idx_src]
+        task_dem = task[csv_col_idx_dem]
+
+        # Quick check if task DEM could be a component DEM of the main VRT,
+        # and would be applicable for subsetting.
+        if not task_dem.startswith(main_vrt_component_dem_prefix):
+            nonvrt_src_set.add(task_src)
+            csv_src_keeprownum_dict[task_src] = rownum
+            continue
+
+        vrt_src_set.add(task_src)
+        vrt_dem_set.add(task_dem)
+        if task_src not in vrt_src_dem_dict:
+            vrt_src_dem_dict[task_src] = []
+        vrt_src_dem_dict[task_src].append(task_dem)
+        if task_src not in csv_src_keeprownum_dict:
+            csv_src_keeprownum_dict[task_src] = rownum
+        else:
+            task_first_src = csv_arg_data[csv_src_keeprownum_dict[task_src]]
+            task_first_src[csv_col_idx_dem] = task_dem
+            if task_first_src.tolist() != task.tolist():
+                raise InvalidArgumentError(
+                    "Source CSV argument list rows {} and {} differ more than allowed "
+                    "for VRT DEM subsetting".format(csv_src_keeprownum_dict[task_src], rownum)
+                )
+
+    invalid_src_set = set.intersection(vrt_src_set, nonvrt_src_set)
+    if len(invalid_src_set) > 0:
+        print("Source CSV argument list invalid 'src':\n{}".format('\n'.join(list(invalid_src_set))))
+        raise InvalidArgumentError(
+            "Source CSV argument list has duplicate 'src' tasks listed with DEMs "
+            "not allowed for VRT DEM subsetting"
+        )
+
+    if len(vrt_src_set) == 0:
+        return csv_arg_data
+
+    # Trim CSV data array to first occurrence of unique 'src' arguments,
+    # and tasks with DEM argument that is not part of a VRT subset.
+    keep_rows_idx = sorted(list(csv_src_keeprownum_dict.values()))
+    csv_arg_data_trimmed = csv_arg_data[np.asarray(keep_rows_idx)]
+    vrt_dem_list = sorted(list(vrt_dem_set))
+
+    subset_vrts_all = set()
+
+    # It's likely that the same combination of DEMs is required
+    # by multiple src images.
+    # For each task src:
+    #  - Derive a subset VRT filename for the particular combination of DEMs.
+    #  - Change the DEM argument value for the task to the VRT filename.
+    #  - For each DEM in the subset, add the VRT filename to a set of
+    #    VRTs that need information for that DEM.
+    process_time = datetime.now().strftime("%Y%m%d%H%M%S")
+    process_pid = os.getpid()
+    dem_vrts_dict = dict()
+    for task in csv_arg_data_trimmed:
+        task_src = task[csv_col_idx_src]
+        if task_src in nonvrt_src_set:
+            continue
+        task_subset_dems = vrt_src_dem_dict[task_src]
+        demgroupid = '-'.join([str(vrt_dem_list.index(dem)) for dem in sorted(task_subset_dems)])
+        task_subset_vrt = os.path.join(
+            script_args.scratch,
+            'Or_dem_{}_{}_{}.vrt'.format(
+                process_time, process_pid, demgroupid
+            )
+        )
+        subset_vrts_all.add(task_subset_vrt)
+        task[csv_col_idx_dem] = task_subset_vrt  # modifies mutable csv_arg_data
+        for dem in task_subset_dems:
+            if dem not in dem_vrts_dict:
+                dem_vrts_dict[dem] = set()
+            dem_vrts_dict[dem].add(task_subset_vrt)
+
+    # Get all information subset VRT files need to contain beyond
+    # the 'SimpleSource' information for each DEM in the subset.
+    vrt_contents_prefix = ''
+    with open(script_arg_vrt_dem, 'r') as main_vrt_fp:
+        for line in main_vrt_fp.readlines():
+            if '<SimpleSource>' in line:
+                vrt_contents_prefix += "    "
+                break
+            vrt_contents_prefix += line
+    vrt_contents_suffix = "</VRTRasterBand>\n</VRTDataset>\n"
+
+    print("Writing subsets of VRT DEM in directory: {}".format(script_args.scratch))
+    subset_vrts_started = set()
+
+    # Loop through all 'SimpleSource' items in the main script VRT DEM.
+    # If the 'SourceFilename' of the SimpleSource matches a DEM filename
+    # needed by any of the subset VRT files, append its SimpleSource info
+    # to all subset VRT files that DEM is part of.
+    # Assume main VRT was built from a sorted text file list of component
+    # DEM filenames, so that we only need to scan through the main VRT once.
+    tree = ET.parse(script_arg_vrt_dem)
+    root = tree.getroot()
+    current_vrt_dem_idx = 0
+    for simpleSource in root.iter('SimpleSource'):
+        sourceFilename = simpleSource.find('SourceFilename')
+        if sourceFilename is None:
+            raise InvalidArgumentError(
+                "Source CSV argument list 'SimpleSource' missing 'SourceFilename'"
+            )
+        dem_filename = sourceFilename.text
+        if dem_filename == vrt_dem_list[current_vrt_dem_idx]:
+            for vrt in dem_vrts_dict[dem_filename]:
+                if vrt not in subset_vrts_started:
+                    subset_vrts_started.add(vrt)
+                    with open(vrt, 'a') as subset_vrt_fp:
+                        subset_vrt_fp.write(vrt_contents_prefix)
+                with open(vrt, 'a') as subset_vrt_fp:
+                    subset_vrt_fp.write(ET.tostring(simpleSource, encoding='unicode', method='xml'))
+            current_vrt_dem_idx += 1
+            if current_vrt_dem_idx == len(vrt_dem_list):
+                break
+
+    if current_vrt_dem_idx != len(vrt_dem_list):
+        raise InvalidArgumentError(
+            "Could not find CSV DEM filename '{}' in scan of main VRT DEM (script argument) "
+            "'SimpleSource/SourceFilename' elements. Make sure CSV DEM filenames match exactly "
+            "the elements in the main VRT DEM, and that the VRT DEM was built from a SORTED "
+            "text file list of component DEM filenames.".format(vrt_dem_list[current_vrt_dem_idx])
+        )
+
+    subset_vrts_not_started = subset_vrts_all - subset_vrts_started
+    if len(subset_vrts_not_started) > 0:
+        raise InvalidArgumentError(
+            "For source CSV argument list, {} of {} subset VRT DEMs were not written. "
+            "Make sure main VRT DEM (script argument) was built from a SORTED text file list "
+            "of component DEM filenames.".format(len(subset_vrts_not_started), len(subset_vrts_all))
+        )
+
+    # Finish writing all subset VRT files
+    for vrt in subset_vrts_started:
+        with open(vrt, 'a') as subset_vrt_fp:
+            subset_vrt_fp.write(vrt_contents_suffix)
+
+    return csv_arg_data_trimmed
diff --git a/pgc_ortho.py b/pgc_ortho.py
@@ -136,26 +136,8 @@ def main():
         parser.error("--dem and --ortho_height options are mutually exclusive.  Please choose only one.")
 
     #### Test if DEM exists
-    if args.dem:
-        if not os.path.isfile(args.dem):
-            parser.error("DEM does not exist: {}".format(args.dem))
-        if args.l is None:
-            if args.dem.endswith('.vrt'):
-                total_dem_filesz_gb = 0.0
-                tree = ET.parse(args.dem)
-                root = tree.getroot()
-                for sourceFilename in root.iter('SourceFilename'):
-                    dem_filename = sourceFilename.text
-                    if not os.path.isfile(dem_filename):
-                        parser.error("VRT DEM component raster does not exist: {}".format(dem_filename))
-                    dem_filesz_gb = os.path.getsize(dem_filename) / 1024.0 / 1024 / 1024
-                    total_dem_filesz_gb += dem_filesz_gb
-                dem_filesz_gb = total_dem_filesz_gb
-            else:
-                dem_filesz_gb = os.path.getsize(args.dem) / 1024.0 / 1024 / 1024
-
-            pbs_req_mem_gb = int(max(math.ceil(dem_filesz_gb) + 2, 4))
-            args.l = 'mem={}gb'.format(pbs_req_mem_gb)
+    if args.dem is not None and not os.path.isfile(args.dem):
+        parser.error("DEM does not exist: {}".format(args.dem))
 
     #### Set up console logging handler
     lso = logging.StreamHandler()
@@ -183,15 +165,28 @@ def main():
         image_list1 = utils.find_images(src, True, ortho_functions.exts)
     elif srctype == 'csvfile':
         # Load CSV data
-        csv_arg_data = np.char.strip(np.loadtxt(src, dtype=str, delimiter=','), '\'"')
-        csv_header_argname_list = [argname.lstrip('-').replace('-', '_').lower() for argname in csv_arg_data[0, :]]
-        csv_arg_data = csv_arg_data[1:, :]  # remove header row
+        csv_arg_data = np.char.strip(np.loadtxt(src, dtype=str, delimiter=',', encoding="utf-8-sig"), '\'"')
+        csv_header = csv_arg_data[0, :]
+
+        # Adjust CSV header argument names
+        # Support ArcGIS export of typical fp.py output
+        if 'OID_' in csv_header:
+            csv_arg_data = np.delete(csv_arg_data, np.where(csv_header == 'OID_')[0], axis=1)
+            csv_header = csv_arg_data[0, :]
+        if 'O_FILEPATH' in csv_header:
+            csv_header[csv_header == 'O_FILEPATH'] = 'src'
+
+        # Convert CSV header argument names to argparse namespace variable format
+        csv_header_argname_list = [argname.lstrip('-').replace('-', '_').lower() for argname in csv_header]
+
+        # Remove header row
+        csv_arg_data = np.delete(csv_arg_data, 0, axis=0)
 
         # Verify CSV arguments and values
         if len(csv_header_argname_list) >= 1 and 'src' in csv_header_argname_list:
             pass
         else:
-            parser.error("'src' should be the header of the first colum of source CSV argument list file")
+            parser.error("'src' should be the header of the first column of source CSV argument list file")
         if 'epsg' in csv_header_argname_list:
             csv_epsg_array = csv_arg_data[:, csv_header_argname_list.index('epsg')].astype(int)
             invalid_epsg_code = False
@@ -206,6 +201,10 @@ def main():
         elif args.epsg is None:
             parser.error("A valid EPSG argument must be specified")
 
+        # Create subsets of VRT DEM and trim CSV data if applicable
+        if args.dem is not None and args.dem.endswith('.vrt') and 'dem' in csv_header_argname_list:
+            csv_arg_data = utils.subset_vrt_dem(csv_arg_data, csv_header_argname_list, args)
+
         # Extract src image paths and send to utils.find_images
         csv_src_array = csv_arg_data[:, csv_header_argname_list.index('src')]
         image_list1 = utils.find_images(csv_src_array.tolist(), True, ortho_functions.exts)
@@ -218,6 +217,25 @@ def main():
     else:
         image_list1 = [src]
 
+    ## Automatically set PBS job requested memory if applicable
+    if (    args.dem is not None and args.pbs and args.l is None
+        and (srctype != 'csvfile' or 'dem' not in csv_header_argname_list)):
+        if args.dem.endswith('.vrt'):
+            total_dem_filesz_gb = 0.0
+            tree = ET.parse(args.dem)
+            root = tree.getroot()
+            for sourceFilename in root.iter('SourceFilename'):
+                dem_filename = sourceFilename.text
+                if not os.path.isfile(dem_filename):
+                    parser.error("VRT DEM component raster does not exist: {}".format(dem_filename))
+                dem_filesz_gb = os.path.getsize(dem_filename) / 1024.0 / 1024 / 1024
+                total_dem_filesz_gb += dem_filesz_gb
+            dem_filesz_gb = total_dem_filesz_gb
+        else:
+            dem_filesz_gb = os.path.getsize(args.dem) / 1024.0 / 1024 / 1024
+        pbs_req_mem_gb = int(min(50, max(4, math.ceil(dem_filesz_gb) + 2)))
+        args.l = 'mem={}gb'.format(pbs_req_mem_gb)
+
     ## Group Ikonos
     image_list2 = []
     for i, srcfp in enumerate(image_list1):