From 339001de48b2d693319001a8d9c6b1a1cfd4b26c Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 15:38:08 +0000
Subject: [PATCH 01/66] converting catfim pipeline to open source

---
 lib/utils/shared_variables.py     |   2 +-
 tests/inundation.py               |   1 +
 tools/generate_categorical_fim.py | 324 ++++++++++++++++++++++--------
 3 files changed, 241 insertions(+), 86 deletions(-)

diff --git a/lib/utils/shared_variables.py b/lib/utils/shared_variables.py
index 40a8feacb..244a12d2b 100644
--- a/lib/utils/shared_variables.py
+++ b/lib/utils/shared_variables.py
@@ -3,7 +3,7 @@
 # Projections.
 #PREP_PROJECTION = "+proj=aea +datum=NAD83 +x_0=0.0 +y_0=0.0 +lon_0=96dW +lat_0=23dN +lat_1=29d30'N +lat_2=45d30'N +towgs84=-0.9956000824677655,1.901299877314078,0.5215002840524426,0.02591500053005733,0.009425998542707753,0.01159900118427752,-0.00062000005129903 +no_defs +units=m"
 PREP_PROJECTION = 'PROJCS["USA_Contiguous_Albers_Equal_Area_Conic_USGS_version",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.2572221010042,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433],AUTHORITY["EPSG","4269"]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["standard_parallel_1",29.5],PARAMETER["standard_parallel_2",45.5],PARAMETER["latitude_of_center",23],PARAMETER["longitude_of_center",-96],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]]]'
-
+VIZ_PROJECTION ='PROJCS["WGS_1984_Web_Mercator_Auxiliary_Sphere",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mercator_Auxiliary_Sphere"],PARAMETER["False_Easting",0.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",0.0],PARAMETER["Standard_Parallel_1",0.0],PARAMETER["Auxiliary_Sphere_Type",0.0],UNIT["Meter",1.0]]'
 # -- Data URLs-- #
 NHD_URL_PARENT = r'https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/Beta/GDB/'
 NWM_HYDROFABRIC_URL = r'http://www.nohrsc.noaa.gov/pub/staff/keicher/NWM_live/web/data_tools/NWM_channel_hydrofabric.tar.gz'  # Temporary
diff --git a/tests/inundation.py b/tests/inundation.py
index b4db4fa49..e7c600510 100755
--- a/tests/inundation.py
+++ b/tests/inundation.py
@@ -11,6 +11,7 @@
 from shapely.geometry import shape
 from rasterio.mask import mask
 from rasterio.io import DatasetReader,DatasetWriter
+from rasterio.features import shapes
 from collections import OrderedDict
 import argparse
 from warnings import warn
diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py
index 9423f6c81..e976fb349 100644
--- a/tools/generate_categorical_fim.py
+++ b/tools/generate_categorical_fim.py
@@ -1,155 +1,309 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, 'foss_fim/tests')
+sys.path.insert(1, 'foss_fim/lib')
 import os
 from multiprocessing import Pool
 import argparse
 import traceback
-import sys
-
-sys.path.insert(1, 'foss_fim/tests')
+import rasterio
+import geopandas as gpd
+import pandas as pd
+import shutil
+from rasterio.features import shapes
+from shapely.geometry.polygon import Polygon
+from shapely.geometry.multipolygon import MultiPolygon
+from utils.shared_variables import PREP_PROJECTION,VIZ_PROJECTION
+from utils.shared_functions import getDriver
 from inundation import inundate
 
 INPUTS_DIR = r'/data/inputs'
+magnitude_list = ['action', 'minor', 'moderate','major']
 
-# Define necessary variables for inundation().
+# map path to points with attributes
+all_mapped_ahps_conus_hipr = '/data/inputs/ahp_sites/all_mapped_ahps.csv'
+
+# define necessary variables for inundation()
 hucs, hucs_layerName = os.path.join(INPUTS_DIR, 'wbd', 'WBD_National.gpkg'), 'WBDHU8'
 mask_type, catchment_poly = 'huc', ''
-    
 
-def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, job_number, gpkg, extif, depthtif):
-    
-    # Create output directory and log directory.
+def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif):
+
+    # create output directory and log directory
     if not os.path.exists(output_cat_fim_dir):
         os.mkdir(output_cat_fim_dir)
+
+    # create error log
     log_dir = os.path.join(output_cat_fim_dir, 'logs')
     if not os.path.exists(log_dir):
         os.mkdir(log_dir)
-    
+
     no_data_list = []
     procs_list = []
-       
-    # Loop through huc directories in the source_flow directory.
+    log_file = os.path.join(log_dir, 'errors.log')
+
+
     source_flow_dir_list = os.listdir(source_flow_dir)
-    for huc in source_flow_dir_list:
+    output_flow_dir_list = os.listdir(fim_run_dir)
+
+    # log missing hucs
+    missing_hucs = list(set(source_flow_dir_list) - set(output_flow_dir_list))
+    missing_hucs = [huc for huc in missing_hucs if "." not in huc]
+    if len(missing_hucs) > 0:
+        f = open(log_file, 'a+')
+        f.write(f"Missing hucs from output directory: {', '.join(missing_hucs)}\n")
+        f.close()
+
+    # loop through matching huc directories in the source_flow directory
+    matching_hucs = list(set(output_flow_dir_list) & set(source_flow_dir_list))
+    for huc in matching_hucs:
+
         if "." not in huc:
-            
-            # Get list of AHPS site directories.
+
+            # get list of AHPS site directories
             ahps_site_dir = os.path.join(source_flow_dir, huc)
             ahps_site_dir_list = os.listdir(ahps_site_dir)
-            
-            # Map paths to HAND files needed for inundation().
+
+            # map paths to HAND files needed for inundation()
             fim_run_huc_dir = os.path.join(fim_run_dir, huc)
             rem = os.path.join(fim_run_huc_dir, 'rem_zeroed_masked.tif')
             catchments = os.path.join(fim_run_huc_dir, 'gw_catchments_reaches_filtered_addedAttributes.tif')
             hydroTable =  os.path.join(fim_run_huc_dir, 'hydroTable.csv')
-            
+
             exit_flag = False  # Default to False.
-            
-            # Check if necessary data exist; set exit_flag to True if they don't exist.
+
+            # check if necessary data exist; set exit_flag to True if they don't exist
             for f in [rem, catchments, hydroTable]:
                 if not os.path.exists(f):
-                    print(f)
                     no_data_list.append(f)
                     exit_flag = True
-                    
-            # Log "Missing data" if missing TODO improve this.
+
+            # log missing data
             if exit_flag == True:
-                f = open(os.path.join(log_dir, huc + '.txt'), 'w')
-                f.write("Missing data")
-                continue
-            
-            # Map path to huc directory inside out output_cat_fim_dir.
+                f = open(log_file, 'a+')
+                f.write(f"Missing data for: {fim_run_huc_dir}\n")
+                f.close()
+
+            # map path to huc directory inside out output_cat_fim_dir
             cat_fim_huc_dir = os.path.join(output_cat_fim_dir, huc)
             if not os.path.exists(cat_fim_huc_dir):
                 os.mkdir(cat_fim_huc_dir)
-            
-            # Loop through AHPS sites.
+
+            # loop through AHPS sites
             for ahps_site in ahps_site_dir_list:
-                # Map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj).
+                # map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj)
                 ahps_site_parent = os.path.join(ahps_site_dir, ahps_site)
                 thresholds_dir_list = os.listdir(ahps_site_parent)
-                
+
                 # Map parent directory for all inundation output filesoutput files.
                 cat_fim_huc_ahps_dir = os.path.join(cat_fim_huc_dir, ahps_site)
                 if not os.path.exists(cat_fim_huc_ahps_dir):
                     os.mkdir(cat_fim_huc_ahps_dir)
-                        
-                # Loop through thresholds/magnitudes and define inundation output files paths
+
+                # loop through thresholds/magnitudes and define inundation output files paths
                 for magnitude in thresholds_dir_list:
+
                     if "." not in magnitude:
+
                         magnitude_flows_csv = os.path.join(ahps_site_parent, magnitude, 'ahps_' + ahps_site + '_huc_' + huc + '_flows_' + magnitude + '.csv')
+
                         if os.path.exists(magnitude_flows_csv):
-                            if gpkg:
-                                output_extent_gpkg = os.path.join(cat_fim_huc_ahps_dir, ahps_site + '_' + magnitude + '_extent.gpkg')
-                            else:
-                                output_extent_gpkg = None
-                            if extif:
-                                output_extent_grid = os.path.join(cat_fim_huc_ahps_dir, ahps_site + '_' + magnitude + '_extent.tif')
-                            else:
-                                output_extent_grid = None
+
+                            output_extent_grid = os.path.join(cat_fim_huc_ahps_dir, ahps_site + '_' + magnitude + '_extent.tif')
+
                             if depthtif:
                                 output_depth_grid = os.path.join(cat_fim_huc_ahps_dir, ahps_site + '_' + magnitude + '_depth.tif')
                             else:
                                 output_depth_grid = None
-                            
-                            # Append necessary variables to list for multiprocessing.
-                            procs_list.append([rem, catchments, catchment_poly, magnitude_flows_csv, huc, hydroTable, output_extent_gpkg, output_extent_grid, output_depth_grid, ahps_site, magnitude, log_dir])
-    # Initiate multiprocessing.                                    
-    pool = Pool(job_number)
-    pool.map(run_inundation, procs_list)
 
+                            # append necessary variables to list for multiprocessing.
+                            procs_list.append([rem, catchments, catchment_poly, magnitude_flows_csv, huc, hydroTable, output_extent_grid, output_depth_grid, ahps_site, magnitude, log_dir])
+
+    # initiate multiprocessing
+    print(f"Running inundation for {len(procs_list)} sites using {number_of_jobs} jobs")
+    pool = Pool(number_of_jobs)
+    pool.map(run_inundation, procs_list)
 
 def run_inundation(args):
-    
-    # Parse args.
-    rem = args[0]
-    catchments = args[1]
-    catchment_poly = args[2]
+
+    rem                 = args[0]
+    catchments          = args[1]
+    catchment_poly      = args[2]
     magnitude_flows_csv = args[3]
-    huc = args[4]
-    hydroTable = args[5]
-    output_extent_gpkg = args[6]
-    output_extent_grid = args[7]
-    output_depth_grid = args[8]
-    ahps_site = args[9]
-    magnitude = args[10]
-    log_dir = args[11]
-    
-    print("Running inundation for " + str(os.path.split(os.path.split(output_extent_gpkg)[0])[0]))
+    huc                 = args[4]
+    hydroTable          = args[5]
+    output_extent_grid  = args[6]
+    output_depth_grid   = args[7]
+    ahps_site           = args[8]
+    magnitude           = args[9]
+    log_dir             = args[10]
+
     try:
-        inundate(
-                 rem,catchments,catchment_poly,hydroTable,magnitude_flows_csv,mask_type,hucs=hucs,hucs_layerName=hucs_layerName,
-                 subset_hucs=huc,num_workers=1,aggregate=False,inundation_raster=output_extent_grid,inundation_polygon=output_extent_gpkg,
+        inundate(rem,catchments,catchment_poly,hydroTable,magnitude_flows_csv,mask_type,hucs=hucs,hucs_layerName=hucs_layerName,
+                 subset_hucs=huc,num_workers=1,aggregate=False,inundation_raster=output_extent_grid,inundation_polygon=None,
                  depths=output_depth_grid,out_raster_profile=None,out_vector_profile=None,quiet=True
                 )
+
     except Exception:
-        # Log errors and their tracebacks.
-        f = open(os.path.join(log_dir, huc + "_" + ahps_site + "_" + magnitude + '.txt'), 'w')
-        f.write(traceback.format_exc())
+        # log errors and their tracebacks
+        f = open(log_file, 'a+')
+        f.write(f"{output_extent_gpkg} - inundation error: {traceback.format_exc()}\n")
         f.close()
-        
-        
+
+def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
+
+    # create workspace
+    gpkg_dir = os.path.join(output_cat_fim_dir, 'gpkg')
+    if not os.path.exists(gpkg_dir):
+        os.mkdir(gpkg_dir)
+
+    fim_version  = os.path.basename(output_cat_fim_dir)
+    merged_layer = os.path.join(output_cat_fim_dir, 'catfim_library.gpkg')
+
+    if not os.path.exists(merged_layer): # prevents appending to existing output
+
+        huc_ahps_dir_list = os.listdir(output_cat_fim_dir)
+        skip_list=['errors','logs','gpkg',merged_layer]
+
+        for magnitude in magnitude_list:
+
+            procs_list = []
+
+            # loop through all categories
+            for huc in huc_ahps_dir_list:
+
+                if huc not in skip_list:
+
+                    huc_dir = os.path.join(output_cat_fim_dir, huc)
+                    ahps_dir_list = os.listdir(huc_dir)
+
+                    # loop through ahps sites
+                    for ahps_lid in ahps_dir_list:
+                        ahps_lid_dir = os.path.join(huc_dir, ahps_lid)
+
+                        extent_grid = os.path.join(ahps_lid_dir, ahps_lid + '_' + magnitude + '_extent_' + huc + '.tif')
+
+                        if os.path.exists(extent_grid):
+                            procs_list.append([ahps_lid, extent_grid, gpkg_dir, fim_version, huc, magnitude])
+
+                        else:
+                            try:
+                                f = open(log_file, 'a+')
+                                f.write(f"Missing layers: {extent_gpkg}\n")
+                                f.close()
+                            except:
+                                pass
+
+            # multiprocess with instructions
+            pool = Pool(number_of_jobs)
+            pool.map(reformat_inundation_maps, procs_list)
+
+        # merge all layers
+        print(f"Merging {len(os.listdir(gpkg_dir))} layers...")
+
+        for layer in os.listdir(gpkg_dir):
+
+            diss_extent_filename = os.path.join(gpkg_dir, layer)
+
+            # open diss_extent
+            diss_extent = gpd.read_file(diss_extent_filename)
+
+            # write/append aggregate diss_extent
+            if os.path.isfile(merged_layer):
+                diss_extent.to_file(merged_layer,driver=getDriver(merged_layer),index=False, mode='a')
+            else:
+                diss_extent.to_file(merged_layer,driver=getDriver(merged_layer),index=False)
+
+            del diss_extent
+
+        # join attributes
+        all_mapped_ahps_conus_hipr_fl = pd.read_table(all_mapped_ahps_conus_hipr, sep=",")
+        merged_layer_gpd = gpd.read_file(merged_layer)
+        merged_layer_gpd = merged_layer_gpd.merge(all_mapped_ahps_conus_hipr_fl, left_on='ahps_lid', right_on='nws_lid')
+
+        # save final output
+        merged_layer_gpd.to_file(merged_layer,driver=getDriver(merged_layer),index=False)
+
+        shutil.rmtree(gpkg_dir)
+
+    else:
+        print(f"{merged_layer} already exists.")
+
+def reformat_inundation_maps(args):
+
+    try:
+        lid          = args[0]
+        grid_path    = args[1]
+        gpkg_dir     = args[2]
+        fim_version  = args[3]
+        huc          = args[4]
+        magnitude    = args[5]
+
+        # convert raster to to shapes
+        # with rasterio.Env():
+        with rasterio.open(grid_path) as src:
+            image = src.read(1)
+            mask = image > 0
+
+        # aggregate shapes
+        results = ({'properties': {'extent': 1}, 'geometry': s} for i, (s, v) in enumerate(shapes(image, mask=mask,transform=src.transform)))
+
+        # convert list of shapes to polygon
+        extent_poly  = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION)
+
+        # dissolve polygons
+        extent_poly_diss = extent_poly.dissolve(by='extent')
+
+        # update attributes
+        extent_poly_diss = extent_poly_diss.reset_index(drop=True)
+        extent_poly_diss['ahps_lid'] = lid
+        extent_poly_diss['magnitude'] = magnitude
+        extent_poly_diss['version'] = fim_version
+        extent_poly_diss['huc'] = huc
+
+        # project to Web Mercator
+        extent_poly = extent_poly.to_crs(VIZ_PROJECTION)
+
+        # copy gdb and save to feature class
+        handle = os.path.split(grid_path)[1].replace('.tif', '')
+
+        diss_extent_filename = os.path.join(gpkg_dir, handle + "_dissolved.gpkg")
+
+        extent_poly_diss["geometry"] = [MultiPolygon([feature]) if type(feature) == Polygon else feature for feature in extent_poly_diss["geometry"]]
+
+        extent_poly_diss.to_file(diss_extent_filename,driver=getDriver(diss_extent_filename),index=False)
+
+    except Exception as e:
+        # log and clean out the gdb so it's not merged in later
+        try:
+            f = open(log_dir, 'a+')
+            f.write("f{diss_extent_filename} - dissolve error: {e}\n")
+            f.close()
+        except:
+            pass
+
+
 if __name__ == '__main__':
-    
-    # Parse arguments.
+
+    # parse arguments
     parser = argparse.ArgumentParser(description='Inundation mapping and regression analysis for FOSS FIM. Regression analysis results are stored in the test directory.')
     parser.add_argument('-r','--fim-run-dir',help='Name of directory containing outputs of fim_run.sh',required=True)
     parser.add_argument('-s', '--source-flow-dir',help='Path to directory containing flow CSVs to use to generate categorical FIM.',required=True, default="")
     parser.add_argument('-o', '--output-cat-fim-dir',help='Path to directory where categorical FIM outputs will be written.',required=True, default="")
-    parser.add_argument('-j','--job-number',help='Number of processes to use. Default is 1.',required=False, default="1")
-    parser.add_argument('-gpkg','--write-geopackage',help='Using this option will write a geopackage.',required=False, action='store_true')
-    parser.add_argument('-extif','--write-extent-tiff',help='Using this option will write extent TIFFs. This is the default.',required=False, action='store_true')
+    parser.add_argument('-j','--number-of-jobs',help='Number of processes to use. Default is 1.',required=False, default="1",type=int)
     parser.add_argument('-depthtif','--write-depth-tiff',help='Using this option will write depth TIFFs.',required=False, action='store_true')
-    
+
     args = vars(parser.parse_args())
-    
+
     fim_run_dir = args['fim_run_dir']
     source_flow_dir = args['source_flow_dir']
     output_cat_fim_dir = args['output_cat_fim_dir']
-    job_number = int(args['job_number'])
-    gpkg = args['write_geopackage']
-    extif = args['write_extent_tiff']
+    number_of_jobs = int(args['number_of_jobs'])
     depthtif = args['write_depth_tiff']
-    
-    generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, job_number, gpkg, extif, depthtif)
-    
-    
-    
+
+    print("Generating Categorical FIM")
+    generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif)
+
+    print("Aggregating Categorical FIM")
+    post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir)

From efdf609d1c126ddf11a62a6fc8d2d3c3a0c91808 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 16:49:55 +0000
Subject: [PATCH 02/66] updating aggregate grid blocksize

---
 src/aggregate_fim_outputs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/aggregate_fim_outputs.py b/src/aggregate_fim_outputs.py
index edafd93a3..67aed0514 100644
--- a/src/aggregate_fim_outputs.py
+++ b/src/aggregate_fim_outputs.py
@@ -118,7 +118,7 @@ def aggregate_fim_outputs(fim_out_dir):
                 out_meta = rem_src.meta.copy()
                 out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
 
-                with rasterio.open(rem_mosaic, "w", **out_meta, tiled=True, blockxsize=256, blockysize=256, BIGTIFF='YES') as dest:
+                with rasterio.open(rem_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
                     dest.write(mosaic)
 
                 del rem_files_to_mosaic,rem_src,out_meta,mosaic
@@ -143,7 +143,7 @@ def aggregate_fim_outputs(fim_out_dir):
 
                 out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
 
-                with rasterio.open(catchment_mosaic, "w", **out_meta, tiled=True, blockxsize=256, blockysize=256, BIGTIFF='YES') as dest:
+                with rasterio.open(catchment_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
                     dest.write(mosaic)
 
                 del cat_files_to_mosaic,cat_src,out_meta,mosaic

From 3fff6c28ffabdb680ed600b26406a258c97554bf Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 19:13:51 +0000
Subject: [PATCH 03/66] parallelizing aggregation process

---
 src/aggregate_fim_outputs.py | 162 ++++++++++++++++++-----------------
 1 file changed, 85 insertions(+), 77 deletions(-)

diff --git a/src/aggregate_fim_outputs.py b/src/aggregate_fim_outputs.py
index 67aed0514..62c6b2fc5 100644
--- a/src/aggregate_fim_outputs.py
+++ b/src/aggregate_fim_outputs.py
@@ -2,6 +2,7 @@
 
 import os
 import argparse
+from multiprocessing import Pool
 import pandas as pd
 import json
 import rasterio
@@ -10,28 +11,25 @@
 import csv
 from utils.shared_variables import PREP_PROJECTION
 
+def aggregate_fim_outputs(args):
 
-def aggregate_fim_outputs(fim_out_dir):
+    fim_out_dir   = args[0]
+    huc6          = args[1]
+    huc_list      = args[2]
 
-    print ("aggregating outputs to HUC6 scale")
+    huc6_dir = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6))
+    os.makedirs(huc6_dir, exist_ok=True)
 
-    drop_folders = ['logs']
-    huc_list = [huc for huc in os.listdir(fim_out_dir) if huc not in drop_folders]
-    huc6_list = [str(huc[0:6]) for huc in os.listdir(fim_out_dir) if huc not in drop_folders]
-    huc6_list = list(set(huc6_list))
+    # aggregate file name paths
+    aggregate_hydrotable = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6),'hydroTable.csv')
+    aggregate_src = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6),f'rating_curves_{huc6}.json')
 
     for huc in huc_list:
 
-        os.makedirs(os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc[0:6])), exist_ok=True)
-
         # original file paths
         hydrotable_filename = os.path.join(fim_out_dir,huc,'hydroTable.csv')
         src_filename = os.path.join(fim_out_dir,huc,'src.json')
 
-        # aggregate file name paths
-        aggregate_hydrotable = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc[0:6]),'hydroTable.csv')
-        aggregate_src = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc[0:6]),f'rating_curves_{huc[0:6]}.json')
-
         if len(huc)> 6:
 
             # open hydrotable
@@ -68,107 +66,117 @@ def aggregate_fim_outputs(fim_out_dir):
             shutil.copy(hydrotable_filename, aggregate_hydrotable)
             shutil.copy(src_filename, aggregate_src)
 
-    for huc6 in huc6_list:
-
-        ## add feature_id to aggregate src
-        aggregate_hydrotable = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6),'hydroTable.csv')
-        aggregate_src = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6),f'rating_curves_{huc6}.json')
-
-        # Open aggregate src for writing feature_ids to
-        src_data = {}
-        with open(aggregate_src) as jsonf:
-            src_data = json.load(jsonf)
-
-        with open(aggregate_hydrotable) as csvf:
-            csvReader = csv.DictReader(csvf)
+    ## add feature_id to aggregate src
+    # Open aggregate src for writing feature_ids to
+    src_data = {}
+    with open(aggregate_src) as jsonf:
+        src_data = json.load(jsonf)
 
-            for row in csvReader:
-                if row['HydroID'].lstrip('0') in src_data and 'nwm_feature_id' not in src_data[row['HydroID'].lstrip('0')]:
-                    src_data[row['HydroID'].lstrip('0')]['nwm_feature_id'] = row['feature_id']
+    with open(aggregate_hydrotable) as csvf:
+        csvReader = csv.DictReader(csvf)
 
-        # Write src_data to JSON file
-        with open(aggregate_src, 'w') as jsonf:
-            json.dump(src_data, jsonf)
+        for row in csvReader:
+            if row['HydroID'].lstrip('0') in src_data and 'nwm_feature_id' not in src_data[row['HydroID'].lstrip('0')]:
+                src_data[row['HydroID'].lstrip('0')]['nwm_feature_id'] = row['feature_id']
 
-        ## aggregate rasters
-        huc6_dir = os.path.join(fim_out_dir,'aggregate_fim_outputs',huc6)
+    # Write src_data to JSON file
+    with open(aggregate_src, 'w') as jsonf:
+        json.dump(src_data, jsonf)
 
-        # aggregate file paths
-        rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}.tif')
-        catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}.tif')
+    ## aggregate rasters
+    # aggregate file paths
+    rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}.tif')
+    catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}.tif')
 
-        if huc6 not in huc_list:
+    if huc6 not in huc_list:
 
-            huc6_filter = [path.startswith(huc6) for path in huc_list]
-            subset_huc6_list = [i for (i, v) in zip(huc_list, huc6_filter) if v]
+        huc6_filter = [path.startswith(huc6) for path in huc_list]
+        subset_huc6_list = [i for (i, v) in zip(huc_list, huc6_filter) if v]
 
-            # aggregate and mosaic rem
-            rem_list = [os.path.join(fim_out_dir,huc,'rem_zeroed_masked.tif') for huc in subset_huc6_list]
+        # aggregate and mosaic rem
+        rem_list = [os.path.join(fim_out_dir,huc,'rem_zeroed_masked.tif') for huc in subset_huc6_list]
 
-            if len(rem_list) > 1:
+        if len(rem_list) > 1:
 
-                rem_files_to_mosaic = []
+            rem_files_to_mosaic = []
 
-                for rem in rem_list:
+            for rem in rem_list:
 
-                    rem_src = rasterio.open(rem)
-                    rem_files_to_mosaic.append(rem_src)
+                rem_src = rasterio.open(rem)
+                rem_files_to_mosaic.append(rem_src)
 
-                mosaic, out_trans = merge(rem_files_to_mosaic)
-                out_meta = rem_src.meta.copy()
-                out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
+            mosaic, out_trans = merge(rem_files_to_mosaic)
+            out_meta = rem_src.meta.copy()
+            out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
 
-                with rasterio.open(rem_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
-                    dest.write(mosaic)
+            with rasterio.open(rem_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
+                dest.write(mosaic)
 
-                del rem_files_to_mosaic,rem_src,out_meta,mosaic
+            del rem_files_to_mosaic,rem_src,out_meta,mosaic
 
-            elif len(rem_list)==1:
+        elif len(rem_list)==1:
 
-                shutil.copy(rem_list[0], rem_mosaic)
+            shutil.copy(rem_list[0], rem_mosaic)
 
-            # aggregate and mosaic catchments
-            catchment_list = [os.path.join(fim_out_dir,huc,'gw_catchments_reaches_filtered_addedAttributes.tif') for huc in subset_huc6_list]
+        # aggregate and mosaic catchments
+        catchment_list = [os.path.join(fim_out_dir,huc,'gw_catchments_reaches_filtered_addedAttributes.tif') for huc in subset_huc6_list]
 
-            if len(catchment_list) > 1:
+        if len(catchment_list) > 1:
 
-                cat_files_to_mosaic = []
+            cat_files_to_mosaic = []
 
-                for cat in catchment_list:
-                    cat_src = rasterio.open(cat)
-                    cat_files_to_mosaic.append(cat_src)
+            for cat in catchment_list:
+                cat_src = rasterio.open(cat)
+                cat_files_to_mosaic.append(cat_src)
 
-                mosaic, out_trans = merge(cat_files_to_mosaic)
-                out_meta = cat_src.meta.copy()
+            mosaic, out_trans = merge(cat_files_to_mosaic)
+            out_meta = cat_src.meta.copy()
 
-                out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
+            out_meta.update({"driver": "GTiff", "height": mosaic.shape[1], "width": mosaic.shape[2], "dtype": str(mosaic.dtype), "transform": out_trans,"crs": PREP_PROJECTION,'compress': 'lzw'})
 
-                with rasterio.open(catchment_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
-                    dest.write(mosaic)
+            with rasterio.open(catchment_mosaic, "w", **out_meta, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dest:
+                dest.write(mosaic)
 
-                del cat_files_to_mosaic,cat_src,out_meta,mosaic
+            del cat_files_to_mosaic,cat_src,out_meta,mosaic
 
-            elif len(catchment_list)==1:
+        elif len(catchment_list)==1:
 
-                shutil.copy(catchment_list[0], catchment_mosaic)
-
-        else:
-            # original file paths
-            rem_filename = os.path.join(fim_out_dir,huc6,'rem_zeroed_masked.tif')
-            catchment_filename = os.path.join(fim_out_dir,huc6,'gw_catchments_reaches_filtered_addedAttributes.tif')
+            shutil.copy(catchment_list[0], catchment_mosaic)
 
-            shutil.copy(rem_filename, rem_mosaic)
-            shutil.copy(catchment_filename, catchment_mosaic)
+    else:
+        # original file paths
+        rem_filename = os.path.join(fim_out_dir,huc6,'rem_zeroed_masked.tif')
+        catchment_filename = os.path.join(fim_out_dir,huc6,'gw_catchments_reaches_filtered_addedAttributes.tif')
 
+        shutil.copy(rem_filename, rem_mosaic)
+        shutil.copy(catchment_filename, catchment_mosaic)
 
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Aggregate layers buy HUC6')
     parser.add_argument('-d','--fim-outputs-directory', help='FIM outputs directory', required=True)
+    parser.add_argument('-j','--number-of-jobs',help='Number of processes to use. Default is 1.',required=False, default="1",type=int)
 
 
     args = vars(parser.parse_args())
 
     fim_outputs_directory = args['fim_outputs_directory']
+    number_of_jobs = int(args['number_of_jobs'])
+
+    drop_folders = ['logs']
+    huc_list = [huc for huc in os.listdir(fim_outputs_directory) if huc not in drop_folders]
+    huc6_list = [str(huc[0:6]) for huc in os.listdir(fim_outputs_directory) if huc not in drop_folders]
+    huc6_list = list(set(huc6_list))
+
+
+    procs_list = []
+
+    for huc6 in huc6_list:
+
+        limited_huc_list = [huc for huc in huc_list if huc.startswith(huc6)]
+
+        procs_list.append([fim_outputs_directory,huc6,limited_huc_list])
 
-    aggregate_fim_outputs(fim_outputs_directory)
+    print(f"aggregating {len(huc_list)} hucs to HUC6 scale using {number_of_jobs} jobs")
+    pool = Pool(number_of_jobs)
+    pool.map(aggregate_fim_outputs, procs_list)

From 19e364ea71179abf8b15bf2c7769b779b08d9093 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 19:15:48 +0000
Subject: [PATCH 04/66] cleanup

---
 tools/generate_categorical_fim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py
index e976fb349..4af00f7c1 100644
--- a/tools/generate_categorical_fim.py
+++ b/tools/generate_categorical_fim.py
@@ -241,7 +241,6 @@ def reformat_inundation_maps(args):
         magnitude    = args[5]
 
         # convert raster to to shapes
-        # with rasterio.Env():
         with rasterio.open(grid_path) as src:
             image = src.read(1)
             mask = image > 0

From 174335156d1a90ea582429bdec762628cafc935f Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 14:14:20 -0600
Subject: [PATCH 05/66] updated comment in generate_categorical_fim.py

---
 tools/generate_categorical_fim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py
index 4af00f7c1..1605d2f1d 100644
--- a/tools/generate_categorical_fim.py
+++ b/tools/generate_categorical_fim.py
@@ -264,7 +264,7 @@ def reformat_inundation_maps(args):
         # project to Web Mercator
         extent_poly = extent_poly.to_crs(VIZ_PROJECTION)
 
-        # copy gdb and save to feature class
+        # save dissolved multipolygon
         handle = os.path.split(grid_path)[1].replace('.tif', '')
 
         diss_extent_filename = os.path.join(gpkg_dir, handle + "_dissolved.gpkg")

From 61a866ce2f12387fc52c92b9257fada60b406057 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 20:38:31 +0000
Subject: [PATCH 06/66] reprojecting rasters to Web Mercator

---
 src/aggregate_fim_outputs.py | 44 +++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/aggregate_fim_outputs.py b/src/aggregate_fim_outputs.py
index 62c6b2fc5..2009250d0 100644
--- a/src/aggregate_fim_outputs.py
+++ b/src/aggregate_fim_outputs.py
@@ -7,9 +7,10 @@
 import json
 import rasterio
 from rasterio.merge import merge
+from rasterio.warp import calculate_default_transform, reproject, Resampling
 import shutil
 import csv
-from utils.shared_variables import PREP_PROJECTION
+from utils.shared_variables import PREP_PROJECTION,VIZ_PROJECTION
 
 def aggregate_fim_outputs(args):
 
@@ -85,8 +86,8 @@ def aggregate_fim_outputs(args):
 
     ## aggregate rasters
     # aggregate file paths
-    rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}.tif')
-    catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}.tif')
+    rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}_unprj.tif')
+    catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}_unprj.tif')
 
     if huc6 not in huc_list:
 
@@ -151,6 +152,43 @@ def aggregate_fim_outputs(args):
         shutil.copy(rem_filename, rem_mosaic)
         shutil.copy(catchment_filename, catchment_mosaic)
 
+    ## reproject rasters
+    reproject_raster(rem_mosaic)
+    os.remove(rem_mosaic)
+
+    reproject_raster(catchment_mosaic)
+    os.remove(catchment_mosaic)
+    
+
+def reproject_raster(raster_name):
+
+    with rasterio.open(raster_name) as src:
+        transform, width, height = calculate_default_transform(
+            src.crs, VIZ_PROJECTION, src.width, src.height, *src.bounds)
+        kwargs = src.meta.copy()
+        kwargs.update({
+            'crs': VIZ_PROJECTION,
+            'transform': transform,
+            'width': width,
+            'height': height,
+            'compress': 'lzw'
+        })
+
+        raster_proj_rename = os.path.split(raster_name)[1].replace('_unprj.tif', '.tif')
+        raster_proj_dir = os.path.join(os.path.dirname(raster_name), raster_proj_rename)
+
+        with rasterio.open(raster_proj_dir, 'w', **kwargs, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dst:
+            # for i in range(1, src.count + 1):
+            reproject(
+                source=rasterio.band(src, 1),
+                destination=rasterio.band(dst, 1),
+                src_transform=src.transform,
+                src_crs=src.crs,
+                dst_transform=transform,
+                dst_crs=VIZ_PROJECTION,
+                resampling=Resampling.nearest)
+    del src, dst
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Aggregate layers buy HUC6')

From 5c108e20a727d6994bed14d4d367ec1964f11a94 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 3 Mar 2021 21:03:19 +0000
Subject: [PATCH 07/66] adding jobs to fim_run.sh

---
 fim_run.sh                   | 2 +-
 src/aggregate_fim_outputs.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index 5acdeff71..42a5d022e 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -152,5 +152,5 @@ fi
 echo "$viz"
 if [[ "$viz" -eq 1 ]]; then
     # aggregate outputs
-    python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir
+    python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir -j 4
 fi
diff --git a/src/aggregate_fim_outputs.py b/src/aggregate_fim_outputs.py
index 2009250d0..9d8676364 100644
--- a/src/aggregate_fim_outputs.py
+++ b/src/aggregate_fim_outputs.py
@@ -18,6 +18,8 @@ def aggregate_fim_outputs(args):
     huc6          = args[1]
     huc_list      = args[2]
 
+    print(f"aggregating {huc6}")
+
     huc6_dir = os.path.join(fim_out_dir,'aggregate_fim_outputs',str(huc6))
     os.makedirs(huc6_dir, exist_ok=True)
 
@@ -158,7 +160,7 @@ def aggregate_fim_outputs(args):
 
     reproject_raster(catchment_mosaic)
     os.remove(catchment_mosaic)
-    
+
 
 def reproject_raster(raster_name):
 

From b57dff67d0a2b6d6d8d1d1e69ccd3855c071ff1a Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 4 Mar 2021 17:45:17 +0000
Subject: [PATCH 08/66] removing multiple util folders

---
 tools/__init__.py                             |   0
 tools/aggregate_mannings_calibration.py       |   1 -
 tools/aggregate_metrics.py                    |  64 +++---
 tools/cache_metrics.py                        |   0
 tools/comparing_src.py                        |   2 +
 tools/generate_categorical_fim.py             | 107 +++++-----
 tools/inundation_wrapper_custom_flow.py       |   1 -
 tools/inundation_wrapper_nwm_flows.py         |   1 -
 tools/mannings_calibration_run.sh             |  18 +-
 tools/mannings_run_by_set.sh                  |   2 +-
 tools/plots/{utils => }/__init__.py           |   0
 tools/plots/eval_plots.py                     | 186 +++++++++---------
 .../shared_functions.py => plot_functions.py} |   0
 tools/preprocess/create_flow_forecast_file.py |  67 +++----
 tools/preprocess/preprocess_benchmark.py      |  51 +++--
 tools/preprocess/preprocess_fimx.py           |  74 ++++---
 tools/run_test_case.py                        |  48 ++---
 tools/run_test_case_calibration.py            |   2 +-
 tools/{utils => }/shapefile_to_raster.py      |  11 +-
 tools/synthesize_test_cases.py                |  83 ++++----
 tools/time_and_tee_mannings_calibration.sh    |   2 +-
 ...functions.py => tools_shared_functions.py} |   0
 ...variables.py => tools_shared_variables.py} |   0
 tools/utils/__init__.py                       |   0
 24 files changed, 350 insertions(+), 370 deletions(-)
 mode change 100644 => 100755 tools/__init__.py
 mode change 100644 => 100755 tools/aggregate_metrics.py
 mode change 100644 => 100755 tools/cache_metrics.py
 mode change 100644 => 100755 tools/generate_categorical_fim.py
 mode change 100644 => 100755 tools/inundation_wrapper_custom_flow.py
 rename tools/plots/{utils => }/__init__.py (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 tools/plots/eval_plots.py
 rename tools/plots/{utils/shared_functions.py => plot_functions.py} (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 tools/preprocess/create_flow_forecast_file.py
 mode change 100644 => 100755 tools/preprocess/preprocess_benchmark.py
 mode change 100644 => 100755 tools/preprocess/preprocess_fimx.py
 rename tools/{utils => }/shapefile_to_raster.py (88%)
 mode change 100644 => 100755
 mode change 100644 => 100755 tools/synthesize_test_cases.py
 rename tools/{utils/shared_functions.py => tools_shared_functions.py} (100%)
 mode change 100644 => 100755
 rename tools/{utils/shared_variables.py => tools_shared_variables.py} (100%)
 mode change 100644 => 100755
 delete mode 100644 tools/utils/__init__.py

diff --git a/tools/__init__.py b/tools/__init__.py
old mode 100644
new mode 100755
diff --git a/tools/aggregate_mannings_calibration.py b/tools/aggregate_mannings_calibration.py
index f94b1d025..c57b17776 100755
--- a/tools/aggregate_mannings_calibration.py
+++ b/tools/aggregate_mannings_calibration.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-
 import os
 import pandas as pd
 import csv
diff --git a/tools/aggregate_metrics.py b/tools/aggregate_metrics.py
old mode 100644
new mode 100755
index d8a462d5b..7cc5951b5
--- a/tools/aggregate_metrics.py
+++ b/tools/aggregate_metrics.py
@@ -3,17 +3,17 @@
 import json
 import os
 import csv
-    
+
 import argparse
 
 TEST_CASES_DIR = r'/data/test_cases_new/'
 # TEMP = r'/data/temp'
 
 # Search through all previous_versions in test_cases
-from utils.shared_functions import compute_stats_from_contingency_table
+from tools_shared_functions import compute_stats_from_contingency_table
 
 def create_master_metrics_csv():
-            
+
     # Construct header
     metrics_to_write = ['true_negatives_count',
                         'false_negatives_count',
@@ -57,33 +57,33 @@ def create_master_metrics_csv():
                         'masked_perc',
                         'masked_area_km2'
                         ]
-    
+
     additional_header_info_prefix = ['version', 'nws_lid', 'magnitude', 'huc']
     list_to_write = [additional_header_info_prefix + metrics_to_write + ['full_json_path'] + ['flow'] + ['benchmark_source']]
-    
+
     for benchmark_type in ['ble', 'ahps']:
-        
+
         if benchmark_type == 'ble':
-        
+
             test_cases = r'/data/test_cases'
             test_cases_list = os.listdir(test_cases)
             # AHPS test_ids
             versions_to_aggregate = ['fim_1_0_0', 'fim_2_3_3', 'fim_3_0_0_3_fr_c']
-                            
+
             for test_case in test_cases_list:
                 try:
                     int(test_case.split('_')[0])
-                    
+
                     huc = test_case.split('_')[0]
                     previous_versions = os.path.join(test_cases, test_case, 'performance_archive', 'previous_versions')
-                    
+
                     for magnitude in ['100yr', '500yr']:
                         for version in versions_to_aggregate:
                             version_dir = os.path.join(previous_versions, version)
                             magnitude_dir = os.path.join(version_dir, magnitude)
 
                             if os.path.exists(magnitude_dir):
-                                
+
                                 magnitude_dir_list = os.listdir(magnitude_dir)
                                 for f in magnitude_dir_list:
                                     if '.json' in f:
@@ -99,40 +99,40 @@ def create_master_metrics_csv():
                                             sub_list_to_append.append(full_json_path)
                                             sub_list_to_append.append(flow)
                                             sub_list_to_append.append(benchmark_source)
-                                            
+
                                             list_to_write.append(sub_list_to_append)
-                                                                                    
+
                 except ValueError:
                     pass
-                
+
         if benchmark_type == 'ahps':
-    
+
             test_cases = r'/data/test_cases_ahps_testing'
             test_cases_list = os.listdir(test_cases)
             # AHPS test_ids
-            versions_to_aggregate = ['fim_1_0_0_nws_1_21_2021', 'fim_1_0_0_usgs_1_21_2021', 
+            versions_to_aggregate = ['fim_1_0_0_nws_1_21_2021', 'fim_1_0_0_usgs_1_21_2021',
                                      'fim_2_x_ms_nws_1_21_2021', 'fim_2_x_ms_usgs_1_21_2021',
                                      'fim_3_0_0_3_ms_c_nws_1_21_2021', 'fim_3_0_0_3_ms_c_usgs_1_21_2021',
                                      'ms_xwalk_fill_missing_cal_nws', 'ms_xwalk_fill_missing_cal_usgs']
-            
+
             for test_case in test_cases_list:
                 try:
                     int(test_case.split('_')[0])
-                    
+
                     huc = test_case.split('_')[0]
                     previous_versions = os.path.join(test_cases, test_case, 'performance_archive', 'previous_versions')
-                    
+
                     for magnitude in ['action', 'minor', 'moderate', 'major']:
                         for version in versions_to_aggregate:
-                            
+
                             if 'nws' in version:
                                 benchmark_source = 'ahps_nws'
                             if 'usgs' in version:
                                 benchmark_source = 'ahps_usgs'
-                            
+
                             version_dir = os.path.join(previous_versions, version)
                             magnitude_dir = os.path.join(version_dir, magnitude)
-                            
+
                             if os.path.exists(magnitude_dir):
                                 magnitude_dir_list = os.listdir(magnitude_dir)
                                 for f in magnitude_dir_list:
@@ -147,7 +147,7 @@ def create_master_metrics_csv():
                                                 parent_dir = 'usgs_1_21_2021'
                                             if 'nws' in version:
                                                 parent_dir = 'nws_1_21_2021'
-                                                
+
                                             flow_file = os.path.join(test_cases, parent_dir, huc, nws_lid, magnitude, 'ahps_' + nws_lid + '_huc_' + huc + '_flows_' + magnitude + '.csv')
                                             if os.path.exists(flow_file):
                                                 with open(flow_file, newline='') as csv_file:
@@ -157,7 +157,7 @@ def create_master_metrics_csv():
                                                         flow = row[1]
                                                     if nws_lid == 'mcc01':
                                                         print(flow)
-                                            
+
                                             stats_dict = json.load(open(full_json_path))
                                             for metric in metrics_to_write:
                                                 sub_list_to_append.append(stats_dict[metric])
@@ -165,10 +165,10 @@ def create_master_metrics_csv():
                                             sub_list_to_append.append(flow)
                                             sub_list_to_append.append(benchmark_source)
                                             list_to_write.append(sub_list_to_append)
-                                        
+
                 except ValueError:
                     pass
-        
+
     with open(output_csv, 'w', newline='') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerows(list_to_write)
@@ -201,7 +201,7 @@ def aggregate_metrics(config="DEV", branch="", hucs="", special_string="", outfo
     for magnitude in ['100yr', '500yr', 'action', 'minor', 'moderate', 'major']:
         huc_path_list = [['huc', 'path']]
         true_positives, true_negatives, false_positives, false_negatives, cell_area, masked_count = 0, 0, 0, 0, 0, 0
-        
+
         for test_case in test_cases_dir_list:
 
             if test_case not in ['other', 'validation_data_ble', 'validation_data_legacy', 'validation_data_ahps']:
@@ -227,11 +227,11 @@ def aggregate_metrics(config="DEV", branch="", hucs="", special_string="", outfo
                     cell_area = json_dict['cell_area_m2']
 
                     huc_path_list.append([huc, stats_json_path])
-                
-                    
+
+
             if cell_area == 0:
                 continue
-            
+
             # Pass all sums to shared function to calculate metrics.
             stats_dict = compute_stats_from_contingency_table(true_negatives, false_negatives, false_positives, true_positives, cell_area=cell_area, masked_count=masked_count)
 
@@ -239,7 +239,7 @@ def aggregate_metrics(config="DEV", branch="", hucs="", special_string="", outfo
 
             for stat in stats_dict:
                 list_to_write.append([stat, stats_dict[stat]])
-                
+
             # Map path to output directory for aggregate metrics.
             output_file = os.path.join(aggregate_output_dir, branch + '_aggregate_metrics_' + magnitude + special_string + '.csv')
 
@@ -249,7 +249,7 @@ def aggregate_metrics(config="DEV", branch="", hucs="", special_string="", outfo
                 csv_writer.writerows(list_to_write)
                 csv_writer.writerow([])
                 csv_writer.writerows(huc_path_list)
-    
+
             print()
             print("Finished aggregating for the '" + magnitude + "' magnitude. Aggregated metrics over " + str(len(huc_path_list)-1) + " test cases.")
             print()
diff --git a/tools/cache_metrics.py b/tools/cache_metrics.py
old mode 100644
new mode 100755
diff --git a/tools/comparing_src.py b/tools/comparing_src.py
index a9c8a1c8a..977b05794 100755
--- a/tools/comparing_src.py
+++ b/tools/comparing_src.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 import matplotlib.pyplot as plt
 import numpy as np
 import json
diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py
old mode 100644
new mode 100755
index 4af00f7c1..ce493d196
--- a/tools/generate_categorical_fim.py
+++ b/tools/generate_categorical_fim.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python3
 
 import sys
-sys.path.insert(1, 'foss_fim/tests')
-sys.path.insert(1, 'foss_fim/lib')
 import os
 from multiprocessing import Pool
 import argparse
@@ -14,6 +12,7 @@
 from rasterio.features import shapes
 from shapely.geometry.polygon import Polygon
 from shapely.geometry.multipolygon import MultiPolygon
+sys.path.append('/foss_fim/src')
 from utils.shared_variables import PREP_PROJECTION,VIZ_PROJECTION
 from utils.shared_functions import getDriver
 from inundation import inundate
@@ -21,33 +20,23 @@
 INPUTS_DIR = r'/data/inputs'
 magnitude_list = ['action', 'minor', 'moderate','major']
 
-# map path to points with attributes
+# Map path to points with attributes
 all_mapped_ahps_conus_hipr = '/data/inputs/ahp_sites/all_mapped_ahps.csv'
 
-# define necessary variables for inundation()
+# Define necessary variables for inundation()
 hucs, hucs_layerName = os.path.join(INPUTS_DIR, 'wbd', 'WBD_National.gpkg'), 'WBDHU8'
 mask_type, catchment_poly = 'huc', ''
 
-def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif):
 
-    # create output directory and log directory
-    if not os.path.exists(output_cat_fim_dir):
-        os.mkdir(output_cat_fim_dir)
-
-    # create error log
-    log_dir = os.path.join(output_cat_fim_dir, 'logs')
-    if not os.path.exists(log_dir):
-        os.mkdir(log_dir)
+def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif, log_file):
 
     no_data_list = []
     procs_list = []
-    log_file = os.path.join(log_dir, 'errors.log')
-
 
     source_flow_dir_list = os.listdir(source_flow_dir)
     output_flow_dir_list = os.listdir(fim_run_dir)
 
-    # log missing hucs
+    # Log missing hucs
     missing_hucs = list(set(source_flow_dir_list) - set(output_flow_dir_list))
     missing_hucs = [huc for huc in missing_hucs if "." not in huc]
     if len(missing_hucs) > 0:
@@ -55,17 +44,17 @@ def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, n
         f.write(f"Missing hucs from output directory: {', '.join(missing_hucs)}\n")
         f.close()
 
-    # loop through matching huc directories in the source_flow directory
+    # Loop through matching huc directories in the source_flow directory
     matching_hucs = list(set(output_flow_dir_list) & set(source_flow_dir_list))
     for huc in matching_hucs:
 
         if "." not in huc:
 
-            # get list of AHPS site directories
+            # Get list of AHPS site directories
             ahps_site_dir = os.path.join(source_flow_dir, huc)
             ahps_site_dir_list = os.listdir(ahps_site_dir)
 
-            # map paths to HAND files needed for inundation()
+            # Map paths to HAND files needed for inundation()
             fim_run_huc_dir = os.path.join(fim_run_dir, huc)
             rem = os.path.join(fim_run_huc_dir, 'rem_zeroed_masked.tif')
             catchments = os.path.join(fim_run_huc_dir, 'gw_catchments_reaches_filtered_addedAttributes.tif')
@@ -73,24 +62,24 @@ def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, n
 
             exit_flag = False  # Default to False.
 
-            # check if necessary data exist; set exit_flag to True if they don't exist
+            # Check if necessary data exist; set exit_flag to True if they don't exist
             for f in [rem, catchments, hydroTable]:
                 if not os.path.exists(f):
                     no_data_list.append(f)
                     exit_flag = True
 
-            # log missing data
+            # Log missing data
             if exit_flag == True:
                 f = open(log_file, 'a+')
                 f.write(f"Missing data for: {fim_run_huc_dir}\n")
                 f.close()
 
-            # map path to huc directory inside out output_cat_fim_dir
+            # Map path to huc directory inside out output_cat_fim_dir
             cat_fim_huc_dir = os.path.join(output_cat_fim_dir, huc)
             if not os.path.exists(cat_fim_huc_dir):
                 os.mkdir(cat_fim_huc_dir)
 
-            # loop through AHPS sites
+            # Loop through AHPS sites
             for ahps_site in ahps_site_dir_list:
                 # map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj)
                 ahps_site_parent = os.path.join(ahps_site_dir, ahps_site)
@@ -101,7 +90,7 @@ def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, n
                 if not os.path.exists(cat_fim_huc_ahps_dir):
                     os.mkdir(cat_fim_huc_ahps_dir)
 
-                # loop through thresholds/magnitudes and define inundation output files paths
+                # Loop through thresholds/magnitudes and define inundation output files paths
                 for magnitude in thresholds_dir_list:
 
                     if "." not in magnitude:
@@ -117,14 +106,15 @@ def generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, n
                             else:
                                 output_depth_grid = None
 
-                            # append necessary variables to list for multiprocessing.
-                            procs_list.append([rem, catchments, catchment_poly, magnitude_flows_csv, huc, hydroTable, output_extent_grid, output_depth_grid, ahps_site, magnitude, log_dir])
+                            # Append necessary variables to list for multiprocessing.
+                            procs_list.append([rem, catchments, catchment_poly, magnitude_flows_csv, huc, hydroTable, output_extent_grid, output_depth_grid, ahps_site, magnitude, log_file])
 
-    # initiate multiprocessing
+    # Initiate multiprocessing
     print(f"Running inundation for {len(procs_list)} sites using {number_of_jobs} jobs")
     pool = Pool(number_of_jobs)
     pool.map(run_inundation, procs_list)
 
+
 def run_inundation(args):
 
     rem                 = args[0]
@@ -137,7 +127,7 @@ def run_inundation(args):
     output_depth_grid   = args[7]
     ahps_site           = args[8]
     magnitude           = args[9]
-    log_dir             = args[10]
+    log_file            = args[10]
 
     try:
         inundate(rem,catchments,catchment_poly,hydroTable,magnitude_flows_csv,mask_type,hucs=hucs,hucs_layerName=hucs_layerName,
@@ -146,14 +136,15 @@ def run_inundation(args):
                 )
 
     except Exception:
-        # log errors and their tracebacks
+        # Log errors and their tracebacks
         f = open(log_file, 'a+')
         f.write(f"{output_extent_gpkg} - inundation error: {traceback.format_exc()}\n")
         f.close()
 
-def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
 
-    # create workspace
+def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir, log_file):
+
+    # Create workspace
     gpkg_dir = os.path.join(output_cat_fim_dir, 'gpkg')
     if not os.path.exists(gpkg_dir):
         os.mkdir(gpkg_dir)
@@ -170,7 +161,7 @@ def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
 
             procs_list = []
 
-            # loop through all categories
+            # Loop through all categories
             for huc in huc_ahps_dir_list:
 
                 if huc not in skip_list:
@@ -178,7 +169,7 @@ def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
                     huc_dir = os.path.join(output_cat_fim_dir, huc)
                     ahps_dir_list = os.listdir(huc_dir)
 
-                    # loop through ahps sites
+                    # Loop through ahps sites
                     for ahps_lid in ahps_dir_list:
                         ahps_lid_dir = os.path.join(huc_dir, ahps_lid)
 
@@ -195,21 +186,21 @@ def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
                             except:
                                 pass
 
-            # multiprocess with instructions
+            # Multiprocess with instructions
             pool = Pool(number_of_jobs)
             pool.map(reformat_inundation_maps, procs_list)
 
-        # merge all layers
+        # Merge all layers
         print(f"Merging {len(os.listdir(gpkg_dir))} layers...")
 
         for layer in os.listdir(gpkg_dir):
 
             diss_extent_filename = os.path.join(gpkg_dir, layer)
 
-            # open diss_extent
+            # Open diss_extent
             diss_extent = gpd.read_file(diss_extent_filename)
 
-            # write/append aggregate diss_extent
+            # Write/append aggregate diss_extent
             if os.path.isfile(merged_layer):
                 diss_extent.to_file(merged_layer,driver=getDriver(merged_layer),index=False, mode='a')
             else:
@@ -217,12 +208,12 @@ def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
 
             del diss_extent
 
-        # join attributes
+        # Join attributes
         all_mapped_ahps_conus_hipr_fl = pd.read_table(all_mapped_ahps_conus_hipr, sep=",")
         merged_layer_gpd = gpd.read_file(merged_layer)
         merged_layer_gpd = merged_layer_gpd.merge(all_mapped_ahps_conus_hipr_fl, left_on='ahps_lid', right_on='nws_lid')
 
-        # save final output
+        # Save final output
         merged_layer_gpd.to_file(merged_layer,driver=getDriver(merged_layer),index=False)
 
         shutil.rmtree(gpkg_dir)
@@ -230,6 +221,7 @@ def post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir):
     else:
         print(f"{merged_layer} already exists.")
 
+
 def reformat_inundation_maps(args):
 
     try:
@@ -240,31 +232,31 @@ def reformat_inundation_maps(args):
         huc          = args[4]
         magnitude    = args[5]
 
-        # convert raster to to shapes
+        # Convert raster to to shapes
         with rasterio.open(grid_path) as src:
             image = src.read(1)
             mask = image > 0
 
-        # aggregate shapes
+        # Cggregate shapes
         results = ({'properties': {'extent': 1}, 'geometry': s} for i, (s, v) in enumerate(shapes(image, mask=mask,transform=src.transform)))
 
         # convert list of shapes to polygon
         extent_poly  = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION)
 
-        # dissolve polygons
+        # Dissolve polygons
         extent_poly_diss = extent_poly.dissolve(by='extent')
 
-        # update attributes
+        # Update attributes
         extent_poly_diss = extent_poly_diss.reset_index(drop=True)
         extent_poly_diss['ahps_lid'] = lid
         extent_poly_diss['magnitude'] = magnitude
         extent_poly_diss['version'] = fim_version
         extent_poly_diss['huc'] = huc
 
-        # project to Web Mercator
+        # Project to Web Mercator
         extent_poly = extent_poly.to_crs(VIZ_PROJECTION)
 
-        # copy gdb and save to feature class
+        # Copy gdb and save to feature class
         handle = os.path.split(grid_path)[1].replace('.tif', '')
 
         diss_extent_filename = os.path.join(gpkg_dir, handle + "_dissolved.gpkg")
@@ -274,9 +266,9 @@ def reformat_inundation_maps(args):
         extent_poly_diss.to_file(diss_extent_filename,driver=getDriver(diss_extent_filename),index=False)
 
     except Exception as e:
-        # log and clean out the gdb so it's not merged in later
+        # Log and clean out the gdb so it's not merged in later
         try:
-            f = open(log_dir, 'a+')
+            f = open(log_file, 'a+')
             f.write("f{diss_extent_filename} - dissolve error: {e}\n")
             f.close()
         except:
@@ -285,8 +277,8 @@ def reformat_inundation_maps(args):
 
 if __name__ == '__main__':
 
-    # parse arguments
-    parser = argparse.ArgumentParser(description='Inundation mapping and regression analysis for FOSS FIM. Regression analysis results are stored in the test directory.')
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='Categorical inundation mapping for FOSS FIM.')
     parser.add_argument('-r','--fim-run-dir',help='Name of directory containing outputs of fim_run.sh',required=True)
     parser.add_argument('-s', '--source-flow-dir',help='Path to directory containing flow CSVs to use to generate categorical FIM.',required=True, default="")
     parser.add_argument('-o', '--output-cat-fim-dir',help='Path to directory where categorical FIM outputs will be written.',required=True, default="")
@@ -301,8 +293,21 @@ def reformat_inundation_maps(args):
     number_of_jobs = int(args['number_of_jobs'])
     depthtif = args['write_depth_tiff']
 
+
+    # Create output directory
+    if not os.path.exists(output_cat_fim_dir):
+        os.mkdir(output_cat_fim_dir)
+
+    # Create log directory
+    log_dir = os.path.join(output_cat_fim_dir, 'logs')
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+
+    # Create error log path
+    log_file = os.path.join(log_dir, 'errors.log')
+
     print("Generating Categorical FIM")
-    generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif)
+    generate_categorical_fim(fim_run_dir, source_flow_dir, output_cat_fim_dir, number_of_jobs, depthtif,log_file)
 
     print("Aggregating Categorical FIM")
-    post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir)
+    post_process_cat_fim_for_viz(number_of_jobs, output_cat_fim_dir,log_file)
diff --git a/tools/inundation_wrapper_custom_flow.py b/tools/inundation_wrapper_custom_flow.py
old mode 100644
new mode 100755
index e82a474e6..530585793
--- a/tools/inundation_wrapper_custom_flow.py
+++ b/tools/inundation_wrapper_custom_flow.py
@@ -12,7 +12,6 @@
 import shutil
 
 # insert python path at runtime for accessing scripts in foss_fim/tests dir (e.g. inundation.py)
-sys.path.insert(1, 'foss_fim/tests')
 from inundation import inundate
 
 TEST_CASES_DIR = r'/data/inundation_review/inundation_custom_flow/'  # Will update.
diff --git a/tools/inundation_wrapper_nwm_flows.py b/tools/inundation_wrapper_nwm_flows.py
index f6d158a79..8a5fe0cf5 100755
--- a/tools/inundation_wrapper_nwm_flows.py
+++ b/tools/inundation_wrapper_nwm_flows.py
@@ -12,7 +12,6 @@
 import shutil
 
 # insert python path at runtime for accessing scripts in foss_fim/tests dir (e.g. inundation.py)
-sys.path.insert(1, 'foss_fim/tests')
 from inundation import inundate
 
 TEST_CASES_DIR = r'/data/inundation_review/inundation_nwm_recurr/'  # Will update.
diff --git a/tools/mannings_calibration_run.sh b/tools/mannings_calibration_run.sh
index 8dc737ee4..89d54cd17 100755
--- a/tools/mannings_calibration_run.sh
+++ b/tools/mannings_calibration_run.sh
@@ -71,7 +71,7 @@ fi
 
 export input_NWM_Catchments=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
 export outdir=$outdir
-export testdir="/foss_fim/tests"
+export toolsdir="/foss_fim/tools"
 
 if [ -f "$huclist" ]; then
 
@@ -84,15 +84,15 @@ if [ -f "$huclist" ]; then
     ## RUN ##
     if [ -f "$paramfile" ]; then
         if [ "$jobLimit" -eq 1 ]; then
-            parallel --verbose --lb  -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh :::: $paramfile
+            parallel --verbose --lb  -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh :::: $paramfile
         else
-            parallel --eta -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh :::: $paramfile
+            parallel --eta -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh :::: $paramfile
         fi
     else
         if [ "$jobLimit" -eq 1 ]; then
-            parallel --verbose --lb -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh ::: $paramfile
+            parallel --verbose --lb -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh ::: $paramfile
         else
-            parallel --eta -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh ::: $paramfile
+            parallel --eta -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh ::: $paramfile
         fi
     fi
   done <$huclist
@@ -108,15 +108,15 @@ else
     ## RUN ##
     if [ -f "$paramfile" ]; then
         if [ "$jobLimit" -eq 1 ]; then
-            parallel --verbose --lb  -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh :::: $paramfile
+            parallel --verbose --lb  -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh :::: $paramfile
         else
-            parallel --eta -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh :::: $paramfile
+            parallel --eta -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh :::: $paramfile
         fi
     else
         if [ "$jobLimit" -eq 1 ]; then
-            parallel --verbose --lb -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh ::: $paramfile
+            parallel --verbose --lb -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh ::: $paramfile
         else
-            parallel --eta -j $jobLimit -- $testdir/time_and_tee_mannings_calibration.sh ::: $paramfile
+            parallel --eta -j $jobLimit -- $toolsdir/time_and_tee_mannings_calibration.sh ::: $paramfile
         fi
     fi
   done
diff --git a/tools/mannings_run_by_set.sh b/tools/mannings_run_by_set.sh
index 917672a63..8394b3a5f 100755
--- a/tools/mannings_run_by_set.sh
+++ b/tools/mannings_run_by_set.sh
@@ -12,4 +12,4 @@ mkdir -p $subdir
 
 $srcDir/add_crosswalk.py -d $hucdir/gw_catchments_reaches_filtered_addedAttributes.gpkg -a $hucdir/demDerived_reaches_split_filtered.gpkg -s $hucdir/src_base.csv -l $subdir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -f $subdir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -r $subdir/src_full_crosswalked.csv -j $subdir/src.json -x $subdir/crosswalk_table.csv -t $subdir/hydroTable.csv -w $hucdir/wbd8_clp.gpkg -b $hucdir/nwm_subset_streams.gpkg -y $hucdir/nwm_catchments_proj_subset.tif -m $param_set -z $input_NWM_Catchments -p FR -c
 
-python3 foss_fim/tests/run_test_case_calibration.py -r $fimdir/$huc -d $subdir -t $huc"_ble" -b "mannings_calibration"/$strorder/$mannings_value
+python3 foss_fim/tools/run_test_case_calibration.py -r $fimdir/$huc -d $subdir -t $huc"_ble" -b "mannings_calibration"/$strorder/$mannings_value
diff --git a/tools/plots/utils/__init__.py b/tools/plots/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/plots/utils/__init__.py
rename to tools/plots/__init__.py
diff --git a/tools/plots/eval_plots.py b/tools/plots/eval_plots.py
old mode 100644
new mode 100755
index 9c29087e1..e04b2fd11
--- a/tools/plots/eval_plots.py
+++ b/tools/plots/eval_plots.py
@@ -4,42 +4,42 @@
 import argparse
 from natsort import natsorted
 import geopandas as gpd
-from utils.shared_functions import filter_dataframe, boxplot, scatterplot, barplot
+from plot_functions import filter_dataframe, boxplot, scatterplot, barplot
 def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'] , alternate_ahps_query = False, spatial_ahps = False, fim_1_ms = False):
 
     '''
-    Creates plots and summary statistics using metrics compiled from 
-    synthesize_test_cases. Required inputs are metrics_csv and workspace. 
+    Creates plots and summary statistics using metrics compiled from
+    synthesize_test_cases. Required inputs are metrics_csv and workspace.
     Outputs include:
-        aggregate_<benchmark source>_<configuration>.csv: this csv 
+        aggregate_<benchmark source>_<configuration>.csv: this csv
             contains the aggregated total statistics (i.e. CSI, FAR, POD)
             using the summed area_sq_km fields
-        <benchmark source>_<configuration>_common_sites.csv: this csv 
-            contains the unique sites (e.g usgs/nws: nws_lid; ble: huc08) 
+        <benchmark source>_<configuration>_common_sites.csv: this csv
+            contains the unique sites (e.g usgs/nws: nws_lid; ble: huc08)
             considered for aggregation/plots for each magnitude. The selected
             sites occur in all versions analyzed. For example, if FIM 1,
-            FIM 2, FIM 3.0.0.3 were versions analyzed, the common sites 
-            would be those that had data for ALL versions. This 
+            FIM 2, FIM 3.0.0.3 were versions analyzed, the common sites
+            would be those that had data for ALL versions. This
             analysis is then redone for each magnitude. As such, the number
             of sites may vary with magnitude. The number of sites for each
             magnitude is annotated on generated plots.
-        <benchmark source>_<configuration>_analyzed_data.csv: this is the 
-            dataset used to create plots and aggregate statistics. It is 
+        <benchmark source>_<configuration>_analyzed_data.csv: this is the
+            dataset used to create plots and aggregate statistics. It is
             a subset of the input metrics file and consists of the common
             sites.
-        csi_aggr_<benchmark source>_<configuration>.png: bar plot of the 
+        csi_aggr_<benchmark source>_<configuration>.png: bar plot of the
             aggregated CSI scores. Number of common sites is annotated
             (see list of sites listed in *_*_common_sites.csv).
-        csi_<benchmark source>_<configuration>.png: box plot of CSI scores 
-            (sites weighted equally). Number of common sites is annotated 
+        csi_<benchmark source>_<configuration>.png: box plot of CSI scores
+            (sites weighted equally). Number of common sites is annotated
             (see list of sites listed in *_*_common_sites.csv).
         far_<benchmark source>_<configuration>*.png: box plot of FAR scores
-            (sites weighted equally). Number of common sites is annotated 
+            (sites weighted equally). Number of common sites is annotated
             (see list of sites listed in *_*_common_sites.csv).
-        tpr_<benchmark source>_<configuration>*.png: box plot of TPR/POD 
-            scores (sites weighted equally). Number of common sites is 
+        tpr_<benchmark source>_<configuration>*.png: box plot of TPR/POD
+            scores (sites weighted equally). Number of common sites is
             annotated (see list of sites listed in *_*_common_sites.csv).
-        csi_scatter_<magnitude>_<configuration>*.png: scatter plot comparing 
+        csi_scatter_<magnitude>_<configuration>*.png: scatter plot comparing
             two versions for a given magnitude. This is only generated if
             there are exactly two versions analyzed.
 
@@ -49,50 +49,50 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         Path to csv produced as part of synthesize_test_cases containing
         all metrics across all versions.
     workspace : STRING
-        Path to the output workspace. Subdirectories will be created 
+        Path to the output workspace. Subdirectories will be created
         reflecting the evaluation datasets.
     versions: LIST
-        A list of versions to be aggregated/plotted. Uses the "startswith" 
-        approach. Versions should be supplied in the order they are to 
-        be plotted. For example: ['fim_', 'fb']; This will evaluate all 
+        A list of versions to be aggregated/plotted. Uses the "startswith"
+        approach. Versions should be supplied in the order they are to
+        be plotted. For example: ['fim_', 'fb']; This will evaluate all
         versions that start with fim_ (e.g. fim_1, fim_2, fim_3) and any
         feature branch that starts with "fb". To esbalish version order,
-        the fim versions are naturally sorted and then fb versions 
-        (naturally sorted) are appended. These versions are also used to 
-        filter the input metric csv as only these versions are retained 
-        for analysis. 
+        the fim versions are naturally sorted and then fb versions
+        (naturally sorted) are appended. These versions are also used to
+        filter the input metric csv as only these versions are retained
+        for analysis.
     stats: LIST
-        A list of statistics to be plotted. Must be identical to column 
-        field in metrics_csv. CSI, POD, TPR are currently calculated, if 
+        A list of statistics to be plotted. Must be identical to column
+        field in metrics_csv. CSI, POD, TPR are currently calculated, if
         additional statistics are desired formulas would need to be coded.
     alternate_ahps_query : STRING, optional
-        The default is false. Currently the default ahps query is same 
-        as done for apg goals. If a different query is desired it can be 
-        supplied and it will supercede the default query. 
+        The default is false. Currently the default ahps query is same
+        as done for apg goals. If a different query is desired it can be
+        supplied and it will supercede the default query.
     spatial_ahps : DICTIONARY, optional
-        The default is false. A dictionary with keys as follows: 
+        The default is false. A dictionary with keys as follows:
             'static': Path to AHPS point file created during creation of
                 FIM 3 static libraries.
             'evaluated': Path to extent file created during the creation
                 of the NWS/USGS AHPS preprocessing.
-            'metadata': Path to previously created file that contains 
+            'metadata': Path to previously created file that contains
                 metadata about each site (feature_id, wfo, rfc and etc).
         No spatial layers will be created if set to False, if a dictionary
         is supplied then a spatial layer is produced.
     fim_1_ms: BOOL
-        Default is false. If True then fim_1 rows are duplicated with 
-        extent_config set to MS. This allows for FIM 1 to be included 
+        Default is false. If True then fim_1 rows are duplicated with
+        extent_config set to MS. This allows for FIM 1 to be included
         in MS plots/stats (helpful for nws/usgs ahps comparisons).
 
     Returns
     -------
     all_datasets : DICT
-        Dictionary containing all datasets generated. 
-        Keys: (benchmark_source, extent_config), 
+        Dictionary containing all datasets generated.
+        Keys: (benchmark_source, extent_config),
         Values: (filtered dataframe, common sites)
 
     '''
-    
+
     #Import metrics csv as DataFrame and initialize all_datasets dictionary
     csv_df = pd.read_csv(metrics_csv)
 
@@ -104,34 +104,34 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         fim_1_rows['extent_config'] = 'MS'
         #Append duplicate FIM 1 rows to original dataframe
         csv_df = csv_df.append(fim_1_rows, ignore_index = True)
-        
-    #If versions are supplied then filter out    
+
+    #If versions are supplied then filter out
     if versions:
         #Filter out versions based on supplied version list
         metrics = csv_df.query('version.str.startswith(tuple(@versions))')
     else:
         metrics = csv_df
-       
+
     #Group by benchmark source
     benchmark_by_source = metrics.groupby(['benchmark_source', 'extent_config'])
 
-    #Iterate through benchmark_by_source. Pre-filter metrics dataframe 
-    #as needed (e.g. usgs/nws filter query). Then further filtering to 
-    #discard all hucs/nws_lid that are not present across all analyzed 
-    #versions for a given magnitude. The final filtered dataset is written 
-    #to a dictionary with the key (benchmark source, extent config) 
+    #Iterate through benchmark_by_source. Pre-filter metrics dataframe
+    #as needed (e.g. usgs/nws filter query). Then further filtering to
+    #discard all hucs/nws_lid that are not present across all analyzed
+    #versions for a given magnitude. The final filtered dataset is written
+    #to a dictionary with the key (benchmark source, extent config)
     #and values (filtered dataframe, common sites).
     all_datasets = {}
-    for (benchmark_source, extent_configuration), benchmark_metrics in benchmark_by_source:        
-                
-        #If source is usgs/nws define the base resolution and query 
-        #(use alternate query if passed). Append filtered datasets to 
+    for (benchmark_source, extent_configuration), benchmark_metrics in benchmark_by_source:
+
+        #If source is usgs/nws define the base resolution and query
+        #(use alternate query if passed). Append filtered datasets to
         #all_datasets dictionary.
         if benchmark_source in ['usgs','nws']:
-            
+
             #Set the base processing unit for the ahps runs.
             base_resolution = 'nws_lid'
-            
+
             #Default query (used for APG) it could be that bad_sites should be modified. If so pass an alternate query using the "alternate_ahps_query"
             bad_sites = ['grfi2','ksdm7','hohn4','rwdn4']
             query = "not flow.isnull() & masked_perc<97 & not nws_lid in @bad_sites"
@@ -142,35 +142,35 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
 
             #Filter the dataset based on query
             ahps_metrics = benchmark_metrics.query(query)
-            
-            #Filter out all instances where the base_resolution doesn't 
+
+            #Filter out all instances where the base_resolution doesn't
             #exist across all desired fim versions for a given magnitude.
             all_datasets[(benchmark_source, extent_configuration)] = filter_dataframe(ahps_metrics, base_resolution)
-                     
-        #If source is 'ble', set base_resolution and append ble dataset 
+
+        #If source is 'ble', set base_resolution and append ble dataset
         #to all_datasets dictionary
         elif benchmark_source == 'ble':
-            
+
             #Set the base processing unit for ble runs
             base_resolution = 'huc'
-            
-            #Filter out all instances where base_resolution doesn't exist 
+
+            #Filter out all instances where base_resolution doesn't exist
             #across all desired fim versions for a given magnitude.
             all_datasets[(benchmark_source, extent_configuration)] = filter_dataframe(benchmark_metrics, base_resolution)
-            
+
     #For each dataset in all_datasets, generate plots and aggregate statistics.
     for (dataset_name,configuration), (dataset, sites) in all_datasets.items():
-        
-        #Define and create the output workspace as a subfolder within 
+
+        #Define and create the output workspace as a subfolder within
         #the supplied workspace
         output_workspace = Path(workspace) / dataset_name / configuration.lower()
-        output_workspace.mkdir(parents = True, exist_ok = True)         
-                
+        output_workspace.mkdir(parents = True, exist_ok = True)
+
         #Write out the filtered dataset and common sites to file
         dataset.to_csv(output_workspace / (f'{dataset_name}_{configuration.lower()}_analyzed_data.csv'), index = False)
         sites_pd = pd.DataFrame.from_dict(sites, orient = 'index').transpose()
         sites_pd.to_csv(output_workspace / (f'{dataset_name}_{configuration.lower()}_common_sites.csv'), index = False)
-        
+
         #set the order of the magnitudes and define base resolution.
         if dataset_name == 'ble':
             magnitude_order = ['100yr', '500yr']
@@ -185,22 +185,22 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         dataset_sums['far'] = dataset_sums['FP_area_km2']/(dataset_sums['TP_area_km2'] + dataset_sums['FP_area_km2'])
         dataset_sums['pod'] = dataset_sums['TP_area_km2']/(dataset_sums['TP_area_km2'] + dataset_sums['FN_area_km2'])
         dataset_sums = dataset_sums.reset_index()
-        
+
         #Write aggregated metrics to file.
         dataset_sums.to_csv(output_workspace / f'aggregate_{dataset_name}_{configuration.lower()}.csv', index = False )
 
-        #This section naturally orders analyzed versions which defines 
+        #This section naturally orders analyzed versions which defines
         #the hue order for the generated plots.
         #Get all versions in dataset
-        all_versions = list(dataset.version.unique())        
-        version_order = []        
-        #If versions are not specified then use all available versions 
+        all_versions = list(dataset.version.unique())
+        version_order = []
+        #If versions are not specified then use all available versions
         #and assign to versions_list
         if not versions:
             versions_list = all_versions
         #if versions are supplied assign to versions_list
         else:
-            versions_list = versions        
+            versions_list = versions
         #For each version supplied by the user
         for version in versions_list:
             #Select all the versions that start with the supplied version.
@@ -209,7 +209,7 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
             selected_versions = natsorted(selected_versions)
             #Populate version order based on the sorted subsets.
             version_order.extend(selected_versions)
-            
+
         #Define textbox which will contain the counts of each magnitude.
         textbox = []
         for magnitude in sites:
@@ -219,16 +219,16 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         textbox = '\n'.join(textbox)
 
         #Create aggregate barplot
-        aggregate_file = output_workspace / (f'csi_aggr_{dataset_name}_{configuration.lower()}.png')        
+        aggregate_file = output_workspace / (f'csi_aggr_{dataset_name}_{configuration.lower()}.png')
         barplot(dataframe = dataset_sums, x_field = 'magnitude', x_order = magnitude_order, y_field = 'csi', hue_field = 'version', ordered_hue = version_order, title_text = f'Aggregate {dataset_name.upper()} FIM Scores', fim_configuration = configuration, textbox_str = textbox, simplify_legend = True, dest_file = aggregate_file)
-        
+
         #Create box plots for each metric in supplied stats.
         for stat in stats:
-            output_file = output_workspace / (f'{stat.lower()}_{dataset_name}_{configuration.lower()}.png')    
+            output_file = output_workspace / (f'{stat.lower()}_{dataset_name}_{configuration.lower()}.png')
             boxplot(dataframe = dataset, x_field = 'magnitude', x_order = magnitude_order, y_field = stat, hue_field = 'version', ordered_hue = version_order, title_text = f'{dataset_name.upper()} FIM Sites', fim_configuration = configuration, textbox_str = textbox, simplify_legend = True, dest_file = output_file)
-        
+
         #Get the last 2 versions from the version order for scatter plot.
-        if len(version_order) == 2:            
+        if len(version_order) == 2:
             x_version, y_version = version_order
             for magnitude in magnitude_order:
                 #Scatterplot comparison between last 2 versions.
@@ -239,7 +239,7 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
                 title_text = f'CSI {magnitude}'
                 dest_file = output_workspace / f'csi_scatter_{magnitude}_{configuration.lower()}.png'
                 scatterplot(dataframe = plotdf, x_field = f'CSI_{x_version}', y_field = f'CSI_{y_version}', title_text = title_text, annotate = False, dest_file = dest_file)
-    
+
 
     #######################################################################
     #Create spatial layers with threshold and mapping information
@@ -247,21 +247,21 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     if spatial_ahps:
 
         #Read in supplied shapefile layers
-        #Layer containing metadata for each site (feature_id, wfo, etc). 
+        #Layer containing metadata for each site (feature_id, wfo, etc).
         #Convert nws_lid to lower case.
         ahps_metadata = gpd.read_file(spatial_ahps['metadata'])
         ahps_metadata['nws_lid'] = ahps_metadata['nws_lid'].str.lower()
         metadata_crs = ahps_metadata.crs
-        
+
         #Extent layer generated from preprocessing NWS/USGS datasets
         evaluated_ahps_extent = gpd.read_file(spatial_ahps['evaluated'])
-        
+
         #Extent layer generated from static ahps library preprocessing
         static_library = gpd.read_file(spatial_ahps['static'])
-        
+
         #Fields to keep
         #Get list of fields to keep in merge
-        preserved_static_library_fields = ['nws_lid'] + [i for i in static_library.columns if i.startswith(('Q','S'))]                
+        preserved_static_library_fields = ['nws_lid'] + [i for i in static_library.columns if i.startswith(('Q','S'))]
         #Get list of fields to keep in merge.
         preserved_evaluated_ahps_fields = ['nws_lid', 'source', 'geometry'] + [i for i in evaluated_ahps_extent.columns if i.startswith(('action','minor','moderate','major'))]
 
@@ -271,27 +271,27 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         evaluated_ahps_extent['geometry'] = evaluated_ahps_extent['geometry_y']
         evaluated_ahps_extent.drop(columns = ['geometry_y','geometry_x'], inplace = True)
         evaluated_ahps_extent = evaluated_ahps_extent.merge(static_library[preserved_static_library_fields], on = 'nws_lid')
-        
-        #Join dataset metrics to evaluated_ahps_extent data. 
+
+        #Join dataset metrics to evaluated_ahps_extent data.
         final_join = pd.DataFrame()
         for (dataset_name, configuration), (dataset, sites) in all_datasets.items():
             #Only select ahps from dataset if config is MS
             if dataset_name in ['usgs','nws'] and configuration == 'MS':
                 #Select records from evaluated_ahps_extent that match the dataset name
-                subset = evaluated_ahps_extent.query(f'source == "{dataset_name}"')                        
+                subset = evaluated_ahps_extent.query(f'source == "{dataset_name}"')
                 #Join to dataset
                 dataset_with_subset = dataset.merge(subset, on = 'nws_lid')
                 #Append rows to final_join dataframe
                 final_join = final_join.append(dataset_with_subset)
-        
+
         #Modify version field
         final_join['version'] = final_join.version.str.split('_nws|_usgs').str[0]
-        
+
         #Write geodataframe to file
         gdf = gpd.GeoDataFrame(final_join, geometry = final_join['geometry'], crs = metadata_crs)
         output_shapefile = Path(workspace) / 'nws_usgs_site_info.shp'
-        gdf.to_file(output_shapefile) 
-                    
+        gdf.to_file(output_shapefile)
+
 
 
 #######################################################################
@@ -305,10 +305,10 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     parser.add_argument('-q', '--alternate_ahps_query',help = 'Alternate filter query for AHPS. Default is: "not nws_lid.isnull() & not flow.isnull() & masked_perc<97 & not nws_lid in @bad_sites" where bad_sites are (grfi2,ksdm7,hohn4,rwdn4)', default = False, required = False)
     parser.add_argument('-sp', '--spatial_ahps', help = 'If spatial point layer is desired, supply a csv with 3 lines of the following format: metadata, path/to/metadata/shapefile\nevaluated, path/to/evaluated/shapefile\nstatic, path/to/static/shapefile.', default = False, required = False)
     parser.add_argument('-f', '--fim_1_ms', help = 'If enabled fim_1 rows will be duplicated and extent config assigned "ms" so that fim_1 can be shown on mainstems plots/stats', action = 'store_true', required = False)
-    
+
     #Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
-    
+
     #If errors occur reassign error to True
     error = False
     #Create dictionary if file specified for spatial_ahps
@@ -339,5 +339,5 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     f = args['fim_1_ms']
 
     #Run eval_plots function
-    if not error:        
-        eval_plots(metrics_csv = m, workspace = w, versions = v, stats = s, alternate_ahps_query = q, spatial_ahps = sp, fim_1_ms = f)
\ No newline at end of file
+    if not error:
+        eval_plots(metrics_csv = m, workspace = w, versions = v, stats = s, alternate_ahps_query = q, spatial_ahps = sp, fim_1_ms = f)
diff --git a/tools/plots/utils/shared_functions.py b/tools/plots/plot_functions.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/plots/utils/shared_functions.py
rename to tools/plots/plot_functions.py
diff --git a/tools/preprocess/create_flow_forecast_file.py b/tools/preprocess/create_flow_forecast_file.py
old mode 100644
new mode 100755
index 9de7abfe5..bb8833343
--- a/tools/preprocess/create_flow_forecast_file.py
+++ b/tools/preprocess/create_flow_forecast_file.py
@@ -1,10 +1,5 @@
 #!/usr/bin/env python3
 
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Jul 29 11:48:37 2020
-@author: Fernando Aristizabal with edits by Trevor Grout
-"""
 import os
 import geopandas as gpd
 import argparse
@@ -21,54 +16,54 @@ def create_flow_forecast_file(ble_geodatabase, nwm_geodatabase, output_parent_di
         Path to nwm geodatabase.
     output_parent_dir : STRING
         Output parent directory of output. Flow files will be output to subdirectories within parent directory.
-    ble_xs_layer_name : STRING 
-       The cross section layer in the ble geodatabase to be imported. Default is 'XS' (sometimes it is 'XS_1D') 
-    ble_huc_layer_name : STRING 
+    ble_xs_layer_name : STRING
+       The cross section layer in the ble geodatabase to be imported. Default is 'XS' (sometimes it is 'XS_1D')
+    ble_huc_layer_name : STRING
        The huc layer in the ble geodatabase.  Default is 'S_HUC_Ar' (sometimes it is 'S_HUC_ar' )
-    ble_huc_id_field : STRING 
+    ble_huc_id_field : STRING
        The attribute field within the ble_huc_layer_name containing the huc code. Default is 'HUC_CODE'. Assumes only 1 unique code.
-    nwm_stream_layer_name : STRING 
+    nwm_stream_layer_name : STRING
        The stream centerline layer name (or partial layer name) for the NWM geodatabase.  Default is 'RouteLink_FL_2020_04_07'.
-    nwm_feature_id_field : STRING 
+    nwm_feature_id_field : STRING
        The feature id of the nwm segments.  Default is 'ID' (applicable if nwmv2.1 is used)
     Returns
     -------
     None.
 
     '''
-    #Read the ble xs layer into a geopandas dataframe.
+    # Read the ble xs layer into a geopandas dataframe.
     xs_layer = gpd.read_file(ble_geodatabase,layer = ble_xs_layer_name)
 
-    #Read ble huc layer into a geopandas dataframe and extract the huc code. By default it assumes only one HUC in the layer (typically always the case).
+    # Read ble huc layer into a geopandas dataframe and extract the huc code. By default it assumes only one HUC in the layer (typically always the case).
     huc_layer = gpd.read_file(ble_geodatabase, layer = ble_huc_layer_name)
     [huc] = huc_layer[ble_huc_id_field].unique()
-    
-    #Read in the NWM stream layer into a geopandas dataframe using the bounding box option based on the extents of the BLE XS layer.
+
+    # Read in the NWM stream layer into a geopandas dataframe using the bounding box option based on the extents of the BLE XS layer.
     nwm_river_layer = gpd.read_file(nwm_geodatabase, bbox = xs_layer, layer = nwm_stream_layer_name)
-    
-    #Make sure xs_layer is in same projection as nwm_river_layer.
+
+    # Make sure xs_layer is in same projection as nwm_river_layer.
     xs_layer_proj = xs_layer.to_crs(nwm_river_layer.crs)
-    
-    #Perform an intersection of the BLE layers and the NWM layers, using the keep_geom_type set to False produces a point output.
+
+    # Perform an intersection of the BLE layers and the NWM layers, using the keep_geom_type set to False produces a point output.
     intersection = gpd.overlay(xs_layer_proj, nwm_river_layer, how = 'intersection', keep_geom_type = False)
 
-    #Create the flow forecast files
-    #define fields containing flow (typically these won't change for BLE)
+    ## Create the flow forecast files
+    # Define fields containing flow (typically these won't change for BLE)
     flow_fields = ['E_Q_01PCT','E_Q_0_2PCT']
 
-    #define return period associated with flow_fields (in same order as flow_fields). These will also serve as subdirectory names.
+    # Define return period associated with flow_fields (in same order as flow_fields). These will also serve as subdirectory names.
     return_period = ['100yr','500yr']
 
-    #Conversion factor from CFS to CMS
-    dischargeMultiplier = 0.3048 ** 3 
-        
-    #Write individual flow csv files
+    # Conversion factor from CFS to CMS
+    dischargeMultiplier = 0.3048 ** 3
+
+    # Write individual flow csv files
     for i,flow in enumerate(flow_fields):
 
-        #Write dataframe with just ID and single flow event
+        # Write dataframe with just ID and single flow event
         forecast = intersection[[nwm_feature_id_field,flow]]
 
-        #Rename field names and re-define datatypes
+        # Rename field names and re-define datatypes
         forecast = forecast.rename(columns={nwm_feature_id_field :'feature_id',flow : 'discharge'})
         forecast = forecast.astype({'feature_id' : int , 'discharge' : float})
 
@@ -76,18 +71,18 @@ def create_flow_forecast_file(ble_geodatabase, nwm_geodatabase, output_parent_di
         forecast = forecast.groupby('feature_id').median()
         forecast = forecast.reset_index(level=0)
 
-        #Convert CFS to CMS
+        # Convert CFS to CMS
         forecast['discharge'] = forecast['discharge'] * dischargeMultiplier
 
-        #Set paths and write file
+        # Set paths and write file
         output_dir = os.path.join(output_parent_dir, huc)
         dir_of_csv = os.path.join(output_dir,return_period[i])
         os.makedirs(dir_of_csv,exist_ok = True)
         path_to_csv = os.path.join(dir_of_csv,"ble_huc_{}_flows_{}.csv".format(huc,return_period[i]))
-        forecast.to_csv(path_to_csv,index=False)   
-   
+        forecast.to_csv(path_to_csv,index=False)
+
 if __name__ == '__main__':
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser(description = 'Produce forecast flow files from BLE datasets')
     parser.add_argument('-b', '--ble-geodatabase', help = 'BLE geodatabase (.gdb file extension). Will look for layer with "XS" in name. It is assumed the 100 year flow field is "E_Q_01PCT" and the 500 year flow field is "E_Q_0_2_PCT" as these are the default field names.', required = True)
     parser.add_argument('-n', '--nwm-geodatabase',  help = 'NWM geodatabase (.gdb file extension).', required = True)
@@ -97,9 +92,7 @@ def create_flow_forecast_file(ble_geodatabase, nwm_geodatabase, output_parent_di
     parser.add_argument('-huid', '--ble-huc-id-field', help = 'BLE id field in the ble-huc-layer-name. Default field is "HUC_CODE".', required = False, default = 'HUC_CODE')
     parser.add_argument('-l', '--nwm-stream-layer-name', help = 'NWM streams layer. Default layer is "RouteLink_FL_2020_04_07")', required = False, default = 'RouteLink_FL_2020_04_07')
     parser.add_argument('-f', '--nwm-feature-id-field', help = 'id field for nwm streams. Not required if NWM v2.1 is used (default id field is "ID")', required = False, default = 'ID')
-    #Extract to dictionary and assign to variables.
+    # Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
-    #Run create_flow_forecast_file
+    # Run create_flow_forecast_file
     create_flow_forecast_file(**args)
-    
-    
diff --git a/tools/preprocess/preprocess_benchmark.py b/tools/preprocess/preprocess_benchmark.py
old mode 100644
new mode 100755
index 02f8e5ea8..81a65db2d
--- a/tools/preprocess/preprocess_benchmark.py
+++ b/tools/preprocess/preprocess_benchmark.py
@@ -1,12 +1,5 @@
 #!/usr/bin/env python3
 
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Jul 23 15:17:04 2020
-
-@author: trevor.grout
-"""
-
 import rasterio
 from rasterio.warp import calculate_default_transform, reproject, Resampling
 import rasterio.mask
@@ -15,7 +8,7 @@
 
 def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path = None):
     '''
-    This function will preprocess a benchmark dataset for purposes of evaluating FIM output. A benchmark dataset will be transformed using properties (CRS, resolution) from an input reference dataset. The benchmark raster will also be converted to a boolean (True/False) raster with inundated areas (True or 1) and dry areas (False or 0). 
+    This function will preprocess a benchmark dataset for purposes of evaluating FIM output. A benchmark dataset will be transformed using properties (CRS, resolution) from an input reference dataset. The benchmark raster will also be converted to a boolean (True/False) raster with inundated areas (True or 1) and dry areas (False or 0).
 
     Parameters
     ----------
@@ -34,59 +27,59 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
         Raster profile information for the preprocessed benchmark array (required for writing to output dataset).
 
     '''
-    #Open and read raster and benchmark rasters
+    # Open and read raster and benchmark rasters
     reference = rasterio.open(reference_raster)
     benchmark = rasterio.open(benchmark_raster)
-    benchmark_arr = benchmark.read(1)    
+    benchmark_arr = benchmark.read(1)
 
-    #Set arbitrary no data value that is not possible value of the benchmark dataset. This will be reassigned later.
+    # Set arbitrary no data value that is not possible value of the benchmark dataset. This will be reassigned later
     nodata_value = -2147483648
-    
-    #Determine the new transform and dimensions of reprojected/resampled raster.
+
+    # Determine the new transform and dimensions of reprojected/resampled raster
     new_transform, new_width, new_height = calculate_default_transform(benchmark.crs, reference.crs, benchmark.width, benchmark.height, *benchmark.bounds, resolution = reference.res)
 
-    #Define an empty array that is same dimensions as output by the "calculate_default_transform" command. 
+    # Define an empty array that is same dimensions as output by the "calculate_default_transform" command
     benchmark_projected = np.empty((new_height,new_width), dtype=np.int32)
 
-    #Reproject and resample the benchmark dataset. Bilinear resampling due to continuous depth data.
-    reproject(benchmark_arr, 
+    # Reproject and resample the benchmark dataset. Bilinear resampling due to continuous depth data
+    reproject(benchmark_arr,
               destination = benchmark_projected,
-              src_transform = benchmark.transform, 
+              src_transform = benchmark.transform,
               src_crs = benchmark.crs,
               src_nodata = benchmark.nodata,
-              dst_transform = new_transform, 
+              dst_transform = new_transform,
               dst_crs = reference.crs,
               dst_nodata = nodata_value,
               dst_resolution = reference.res,
               resampling = Resampling.bilinear)
 
-    #Convert entire depth grid to boolean (1 = Flood, 0 = No Flood)
+    # Convert entire depth grid to boolean (1 = Flood, 0 = No Flood)
     boolean_benchmark = np.where(benchmark_projected != nodata_value, 1, 0)
 
-    #Update profile (data type, NODATA, transform, width/height).
+    #Update profile (data type, NODATA, transform, width/height)
     profile = reference.profile
     profile.update(transform = new_transform)
     profile.update(dtype = rasterio.int8)
-    profile.update(nodata = 2) #Update NODATA to some integer so we can keep int8 datatype. There are no NODATA in the raster dataset.
+    profile.update(nodata = 2) #Update NODATA to some integer so we can keep int8 datatype. There are no NODATA in the raster dataset
     profile.update (width = new_width)
     profile.update(height = new_height)
 
-    #Write out preprocessed benchmark array to raster if path is supplied
-    if out_raster_path is not None:    
-        with rasterio.Env():    
-            #Write out reassigned values to raster dataset.
+    # Write out preprocessed benchmark array to raster if path is supplied
+    if out_raster_path is not None:
+        with rasterio.Env():
+            # Write out reassigned values to raster dataset
             with rasterio.open(out_raster_path, 'w', **profile) as dst:
-                dst.write(boolean_benchmark.astype('int8'),1)   
+                dst.write(boolean_benchmark.astype('int8'),1)
     return boolean_benchmark.astype('int8'), profile
 
 if __name__ == '__main__':
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser(description = 'Preprocess BLE grids (in tiff format) for use in run_test_cast.py. Preprocessing includes reprojecting and converting to boolean raster (1 = Flooding, 0 = No Flooding)')
     parser.add_argument('-b','--benchmark-raster', help = 'BLE depth or water surface elevation grid (in GTiff format).', required = True)
     parser.add_argument('-r', '--reference-raster', help = 'Benchmark will use reference raster to set CRS and resolution to reference raster CRS.', required = True)
     parser.add_argument('-o', '--out-raster-path', help = 'Output raster path (include name and extension).', required = True)
 
-    #Extract to dictionary and assign to variables.
+    # Extract to dictionary and assign to variables
     args = vars(parser.parse_args())
-    #Run preprocess benchmark function
+    # Run preprocess benchmark function
     preprocess_benchmark_static(**args)
diff --git a/tools/preprocess/preprocess_fimx.py b/tools/preprocess/preprocess_fimx.py
old mode 100644
new mode 100755
index 344fecf7d..cad6058d0
--- a/tools/preprocess/preprocess_fimx.py
+++ b/tools/preprocess/preprocess_fimx.py
@@ -1,9 +1,5 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jul 24 13:50:59 2020
+#!/usr/bin/env python3
 
-@author: trevor.grout
-"""
 import rasterio
 from rasterio.warp import calculate_default_transform, reproject, Resampling
 from rasterio import features
@@ -47,74 +43,75 @@ def fimx_to_fim3(catchments_path, raster_value_field, hand_raster_path, template
         Preprocessed catchment raster profile.
 
     '''
-    
-    
-    #Read in template raster as band object.
+
+
+    # Read in template raster as band object
     reference = rasterio.open(template_raster)
-    
-    #Step 1: Convert HAND grid
-    #Read in the hand raster     
+
+    ## Step 1: Convert HAND grid
+    # Read in the hand raster
     hand = rasterio.open(hand_raster_path)
     hand_arr = hand.read(1)
-    #Determine the new transform and dimensions of reprojected raster (CRS = reference raster).
+    #Determine the new transform and dimensions of reprojected raster (CRS = reference raster)
     new_transform, new_width, new_height = calculate_default_transform(hand.crs, reference.crs, hand.width, hand.height, *hand.bounds)
-    #Define an empty array that is same dimensions as output by the "calculate_default_transform" command. 
-    hand_proj = np.empty((new_height,new_width), dtype=np.float)    
-    #Reproject to target dataset (resample method is bilinear due to elevation type data).
+    # Define an empty array that is same dimensions as output by the "calculate_default_transform" command
+    hand_proj = np.empty((new_height,new_width), dtype=np.float)
+    # Reproject to target dataset (resample method is bilinear due to elevation type data)
     hand_nodata_value = -2147483648
-    reproject(hand_arr, 
+    reproject(hand_arr,
               destination = hand_proj,
-              src_transform = hand.transform, 
+              src_transform = hand.transform,
               src_crs = hand.crs,
               src_nodata = hand.nodata,
-              dst_transform = new_transform, 
+              dst_transform = new_transform,
               dst_crs = reference.crs,
               dst_nodata = hand_nodata_value,
               dst_resolution = hand.res,
               resampling = Resampling.bilinear)
-    #Update profile data type and no data value.
+
+    # Update profile data type and no data value
     hand_profile = reference.profile
     hand_profile.update(dtype = rasterio.float32)
     hand_profile.update(nodata = hand_nodata_value)
     hand_profile.update(width = new_width)
     hand_profile.update(height = new_height)
     hand_profile.update(transform = new_transform)
-    
-    #Step 2: Catchments to Polygons (same extent as the HAND raster)
-    #Read in the catchment layer to geopandas dataframe and convert to same CRS as reference raster.
+
+    ## Step 2: Catchments to Polygons (same extent as the HAND raster)
+    # Read in the catchment layer to geopandas dataframe and convert to same CRS as reference raster
     gdbpath, layername = os.path.split(catchments_path)
     gdb_layer=gpd.read_file(gdbpath, driver='FileGDB', layer=layername)
     proj_gdb_layer = gdb_layer.to_crs(reference.crs)
-    #Prepare vector data to be written to raster.
-    shapes = list(zip(proj_gdb_layer['geometry'],proj_gdb_layer[raster_value_field].astype('int32')))   
-    #Write vector data to raster image. Fill raster with zeros for areas that do not have data. We will set nodata to be zero later.
-    catchment_proj = features.rasterize(((geometry, value) for geometry, value in shapes), fill = 0, out_shape=hand_proj.shape, transform=hand_profile['transform'], dtype = 'int32' )    
-    #Save raster image to in-memory dataset. Reset dtype and nodata values.
+    # Prepare vector data to be written to raster
+    shapes = list(zip(proj_gdb_layer['geometry'],proj_gdb_layer[raster_value_field].astype('int32')))
+    # Write vector data to raster image. Fill raster with zeros for areas that do not have data. We will set nodata to be zero later
+    catchment_proj = features.rasterize(((geometry, value) for geometry, value in shapes), fill = 0, out_shape=hand_proj.shape, transform=hand_profile['transform'], dtype = 'int32' )
+    # Save raster image to in-memory dataset. Reset dtype and nodata values.
     catchment_profile = hand_profile.copy()
     catchment_profile.update(dtype = 'int32')
     catchment_profile.update(nodata=0)
-  
-    #Step 3: Union of NODATA locations applied to both HAND and Catchment grids. 
+
+    ## Step 3: Union of NODATA locations applied to both HAND and Catchment grids
     catchment_masked = np.where(np.logical_or(hand_proj == hand_profile['nodata'], catchment_proj == catchment_profile['nodata']), catchment_profile['nodata'],catchment_proj)
-        #Assign NODATA to hand where both catchment and hand have NODATA else assign hand values.
+        # Assign NODATA to hand where both catchment and hand have NODATA else assign hand values.
     hand_masked = np.where(np.logical_or(hand_proj == hand_profile['nodata'], catchment_proj == catchment_profile['nodata']), hand_profile['nodata'],hand_proj)
 
-    #Step 4: Write out hand and catchment rasters to file if path is specified
+    ## Step 4: Write out hand and catchment rasters to file if path is specified
     if out_hand_path is not None:
-        os.makedirs(os.path.split(out_hand_path)[0], exist_ok = True)        
+        os.makedirs(os.path.split(out_hand_path)[0], exist_ok = True)
         with rasterio.Env():
             with rasterio.open(out_hand_path, 'w', **hand_profile) as hnd_dst:
                 hnd_dst.write(hand_masked.astype('float32'),1)
     if out_catchment_path is not None:
-        os.makedirs(os.path.split(out_catchment_path)[0], exist_ok = True)        
+        os.makedirs(os.path.split(out_catchment_path)[0], exist_ok = True)
         with rasterio.Env():
             with rasterio.open(out_catchment_path, 'w', **catchment_profile) as cat_dst:
-                cat_dst.write(catchment_masked.astype('int32'),1)   
-    
+                cat_dst.write(catchment_masked.astype('int32'),1)
+
     return hand_masked, hand_profile, catchment_masked, catchment_profile
 
 if __name__ == '__main__':
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser(description = 'Preprocess FIM 1 and FIM 2 HAND and Catchment grids to be compatible with FIM 3.')
     parser.add_argument('-c','--catchments-path', help = 'Path to catchments vector file', required = True)
     parser.add_argument('-f', '--raster-value-field', help = 'Attribute ID field from which raster values will be assigned. Typically this will be "HydroID" for FIM2 and "feature_ID" for fim 1.', required = True)
@@ -122,8 +119,7 @@ def fimx_to_fim3(catchments_path, raster_value_field, hand_raster_path, template
     parser.add_argument('-t', '--template-raster', help = 'Path to a template raster. Properties (CRS, resolution) of the template raster will be used to preprocess HAND and Catchments grids', required = True)
     parser.add_argument('-oh', '--out-hand-path', help = 'Path to the output HAND raster. Raster must be named "rem_clipped_zeroed_masked.tif', required = True)
     parser.add_argument('-oc', '--out-catchment-path', help = 'Path to the output Catchment raster. Raster must be named "gw_catchments_reaches_clipped_addedAttributes.tif"', required = True)
-    #Extract to dictionary and assign to variables.
+    # Extract to dictionary and assign to variables
     args = vars(parser.parse_args())
-    #Run fimx to fim3 function.
+    # Run fimx to fim3 function
     fimx_to_fim3(**args)
-
diff --git a/tools/run_test_case.py b/tools/run_test_case.py
index 2a0a279c5..e3168a422 100755
--- a/tools/run_test_case.py
+++ b/tools/run_test_case.py
@@ -5,16 +5,16 @@
 import shutil
 import argparse
 
-from utils.shared_functions import compute_contingency_stats_from_rasters
-from utils.shared_variables import (TEST_CASES_DIR, INPUTS_DIR, ENDC, TRED_BOLD, WHITE_BOLD, CYAN_BOLD, AHPS_BENCHMARK_CATEGORIES)
+from tools_shared_functions import compute_contingency_stats_from_rasters
+from tools_shared_variables import (TEST_CASES_DIR, INPUTS_DIR, ENDC, TRED_BOLD, WHITE_BOLD, CYAN_BOLD, AHPS_BENCHMARK_CATEGORIES)
 from inundation import inundate
 
 
 def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous=False, archive_results=False, mask_type='huc', inclusion_area='', inclusion_area_buffer=0, light_run=False, overwrite=True):
-    
+
     benchmark_category = test_id.split('_')[1] # Parse benchmark_category from test_id.
     current_huc = test_id.split('_')[0]  # Break off HUC ID and assign to variable.
-    
+
     # Construct paths to development test results if not existent.
     if archive_results:
         version_test_case_dir_parent = os.path.join(TEST_CASES_DIR, benchmark_category + '_test_cases', test_id, 'official_versions', version)
@@ -28,7 +28,7 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
         else:
             print("Metrics for ({version}: {test_id}) already exist. Use overwrite flag (-o) to overwrite metrics.".format(version=version, test_id=test_id))
             return
-        
+
     os.mkdir(version_test_case_dir_parent)
 
     print("Running the alpha test for test_id: " + test_id + ", " + version + "...")
@@ -49,13 +49,13 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
     else:
         catchment_poly = os.path.join(fim_run_parent, 'gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg')
     hydro_table = os.path.join(fim_run_parent, 'hydroTable.csv')
-        
+
     # Map necessary inputs for inundation().
     hucs, hucs_layerName = os.path.join(INPUTS_DIR, 'wbd', 'WBD_National.gpkg'), 'WBDHU8'
 
     # Create list of shapefile paths to use as exclusion areas.
     zones_dir = os.path.join(TEST_CASES_DIR, 'other', 'zones')
-    mask_dict = {'levees': 
+    mask_dict = {'levees':
                     {'path': os.path.join(zones_dir, 'leveed_areas_conus.shp'),
                      'buffer': None,
                      'operation': 'exclude'
@@ -66,7 +66,7 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
                      'operation': 'exclude',
                      },
                 }
-            
+
     if inclusion_area != '':
         inclusion_area_name = os.path.split(inclusion_area)[1].split('.')[0]  # Get layer name
         mask_dict.update({inclusion_area_name: {'path': inclusion_area,
@@ -75,7 +75,7 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
         # Append the concatenated inclusion_area_name and buffer.
         if inclusion_area_buffer == None:
             inclusion_area_buffer = 0
-        stats_modes_list.append(inclusion_area_name + '_b' + str(inclusion_area_buffer) + 'm') 
+        stats_modes_list.append(inclusion_area_name + '_b' + str(inclusion_area_buffer) + 'm')
 
     # Check if magnitude is list of magnitudes or single value.
     magnitude_list = magnitude
@@ -88,13 +88,13 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
         version_test_case_dir = os.path.join(version_test_case_dir_parent, magnitude)
         if not os.path.exists(version_test_case_dir):
             os.mkdir(version_test_case_dir)
-    
+
         # Construct path to validation raster and forecast file.
         if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
             benchmark_raster_path_list, forecast_list = [], []
             lid_dir_list = os.listdir(os.path.join(validation_data_path, current_huc))
             lid_list, inundation_raster_list, domain_file_list = [], [], []
-            
+
             for lid in lid_dir_list:
                 lid_dir = os.path.join(validation_data_path, current_huc, lid)
                 benchmark_raster_path_list.append(os.path.join(lid_dir, magnitude, 'ahps_' + lid + '_huc_' + current_huc + '_extent_' + magnitude + '.tif'))  # TEMP
@@ -109,7 +109,7 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
             forecast_path = os.path.join(TEST_CASES_DIR, benchmark_category + '_test_cases', 'validation_data_' + benchmark_category, current_huc, magnitude, benchmark_category + '_huc_' + current_huc + '_flows_' + magnitude + '.csv')
             forecast_list = [forecast_path]
             inundation_raster_list = [os.path.join(version_test_case_dir, 'inundation_extent.tif')]
-            
+
         for index in range(0, len(benchmark_raster_path_list)):
             benchmark_raster_path = benchmark_raster_path_list[index]
             forecast = forecast_list[index]
@@ -123,7 +123,7 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
                      'buffer': None,
                      'operation': 'include'}
                         })
-        
+
                 if not os.path.exists(benchmark_raster_path) or not os.path.exists(ahps_domain_file) or not os.path.exists(forecast):  # Skip loop instance if the benchmark raster doesn't exist.
                     continue
             else:  # If not in AHPS_BENCHMARK_CATEGORIES.
@@ -137,16 +137,16 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
                          subset_hucs=current_huc,num_workers=1,aggregate=False,inundation_raster=inundation_raster,inundation_polygon=None,
                          depths=None,out_raster_profile=None,out_vector_profile=None,quiet=True
                         )
-            
+
                 print("-----> Inundation mapping complete.")
                 predicted_raster_path = os.path.join(os.path.split(inundation_raster)[0], os.path.split(inundation_raster)[1].replace('.tif', '_' + current_huc + '.tif'))  # The inundate adds the huc to the name so I account for that here.
-        
+
                 # Define outputs for agreement_raster, stats_json, and stats_csv.
                 if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
                     agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, lid + 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
                 else:
                     agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
-         
+
                 compute_contingency_stats_from_rasters(predicted_raster_path,
                                                        benchmark_raster_path,
                                                        agreement_raster,
@@ -157,16 +157,16 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
                                                        test_id=test_id,
                                                        mask_dict=mask_dict,
                                                        )
-        
+
                 if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
                     del mask_dict[ahps_lid]
-                
+
                 print(" ")
                 print("Evaluation complete. All metrics for " + test_id + ", " + version + ", " + magnitude + " are available at " + CYAN_BOLD + version_test_case_dir + ENDC)
                 print(" ")
             except Exception as e:
-                print(e)      
-        
+                print(e)
+
         if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
             # -- Delete temp files -- #
             # List all files in the output directory.
@@ -217,12 +217,12 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
         print(WHITE_BOLD + "Please provide the parent directory name for fim_run.sh outputs. These outputs are usually written in a subdirectory, e.g. outputs/123456/123456." + ENDC)
         print()
         exit_flag = True
-    
+
     # Ensure inclusion_area path exists.
     if args['inclusion_area'] != "" and not os.path.exists(args['inclusion_area']):
         print(TRED_BOLD + "Error: " + WHITE_BOLD + "The provided inclusion_area (-i) " + CYAN_BOLD + args['inclusion_area'] + WHITE_BOLD + " could not be located." + ENDC)
         exit_flag = True
-        
+
     try:
         inclusion_buffer = int(args['inclusion_area_buffer'])
     except ValueError:
@@ -235,8 +235,8 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
             args['magnitude'] = ['action', 'minor', 'moderate', 'major']
         else:
             print(TRED_BOLD + "Error: " + WHITE_BOLD + "The provided magnitude (-y) " + CYAN_BOLD + args['magnitude'] + WHITE_BOLD + " is invalid. ble options include: 100yr, 500yr. ahps options include action, minor, moderate, major." + ENDC)
-            exit_flag = True     
-            
+            exit_flag = True
+
     if exit_flag:
         print()
         sys.exit()
diff --git a/tools/run_test_case_calibration.py b/tools/run_test_case_calibration.py
index f630360b0..728b87abe 100755
--- a/tools/run_test_case_calibration.py
+++ b/tools/run_test_case_calibration.py
@@ -9,7 +9,7 @@
 import argparse
 import shutil
 
-from utils.shared_functions import get_contingency_table_from_binary_rasters, compute_stats_from_contingency_table
+from tools_shared_functions import get_contingency_table_from_binary_rasters, compute_stats_from_contingency_table
 from inundation import inundate
 
 TEST_CASES_DIR = r'/data/test_cases/'  # Will update.
diff --git a/tools/utils/shapefile_to_raster.py b/tools/shapefile_to_raster.py
old mode 100644
new mode 100755
similarity index 88%
rename from tools/utils/shapefile_to_raster.py
rename to tools/shapefile_to_raster.py
index 4d1a61ed9..fc1689954
--- a/tools/utils/shapefile_to_raster.py
+++ b/tools/shapefile_to_raster.py
@@ -1,9 +1,4 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jul 14 16:19:26 2020
-
-@author: bradford.bates
-"""
+#!/usr/bin/env python3
 
 # A script to rasterise a shapefile to the same projection & pixel resolution as a reference image.
 from osgeo import ogr, gdal
@@ -28,7 +23,7 @@
 print("Rasterising shapefile...")
 Output = gdal.GetDriverByName(gdalformat).Create(OutputImage, Image.RasterXSize, Image.RasterYSize, 1, datatype, options=['COMPRESS=DEFLATE'])
 Output.SetProjection(Image.GetProjectionRef())
-Output.SetGeoTransform(Image.GetGeoTransform()) 
+Output.SetGeoTransform(Image.GetGeoTransform())
 
 # Write data to band 1
 Band = Output.GetRasterBand(1)
@@ -43,4 +38,4 @@
 
 # Build image overviews
 subprocess.call("gdaladdo --config COMPRESS_OVERVIEW DEFLATE "+OutputImage+" 2 4 8 16 32 64", shell=True)
-print("Done.")
\ No newline at end of file
+print("Done.")
diff --git a/tools/synthesize_test_cases.py b/tools/synthesize_test_cases.py
old mode 100644
new mode 100755
index 1fdb0a4dc..f3d02192c
--- a/tools/synthesize_test_cases.py
+++ b/tools/synthesize_test_cases.py
@@ -7,11 +7,11 @@
 import csv
 
 from run_test_case import run_alpha_test
-from utils.shared_variables import TEST_CASES_DIR, PREVIOUS_FIM_DIR, OUTPUTS_DIR, AHPS_BENCHMARK_CATEGORIES
+from tools_shared_variables import TEST_CASES_DIR, PREVIOUS_FIM_DIR, OUTPUTS_DIR, AHPS_BENCHMARK_CATEGORIES
 
 
 def create_master_metrics_csv(master_metrics_csv_output):
-            
+
     # Construct header
     metrics_to_write = ['true_negatives_count',
                         'false_negatives_count',
@@ -55,26 +55,26 @@ def create_master_metrics_csv(master_metrics_csv_output):
                         'masked_perc',
                         'masked_area_km2'
                         ]
-    
+
     additional_header_info_prefix = ['version', 'nws_lid', 'magnitude', 'huc']
     list_to_write = [additional_header_info_prefix + metrics_to_write + ['full_json_path'] + ['flow'] + ['benchmark_source'] + ['extent_config'] + ["calibrated"]]
-    
+
     versions_to_aggregate = os.listdir(PREVIOUS_FIM_DIR)
-    
+
     for benchmark_source in ['ble', 'nws', 'usgs']:
-        
+
         benchmark_test_case_dir = os.path.join(TEST_CASES_DIR, benchmark_source + '_test_cases')
-        
+
         if benchmark_source == 'ble':
             test_cases_list = os.listdir(benchmark_test_case_dir)
-                            
+
             for test_case in test_cases_list:
                 try:
                     int(test_case.split('_')[0])
-                    
+
                     huc = test_case.split('_')[0]
                     official_versions = os.path.join(benchmark_test_case_dir, test_case, 'official_versions')
-                    
+
                     for magnitude in ['100yr', '500yr']:
                         for version in versions_to_aggregate:
                             if '_fr' in version:
@@ -108,21 +108,21 @@ def create_master_metrics_csv(master_metrics_csv_output):
                                             sub_list_to_append.append(benchmark_source)
                                             sub_list_to_append.append(extent_config)
                                             sub_list_to_append.append(calibrated)
-                                            
+
                                             list_to_write.append(sub_list_to_append)
                 except ValueError:
                     pass
-                
+
         if benchmark_source in AHPS_BENCHMARK_CATEGORIES:
             test_cases_list = os.listdir(benchmark_test_case_dir)
 
             for test_case in test_cases_list:
                 try:
                     int(test_case.split('_')[0])
-                    
+
                     huc = test_case.split('_')[0]
                     official_versions = os.path.join(benchmark_test_case_dir, test_case, 'official_versions')
-                    
+
                     for magnitude in ['action', 'minor', 'moderate', 'major']:
                         for version in versions_to_aggregate:
                             if '_fr' in version:
@@ -135,7 +135,7 @@ def create_master_metrics_csv(master_metrics_csv_output):
                                 calibrated = "yes"
                             else:
                                 calibrated = "no"
-                                
+
                             version_dir = os.path.join(official_versions, version)
                             magnitude_dir = os.path.join(version_dir, magnitude)
                             if os.path.exists(magnitude_dir):
@@ -147,8 +147,8 @@ def create_master_metrics_csv(master_metrics_csv_output):
                                         full_json_path = os.path.join(magnitude_dir, f)
                                         flow = ''
                                         if os.path.exists(full_json_path):
-                                            
-                                            # Get flow used to map.                                                
+
+                                            # Get flow used to map.
                                             flow_file = os.path.join(benchmark_test_case_dir, 'validation_data_' + benchmark_source, huc, nws_lid, magnitude, 'ahps_' + nws_lid + '_huc_' + huc + '_flows_' + magnitude + '.csv')
                                             if os.path.exists(flow_file):
                                                 with open(flow_file, newline='') as csv_file:
@@ -158,7 +158,7 @@ def create_master_metrics_csv(master_metrics_csv_output):
                                                         flow = row[1]
                                                     if nws_lid == 'mcc01':
                                                         print(flow)
-                                            
+
                                             stats_dict = json.load(open(full_json_path))
                                             for metric in metrics_to_write:
                                                 sub_list_to_append.append(stats_dict[metric])
@@ -167,27 +167,27 @@ def create_master_metrics_csv(master_metrics_csv_output):
                                             sub_list_to_append.append(benchmark_source)
                                             sub_list_to_append.append(extent_config)
                                             sub_list_to_append.append(calibrated)
-                                            
+
                                             list_to_write.append(sub_list_to_append)
                 except ValueError:
                     pass
-        
+
     with open(master_metrics_csv_output, 'w', newline='') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerows(list_to_write)
 
 
 def process_alpha_test(args):
-    
+
     fim_run_dir = args[0]
     version = args[1]
     test_id = args[2]
     magnitude = args[3]
     archive_results = args[4]
     overwrite = args[5]
-    
-    mask_type = 'huc'    
-    
+
+    mask_type = 'huc'
+
     if archive_results == False:
         compare_to_previous = True
     else:
@@ -210,7 +210,7 @@ def process_alpha_test(args):
     parser.add_argument('-b','--benchmark-category',help='A benchmark category to specify. Defaults to process all categories.',required=False, default="all")
     parser.add_argument('-o','--overwrite',help='Overwrite all metrics or only fill in missing metrics.',required=False, action="store_true")
     parser.add_argument('-m','--master-metrics-csv',help='Define path for master metrics CSV file.',required=True)
-    
+
     # Assign variables from arguments.
     args = vars(parser.parse_args())
     config = args['config']
@@ -220,11 +220,11 @@ def process_alpha_test(args):
     benchmark_category = args['benchmark_category']
     overwrite = args['overwrite']
     master_metrics_csv = args['master_metrics_csv']
-    
+
     if overwrite:
         if input("Are you sure you want to overwrite metrics? y/n: ") == "n":
             quit
-        
+
     # Default to processing all possible versions in PREVIOUS_FIM_DIR. Otherwise, process only the user-supplied version.
     if fim_version != "all":
         previous_fim_list = [fim_version]
@@ -233,7 +233,7 @@ def process_alpha_test(args):
             previous_fim_list = os.listdir(PREVIOUS_FIM_DIR)
         elif config == 'DEV':
             previous_fim_list = os.listdir(OUTPUTS_DIR)
-    
+
     # Define whether or not to archive metrics in "official_versions" or "testing_versions" for each test_id.
     if config == 'PREV':
         archive_results = True
@@ -241,7 +241,7 @@ def process_alpha_test(args):
         archive_results = False
     else:
         print('Config (-c) option incorrectly set. Use "DEV" or "PREV"')
-     
+
     # List all available benchmark categories and test_cases.
     test_cases_dir_list = os.listdir(TEST_CASES_DIR)
     benchmark_category_list = []
@@ -251,41 +251,41 @@ def process_alpha_test(args):
                 benchmark_category_list.append(d.replace('_test_cases', ''))
     else:
         benchmark_category_list = [benchmark_category]
-        
+
     # Loop through benchmark categories.
     procs_list = []
     for bench_cat in benchmark_category_list:
-        
+
         # Map path to appropriate test_cases folder and list test_ids into bench_cat_id_list.
         bench_cat_test_case_dir = os.path.join(TEST_CASES_DIR, bench_cat + '_test_cases')
         bench_cat_id_list = os.listdir(bench_cat_test_case_dir)
-    
+
         # Loop through test_ids in bench_cat_id_list.
         for test_id in bench_cat_id_list:
             if 'validation' and 'other' not in test_id:
                 current_huc = test_id.split('_')[0]
                 if test_id.split('_')[1] in bench_cat:
-                
+
                     # Loop through versions.
                     for version in previous_fim_list:
                         if config == 'DEV':
                             fim_run_dir = os.path.join(OUTPUTS_DIR, version, current_huc)
                         elif config == 'PREV':
                             fim_run_dir = os.path.join(PREVIOUS_FIM_DIR, version, current_huc)
-                                                
+
                         # For previous versions of HAND computed at HUC6 scale
                         if not os.path.exists(fim_run_dir):
                             if config == 'DEV':
                                 fim_run_dir = os.path.join(OUTPUTS_DIR, version, current_huc[:6])
                             elif config == 'PREV':
-                                fim_run_dir = os.path.join(PREVIOUS_FIM_DIR, version, current_huc[:6])  
-                        
+                                fim_run_dir = os.path.join(PREVIOUS_FIM_DIR, version, current_huc[:6])
+
                         if os.path.exists(fim_run_dir):
-                            
+
                             # If a user supplies a specia_string (-s), then add it to the end of the created dirs.
                             if special_string != "":
                                 version = version + '_' + special_string
-                            
+
                             # Define the magnitude lists to use, depending on test_id.
                             if 'ble' in test_id:
                                 magnitude = ['100yr', '500yr']
@@ -293,19 +293,18 @@ def process_alpha_test(args):
                                 magnitude = ['action', 'minor', 'moderate', 'major']
                             else:
                                 continue
-                        
+
                             # Either add to list to multiprocess or process serially, depending on user specification.
                             if job_number > 1:
                                 procs_list.append([fim_run_dir, version, test_id, magnitude, archive_results, overwrite])
-                            else:                            
+                            else:
                                 process_alpha_test([fim_run_dir, version, test_id, magnitude, archive_results, overwrite])
 
     # Multiprocess alpha test runs.
     if job_number > 1:
         pool = Pool(job_number)
         pool.map(process_alpha_test, procs_list)
-    
+
     # Do aggregate_metrics.
     print("Creating master metrics CSV...")
     create_master_metrics_csv(master_metrics_csv_output=master_metrics_csv)
-    
\ No newline at end of file
diff --git a/tools/time_and_tee_mannings_calibration.sh b/tools/time_and_tee_mannings_calibration.sh
index d45976cc8..7a1c06cea 100755
--- a/tools/time_and_tee_mannings_calibration.sh
+++ b/tools/time_and_tee_mannings_calibration.sh
@@ -1,4 +1,4 @@
 #!/bin/bash -e
 
-/usr/bin/time -v $testdir/mannings_run_by_set.sh $1 |& tee
+/usr/bin/time -v $toolsdir/mannings_run_by_set.sh $1 |& tee
 exit ${PIPESTATUS[0]}
diff --git a/tools/utils/shared_functions.py b/tools/tools_shared_functions.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/utils/shared_functions.py
rename to tools/tools_shared_functions.py
diff --git a/tools/utils/shared_variables.py b/tools/tools_shared_variables.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/utils/shared_variables.py
rename to tools/tools_shared_variables.py
diff --git a/tools/utils/__init__.py b/tools/utils/__init__.py
deleted file mode 100644
index e69de29bb..000000000

From f74f7d27f7e96858b145ac60200b57657495d7fe Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 4 Mar 2021 12:00:27 -0600
Subject: [PATCH 09/66] removing comment in inundation_wrapper_custom_flow.py

---
 tools/inundation_wrapper_custom_flow.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/inundation_wrapper_custom_flow.py b/tools/inundation_wrapper_custom_flow.py
index 530585793..6867bea5f 100755
--- a/tools/inundation_wrapper_custom_flow.py
+++ b/tools/inundation_wrapper_custom_flow.py
@@ -10,8 +10,6 @@
 import sys
 import argparse
 import shutil
-
-# insert python path at runtime for accessing scripts in foss_fim/tests dir (e.g. inundation.py)
 from inundation import inundate
 
 TEST_CASES_DIR = r'/data/inundation_review/inundation_custom_flow/'  # Will update.

From 6197f32e76cf2bee72c62c179cac1812fc29afc5 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 4 Mar 2021 12:00:55 -0600
Subject: [PATCH 10/66] removing comment in inundation_wrapper_nwm_flow.py

---
 tools/inundation_wrapper_nwm_flows.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/inundation_wrapper_nwm_flows.py b/tools/inundation_wrapper_nwm_flows.py
index 8a5fe0cf5..f2b641e83 100755
--- a/tools/inundation_wrapper_nwm_flows.py
+++ b/tools/inundation_wrapper_nwm_flows.py
@@ -10,8 +10,6 @@
 import csv
 import argparse
 import shutil
-
-# insert python path at runtime for accessing scripts in foss_fim/tests dir (e.g. inundation.py)
 from inundation import inundate
 
 TEST_CASES_DIR = r'/data/inundation_review/inundation_nwm_recurr/'  # Will update.

From dd8952c7ee72725d3108cfd9a44ca7bd66982f2f Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 4 Mar 2021 12:09:23 -0600
Subject: [PATCH 11/66] formatting eval_plots.py

---
 tools/plots/eval_plots.py | 134 +++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/tools/plots/eval_plots.py b/tools/plots/eval_plots.py
index e04b2fd11..f18390f5e 100755
--- a/tools/plots/eval_plots.py
+++ b/tools/plots/eval_plots.py
@@ -1,10 +1,13 @@
 #!/usr/bin/env python3
+
 import pandas as pd
 from pathlib import Path
 import argparse
 from natsort import natsorted
 import geopandas as gpd
 from plot_functions import filter_dataframe, boxplot, scatterplot, barplot
+
+
 def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'] , alternate_ahps_query = False, spatial_ahps = False, fim_1_ms = False):
 
     '''
@@ -93,10 +96,10 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
 
     '''
 
-    #Import metrics csv as DataFrame and initialize all_datasets dictionary
+    # Import metrics csv as DataFrame and initialize all_datasets dictionary
     csv_df = pd.read_csv(metrics_csv)
 
-    #fim_1_ms flag enables FIM 1 to be shown on MS plots/stats
+    # fim_1_ms flag enables FIM 1 to be shown on MS plots/stats
     if fim_1_ms:
         #Query FIM 1 rows based on version beginning with "fim_1"
         fim_1_rows = csv_df.query('version.str.startswith("fim_1")').copy()
@@ -105,73 +108,71 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
         #Append duplicate FIM 1 rows to original dataframe
         csv_df = csv_df.append(fim_1_rows, ignore_index = True)
 
-    #If versions are supplied then filter out
+    # If versions are supplied then filter out
     if versions:
         #Filter out versions based on supplied version list
         metrics = csv_df.query('version.str.startswith(tuple(@versions))')
     else:
         metrics = csv_df
 
-    #Group by benchmark source
+    # Group by benchmark source
     benchmark_by_source = metrics.groupby(['benchmark_source', 'extent_config'])
 
-    #Iterate through benchmark_by_source. Pre-filter metrics dataframe
-    #as needed (e.g. usgs/nws filter query). Then further filtering to
-    #discard all hucs/nws_lid that are not present across all analyzed
-    #versions for a given magnitude. The final filtered dataset is written
-    #to a dictionary with the key (benchmark source, extent config)
-    #and values (filtered dataframe, common sites).
+    ''' Iterate through benchmark_by_source. Pre-filter metrics dataframe
+    as needed (e.g. usgs/nws filter query). Then further filtering to
+    discard all hucs/nws_lid that are not present across all analyzed
+    versions for a given magnitude. The final filtered dataset is written
+    to a dictionary with the key (benchmark source, extent config)
+    and values (filtered dataframe, common sites). '''
+    
     all_datasets = {}
     for (benchmark_source, extent_configuration), benchmark_metrics in benchmark_by_source:
 
-        #If source is usgs/nws define the base resolution and query
-        #(use alternate query if passed). Append filtered datasets to
-        #all_datasets dictionary.
+        '''If source is usgs/nws define the base resolution and query
+        (use alternate query if passed). Append filtered datasets to
+        all_datasets dictionary.'''
+        
         if benchmark_source in ['usgs','nws']:
 
-            #Set the base processing unit for the ahps runs.
+            # Set the base processing unit for the ahps runs.
             base_resolution = 'nws_lid'
 
             #Default query (used for APG) it could be that bad_sites should be modified. If so pass an alternate query using the "alternate_ahps_query"
             bad_sites = ['grfi2','ksdm7','hohn4','rwdn4']
             query = "not flow.isnull() & masked_perc<97 & not nws_lid in @bad_sites"
 
-            #If alternate ahps evaluation query argument is passed, use that.
+            # If alternate ahps evaluation query argument is passed, use that.
             if alternate_ahps_query:
                 query = alternate_ahps_query
 
-            #Filter the dataset based on query
+            # Filter the dataset based on query
             ahps_metrics = benchmark_metrics.query(query)
 
-            #Filter out all instances where the base_resolution doesn't
-            #exist across all desired fim versions for a given magnitude.
+            # Filter out all instances where the base_resolution doesn't exist across all desired fim versions for a given magnitude
             all_datasets[(benchmark_source, extent_configuration)] = filter_dataframe(ahps_metrics, base_resolution)
 
-        #If source is 'ble', set base_resolution and append ble dataset
-        #to all_datasets dictionary
+        # If source is 'ble', set base_resolution and append ble dataset to all_datasets dictionary
         elif benchmark_source == 'ble':
 
-            #Set the base processing unit for ble runs
+            # Set the base processing unit for ble runs
             base_resolution = 'huc'
 
-            #Filter out all instances where base_resolution doesn't exist
-            #across all desired fim versions for a given magnitude.
+            # Filter out all instances where base_resolution doesn't exist across all desired fim versions for a given magnitude
             all_datasets[(benchmark_source, extent_configuration)] = filter_dataframe(benchmark_metrics, base_resolution)
 
-    #For each dataset in all_datasets, generate plots and aggregate statistics.
+    # For each dataset in all_datasets, generate plots and aggregate statistics
     for (dataset_name,configuration), (dataset, sites) in all_datasets.items():
 
-        #Define and create the output workspace as a subfolder within
-        #the supplied workspace
+        # Define and create the output workspace as a subfolder within the supplied workspace
         output_workspace = Path(workspace) / dataset_name / configuration.lower()
         output_workspace.mkdir(parents = True, exist_ok = True)
 
-        #Write out the filtered dataset and common sites to file
+        # Write out the filtered dataset and common sites to file
         dataset.to_csv(output_workspace / (f'{dataset_name}_{configuration.lower()}_analyzed_data.csv'), index = False)
         sites_pd = pd.DataFrame.from_dict(sites, orient = 'index').transpose()
         sites_pd.to_csv(output_workspace / (f'{dataset_name}_{configuration.lower()}_common_sites.csv'), index = False)
 
-        #set the order of the magnitudes and define base resolution.
+        # Set the order of the magnitudes and define base resolution
         if dataset_name == 'ble':
             magnitude_order = ['100yr', '500yr']
             base_resolution = 'huc'
@@ -179,29 +180,28 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
             magnitude_order = ['action','minor','moderate','major']
             base_resolution = 'nws_lid'
 
-        #Calculate aggregated metrics based on total_sq_km fields.
+        # Calculate aggregated metrics based on total_sq_km fields
         dataset_sums = dataset.groupby(['version', 'magnitude'])[['TP_area_km2','FP_area_km2','FN_area_km2']].sum()
         dataset_sums['csi'] = dataset_sums['TP_area_km2']/(dataset_sums['TP_area_km2'] + dataset_sums['FP_area_km2'] + dataset_sums['FN_area_km2'])
         dataset_sums['far'] = dataset_sums['FP_area_km2']/(dataset_sums['TP_area_km2'] + dataset_sums['FP_area_km2'])
         dataset_sums['pod'] = dataset_sums['TP_area_km2']/(dataset_sums['TP_area_km2'] + dataset_sums['FN_area_km2'])
         dataset_sums = dataset_sums.reset_index()
 
-        #Write aggregated metrics to file.
+        # Write aggregated metrics to file
         dataset_sums.to_csv(output_workspace / f'aggregate_{dataset_name}_{configuration.lower()}.csv', index = False )
 
-        #This section naturally orders analyzed versions which defines
-        #the hue order for the generated plots.
-        #Get all versions in dataset
+        ## This section naturally orders analyzed versions which defines the hue order for the generated plots
+        # Get all versions in dataset
         all_versions = list(dataset.version.unique())
         version_order = []
-        #If versions are not specified then use all available versions
-        #and assign to versions_list
+        
+        # If versions are not specified then use all available versions and assign to versions_list
         if not versions:
             versions_list = all_versions
-        #if versions are supplied assign to versions_list
+        # If versions are supplied assign to versions_list
         else:
             versions_list = versions
-        #For each version supplied by the user
+        # For each version supplied by the user
         for version in versions_list:
             #Select all the versions that start with the supplied version.
             selected_versions = [sel_version for sel_version in all_versions if sel_version.startswith(version)]
@@ -210,7 +210,7 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
             #Populate version order based on the sorted subsets.
             version_order.extend(selected_versions)
 
-        #Define textbox which will contain the counts of each magnitude.
+        # Define textbox which will contain the counts of each magnitude
         textbox = []
         for magnitude in sites:
             count = len(sites[magnitude])
@@ -218,24 +218,24 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
             textbox.append(line_text)
         textbox = '\n'.join(textbox)
 
-        #Create aggregate barplot
+        # Create aggregate barplot
         aggregate_file = output_workspace / (f'csi_aggr_{dataset_name}_{configuration.lower()}.png')
         barplot(dataframe = dataset_sums, x_field = 'magnitude', x_order = magnitude_order, y_field = 'csi', hue_field = 'version', ordered_hue = version_order, title_text = f'Aggregate {dataset_name.upper()} FIM Scores', fim_configuration = configuration, textbox_str = textbox, simplify_legend = True, dest_file = aggregate_file)
 
-        #Create box plots for each metric in supplied stats.
+        # Create box plots for each metric in supplied stats
         for stat in stats:
             output_file = output_workspace / (f'{stat.lower()}_{dataset_name}_{configuration.lower()}.png')
             boxplot(dataframe = dataset, x_field = 'magnitude', x_order = magnitude_order, y_field = stat, hue_field = 'version', ordered_hue = version_order, title_text = f'{dataset_name.upper()} FIM Sites', fim_configuration = configuration, textbox_str = textbox, simplify_legend = True, dest_file = output_file)
 
-        #Get the last 2 versions from the version order for scatter plot.
+        # Get the last 2 versions from the version order for scatter plot
         if len(version_order) == 2:
             x_version, y_version = version_order
             for magnitude in magnitude_order:
-                #Scatterplot comparison between last 2 versions.
+                # Scatterplot comparison between last 2 versions
                 x_csi = dataset.query(f'version == "{x_version}" & magnitude == "{magnitude}"')[[base_resolution, 'CSI']]
                 y_csi = dataset.query(f'version == "{y_version}" & magnitude == "{magnitude}"')[[base_resolution, 'CSI']]
                 plotdf = pd.merge(x_csi, y_csi, on = base_resolution, suffixes = (f"_{x_version}",f"_{y_version}"))
-                #Define arguments for scatterplot function.
+                # Define arguments for scatterplot function
                 title_text = f'CSI {magnitude}'
                 dest_file = output_workspace / f'csi_scatter_{magnitude}_{configuration.lower()}.png'
                 scatterplot(dataframe = plotdf, x_field = f'CSI_{x_version}', y_field = f'CSI_{y_version}', title_text = title_text, annotate = False, dest_file = dest_file)
@@ -246,48 +246,48 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     ########################################################################
     if spatial_ahps:
 
-        #Read in supplied shapefile layers
-        #Layer containing metadata for each site (feature_id, wfo, etc).
-        #Convert nws_lid to lower case.
+        # Read in supplied shapefile layers
+        # Layer containing metadata for each site (feature_id, wfo, etc)
+        # Convert nws_lid to lower case
         ahps_metadata = gpd.read_file(spatial_ahps['metadata'])
         ahps_metadata['nws_lid'] = ahps_metadata['nws_lid'].str.lower()
         metadata_crs = ahps_metadata.crs
 
-        #Extent layer generated from preprocessing NWS/USGS datasets
+        # Extent layer generated from preprocessing NWS/USGS datasets
         evaluated_ahps_extent = gpd.read_file(spatial_ahps['evaluated'])
 
-        #Extent layer generated from static ahps library preprocessing
+        # Extent layer generated from static ahps library preprocessing
         static_library = gpd.read_file(spatial_ahps['static'])
 
-        #Fields to keep
-        #Get list of fields to keep in merge
+        # Fields to keep
+        # Get list of fields to keep in merge
         preserved_static_library_fields = ['nws_lid'] + [i for i in static_library.columns if i.startswith(('Q','S'))]
-        #Get list of fields to keep in merge.
+        # Get list of fields to keep in merge
         preserved_evaluated_ahps_fields = ['nws_lid', 'source', 'geometry'] + [i for i in evaluated_ahps_extent.columns if i.startswith(('action','minor','moderate','major'))]
 
-        #Join tables to evaluated_ahps_extent
+        # Join tables to evaluated_ahps_extent
         evaluated_ahps_extent = evaluated_ahps_extent[preserved_evaluated_ahps_fields]
         evaluated_ahps_extent = evaluated_ahps_extent.merge(ahps_metadata, on = 'nws_lid')
         evaluated_ahps_extent['geometry'] = evaluated_ahps_extent['geometry_y']
         evaluated_ahps_extent.drop(columns = ['geometry_y','geometry_x'], inplace = True)
         evaluated_ahps_extent = evaluated_ahps_extent.merge(static_library[preserved_static_library_fields], on = 'nws_lid')
 
-        #Join dataset metrics to evaluated_ahps_extent data.
+        # Join dataset metrics to evaluated_ahps_extent data
         final_join = pd.DataFrame()
         for (dataset_name, configuration), (dataset, sites) in all_datasets.items():
-            #Only select ahps from dataset if config is MS
+            # Only select ahps from dataset if config is MS
             if dataset_name in ['usgs','nws'] and configuration == 'MS':
-                #Select records from evaluated_ahps_extent that match the dataset name
+                # Select records from evaluated_ahps_extent that match the dataset name
                 subset = evaluated_ahps_extent.query(f'source == "{dataset_name}"')
-                #Join to dataset
+                # Join to dataset
                 dataset_with_subset = dataset.merge(subset, on = 'nws_lid')
-                #Append rows to final_join dataframe
+                # Append rows to final_join dataframe
                 final_join = final_join.append(dataset_with_subset)
 
-        #Modify version field
+        # Modify version field
         final_join['version'] = final_join.version.str.split('_nws|_usgs').str[0]
 
-        #Write geodataframe to file
+        # Write geodataframe to file
         gdf = gpd.GeoDataFrame(final_join, geometry = final_join['geometry'], crs = metadata_crs)
         output_shapefile = Path(workspace) / 'nws_usgs_site_info.shp'
         gdf.to_file(output_shapefile)
@@ -296,7 +296,7 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
 
 #######################################################################
 if __name__ == '__main__':
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser(description = 'Plot and aggregate statistics for benchmark datasets (BLE/AHPS libraries)')
     parser.add_argument('-m','--metrics_csv', help = 'Metrics csv created from synthesize test cases.', required = True)
     parser.add_argument('-w', '--workspace', help = 'Output workspace', required = True)
@@ -306,21 +306,21 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     parser.add_argument('-sp', '--spatial_ahps', help = 'If spatial point layer is desired, supply a csv with 3 lines of the following format: metadata, path/to/metadata/shapefile\nevaluated, path/to/evaluated/shapefile\nstatic, path/to/static/shapefile.', default = False, required = False)
     parser.add_argument('-f', '--fim_1_ms', help = 'If enabled fim_1 rows will be duplicated and extent config assigned "ms" so that fim_1 can be shown on mainstems plots/stats', action = 'store_true', required = False)
 
-    #Extract to dictionary and assign to variables.
+    # Extract to dictionary and assign to variables
     args = vars(parser.parse_args())
 
-    #If errors occur reassign error to True
+    # If errors occur reassign error to True
     error = False
-    #Create dictionary if file specified for spatial_ahps
+    # Create dictionary if file specified for spatial_ahps
     if args['spatial_ahps']:
-        #Create dictionary
+        # Create dictionary
         spatial_dict = {}
         with open(args['spatial_ahps']) as file:
             for line in file:
                 key, value = line.strip('\n').split(',')
                 spatial_dict[key] = Path(value)
         args['spatial_ahps'] = spatial_dict
-        #Check that all required keys are present and overwrite args with spatial_dict
+        # Check that all required keys are present and overwrite args with spatial_dict
         required_keys = set(['metadata', 'evaluated', 'static'])
         if required_keys - spatial_dict.keys():
           print('\n Required keys are: metadata, evaluated, static')
@@ -329,7 +329,7 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
             args['spatial_ahps'] = spatial_dict
 
 
-    #Finalize Variables
+    # Finalize Variables
     m = args['metrics_csv']
     w = args['workspace']
     v = args['versions']
@@ -338,6 +338,6 @@ def eval_plots(metrics_csv, workspace, versions = [], stats = ['CSI','FAR','TPR'
     sp= args['spatial_ahps']
     f = args['fim_1_ms']
 
-    #Run eval_plots function
+    # Run eval_plots function
     if not error:
         eval_plots(metrics_csv = m, workspace = w, versions = v, stats = s, alternate_ahps_query = q, spatial_ahps = sp, fim_1_ms = f)

From a7f7e2c9957ee3c7736f748a0b6d6dae07d7f274 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Fri, 5 Mar 2021 14:49:28 +0000
Subject: [PATCH 12/66] adding usgs pixel catchment ID crosswalk

---
 src/add_crosswalk.py                  |  4 +-
 src/run_by_unit.sh                    |  8 +++
 src/usgs_catchment_pixel_crosswalk.py | 96 +++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100755 src/usgs_catchment_pixel_crosswalk.py

diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
index eb4198cb3..2958c2882 100755
--- a/src/add_crosswalk.py
+++ b/src/add_crosswalk.py
@@ -260,12 +260,12 @@ def add_crosswalk(input_catchments_fileName,input_flows_fileName,input_srcbase_f
 
 if __name__ == '__main__':
 
-    parser = argparse.ArgumentParser(description='Subset vector layers')
+    parser = argparse.ArgumentParser(description='Crosswalk for MS/FR networks; calculate synthetic rating curves; update short rating curves')
     parser.add_argument('-d','--input-catchments-fileName', help='DEM derived catchments', required=True)
     parser.add_argument('-a','--input-flows-fileName', help='DEM derived streams', required=True)
     parser.add_argument('-s','--input-srcbase-fileName', help='Base synthetic rating curve table', required=True)
     parser.add_argument('-l','--output-catchments-fileName', help='Subset crosswalked catchments', required=True)
-    parser.add_argument('-f','--output-flows-fileName', help='Subset crosswalked  streams', required=True)
+    parser.add_argument('-f','--output-flows-fileName', help='Subset crosswalked streams', required=True)
     parser.add_argument('-r','--output-src-fileName', help='Output crosswalked synthetic rating curve table', required=True)
     parser.add_argument('-j','--output-src-json-fileName',help='Output synthetic rating curve json',required=True)
     parser.add_argument('-x','--output-crosswalk-fileName',help='Crosswalk table',required=True)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 68866729d..a66231b36 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -432,6 +432,14 @@ Tstart
 $srcDir/add_crosswalk.py -d $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes.gpkg -a $outputHucDataDir/demDerived_reaches_split_filtered.gpkg -s $outputHucDataDir/src_base.csv -l $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -f $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -r $outputHucDataDir/src_full_crosswalked.csv -j $outputHucDataDir/src.json -x $outputHucDataDir/crosswalk_table.csv -t $outputHucDataDir/hydroTable.csv -w $outputHucDataDir/wbd8_clp.gpkg -b $outputHucDataDir/nwm_subset_streams.gpkg -y $outputHucDataDir/nwm_catchments_proj_subset.tif -m $manning_n -z $input_NWM_Catchments -p $extent -k $outputHucDataDir/small_segments.csv
 Tcount
 
+
+## USGS CROSSWALK ##
+echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
+date -u
+Tstart
+$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -catpix $outputHucDataDir/gw_catchments_pixels.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered.gpkg
+Tcount
+
 ## CLEANUP OUTPUTS ##
 echo -e $startDiv"Cleaning up outputs $hucNumber"$stopDiv
 args=()
diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
new file mode 100755
index 000000000..6bb1302da
--- /dev/null
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+import os
+import geopandas as gpd
+import pandas as pd
+from numpy import unique
+import rasterio
+from rasterstats import zonal_stats
+import json
+import argparse
+import sys
+from utils.shared_functions import getDriver
+import numpy as np
+from os.path import splitext
+import pygeos
+from shapely.geometry import Point,LineString
+from shapely.ops import split
+from shapely.wkb import dumps, loads
+
+
+''' crosswalk USGS gages to catchment pixels
+3 linear reference to final stream segments layer
+5 save to output table either hydroTable, src.json, or '''
+
+def crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename):
+
+
+    # usgs_gages_filename='/data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp'
+    # catchment_pixels_filename='/data/outputs/usgs_rc_xwalk/04050001/gw_catchments_pixels.tif'
+    # input_flows_filename='/data/outputs/usgs_rc_xwalk/04050001/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg'
+    # input_catchment_filename='/data/outputs/usgs_rc_xwalk/04050001/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg'
+    # wbd_buffer_filename='/data/outputs/usgs_rc_xwalk/04050001/wbd_buffered.gpkg'
+
+    wbd_buffer = gpd.read_file(wbd_buffer_filename)
+    usgs_gages = gpd.read_file(usgs_gages_filename, mask=wbd_buffer)
+    catchment_pixels = rasterio.open(catchment_pixels_filename,'r')
+    input_flows = gpd.read_file(input_flows_filename)
+    input_catchment = gpd.read_file(input_catchment_filename)
+
+    ##################### Itentify closest HydroID
+    closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
+    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID'])
+
+    if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
+
+    ##################### Move USGS gage to stream
+    for index, point in usgs_gages.iterrows():
+        print (f"usgs gage: {point.site_no}")
+        pre_reference_catpix_id = list(rasterio.sample.sample_gen(catchment_pixels,point.geometry.coords))[0].item()
+        # find better way to retrieve cat ID
+        print(f"pre adjusted catchment pixel ID: {pre_reference_catpix_id}")
+        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==point.site_no].HydroID.item()
+        # convert headwaterpoint geometries to WKB representation
+        wkb_points = dumps(point.geometry)
+        # create pygeos headwaterpoint geometries from WKB representation
+        pointbin_geom = pygeos.io.from_wkb(wkb_points)
+        # Closest segment to headwater
+        closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
+        wkb_closest_stream = dumps(closest_stream.geometry.item())
+        streambin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+        # Linear reference headwater to closest stream segment
+        pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
+        referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
+        # convert geometries to wkb representation
+        bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
+        # convert to shapely geometries
+        shply_referencedpoint = loads(bin_referencedpoint)
+        ##################### Sample from
+        reference_catpix_id = list(rasterio.sample.sample_gen(catchment_pixels,shply_referencedpoint.coords))[0].item()
+        # find better way to retrieve cat ID
+        print(f"post adjusted catchment pixel ID: {reference_catpix_id}")
+
+        # append reference_catpix_id, hydro_id, and point.site_no to file
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Crosswalk USGS sites to HydroID and Catchment Pixel ID')
+    parser.add_argument('-gages','--usgs-gages-filename', help='USGS gages', required=True)
+    parser.add_argument('-catpix','--catchment-pixels-filename',help='catchment pixel raster',required=True)
+    parser.add_argument('-flows','--input-flows-filename', help='DEM derived streams', required=True)
+    # parser.add_argument('-r','--output-src-filename', help='Output crosswalked synthetic rating curve table', required=True)
+    # parser.add_argument('-j','--output-src-json-filename',help='Output synthetic rating curve json',required=True)
+    # parser.add_argument('-t','--output-hydro-table-filename',help='Hydrotable',required=True)
+
+    args = vars(parser.parse_args())
+
+    usgs_gages_filename = args['usgs_gages_filename']
+    catchment_pixels_filename = args['catchment_pixels_filename']
+    input_flows_filename = args['input_flows_filename']
+    # output_src_filename = args['output_src_filename']
+    # output_src_json_filename = args['output_src_json_filename']
+    # output_hydro_table_filename = args['output_hydro_table_filename']
+
+
+    crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename)

From 2cbe061280f27ea0226bdd503f3093de82bcbae7 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Mon, 8 Mar 2021 15:57:35 +0000
Subject: [PATCH 13/66] adding dem value samples

---
 src/run_by_unit.sh                    |  2 +-
 src/usgs_catchment_pixel_crosswalk.py | 62 +++++++++++++++------------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index a66231b36..646a679b9 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -catpix $outputHucDataDir/gw_catchments_pixels.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered.gpkg
+$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -catpix $outputHucDataDir/gw_catchments_pixels.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem $dem_thalwegCond -table $outputHucDataDir/hand_ref_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##
diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 6bb1302da..24be40d7a 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -19,58 +19,63 @@
 
 
 ''' crosswalk USGS gages to catchment pixels
-3 linear reference to final stream segments layer
-5 save to output table either hydroTable, src.json, or '''
+5 save to output table either hydroTable, src.json, or hand_ref_elev_table'''
 
-def crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename):
 
-
-    # usgs_gages_filename='/data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp'
-    # catchment_pixels_filename='/data/outputs/usgs_rc_xwalk/04050001/gw_catchments_pixels.tif'
-    # input_flows_filename='/data/outputs/usgs_rc_xwalk/04050001/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg'
-    # input_catchment_filename='/data/outputs/usgs_rc_xwalk/04050001/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg'
-    # wbd_buffer_filename='/data/outputs/usgs_rc_xwalk/04050001/wbd_buffered.gpkg'
+def crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_filename,table_filename):
 
     wbd_buffer = gpd.read_file(wbd_buffer_filename)
     usgs_gages = gpd.read_file(usgs_gages_filename, mask=wbd_buffer)
     catchment_pixels = rasterio.open(catchment_pixels_filename,'r')
     input_flows = gpd.read_file(input_flows_filename)
     input_catchment = gpd.read_file(input_catchment_filename)
+    dem = rasterio.open(dem_filename,'r')
+    table = pd.read_csv(table_filename)
+
 
-    ##################### Itentify closest HydroID
+    # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
     closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID'])
 
     if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
 
-    ##################### Move USGS gage to stream
+    # Move USGS gage to stream
     for index, point in usgs_gages.iterrows():
+
         print (f"usgs gage: {point.site_no}")
-        pre_reference_catpix_id = list(rasterio.sample.sample_gen(catchment_pixels,point.geometry.coords))[0].item()
-        # find better way to retrieve cat ID
-        print(f"pre adjusted catchment pixel ID: {pre_reference_catpix_id}")
+        # Get HydroID
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==point.site_no].HydroID.item()
-        # convert headwaterpoint geometries to WKB representation
+
+        # Convert headwaterpoint geometries to WKB representation
         wkb_points = dumps(point.geometry)
-        # create pygeos headwaterpoint geometries from WKB representation
+
+        # Create pygeos headwaterpoint geometries from WKB representation
         pointbin_geom = pygeos.io.from_wkb(wkb_points)
+
         # Closest segment to headwater
         closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
         wkb_closest_stream = dumps(closest_stream.geometry.item())
         streambin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+
         # Linear reference headwater to closest stream segment
         pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
         referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
-        # convert geometries to wkb representation
+
+        # Convert geometries to wkb representation
         bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
-        # convert to shapely geometries
+
+        # Convert to shapely geometries
         shply_referencedpoint = loads(bin_referencedpoint)
-        ##################### Sample from
+
+        # Sample rasters at adjusted point
         reference_catpix_id = list(rasterio.sample.sample_gen(catchment_pixels,shply_referencedpoint.coords))[0].item()
+        reference_elev = list(rasterio.sample.sample_gen(dem,shply_referencedpoint.coords))[0].item() # round to n decimal places
+
         # find better way to retrieve cat ID
         print(f"post adjusted catchment pixel ID: {reference_catpix_id}")
+        print(f"post adjusted elevation: {reference_elev}")
 
-        # append reference_catpix_id, hydro_id, and point.site_no to file
+        # append reference_catpix_id, reference_elev, hydro_id, and point.site_no to table
 
 
 if __name__ == '__main__':
@@ -79,18 +84,19 @@ def crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flow
     parser.add_argument('-gages','--usgs-gages-filename', help='USGS gages', required=True)
     parser.add_argument('-catpix','--catchment-pixels-filename',help='catchment pixel raster',required=True)
     parser.add_argument('-flows','--input-flows-filename', help='DEM derived streams', required=True)
-    # parser.add_argument('-r','--output-src-filename', help='Output crosswalked synthetic rating curve table', required=True)
-    # parser.add_argument('-j','--output-src-json-filename',help='Output synthetic rating curve json',required=True)
-    # parser.add_argument('-t','--output-hydro-table-filename',help='Hydrotable',required=True)
+    parser.add_argument('-cat','--input-catchment-filename', help='DEM derived catchments', required=True)
+    parser.add_argument('-wbd','--wbd-buffer-filename', help='WBD buffer', required=True)
+    parser.add_argument('-dem','--dem-filename', help='Thalweg adjusted DEM', required=True)
+    parser.add_argument('-table','--table-filename', help='Table to append data', required=True)
 
     args = vars(parser.parse_args())
 
     usgs_gages_filename = args['usgs_gages_filename']
     catchment_pixels_filename = args['catchment_pixels_filename']
     input_flows_filename = args['input_flows_filename']
-    # output_src_filename = args['output_src_filename']
-    # output_src_json_filename = args['output_src_json_filename']
-    # output_hydro_table_filename = args['output_hydro_table_filename']
-
+    input_catchment_filename = args['input_catchment_filename']
+    wbd_buffer_filename = args['wbd_buffer_filename']
+    dem_filename = args['dem_filename']
+    table_filename = args['table_filename']
 
-    crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename)
+    crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_filename,table_filename)

From 816c1b7a17227be479fa78344e4ce89b89e1b3eb Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Tue, 9 Mar 2021 15:31:38 +0000
Subject: [PATCH 14/66] refactoring tables and adding evelation values

---
 src/add_crosswalk.py                  |  3 +-
 src/rem.py                            | 13 +++-
 src/run_by_unit.sh                    |  2 +-
 src/usgs_catchment_pixel_crosswalk.py | 95 ++++++++++++++-------------
 4 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
index 2958c2882..96f2805c0 100755
--- a/src/add_crosswalk.py
+++ b/src/add_crosswalk.py
@@ -220,9 +220,8 @@ def add_crosswalk(input_catchments_fileName,input_flows_fileName,input_srcbase_f
     output_hydro_table = output_hydro_table.merge(input_huc.loc[:,[FIM_ID,'HUC8']],how='left',on=FIM_ID)
 
     if output_flows.HydroID.dtype != 'str': output_flows.HydroID = output_flows.HydroID.astype(str)
-    output_hydro_table = output_hydro_table.merge(output_flows.loc[:,['HydroID','LakeID','Median_Thal_Elev_m']],how='left',on='HydroID')
+    output_hydro_table = output_hydro_table.merge(output_flows.loc[:,['HydroID','LakeID']],how='left',on='HydroID')
     output_hydro_table['LakeID'] = output_hydro_table['LakeID'].astype(int)
-    output_hydro_table['Median_Thal_Elev_m'] = output_hydro_table['Median_Thal_Elev_m'].astype(float).round(2)
     output_hydro_table = output_hydro_table.rename(columns={'HUC8':'HUC'})
     if output_hydro_table.HUC.dtype != 'str': output_hydro_table.HUC = output_hydro_table.HUC.astype(str)
 
diff --git a/src/rem.py b/src/rem.py
index 403edf9db..d61271850 100755
--- a/src/rem.py
+++ b/src/rem.py
@@ -118,10 +118,17 @@ def make_catchment_min_dict(flat_dem, catchment_min_dict, flat_catchments, thalw
     merge_df.index.name = 'pixelcatch_id'
     merge_df.to_csv(hand_ref_elev_fileName,index=True) # export dataframe to csv file
 
-    # Merge the HAND reference elvation by HydroID dataframe with the demDerived_reaches layer (add new layer attribute)
-    merge_df = merge_df.groupby(['HydroID']).median() # median value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
+    # Merge the HAND reference elevation by HydroID dataframe with the demDerived_reaches layer (add new layer attribute)
+    min_by_hydroid = merge_df.groupby(['HydroID']).min() # min value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
+    min_by_hydroid.columns = ['Min_Thal_Elev_m']
+    med_by_hydroid = merge_df.groupby(['HydroID']).median() # median value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
+    med_by_hydroid.columns = ['Median_Thal_Elev_m']
+    max_by_hydroid = merge_df.groupby(['HydroID']).max() # max value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
+    max_by_hydroid.columns = ['Max_Thal_Elev_m']
     input_reaches = gpd.read_file(dem_reaches_filename)
-    input_reaches = input_reaches.merge(merge_df, on='HydroID') # merge dataframes by HydroID variable
+    input_reaches = input_reaches.merge(min_by_hydroid, on='HydroID') # merge dataframes by HydroID variable
+    input_reaches = input_reaches.merge(med_by_hydroid, on='HydroID') # merge dataframes by HydroID variable
+    input_reaches = input_reaches.merge(max_by_hydroid, on='HydroID') # merge dataframes by HydroID variable
     input_reaches.to_file(dem_reaches_filename,driver=getDriver(dem_reaches_filename),index=False)
     # ------------------------------------------------------------------------------------------------------------------------ #
 
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 646a679b9..839379fda 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -catpix $outputHucDataDir/gw_catchments_pixels.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem $dem_thalwegCond -table $outputHucDataDir/hand_ref_elev_table.csv
+$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -dem_m $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -reftable $outputHucDataDir/hand_ref_elev_table.csv -outtable $outputHucDataDir/usgs_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##
diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 24be40d7a..6bab9d6d5 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -3,100 +3,107 @@
 import os
 import geopandas as gpd
 import pandas as pd
-from numpy import unique
 import rasterio
-from rasterstats import zonal_stats
-import json
 import argparse
-import sys
-from utils.shared_functions import getDriver
-import numpy as np
-from os.path import splitext
 import pygeos
-from shapely.geometry import Point,LineString
-from shapely.ops import split
 from shapely.wkb import dumps, loads
 
 
-''' crosswalk USGS gages to catchment pixels
-5 save to output table either hydroTable, src.json, or hand_ref_elev_table'''
+''' Get elevation at adjusted USGS gages locations'''
 
 
-def crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_filename,table_filename):
+def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,ref_table_filename,output_table_filename):
 
     wbd_buffer = gpd.read_file(wbd_buffer_filename)
     usgs_gages = gpd.read_file(usgs_gages_filename, mask=wbd_buffer)
-    catchment_pixels = rasterio.open(catchment_pixels_filename,'r')
+    dem_m = rasterio.open(dem_filename,'r')
     input_flows = gpd.read_file(input_flows_filename)
     input_catchment = gpd.read_file(input_catchment_filename)
-    dem = rasterio.open(dem_filename,'r')
-    table = pd.read_csv(table_filename)
+    dem_adj = rasterio.open(dem_adj_filename,'r')
+    ref_table = pd.read_csv(ref_table_filename)
 
 
     # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
-    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID'])
+    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','Min_Thal_Elev_m','Median_Thal_Elev_m','Max_Thal_Elev_m'])
 
     if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
 
+    columns = ['usgs_gage_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev']
+    gage_data = []
+
     # Move USGS gage to stream
-    for index, point in usgs_gages.iterrows():
+    for index, gage in usgs_gages.iterrows():
 
-        print (f"usgs gage: {point.site_no}")
-        # Get HydroID
-        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==point.site_no].HydroID.item()
+        print (f"usgs gage: {gage.site_no}")
+        # Get stream attributes
+        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
+        min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Min_Thal_Elev_m.item(),2)
+        med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Median_Thal_Elev_m.item(),2)
+        max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Max_Thal_Elev_m.item(),2)
 
-        # Convert headwaterpoint geometries to WKB representation
-        wkb_points = dumps(point.geometry)
+        # Convert headwater point geometries to WKB representation
+        wkb_gages = dumps(gage.geometry)
 
-        # Create pygeos headwaterpoint geometries from WKB representation
-        pointbin_geom = pygeos.io.from_wkb(wkb_points)
+        # Create pygeos headwater point geometries from WKB representation
+        gage_bin_geom = pygeos.io.from_wkb(wkb_gages)
 
         # Closest segment to headwater
         closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
         wkb_closest_stream = dumps(closest_stream.geometry.item())
-        streambin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+        stream_bin_geom = pygeos.io.from_wkb(wkb_closest_stream)
 
         # Linear reference headwater to closest stream segment
-        pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
-        referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
+        gage_distance_to_line = pygeos.linear.line_locate_point(stream_bin_geom, gage_bin_geom)
+        referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
 
         # Convert geometries to wkb representation
-        bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
+        bin_referencedgage = pygeos.io.to_wkb(referenced_gage)
 
         # Convert to shapely geometries
-        shply_referencedpoint = loads(bin_referencedpoint)
+        shply_referenced_gage = loads(bin_referenced_gage)
+
+        # Sample rasters at adjusted gage
+        dem_m_elev = list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+        dem_adj_elev = list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+
+        # Print elevations to log file
+        print(f"post adjusted catchment pixel ID: {dem_m_elev}")
+        print(f"post adjusted elevation: {dem_adj_elev}")
 
-        # Sample rasters at adjusted point
-        reference_catpix_id = list(rasterio.sample.sample_gen(catchment_pixels,shply_referencedpoint.coords))[0].item()
-        reference_elev = list(rasterio.sample.sample_gen(dem,shply_referencedpoint.coords))[0].item() # round to n decimal places
+        # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
+        site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev]
+        gage_data.append(site_elevations)
 
-        # find better way to retrieve cat ID
-        print(f"post adjusted catchment pixel ID: {reference_catpix_id}")
-        print(f"post adjusted elevation: {reference_elev}")
 
-        # append reference_catpix_id, reference_elev, hydro_id, and point.site_no to table
+    elev_table = pd.DataFrame(gage_data, columns=columns)
+    # elev_table = elev_table.merge(ref_table, on='HydroID')
+
+    if not elev_table.empty:
+        elev_table.to_csv(output_table_filename,index=False)
 
 
 if __name__ == '__main__':
 
-    parser = argparse.ArgumentParser(description='Crosswalk USGS sites to HydroID and Catchment Pixel ID')
+    parser = argparse.ArgumentParser(description='Crosswalk USGS sites to HydroID and get elevations')
     parser.add_argument('-gages','--usgs-gages-filename', help='USGS gages', required=True)
-    parser.add_argument('-catpix','--catchment-pixels-filename',help='catchment pixel raster',required=True)
+    parser.add_argument('-dem_m','--dem-filename',help='Catchment pixel raster',required=True)
     parser.add_argument('-flows','--input-flows-filename', help='DEM derived streams', required=True)
     parser.add_argument('-cat','--input-catchment-filename', help='DEM derived catchments', required=True)
     parser.add_argument('-wbd','--wbd-buffer-filename', help='WBD buffer', required=True)
-    parser.add_argument('-dem','--dem-filename', help='Thalweg adjusted DEM', required=True)
-    parser.add_argument('-table','--table-filename', help='Table to append data', required=True)
+    parser.add_argument('-dem_adj','--dem-adj-filename', help='Thalweg adjusted DEM', required=True)
+    parser.add_argument('-reftable','--ref-table-filename', help='Hand reference table', required=True)
+    parser.add_argument('-outtable','--output-table-filename', help='Table to append data', required=True)
 
     args = vars(parser.parse_args())
 
     usgs_gages_filename = args['usgs_gages_filename']
-    catchment_pixels_filename = args['catchment_pixels_filename']
+    dem_filename = args['dem_filename']
     input_flows_filename = args['input_flows_filename']
     input_catchment_filename = args['input_catchment_filename']
     wbd_buffer_filename = args['wbd_buffer_filename']
-    dem_filename = args['dem_filename']
-    table_filename = args['table_filename']
+    dem_adj_filename = args['dem_adj_filename']
+    ref_table_filename = args['ref_table_filename']
+    output_table_filename = args['output_table_filename']
 
-    crosswalk_usgs_gage(usgs_gages_filename,catchment_pixels_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_filename,table_filename)
+    crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_adj_filename,ref_table_filename,output_table_filename)

From b91ba04a0123dbc5c54b14a1775863ca743bc289 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Tue, 9 Mar 2021 15:46:25 +0000
Subject: [PATCH 15/66] adding tables to prod whitelist

---
 src/output_cleanup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/output_cleanup.py b/src/output_cleanup.py
index 7e211bdc5..ccbb7c33f 100755
--- a/src/output_cleanup.py
+++ b/src/output_cleanup.py
@@ -31,7 +31,9 @@ def output_cleanup(huc_number, output_folder_path, additional_whitelist, is_prod
         'gw_catchments_reaches_filtered_addedAttributes.tif',
         'hydroTable.csv',
         'src.json',
-        'small_segments.csv'
+        'small_segments.csv',
+        'usgs_elev_table.csv',
+        'hand_ref_elev_table.csv'
     ]
 
     # List of files that will be saved during a viz run

From 7708f3b2958daa3ff32ac31044ce40c6760cee40 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Tue, 9 Mar 2021 22:47:50 +0000
Subject: [PATCH 16/66] moving usgs gage shp to inputs

---
 src/run_by_unit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 839379fda..d2dba8a2a 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_catchment_pixel_crosswalk.py -gages /data/temp/tsg/sample_gage_sites/evaluated_active_gages.shp -dem_m $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -reftable $outputHucDataDir/hand_ref_elev_table.csv -outtable $outputHucDataDir/usgs_elev_table.csv
+$srcDir/usgs_catchment_pixel_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem_m $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -reftable $outputHucDataDir/hand_ref_elev_table.csv -outtable $outputHucDataDir/usgs_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##

From 957c0376d831974f78a4f08dcf2e71b58eb1ce97 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 10 Mar 2021 16:58:03 +0000
Subject: [PATCH 17/66] fixed var name

---
 src/usgs_catchment_pixel_crosswalk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 6bab9d6d5..b5a9279ce 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -58,7 +58,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
         referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
 
         # Convert geometries to wkb representation
-        bin_referencedgage = pygeos.io.to_wkb(referenced_gage)
+        bin_referenced_gage = pygeos.io.to_wkb(referenced_gage)
 
         # Convert to shapely geometries
         shply_referenced_gage = loads(bin_referenced_gage)

From 8e36fab37c3fb1fff0cb5f650f11f988c34120fc Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 10 Mar 2021 20:04:01 +0000
Subject: [PATCH 18/66] handles no nearby hydroids

---
 src/usgs_catchment_pixel_crosswalk.py | 58 ++++++++++++++-------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index b5a9279ce..229770af3 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -3,6 +3,7 @@
 import os
 import geopandas as gpd
 import pandas as pd
+import numpy as np
 import rasterio
 import argparse
 import pygeos
@@ -38,42 +39,45 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
         print (f"usgs gage: {gage.site_no}")
         # Get stream attributes
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
-        min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Min_Thal_Elev_m.item(),2)
-        med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Median_Thal_Elev_m.item(),2)
-        max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Max_Thal_Elev_m.item(),2)
 
-        # Convert headwater point geometries to WKB representation
-        wkb_gages = dumps(gage.geometry)
+        if not np.isnan(hydro_id):
 
-        # Create pygeos headwater point geometries from WKB representation
-        gage_bin_geom = pygeos.io.from_wkb(wkb_gages)
+            min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Min_Thal_Elev_m.item(),2)
+            med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Median_Thal_Elev_m.item(),2)
+            max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Max_Thal_Elev_m.item(),2)
 
-        # Closest segment to headwater
-        closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
-        wkb_closest_stream = dumps(closest_stream.geometry.item())
-        stream_bin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+            # Convert headwater point geometries to WKB representation
+            wkb_gages = dumps(gage.geometry)
 
-        # Linear reference headwater to closest stream segment
-        gage_distance_to_line = pygeos.linear.line_locate_point(stream_bin_geom, gage_bin_geom)
-        referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
+            # Create pygeos headwater point geometries from WKB representation
+            gage_bin_geom = pygeos.io.from_wkb(wkb_gages)
 
-        # Convert geometries to wkb representation
-        bin_referenced_gage = pygeos.io.to_wkb(referenced_gage)
+            # Closest segment to headwater
+            closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
+            wkb_closest_stream = dumps(closest_stream.geometry.item())
+            stream_bin_geom = pygeos.io.from_wkb(wkb_closest_stream)
 
-        # Convert to shapely geometries
-        shply_referenced_gage = loads(bin_referenced_gage)
+            # Linear reference headwater to closest stream segment
+            gage_distance_to_line = pygeos.linear.line_locate_point(stream_bin_geom, gage_bin_geom)
+            referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
 
-        # Sample rasters at adjusted gage
-        dem_m_elev = list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item().astype(float).round(2)
-        dem_adj_elev = list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+            # Convert geometries to wkb representation
+            bin_referenced_gage = pygeos.io.to_wkb(referenced_gage)
 
-        # Print elevations to log file
-        print(f"post adjusted catchment pixel ID: {dem_m_elev}")
-        print(f"post adjusted elevation: {dem_adj_elev}")
+            # Convert to shapely geometries
+            shply_referenced_gage = loads(bin_referenced_gage)
 
-        # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
-        site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev]
-        gage_data.append(site_elevations)
+            # Sample rasters at adjusted gage
+            dem_m_elev = list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+            dem_adj_elev = list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+
+            # Print elevations to log file
+            print(f"post adjusted catchment pixel ID: {dem_m_elev}")
+            print(f"post adjusted elevation: {dem_adj_elev}")
+
+            # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
+            site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev]
+            gage_data.append(site_elevations)
 
 
     elev_table = pd.DataFrame(gage_data, columns=columns)

From 6d42f61053068b9cae027a965825dcf2dbb8f29d Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 11 Mar 2021 15:49:41 +0000
Subject: [PATCH 19/66] rounding elevation values

---
 src/usgs_catchment_pixel_crosswalk.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 229770af3..9ffa8aa25 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -34,11 +34,12 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     gage_data = []
 
     # Move USGS gage to stream
-    for index, gage in usgs_gages.iterrows():
+for index, gage in usgs_gages.iterrows():
 
-        print (f"usgs gage: {gage.site_no}")
-        # Get stream attributes
-        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
+    print (f"usgs gage: {gage.site_no}")
+    
+    # Get stream attributes
+    hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
 
         if not np.isnan(hydro_id):
 
@@ -68,8 +69,8 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
             shply_referenced_gage = loads(bin_referenced_gage)
 
             # Sample rasters at adjusted gage
-            dem_m_elev = list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item().astype(float).round(2)
-            dem_adj_elev = list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item().astype(float).round(2)
+            dem_m_elev = round(list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item(),2)
+            dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
 
             # Print elevations to log file
             print(f"post adjusted catchment pixel ID: {dem_m_elev}")

From 76d4aa23049db23805ff56cadcc9c92a33932304 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 11 Mar 2021 16:34:28 +0000
Subject: [PATCH 20/66] formatting

---
 src/usgs_catchment_pixel_crosswalk.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 9ffa8aa25..67154d596 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -34,12 +34,12 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     gage_data = []
 
     # Move USGS gage to stream
-for index, gage in usgs_gages.iterrows():
+    for index, gage in usgs_gages.iterrows():
 
-    print (f"usgs gage: {gage.site_no}")
-    
-    # Get stream attributes
-    hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
+        print (f"usgs gage: {gage.site_no}")
+
+        # Get stream attributes
+        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
 
         if not np.isnan(hydro_id):
 

From 430a0e7ed99c36e4f7c05fb7f106cd28b5ae50bf Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Fri, 12 Mar 2021 16:01:39 +0000
Subject: [PATCH 21/66] temporary patch for BED run

---
 fim_run.sh            | 2 +-
 src/output_cleanup.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index 42a5d022e..4bd19a115 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -152,5 +152,5 @@ fi
 echo "$viz"
 if [[ "$viz" -eq 1 ]]; then
     # aggregate outputs
-    python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir -j 4
+    time python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir -j 4
 fi
diff --git a/src/output_cleanup.py b/src/output_cleanup.py
index 7e211bdc5..2f12c31a4 100755
--- a/src/output_cleanup.py
+++ b/src/output_cleanup.py
@@ -37,9 +37,12 @@ def output_cleanup(huc_number, output_folder_path, additional_whitelist, is_prod
     # List of files that will be saved during a viz run
     viz_whitelist = [
         'rem_zeroed_masked.tif',
+        'gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg',
+        'demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg',
         'gw_catchments_reaches_filtered_addedAttributes.tif',
         'hydroTable.csv',
-        'src.json'
+        'src.json',
+        'small_segments.csv'
     ]
 
     # If "production" run, only keep whitelisted files

From 7ea4e44c47a7008c5f53ec7e2e9a4ea298a77433 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Mon, 15 Mar 2021 18:35:33 +0000
Subject: [PATCH 22/66] merging with dev and increasing agg jobs from 4 to 6

---
 fim_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fim_run.sh b/fim_run.sh
index 4bd19a115..8d1875e5f 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -152,5 +152,5 @@ fi
 echo "$viz"
 if [[ "$viz" -eq 1 ]]; then
     # aggregate outputs
-    time python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir -j 4
+    time python3 /foss_fim/src/aggregate_fim_outputs.py -d $outputRunDataDir -j 6
 fi

From 1bb70203b50fc479b16f5e3e44da36cb04833b0a Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 25 Mar 2021 14:15:07 +0000
Subject: [PATCH 23/66] adding post-processing script to gather elevation
 values and calculate metrics

---
 src/rem.py                            |  18 +-
 src/run_by_unit.sh                    |   2 +-
 src/usgs_catchment_pixel_crosswalk.py |  16 +-
 tools/rating_curve_comparison.py      | 334 ++++++++++++++++++++++++++
 4 files changed, 343 insertions(+), 27 deletions(-)
 create mode 100755 tools/rating_curve_comparison.py

diff --git a/src/rem.py b/src/rem.py
index d61271850..27dd4ad1b 100755
--- a/src/rem.py
+++ b/src/rem.py
@@ -11,7 +11,7 @@
 from utils.shared_functions import getDriver
 
 
-def rel_dem(dem_fileName, pixel_watersheds_fileName, rem_fileName, thalweg_raster, hydroid_fileName, hand_ref_elev_fileName, dem_reaches_filename):
+def rel_dem(dem_fileName, pixel_watersheds_fileName, rem_fileName, thalweg_raster, hydroid_fileName, dem_reaches_filename):
     """
         Calculates REM/HAND/Detrended DEM
 
@@ -25,8 +25,6 @@ def rel_dem(dem_fileName, pixel_watersheds_fileName, rem_fileName, thalweg_raste
             File name of output relative elevation raster.
         hydroid_fileName : str
             File name of the hydroid raster (i.e. gw_catchments_reaches.tif)
-        hand_ref_elev_fileName
-            File name of the output csv containing list of hydroid values and HAND zero/reference elev
         dem_reaches_filename
             File name of the reaches layer to populate HAND elevation attribute values and overwrite as output
 
@@ -108,16 +106,6 @@ def make_catchment_min_dict(flat_dem, catchment_min_dict, flat_catchments, thalw
     gw_catchments_pixels_masked_object.close()
     thalweg_raster_object.close()
 
-###############################################
-    # Merge and export dictionary to to_csv
-    catchment_min_dict_df = pd.DataFrame.from_dict(catchment_min_dict, orient='index') # convert dict to dataframe
-    catchment_min_dict_df.columns = ['Median_Thal_Elev_m']
-    catchment_hydroid_dict_df = pd.DataFrame.from_dict(catchment_hydroid_dict, orient='index') # convert dict to dataframe
-    catchment_hydroid_dict_df.columns = ['HydroID']
-    merge_df = catchment_hydroid_dict_df.merge(catchment_min_dict_df, left_index=True, right_index=True)
-    merge_df.index.name = 'pixelcatch_id'
-    merge_df.to_csv(hand_ref_elev_fileName,index=True) # export dataframe to csv file
-
     # Merge the HAND reference elevation by HydroID dataframe with the demDerived_reaches layer (add new layer attribute)
     min_by_hydroid = merge_df.groupby(['HydroID']).min() # min value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
     min_by_hydroid.columns = ['Min_Thal_Elev_m']
@@ -178,7 +166,6 @@ def calculate_rem(flat_dem,catchmentMinDict,flat_catchments,ndv):
     parser.add_argument('-t','--thalweg-raster',help='A binary raster representing the thalweg. 1 for thalweg, 0 for non-thalweg.',required=True)
     parser.add_argument('-o','--rem',help='Output REM raster',required=True)
     parser.add_argument('-i','--hydroid', help='HydroID raster to use within project path', required=True)
-    parser.add_argument('-r','--hand_ref_elev_table',help='Output table of HAND reference elev by catchment',required=True)
     parser.add_argument('-s','--dem_reaches_in_out',help='DEM derived reach layer to join HAND reference elevation attribute',required=True)
 
 
@@ -191,7 +178,6 @@ def calculate_rem(flat_dem,catchmentMinDict,flat_catchments,ndv):
     rem_fileName = args['rem']
     thalweg_raster = args['thalweg_raster']
     hydroid_fileName = args['hydroid']
-    hand_ref_elev_fileName = args['hand_ref_elev_table']
     dem_reaches_filename = args['dem_reaches_in_out']
 
-    rel_dem(dem_fileName, pixel_watersheds_fileName, rem_fileName, thalweg_raster, hydroid_fileName, hand_ref_elev_fileName, dem_reaches_filename)
+    rel_dem(dem_fileName, pixel_watersheds_fileName, rem_fileName, thalweg_raster, hydroid_fileName, dem_reaches_filename)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index d2dba8a2a..cf95d5286 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -321,7 +321,7 @@ echo -e $startDiv"D8 REM $hucNumber"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/rem.tif ] && \
-$srcDir/rem.py -d $dem_thalwegCond -w $outputHucDataDir/gw_catchments_pixels.tif -o $outputHucDataDir/rem.tif -t $demDerived_streamPixels -i $outputHucDataDir/gw_catchments_reaches.tif -r $outputHucDataDir/hand_ref_elev_table.csv -s $outputHucDataDir/demDerived_reaches_split.gpkg
+$srcDir/rem.py -d $dem_thalwegCond -w $outputHucDataDir/gw_catchments_pixels.tif -o $outputHucDataDir/rem.tif -t $demDerived_streamPixels -i $outputHucDataDir/gw_catchments_reaches.tif -s $outputHucDataDir/demDerived_reaches_split.gpkg
 Tcount
 
 ## DINF DISTANCE DOWN ##
diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_catchment_pixel_crosswalk.py
index 67154d596..ad0d13349 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_catchment_pixel_crosswalk.py
@@ -13,7 +13,7 @@
 ''' Get elevation at adjusted USGS gages locations'''
 
 
-def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,ref_table_filename,output_table_filename):
+def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,output_table_filename):
 
     wbd_buffer = gpd.read_file(wbd_buffer_filename)
     usgs_gages = gpd.read_file(usgs_gages_filename, mask=wbd_buffer)
@@ -21,16 +21,14 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     input_flows = gpd.read_file(input_flows_filename)
     input_catchment = gpd.read_file(input_catchment_filename)
     dem_adj = rasterio.open(dem_adj_filename,'r')
-    ref_table = pd.read_csv(ref_table_filename)
-
 
     # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
-    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','Min_Thal_Elev_m','Median_Thal_Elev_m','Max_Thal_Elev_m'])
+    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','Min_Thal_Elev_m','Median_Thal_Elev_m','Max_Thal_Elev_m', 'order_'])
 
     if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
 
-    columns = ['usgs_gage_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev']
+    columns = ['usgs_gage_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev','str_order']
     gage_data = []
 
     # Move USGS gage to stream
@@ -40,6 +38,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
         # Get stream attributes
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
+        str_order = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()
 
         if not np.isnan(hydro_id):
 
@@ -77,12 +76,11 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
             print(f"post adjusted elevation: {dem_adj_elev}")
 
             # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
-            site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev]
+            site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str_order]
             gage_data.append(site_elevations)
 
 
     elev_table = pd.DataFrame(gage_data, columns=columns)
-    # elev_table = elev_table.merge(ref_table, on='HydroID')
 
     if not elev_table.empty:
         elev_table.to_csv(output_table_filename,index=False)
@@ -97,7 +95,6 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     parser.add_argument('-cat','--input-catchment-filename', help='DEM derived catchments', required=True)
     parser.add_argument('-wbd','--wbd-buffer-filename', help='WBD buffer', required=True)
     parser.add_argument('-dem_adj','--dem-adj-filename', help='Thalweg adjusted DEM', required=True)
-    parser.add_argument('-reftable','--ref-table-filename', help='Hand reference table', required=True)
     parser.add_argument('-outtable','--output-table-filename', help='Table to append data', required=True)
 
     args = vars(parser.parse_args())
@@ -108,7 +105,6 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     input_catchment_filename = args['input_catchment_filename']
     wbd_buffer_filename = args['wbd_buffer_filename']
     dem_adj_filename = args['dem_adj_filename']
-    ref_table_filename = args['ref_table_filename']
     output_table_filename = args['output_table_filename']
 
-    crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_adj_filename,ref_table_filename,output_table_filename)
+    crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_adj_filename,output_table_filename)
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
new file mode 100755
index 000000000..603f697ed
--- /dev/null
+++ b/tools/rating_curve_comparison.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+import argparse
+import matplotlib.pyplot as plt
+import seaborn as sns
+from functools import reduce
+from multiprocessing import Pool
+from os.path import isfile, join, dirname
+sys.path.append('/foss_fim/src')
+from utils.shared_functions import getDriver
+
+"""
+    Plot Rating Curves and Compare to USGS Gages
+
+    Parameters
+    ----------
+    output_dir : str
+        Directory containing FIM output folders.
+    usgs_gages_filename : str
+        File name of USGS rating curves.
+    nwm_flow_dir : str
+        Directory containing NWM recurrence flows files.
+"""
+
+# recurr_intervals = ['recurr_1_5_cms.csv','recurr_5_0_cms.csv','recurr_10_0_cms.csv']
+
+def generate_rating_curve_metrics(args):
+
+    elev_table_filename         = args[0]
+    hydrotable_filename         = args[1]
+    usgs_gages_filename         = args[2]
+    usgs_recurr_stats_filename  = args[3]
+    nwm_recurr_data_filename    = args[4]
+    rc_comparison_plot_filename = args[5]
+    nwm_flow_dir                = args[6]
+    huc                         = args[7]
+
+    elev_table = pd.read_csv(elev_table_filename)
+    hydrotable = pd.read_csv(hydrotable_filename)
+    usgs_gages = pd.read_csv(usgs_gages_filename)
+
+    # Join rating curves with elevation data
+    hydrotable = hydrotable.merge(elev_table, on="HydroID")
+    relevant_gages = list(hydrotable.usgs_gage_id.unique())
+    usgs_gages = usgs_gages[usgs_gages['location_id'].isin(relevant_gages)]
+    usgs_gages = usgs_gages.reset_index(drop=True)
+
+    if len(usgs_gages) > 0:
+
+        # Adjust rating curve to elevation
+        hydrotable['thal_elevation'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
+        # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
+        hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
+        usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation"})
+
+        hydrotable['Source'] = "FIM"
+        usgs_gages['Source'] = "USGS"
+        limited_hydrotable = hydrotable.filter(items=['usgs_gage_id','thal_elevation','discharge_cfs','Source'])
+        limited_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
+
+        rating_curves = limited_hydrotable.rename(columns={"usgs_gage_id": "location_id","thal_elevation": "elevation"})
+
+        rating_curves = rating_curves.append(limited_usgs_gages)
+        rating_curves = rating_curves.rename(columns={"location_id": "USGS Gage"})
+
+        generate_facet_plot(rating_curves, rc_comparison_plot_filename)
+
+        ## Calculate metrics for NWM reccurence intervals
+        # NWM recurr intervals
+        recurr_1_5_yr_filename = join(nwm_flow_dir,'recurr_1_5_cms.csv')
+        recurr_5_yr_filename = join(nwm_flow_dir,'recurr_5_0_cms.csv')
+        recurr_10_yr_filename = join(nwm_flow_dir,'recurr_10_0_cms.csv')
+
+        recurr_1_5_yr = pd.read_csv(recurr_1_5_yr_filename)
+        recurr_1_5_yr = recurr_1_5_yr.rename(columns={"discharge": "1.5"})
+        recurr_5_yr = pd.read_csv(recurr_5_yr_filename)
+        recurr_5_yr = recurr_5_yr.rename(columns={"discharge": "5.0"})
+        recurr_10_yr = pd.read_csv(recurr_10_yr_filename)
+        recurr_10_yr = recurr_10_yr.rename(columns={"discharge": "10.0"})
+
+        nwm_recurr_intervals_all = reduce(lambda x,y: pd.merge(x,y, on='feature_id', how='outer'), [recurr_1_5_yr, recurr_5_yr, recurr_10_yr])
+        nwm_recurr_intervals_all = pd.melt(nwm_recurr_intervals_all, id_vars=['feature_id'], value_vars=['1.5','5.0','10.0'], var_name='recurr_interval', value_name='discharge_cms')
+        nwm_recurr_intervals_all['discharge_cfs'] = nwm_recurr_intervals_all.discharge_cms * 35.3147
+        nwm_recurr_intervals_all = nwm_recurr_intervals_all.filter(items=['discharge_cfs', 'recurr_interval','feature_id']).drop_duplicates()
+
+        usgs_crosswalk = hydrotable.filter(items=['usgs_gage_id', 'feature_id']).drop_duplicates()
+
+        nwm_recurr_data_table = pd.DataFrame()
+        columns = ['usgs_gage','NRMSE','mean_abs_y_diff','percent_bias']
+        usgs_recurr_stats = []
+
+        for index, gage in usgs_crosswalk.iterrows():
+            ## Interpolate USGS/FIM elevation at NWM recurrence intervals
+            # Interpolate USGS elevation at NWM recurrence intervals
+            usgs_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.usgs_gage_id) & (rating_curves.Source=="USGS")]
+            usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
+
+            # handle sites missing data
+            if len(usgs_pred_elev) <1:
+                continue
+
+            # clean up data
+            usgs_pred_elev['usgs_gage'] = gage.usgs_gage_id
+            usgs_pred_elev = usgs_pred_elev.filter(items=['usgs_gage','recurr_interval', 'discharge_cfs','pred_elev'])
+            usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "usgs_pred_elev"})
+
+            # Interpolate FIM elevation at NWM recurrence intervals
+            fim_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.usgs_gage_id) & (rating_curves.Source=="FIM")]
+            fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
+
+            # handle sites missing data
+            if len(fim_pred_elev) <1:
+                print(f"missing fim elevation data for usgs station {gage.usgs_gage_id} in huc {huc}")
+                continue
+
+            # clean up data
+            fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "fim_pred_elev"})
+            fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','fim_pred_elev'])
+            usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs']) # str_order
+            usgs_pred_elev['HUC'] = huc
+            nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
+
+            ## Interpolate FIM elevation at USGS observations
+            # Sort stage in ascending order
+            usgs_rc = usgs_rc.sort_values('elevation',ascending=True)
+            fim_rc = fim_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="usgs_gage_id")
+            usgs_rc['pred_elev'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
+
+            usgs_rc = usgs_rc[usgs_rc['pred_elev'].notna()]
+            rc_stats_plot_filename = join(dirname(rc_comparison_plot_filename),'rating_curve_stats' + str(gage.usgs_gage_id) +'.png')
+
+            if not usgs_rc.empty:
+                gage_stats = calculate_rc_stats_stage(usgs_rc,rc_stats_plot_filename)
+
+                usgs_recurr_stats.append(gage_stats)
+
+        usgs_recurr_stats_table = pd.DataFrame(usgs_recurr_stats, columns=columns)
+
+        if not usgs_recurr_stats_table.empty:
+            usgs_recurr_stats_table.to_csv(usgs_recurr_stats_filename,index=False)
+
+        if not nwm_recurr_data_table.empty:
+            nwm_recurr_data_table.to_csv(nwm_recurr_data_filename,index=False)
+
+    else:
+        print(f"no USGS data for gage(s): {relevant_gages} in huc {huc}")
+
+def aggregate_metrics(output_dir,procs_list):
+
+    agg_usgs_interp_elev_stats = join(output_dir,'agg_usgs_interp_elev_stats.csv')
+    agg_nwm_recurr_flow_elev = join(output_dir,'agg_nwm_recurr_flow_elevations.csv')
+
+    for huc in procs_list:
+        if os.path.isfile(huc[3]):
+            usgs_recurr_stats = pd.read_csv(huc[3])
+
+            # Write/append usgs_recurr_stats
+            if os.path.isfile(agg_usgs_interp_elev_stats):
+                usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False, mode='a',header=False)
+            else:
+                usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False)
+
+        if os.path.isfile(huc[4]):
+            nwm_recurr_data = pd.read_csv(huc[4])
+
+            # Write/append nwm_recurr_data
+            if os.path.isfile(agg_nwm_recurr_flow_elev):
+                nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False, mode='a',header=False)
+            else:
+                nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
+
+
+def generate_facet_plot(rating_curves, rc_comparison_plot_filename):
+    # Filter FIM elevation based on USGS data
+    for gage in rating_curves['USGS Gage'].unique():
+
+        min_elev = rating_curves.loc[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='USGS')].elevation.min()
+        max_elev = rating_curves.loc[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='USGS')].elevation.max()
+
+        rating_curves_map = rating_curves.drop(rating_curves[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='FIM') & (rating_curves.elevation > (max_elev + 2))].index)
+        rating_curves_map = rating_curves.drop(rating_curves[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='FIM') & (rating_curves.elevation < min_elev - 2)].index)
+
+    ## Generate rating curve plots
+    sns.set(style="ticks")
+    g = sns.FacetGrid(rating_curves_map, col="USGS Gage", hue="Source",sharex=False, sharey=False,col_wrap=3)
+    g.map(sns.scatterplot, "discharge_cfs", "elevation", palette="tab20c", marker="o")
+    g.set_axis_labels(x_var="Discharge (cfs)", y_var="Stage (ft)")
+
+    # Adjust the arrangement of the plots
+    g.fig.tight_layout(w_pad=1)
+    g.add_legend()
+
+    plt.savefig(rc_comparison_plot_filename)
+
+
+def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
+
+    usgs_site = site_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="usgs_gage_id")
+    nwm_ids = len(usgs_site.feature_id.drop_duplicates())
+
+    if nwm_ids > 0:
+
+        nwm_recurr_intervals = nwm_recurr_intervals.copy().loc[nwm_recurr_intervals.feature_id==usgs_site.feature_id.drop_duplicates().item()]
+        nwm_recurr_intervals['pred_elev'] = np.interp(nwm_recurr_intervals.discharge_cfs.values, usgs_site['discharge_cfs'], usgs_site['elevation'], left = np.nan, right = np.nan)
+
+        return nwm_recurr_intervals
+
+    else:
+        return []
+
+
+def calculate_rc_stats_stage(rating_curve, fig_path):
+    station = rating_curve["USGS Gage"].unique().item()
+
+    # Get the interpolated hand column, for now it is just the last column but THIS NEEDS TO BE BETTER FORMALIZED.
+    usgs_stage = "elevation"
+    flows = "discharge_cfs"
+    hand_stage = "pred_elev"
+
+    # Calculate variables for NRMSE
+    rating_curve["yhat_minus_y"] = rating_curve[hand_stage] - rating_curve[usgs_stage]
+    rating_curve["yhat_minus_y_squared"] = rating_curve["yhat_minus_y"] ** 2
+    sum_y_diff = rating_curve["yhat_minus_y_squared"].sum()
+
+    # determine number of events that are modeled
+    n = rating_curve[usgs_stage].count()
+
+    # Determine the maximum/minimum USGS stage
+    y_max = rating_curve[usgs_stage].max()
+    y_min = rating_curve[usgs_stage].min()
+
+    # Calculate NRMSE
+    NRMSE_numerator = (sum_y_diff / n) ** 0.5
+    NRMSE_denominator = y_max - y_min
+    NRMSE = NRMSE_numerator / NRMSE_denominator
+
+    # Calculate Mean Absolute Depth Difference
+    mean_abs_y_diff = abs(rating_curve["yhat_minus_y"]).mean()
+
+    # Calculate Percent Bias
+    percent_bias = 100 * (rating_curve["yhat_minus_y"].sum() / rating_curve[usgs_stage].sum())
+
+    ## plot USGS rating curve and HAND rating curve and display statistics
+    fig, ax = plt.subplots()
+    rating_curve.plot(
+        x=flows,
+        y=usgs_stage,
+        ax=ax,
+        legend=False,
+        style="-",
+        color="orange",
+        zorder=2,
+    )
+    rating_curve.plot(
+        x=flows,
+        y=usgs_stage,
+        ax=ax,
+        legend=False,
+        kind="scatter",
+        marker="o",
+        s=30.0,
+        color="black",
+        zorder=3,
+    )
+    rating_curve.plot(
+        x=flows, y=hand_stage, ax=ax, legend=False, style="--", color="gray", zorder=2
+    )
+    rating_curve.plot(
+        x=flows,
+        y=hand_stage,
+        ax=ax,
+        legend=False,
+        kind="scatter",
+        marker="x",
+        s=30.0,
+        color="blue",
+        zorder=3,
+    )
+    ax.set_xlabel("Flow (cfs)")
+    ax.set_ylabel("Elevation (ft)")
+    ax.legend(["USGS Curve", "HAND Curve"], loc="best")
+    ax.grid(zorder=1)
+    fig.suptitle(
+        "Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
+            station,
+            round(NRMSE, 2),
+            round(mean_abs_y_diff, 2),
+            round(percent_bias, 1),
+        )
+    )
+    fig.savefig(fig_path)
+    plt.close(fig)
+    return [station, NRMSE, mean_abs_y_diff, percent_bias]
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
+    parser.add_argument('-output_dir','--output-dir', help='FIM output dir', required=True)
+    parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True)
+    parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True)
+    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
+
+    args = vars(parser.parse_args())
+
+    output_dir = args['output_dir']
+    usgs_gages_filename = args['usgs_gages_filename']
+    nwm_flow_dir = args['nwm_flow_dir']
+    number_of_jobs = args['number_of_jobs']
+
+    procs_list = []
+
+    huc_list  = os.listdir(output_dir)
+    for huc in huc_list:
+        elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
+        hydrotable_filename = join(output_dir,huc,'hydroTable.csv')
+        usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
+        nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
+        rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
+
+        if isfile(elev_table_filename):
+            procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
+
+    # Initiate multiprocessing
+    print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")
+    pool = Pool(number_of_jobs)
+    pool.map(generate_rating_curve_metrics, procs_list)
+
+    print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
+    aggregate_metrics(output_dir,procs_list)

From 0ff56b553deaddd7667e219dbd61a4e6894286f4 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 25 Mar 2021 14:26:32 +0000
Subject: [PATCH 24/66] fixing merge conflict

---
 tools/synthesize_test_cases.py | 80 +++++++++-------------------------
 1 file changed, 20 insertions(+), 60 deletions(-)

diff --git a/tools/synthesize_test_cases.py b/tools/synthesize_test_cases.py
index e922abf2b..06f55b4a0 100755
--- a/tools/synthesize_test_cases.py
+++ b/tools/synthesize_test_cases.py
@@ -68,7 +68,6 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
 
     for benchmark_source in ['ble', 'nws', 'usgs']:
         benchmark_test_case_dir = os.path.join(TEST_CASES_DIR, benchmark_source + '_test_cases')
-
         if benchmark_source == 'ble':
             test_cases_list = os.listdir(benchmark_test_case_dir)
 
@@ -77,16 +76,16 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                     int(test_case.split('_')[0])
 
                     huc = test_case.split('_')[0]
-
+                    
                     for iteration in iteration_list:
-
+                        
                         if iteration == "official":
                             versions_to_crawl = os.path.join(benchmark_test_case_dir, test_case, 'official_versions')
                             versions_to_aggregate = os.listdir(PREVIOUS_FIM_DIR)
                         if iteration == "comparison":
                             versions_to_crawl = os.path.join(benchmark_test_case_dir, test_case, 'testing_versions')
                             versions_to_aggregate = [dev_comparison]
-
+        
                         for magnitude in ['100yr', '500yr']:
                             for version in versions_to_aggregate:
                                 if '_fr' in version:
@@ -101,7 +100,7 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                     calibrated = "no"
                                 version_dir = os.path.join(versions_to_crawl, version)
                                 magnitude_dir = os.path.join(version_dir, magnitude)
-
+    
                                 if os.path.exists(magnitude_dir):
                                     magnitude_dir_list = os.listdir(magnitude_dir)
                                     for f in magnitude_dir_list:
@@ -120,60 +119,22 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                                 sub_list_to_append.append(benchmark_source)
                                                 sub_list_to_append.append(extent_config)
                                                 sub_list_to_append.append(calibrated)
-
+    
                                                 list_to_write.append(sub_list_to_append)
+                except ValueError:
+                    pass
 
-                    official_versions = os.path.join(benchmark_test_case_dir, test_case, 'official_versions')
+        if benchmark_source in AHPS_BENCHMARK_CATEGORIES:
+            test_cases_list = os.listdir(benchmark_test_case_dir)
 
-                    for magnitude in ['action', 'minor', 'moderate', 'major']:
-                        for version in versions_to_aggregate:
-                            if '_fr' in version:
-                                extent_config = 'FR'
-                            elif '_ms' in version:
-                                extent_config = 'MS'
-                            else:
-                                extent_config = 'FR'
-                            if "_c" in version and version.split('_c')[1] == "":
-                                calibrated = "yes"
-                            else:
-                                calibrated = "no"
-
-                            version_dir = os.path.join(official_versions, version)
-                            magnitude_dir = os.path.join(version_dir, magnitude)
-                            if os.path.exists(magnitude_dir):
-                                magnitude_dir_list = os.listdir(magnitude_dir)
-                                for f in magnitude_dir_list:
-                                    if '.json' in f and 'total_area' not in f:
-                                        nws_lid = f[:5]
-                                        sub_list_to_append = [version, nws_lid, magnitude, huc]
-                                        full_json_path = os.path.join(magnitude_dir, f)
-                                        flow = ''
-                                        if os.path.exists(full_json_path):
-
-                                            # Get flow used to map.
-                                            flow_file = os.path.join(benchmark_test_case_dir, 'validation_data_' + benchmark_source, huc, nws_lid, magnitude, 'ahps_' + nws_lid + '_huc_' + huc + '_flows_' + magnitude + '.csv')
-                                            if os.path.exists(flow_file):
-                                                with open(flow_file, newline='') as csv_file:
-                                                    reader = csv.reader(csv_file)
-                                                    next(reader)
-                                                    for row in reader:
-                                                        flow = row[1]
-                                                    if nws_lid == 'mcc01':
-                                                        print(flow)
-
-                                            stats_dict = json.load(open(full_json_path))
-                                            for metric in metrics_to_write:
-                                                sub_list_to_append.append(stats_dict[metric])
-                                            sub_list_to_append.append(full_json_path)
-                                            sub_list_to_append.append(flow)
-                                            sub_list_to_append.append(benchmark_source)
-                                            sub_list_to_append.append(extent_config)
-                                            sub_list_to_append.append(calibrated)
-
-                                            list_to_write.append(sub_list_to_append)
+            for test_case in test_cases_list:
+                try:
+                    int(test_case.split('_')[0])
 
+                    huc = test_case.split('_')[0]
+                    
                     for iteration in iteration_list:
-
+                        
                         if iteration == "official":
                             versions_to_crawl = os.path.join(benchmark_test_case_dir, test_case, 'official_versions')
                             versions_to_aggregate = os.listdir(PREVIOUS_FIM_DIR)
@@ -193,7 +154,7 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                     calibrated = "yes"
                                 else:
                                     calibrated = "no"
-
+    
                                 version_dir = os.path.join(versions_to_crawl, version)
                                 magnitude_dir = os.path.join(version_dir, magnitude)
                                 if os.path.exists(magnitude_dir):
@@ -205,7 +166,7 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                             full_json_path = os.path.join(magnitude_dir, f)
                                             flow = ''
                                             if os.path.exists(full_json_path):
-
+    
                                                 # Get flow used to map.
                                                 flow_file = os.path.join(benchmark_test_case_dir, 'validation_data_' + benchmark_source, huc, nws_lid, magnitude, 'ahps_' + nws_lid + '_huc_' + huc + '_flows_' + magnitude + '.csv')
                                                 if os.path.exists(flow_file):
@@ -216,7 +177,7 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                                             flow = row[1]
                                                         if nws_lid == 'mcc01':
                                                             print(flow)
-
+    
                                                 stats_dict = json.load(open(full_json_path))
                                                 for metric in metrics_to_write:
                                                     sub_list_to_append.append(stats_dict[metric])
@@ -225,9 +186,8 @@ def create_master_metrics_csv(master_metrics_csv_output, dev_comparison):
                                                 sub_list_to_append.append(benchmark_source)
                                                 sub_list_to_append.append(extent_config)
                                                 sub_list_to_append.append(calibrated)
-
+    
                                                 list_to_write.append(sub_list_to_append)
-
                 except ValueError:
                     pass
 
@@ -366,7 +326,7 @@ def process_alpha_test(args):
 
     # Do aggregate_metrics.
     print("Creating master metrics CSV...")
-
+    
     if config == 'DEV':
         dev_comparison = fim_version + "_" + special_string
     else:

From eb670e0f20525792849ad00033554a6db9fe3bd8 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 25 Mar 2021 16:21:27 +0000
Subject: [PATCH 25/66] updating args and renaming crosswalk

---
 src/run_by_unit.sh                            |  2 +-
 ...el_crosswalk.py => usgs_gage_crosswalk.py} | 27 ++++++++++++++-----
 2 files changed, 21 insertions(+), 8 deletions(-)
 rename src/{usgs_catchment_pixel_crosswalk.py => usgs_gage_crosswalk.py} (88%)

diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index cf95d5286..86a409bfe 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_catchment_pixel_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem_m $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -reftable $outputHucDataDir/hand_ref_elev_table.csv -outtable $outputHucDataDir/usgs_elev_table.csv
+$srcDir/usgs_catchment_pixel_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##
diff --git a/src/usgs_catchment_pixel_crosswalk.py b/src/usgs_gage_crosswalk.py
similarity index 88%
rename from src/usgs_catchment_pixel_crosswalk.py
rename to src/usgs_gage_crosswalk.py
index ad0d13349..296c9cdea 100755
--- a/src/usgs_catchment_pixel_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -9,8 +9,25 @@
 import pygeos
 from shapely.wkb import dumps, loads
 
-
-''' Get elevation at adjusted USGS gages locations'''
+''' Get elevation at adjusted USGS gages locations
+
+    Parameters
+    ----------
+    usgs_gages_filename : str
+        File name of USGS stations layer.
+    dem_filename : str
+        File name of original DEM.
+    input_flows_filename : str
+        File name of FIM streams layer.
+    input_catchment_filename : str
+        File name of FIM catchment layer.
+    wbd_buffer_filename : str
+        File name of buffered wbd.
+    dem_adj_filename : str
+        File name of thalweg adjusted DEM.
+    output_table_filename : str
+        File name of output table.
+'''
 
 
 def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,output_table_filename):
@@ -71,10 +88,6 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
             dem_m_elev = round(list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item(),2)
             dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
 
-            # Print elevations to log file
-            print(f"post adjusted catchment pixel ID: {dem_m_elev}")
-            print(f"post adjusted elevation: {dem_adj_elev}")
-
             # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
             site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str_order]
             gage_data.append(site_elevations)
@@ -90,7 +103,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
     parser = argparse.ArgumentParser(description='Crosswalk USGS sites to HydroID and get elevations')
     parser.add_argument('-gages','--usgs-gages-filename', help='USGS gages', required=True)
-    parser.add_argument('-dem_m','--dem-filename',help='Catchment pixel raster',required=True)
+    parser.add_argument('-dem','--dem-filename',help='DEM',required=True)
     parser.add_argument('-flows','--input-flows-filename', help='DEM derived streams', required=True)
     parser.add_argument('-cat','--input-catchment-filename', help='DEM derived catchments', required=True)
     parser.add_argument('-wbd','--wbd-buffer-filename', help='WBD buffer', required=True)

From be7543ca0609b32212f9ee06063dc7f8709a501b Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Thu, 25 Mar 2021 18:16:48 +0000
Subject: [PATCH 26/66] fixing bug in rem.py

---
 src/rem.py                       | 20 ++++++++++++++------
 src/run_by_unit.sh               |  2 +-
 src/usgs_gage_crosswalk.py       | 10 +++++-----
 tools/rating_curve_comparison.py | 32 ++++++++++++++++++--------------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/rem.py b/src/rem.py
index 27dd4ad1b..f0cd8fad3 100755
--- a/src/rem.py
+++ b/src/rem.py
@@ -106,13 +106,21 @@ def make_catchment_min_dict(flat_dem, catchment_min_dict, flat_catchments, thalw
     gw_catchments_pixels_masked_object.close()
     thalweg_raster_object.close()
 
+    # Merge and export dictionary to to_csv
+    catchment_min_dict_df = pd.DataFrame.from_dict(catchment_min_dict, orient='index') # convert dict to dataframe
+    catchment_min_dict_df.columns = ['Median_Thal_Elev_m']
+    catchment_hydroid_dict_df = pd.DataFrame.from_dict(catchment_hydroid_dict, orient='index') # convert dict to dataframe
+    catchment_hydroid_dict_df.columns = ['HydroID']
+    merge_df = catchment_hydroid_dict_df.merge(catchment_min_dict_df, left_index=True, right_index=True)
+    merge_df.index.name = 'pixelcatch_id'
+
     # Merge the HAND reference elevation by HydroID dataframe with the demDerived_reaches layer (add new layer attribute)
-    min_by_hydroid = merge_df.groupby(['HydroID']).min() # min value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
-    min_by_hydroid.columns = ['Min_Thal_Elev_m']
-    med_by_hydroid = merge_df.groupby(['HydroID']).median() # median value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
-    med_by_hydroid.columns = ['Median_Thal_Elev_m']
-    max_by_hydroid = merge_df.groupby(['HydroID']).max() # max value of all Median_Thal_Elev_m for pixel catchments in each HydroID reach
-    max_by_hydroid.columns = ['Max_Thal_Elev_m']
+    min_by_hydroid = merge_df.groupby(['HydroID']).min() # min value of all med_thal_elev for pixel catchments in each HydroID reach
+    min_by_hydroid.columns = ['min_thal_elev']
+    med_by_hydroid = merge_df.groupby(['HydroID']).median() # median value of all med_thal_elev for pixel catchments in each HydroID reach
+    med_by_hydroid.columns = ['med_thal_elev']
+    max_by_hydroid = merge_df.groupby(['HydroID']).max() # max value of all med_thal_elev for pixel catchments in each HydroID reach
+    max_by_hydroid.columns = ['max_thal_elev']
     input_reaches = gpd.read_file(dem_reaches_filename)
     input_reaches = input_reaches.merge(min_by_hydroid, on='HydroID') # merge dataframes by HydroID variable
     input_reaches = input_reaches.merge(med_by_hydroid, on='HydroID') # merge dataframes by HydroID variable
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 86a409bfe..6805be7e3 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_catchment_pixel_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
+$srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##
diff --git a/src/usgs_gage_crosswalk.py b/src/usgs_gage_crosswalk.py
index 296c9cdea..8c45f6b1b 100755
--- a/src/usgs_gage_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -41,11 +41,11 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
     # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
-    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','Min_Thal_Elev_m','Median_Thal_Elev_m','Max_Thal_Elev_m', 'order_'])
+    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','min_thal_elev','med_thal_elev','max_thal_elev', 'order_'])
 
     if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
 
-    columns = ['usgs_gage_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev','str_order']
+    columns = ['location_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev','str_order']
     gage_data = []
 
     # Move USGS gage to stream
@@ -59,9 +59,9 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
         if not np.isnan(hydro_id):
 
-            min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Min_Thal_Elev_m.item(),2)
-            med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Median_Thal_Elev_m.item(),2)
-            max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].Max_Thal_Elev_m.item(),2)
+            min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].min_thal_elev.item(),2)
+            med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].med_thal_elev.item(),2)
+            max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].max_thal_elev.item(),2)
 
             # Convert headwater point geometries to WKB representation
             wkb_gages = dumps(gage.geometry)
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 603f697ed..b3294dc95 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -46,28 +46,32 @@ def generate_rating_curve_metrics(args):
 
     # Join rating curves with elevation data
     hydrotable = hydrotable.merge(elev_table, on="HydroID")
-    relevant_gages = list(hydrotable.usgs_gage_id.unique())
+    relevant_gages = list(hydrotable.location_id.unique())
     usgs_gages = usgs_gages[usgs_gages['location_id'].isin(relevant_gages)]
     usgs_gages = usgs_gages.reset_index(drop=True)
 
     if len(usgs_gages) > 0:
 
         # Adjust rating curve to elevation
-        hydrotable['thal_elevation'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
+        hydrotable['elevation'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
         # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
         hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
         usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation"})
 
         hydrotable['Source'] = "FIM"
         usgs_gages['Source'] = "USGS"
-        limited_hydrotable = hydrotable.filter(items=['usgs_gage_id','thal_elevation','discharge_cfs','Source'])
-        limited_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
+        limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','Source'])
+        select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
 
-        rating_curves = limited_hydrotable.rename(columns={"usgs_gage_id": "location_id","thal_elevation": "elevation"})
+        rating_curves = rating_curves.append(select_usgs_gages)
+
+        # add stream order
+        stream_order = hydrotable.filter(items=['location_id','str_order'])
+        rating_curves = rating_curves.merge(stream_order, on='location_id')
 
-        rating_curves = rating_curves.append(limited_usgs_gages)
         rating_curves = rating_curves.rename(columns={"location_id": "USGS Gage"})
 
+
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
 
         ## Calculate metrics for NWM reccurence intervals
@@ -88,7 +92,7 @@ def generate_rating_curve_metrics(args):
         nwm_recurr_intervals_all['discharge_cfs'] = nwm_recurr_intervals_all.discharge_cms * 35.3147
         nwm_recurr_intervals_all = nwm_recurr_intervals_all.filter(items=['discharge_cfs', 'recurr_interval','feature_id']).drop_duplicates()
 
-        usgs_crosswalk = hydrotable.filter(items=['usgs_gage_id', 'feature_id']).drop_duplicates()
+        usgs_crosswalk = hydrotable.filter(items=['location_id', 'feature_id']).drop_duplicates()
 
         nwm_recurr_data_table = pd.DataFrame()
         columns = ['usgs_gage','NRMSE','mean_abs_y_diff','percent_bias']
@@ -97,7 +101,7 @@ def generate_rating_curve_metrics(args):
         for index, gage in usgs_crosswalk.iterrows():
             ## Interpolate USGS/FIM elevation at NWM recurrence intervals
             # Interpolate USGS elevation at NWM recurrence intervals
-            usgs_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.usgs_gage_id) & (rating_curves.Source=="USGS")]
+            usgs_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.location_id) & (rating_curves.Source=="USGS")]
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # handle sites missing data
@@ -105,17 +109,17 @@ def generate_rating_curve_metrics(args):
                 continue
 
             # clean up data
-            usgs_pred_elev['usgs_gage'] = gage.usgs_gage_id
+            usgs_pred_elev['usgs_gage'] = gage.location_id
             usgs_pred_elev = usgs_pred_elev.filter(items=['usgs_gage','recurr_interval', 'discharge_cfs','pred_elev'])
             usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "usgs_pred_elev"})
 
             # Interpolate FIM elevation at NWM recurrence intervals
-            fim_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.usgs_gage_id) & (rating_curves.Source=="FIM")]
+            fim_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.location_id) & (rating_curves.Source=="FIM")]
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # handle sites missing data
             if len(fim_pred_elev) <1:
-                print(f"missing fim elevation data for usgs station {gage.usgs_gage_id} in huc {huc}")
+                print(f"missing fim elevation data for usgs station {gage.location_id} in huc {huc}")
                 continue
 
             # clean up data
@@ -128,11 +132,11 @@ def generate_rating_curve_metrics(args):
             ## Interpolate FIM elevation at USGS observations
             # Sort stage in ascending order
             usgs_rc = usgs_rc.sort_values('elevation',ascending=True)
-            fim_rc = fim_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="usgs_gage_id")
+            fim_rc = fim_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="location_id")
             usgs_rc['pred_elev'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
 
             usgs_rc = usgs_rc[usgs_rc['pred_elev'].notna()]
-            rc_stats_plot_filename = join(dirname(rc_comparison_plot_filename),'rating_curve_stats' + str(gage.usgs_gage_id) +'.png')
+            rc_stats_plot_filename = join(dirname(rc_comparison_plot_filename),'rating_curve_stats' + str(gage.location_id) +'.png')
 
             if not usgs_rc.empty:
                 gage_stats = calculate_rc_stats_stage(usgs_rc,rc_stats_plot_filename)
@@ -200,7 +204,7 @@ def generate_facet_plot(rating_curves, rc_comparison_plot_filename):
 
 def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
 
-    usgs_site = site_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="usgs_gage_id")
+    usgs_site = site_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="location_id")
     nwm_ids = len(usgs_site.feature_id.drop_duplicates())
 
     if nwm_ids > 0:

From aab613bf7bb5e6b1aaa109dc12f879f10944cf48 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Fri, 26 Mar 2021 01:53:20 +0000
Subject: [PATCH 27/66] str_order object issue - still not resolved

---
 src/usgs_gage_crosswalk.py       |   2 +-
 tools/rating_curve_comparison.py | 246 ++++++++++++++++---------------
 2 files changed, 129 insertions(+), 119 deletions(-)

diff --git a/src/usgs_gage_crosswalk.py b/src/usgs_gage_crosswalk.py
index 8c45f6b1b..6ce172856 100755
--- a/src/usgs_gage_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -55,7 +55,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
         # Get stream attributes
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
-        str_order = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()
+        str_order = str(int(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()))
 
         if not np.isnan(hydro_id):
 
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index b3294dc95..fc54fc6d4 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -63,14 +63,13 @@ def generate_rating_curve_metrics(args):
         limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','Source'])
         select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
 
-        rating_curves = rating_curves.append(select_usgs_gages)
+        rating_curves = limited_hydrotable.append(select_usgs_gages)
 
         # add stream order
-        stream_order = hydrotable.filter(items=['location_id','str_order'])
+        stream_order = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
         rating_curves = rating_curves.merge(stream_order, on='location_id')
-
-        rating_curves = rating_curves.rename(columns={"location_id": "USGS Gage"})
-
+        rating_curves['str_order'] = rating_curves['str_order'].astype('int')
+        rating_curves['str_order'] = rating_curves['str_order'].astype('str')
 
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
 
@@ -95,13 +94,23 @@ def generate_rating_curve_metrics(args):
         usgs_crosswalk = hydrotable.filter(items=['location_id', 'feature_id']).drop_duplicates()
 
         nwm_recurr_data_table = pd.DataFrame()
-        columns = ['usgs_gage','NRMSE','mean_abs_y_diff','percent_bias']
-        usgs_recurr_stats = []
+        usgs_recurr_data = pd.DataFrame()
 
         for index, gage in usgs_crosswalk.iterrows():
             ## Interpolate USGS/FIM elevation at NWM recurrence intervals
             # Interpolate USGS elevation at NWM recurrence intervals
-            usgs_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.location_id) & (rating_curves.Source=="USGS")]
+            usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="USGS")]
+            try:
+                # str_order = usgs_rc.str_order.unique().item()
+                usgs_rc = usgs_rc.set_index('str_order')
+                str_order = usgs_rc.index.unique()
+                usgs_rc = usgs_rc.reset_index()
+            except:
+                try:
+                    str_order = list(set(usgs_rc.str_order.to_list()))[0] # pandas is unusable sometimes
+                except:
+                    print(f"something is messed up with this site: huc {huc}, site {gage.location_id}, rating curve shape {rating_curves.shape}, rating curve columns {rating_curves.columns}, rating curve str_order column {rating_curves.str_order.head()}")
+
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # handle sites missing data
@@ -109,12 +118,12 @@ def generate_rating_curve_metrics(args):
                 continue
 
             # clean up data
-            usgs_pred_elev['usgs_gage'] = gage.location_id
-            usgs_pred_elev = usgs_pred_elev.filter(items=['usgs_gage','recurr_interval', 'discharge_cfs','pred_elev'])
-            usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "usgs_pred_elev"})
+            usgs_pred_elev['location_id'] = gage.location_id
+            usgs_pred_elev = usgs_pred_elev.filter(items=['location_id','recurr_interval', 'discharge_cfs','pred_elev'])
+            usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "USGS"})
 
             # Interpolate FIM elevation at NWM recurrence intervals
-            fim_rc = rating_curves.loc[(rating_curves["USGS Gage"]==gage.location_id) & (rating_curves.Source=="FIM")]
+            fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="FIM")]
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # handle sites missing data
@@ -123,31 +132,40 @@ def generate_rating_curve_metrics(args):
                 continue
 
             # clean up data
-            fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "fim_pred_elev"})
-            fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','fim_pred_elev'])
-            usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs']) # str_order
+            fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "FIM"})
+            fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','FIM'])
+            usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
+
             usgs_pred_elev['HUC'] = huc
+            usgs_pred_elev['str_order'] = str_order
+
+            usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','recurr_interval','discharge_cfs','HUC','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
 
             ## Interpolate FIM elevation at USGS observations
             # Sort stage in ascending order
-            usgs_rc = usgs_rc.sort_values('elevation',ascending=True)
-            fim_rc = fim_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="location_id")
-            usgs_rc['pred_elev'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
-
-            usgs_rc = usgs_rc[usgs_rc['pred_elev'].notna()]
-            rc_stats_plot_filename = join(dirname(rc_comparison_plot_filename),'rating_curve_stats' + str(gage.location_id) +'.png')
+            usgs_rc = usgs_rc.rename(columns={"elevation": "USGS"})
+            usgs_rc = usgs_rc.sort_values('USGS',ascending=True)
+            fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
 
-            if not usgs_rc.empty:
-                gage_stats = calculate_rc_stats_stage(usgs_rc,rc_stats_plot_filename)
+            usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
+            usgs_rc = usgs_rc[usgs_rc['FIM'].notna()]
+            usgs_rc = usgs_rc.drop(columns=["Source"])
 
-                usgs_recurr_stats.append(gage_stats)
+            usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
 
-        usgs_recurr_stats_table = pd.DataFrame(usgs_recurr_stats, columns=columns)
+            if not usgs_rc.empty:
+                usgs_recurr_data = usgs_recurr_data.append(usgs_rc)
 
-        if not usgs_recurr_stats_table.empty:
+        # Generate stats for all sites in huc
+        if not usgs_recurr_data.empty:
+            usgs_recurr_stats_table = calculate_rc_stats_elev(usgs_recurr_data)
             usgs_recurr_stats_table.to_csv(usgs_recurr_stats_filename,index=False)
 
+        # Generate plots
+        fim_elev_at_USGS_rc_plot_filename = join(dirname(rc_comparison_plot_filename),'FIM_elevations_at_USGS_rc_' + str(huc) +'.png')
+        generate_facet_plot(usgs_recurr_data, fim_elev_at_USGS_rc_plot_filename)
+
         if not nwm_recurr_data_table.empty:
             nwm_recurr_data_table.to_csv(nwm_recurr_data_filename,index=False)
 
@@ -178,33 +196,52 @@ def aggregate_metrics(output_dir,procs_list):
             else:
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
 
+    agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
+    agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats)
+
 
-def generate_facet_plot(rating_curves, rc_comparison_plot_filename):
+def generate_facet_plot(rc, plot_filename):
     # Filter FIM elevation based on USGS data
-    for gage in rating_curves['USGS Gage'].unique():
+    for gage in rc.location_id.unique():
 
-        min_elev = rating_curves.loc[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='USGS')].elevation.min()
-        max_elev = rating_curves.loc[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='USGS')].elevation.max()
+        min_elev = rc.loc[(rc.location_id==gage) & (rc.Source=='USGS')].elevation.min()
+        max_elev = rc.loc[(rc.location_id==gage) & (rc.Source=='USGS')].elevation.max()
 
-        rating_curves_map = rating_curves.drop(rating_curves[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='FIM') & (rating_curves.elevation > (max_elev + 2))].index)
-        rating_curves_map = rating_curves.drop(rating_curves[(rating_curves['USGS Gage']==gage) & (rating_curves.Source=='FIM') & (rating_curves.elevation < min_elev - 2)].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.Source=='FIM') & (rc.elevation > (max_elev + 2))].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.Source=='FIM') & (rc.elevation < min_elev - 2)].index)
+
+    rc = rc.rename(columns={"location_id": "USGS Gage"})
 
     ## Generate rating curve plots
     sns.set(style="ticks")
-    g = sns.FacetGrid(rating_curves_map, col="USGS Gage", hue="Source",sharex=False, sharey=False,col_wrap=3)
+    g = sns.FacetGrid(rc, col="USGS Gage", hue="Source",sharex=False, sharey=False,col_wrap=3)
     g.map(sns.scatterplot, "discharge_cfs", "elevation", palette="tab20c", marker="o")
-    g.set_axis_labels(x_var="Discharge (cfs)", y_var="Stage (ft)")
+    g.set_axis_labels(x_var="Discharge (cfs)", y_var="Elevation (ft)")
 
     # Adjust the arrangement of the plots
     g.fig.tight_layout(w_pad=1)
     g.add_legend()
 
-    plt.savefig(rc_comparison_plot_filename)
+    plt.savefig(plot_filename)
+    plt.close()
+
+
+    # "Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
+    #     station,
+    #     round(NRMSE, 2),
+    #     round(mean_abs_y_diff, 2),
+    #     round(percent_bias, 1),
+    # )
+
+    ## Change labels
+    # axes = g.axes.flatten()
+    # for ax in axes:
+    #     ax.set_xlabel("Percentage Depth")
 
 
 def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
 
-    usgs_site = site_rc.merge(usgs_crosswalk, left_on="USGS Gage", right_on="location_id")
+    usgs_site = site_rc.merge(usgs_crosswalk, on="location_id")
     nwm_ids = len(usgs_site.feature_id.drop_duplicates())
 
     if nwm_ids > 0:
@@ -218,88 +255,61 @@ def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
         return []
 
 
-def calculate_rc_stats_stage(rating_curve, fig_path):
-    station = rating_curve["USGS Gage"].unique().item()
-
-    # Get the interpolated hand column, for now it is just the last column but THIS NEEDS TO BE BETTER FORMALIZED.
-    usgs_stage = "elevation"
-    flows = "discharge_cfs"
-    hand_stage = "pred_elev"
-
-    # Calculate variables for NRMSE
-    rating_curve["yhat_minus_y"] = rating_curve[hand_stage] - rating_curve[usgs_stage]
-    rating_curve["yhat_minus_y_squared"] = rating_curve["yhat_minus_y"] ** 2
-    sum_y_diff = rating_curve["yhat_minus_y_squared"].sum()
-
-    # determine number of events that are modeled
-    n = rating_curve[usgs_stage].count()
-
-    # Determine the maximum/minimum USGS stage
-    y_max = rating_curve[usgs_stage].max()
-    y_min = rating_curve[usgs_stage].min()
-
-    # Calculate NRMSE
-    NRMSE_numerator = (sum_y_diff / n) ** 0.5
-    NRMSE_denominator = y_max - y_min
-    NRMSE = NRMSE_numerator / NRMSE_denominator
-
-    # Calculate Mean Absolute Depth Difference
-    mean_abs_y_diff = abs(rating_curve["yhat_minus_y"]).mean()
-
-    # Calculate Percent Bias
-    percent_bias = 100 * (rating_curve["yhat_minus_y"].sum() / rating_curve[usgs_stage].sum())
-
-    ## plot USGS rating curve and HAND rating curve and display statistics
-    fig, ax = plt.subplots()
-    rating_curve.plot(
-        x=flows,
-        y=usgs_stage,
-        ax=ax,
-        legend=False,
-        style="-",
-        color="orange",
-        zorder=2,
-    )
-    rating_curve.plot(
-        x=flows,
-        y=usgs_stage,
-        ax=ax,
-        legend=False,
-        kind="scatter",
-        marker="o",
-        s=30.0,
-        color="black",
-        zorder=3,
-    )
-    rating_curve.plot(
-        x=flows, y=hand_stage, ax=ax, legend=False, style="--", color="gray", zorder=2
-    )
-    rating_curve.plot(
-        x=flows,
-        y=hand_stage,
-        ax=ax,
-        legend=False,
-        kind="scatter",
-        marker="x",
-        s=30.0,
-        color="blue",
-        zorder=3,
-    )
-    ax.set_xlabel("Flow (cfs)")
-    ax.set_ylabel("Elevation (ft)")
-    ax.legend(["USGS Curve", "HAND Curve"], loc="best")
-    ax.grid(zorder=1)
-    fig.suptitle(
-        "Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
-            station,
-            round(NRMSE, 2),
-            round(mean_abs_y_diff, 2),
-            round(percent_bias, 1),
-        )
-    )
-    fig.savefig(fig_path)
-    plt.close(fig)
-    return [station, NRMSE, mean_abs_y_diff, percent_bias]
+def calculate_rc_stats_elev(rc,slice_vars=None):
+
+    stations = rc.location_id.unique()
+    columns = ['location_id','NRMSE','mean_abs_y_diff','percent_bias']
+    rc_stats = []
+
+    # if slice_vars not None:
+
+    for station in stations:
+
+        station_rc = rc.loc[rc.location_id==station]
+
+        # Collect any extra columns not associated with melt
+        col_index = list(station_rc.columns)
+        pivot_vars = ['Source','elevation']
+        col_index = [col for col in col_index if col not in pivot_vars]
+
+        # Unmelt elevation/Source
+        station_rc = (station_rc.set_index(col_index)
+            .pivot(columns="Source")['elevation']
+            .reset_index()
+            .rename_axis(None, axis=1)
+         )
+
+        usgs_elev = "USGS"
+        src_elev = "FIM"
+
+        # Calculate variables for NRMSE
+        station_rc["yhat_minus_y"] = station_rc[src_elev] - station_rc[usgs_elev]
+        station_rc["yhat_minus_y_squared"] = station_rc["yhat_minus_y"] ** 2
+        sum_y_diff = station_rc["yhat_minus_y_squared"].sum()
+
+        # determine number of events that are modeled
+        n = station_rc[usgs_elev].count()
+
+        # Determine the maximum/minimum USGS elevation
+        y_max = station_rc[usgs_elev].max()
+        y_min = station_rc[usgs_elev].min()
+
+        # Calculate NRMSE
+        NRMSE_numerator = (sum_y_diff / n) ** 0.5
+        NRMSE_denominator = y_max - y_min
+        NRMSE = NRMSE_numerator / NRMSE_denominator
+
+        # Calculate Mean Absolute Depth Difference
+        mean_abs_y_diff = abs(station_rc["yhat_minus_y"]).mean()
+
+        # Calculate Percent Bias
+        percent_bias = 100 * (station_rc["yhat_minus_y"].sum() / station_rc[usgs_elev].sum())
+
+        rc_stats.append([station, NRMSE, mean_abs_y_diff, percent_bias])
+
+    rc_stat_table = pd.DataFrame(rc_stats, columns=columns)
+
+    return rc_stat_table
 
 if __name__ == '__main__':
 

From d139c1dcca4f59b6b8861cebd81a4395854c508d Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Fri, 26 Mar 2021 09:42:12 -0500
Subject: [PATCH 28/66] switching to numpy to get str_order

---
 tools/rating_curve_comparison.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index fc54fc6d4..6dee2bbfd 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -100,17 +100,9 @@ def generate_rating_curve_metrics(args):
             ## Interpolate USGS/FIM elevation at NWM recurrence intervals
             # Interpolate USGS elevation at NWM recurrence intervals
             usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="USGS")]
-            try:
-                # str_order = usgs_rc.str_order.unique().item()
-                usgs_rc = usgs_rc.set_index('str_order')
-                str_order = usgs_rc.index.unique()
-                usgs_rc = usgs_rc.reset_index()
-            except:
-                try:
-                    str_order = list(set(usgs_rc.str_order.to_list()))[0] # pandas is unusable sometimes
-                except:
-                    print(f"something is messed up with this site: huc {huc}, site {gage.location_id}, rating curve shape {rating_curves.shape}, rating curve columns {rating_curves.columns}, rating curve str_order column {rating_curves.str_order.head()}")
-
+            
+            str_order = np.unique(usgs_rc.str_order)
+                        
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # handle sites missing data

From 5e07adb65bd908f5ac0370a3a96f601ce07ad854 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Fri, 26 Mar 2021 10:28:13 -0500
Subject: [PATCH 29/66] partial update of stats function using slice arg (no
 VPN right now)

---
 tools/rating_curve_comparison.py | 135 +++++++++++++++----------------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 6dee2bbfd..69acd72e4 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -65,11 +65,10 @@ def generate_rating_curve_metrics(args):
 
         rating_curves = limited_hydrotable.append(select_usgs_gages)
 
-        # add stream order
+        # Add stream order
         stream_order = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
         rating_curves = rating_curves.merge(stream_order, on='location_id')
         rating_curves['str_order'] = rating_curves['str_order'].astype('int')
-        rating_curves['str_order'] = rating_curves['str_order'].astype('str')
 
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
 
@@ -105,11 +104,11 @@ def generate_rating_curve_metrics(args):
                         
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
-            # handle sites missing data
+            # Handle sites missing data
             if len(usgs_pred_elev) <1:
                 continue
 
-            # clean up data
+            # Clean up data
             usgs_pred_elev['location_id'] = gage.location_id
             usgs_pred_elev = usgs_pred_elev.filter(items=['location_id','recurr_interval', 'discharge_cfs','pred_elev'])
             usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "USGS"})
@@ -118,12 +117,12 @@ def generate_rating_curve_metrics(args):
             fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="FIM")]
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
-            # handle sites missing data
+            # Handle sites missing data
             if len(fim_pred_elev) <1:
                 print(f"missing fim elevation data for usgs station {gage.location_id} in huc {huc}")
                 continue
 
-            # clean up data
+            # Clean up data
             fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "FIM"})
             fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','FIM'])
             usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
@@ -164,7 +163,7 @@ def generate_rating_curve_metrics(args):
     else:
         print(f"no USGS data for gage(s): {relevant_gages} in huc {huc}")
 
-def aggregate_metrics(output_dir,procs_list):
+def aggregate_metrics(output_dir,procs_list,slice):
 
     agg_usgs_interp_elev_stats = join(output_dir,'agg_usgs_interp_elev_stats.csv')
     agg_nwm_recurr_flow_elev = join(output_dir,'agg_nwm_recurr_flow_elevations.csv')
@@ -189,7 +188,7 @@ def aggregate_metrics(output_dir,procs_list):
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
 
     agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
-    agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats)
+    agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,slice)
 
 
 def generate_facet_plot(rc, plot_filename):
@@ -209,6 +208,16 @@ def generate_facet_plot(rc, plot_filename):
     g = sns.FacetGrid(rc, col="USGS Gage", hue="Source",sharex=False, sharey=False,col_wrap=3)
     g.map(sns.scatterplot, "discharge_cfs", "elevation", palette="tab20c", marker="o")
     g.set_axis_labels(x_var="Discharge (cfs)", y_var="Elevation (ft)")
+    
+     ## Change labels
+    # axes = g.axes.flatten()
+    # for ax in axes:
+    #     ax.set_xlabel("Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
+    #     station,
+    #     round(NRMSE, 2),
+    #     round(mean_abs_y_diff, 2),
+    #     round(percent_bias, 1),
+    # ))
 
     # Adjust the arrangement of the plots
     g.fig.tight_layout(w_pad=1)
@@ -218,19 +227,6 @@ def generate_facet_plot(rc, plot_filename):
     plt.close()
 
 
-    # "Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
-    #     station,
-    #     round(NRMSE, 2),
-    #     round(mean_abs_y_diff, 2),
-    #     round(percent_bias, 1),
-    # )
-
-    ## Change labels
-    # axes = g.axes.flatten()
-    # for ax in axes:
-    #     ax.set_xlabel("Percentage Depth")
-
-
 def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
 
     usgs_site = site_rc.merge(usgs_crosswalk, on="location_id")
@@ -252,52 +248,53 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     stations = rc.location_id.unique()
     columns = ['location_id','NRMSE','mean_abs_y_diff','percent_bias']
     rc_stats = []
-
-    # if slice_vars not None:
-
-    for station in stations:
-
-        station_rc = rc.loc[rc.location_id==station]
-
-        # Collect any extra columns not associated with melt
-        col_index = list(station_rc.columns)
-        pivot_vars = ['Source','elevation']
-        col_index = [col for col in col_index if col not in pivot_vars]
-
-        # Unmelt elevation/Source
-        station_rc = (station_rc.set_index(col_index)
-            .pivot(columns="Source")['elevation']
-            .reset_index()
-            .rename_axis(None, axis=1)
-         )
-
-        usgs_elev = "USGS"
-        src_elev = "FIM"
-
-        # Calculate variables for NRMSE
-        station_rc["yhat_minus_y"] = station_rc[src_elev] - station_rc[usgs_elev]
-        station_rc["yhat_minus_y_squared"] = station_rc["yhat_minus_y"] ** 2
-        sum_y_diff = station_rc["yhat_minus_y_squared"].sum()
-
-        # determine number of events that are modeled
-        n = station_rc[usgs_elev].count()
-
-        # Determine the maximum/minimum USGS elevation
-        y_max = station_rc[usgs_elev].max()
-        y_min = station_rc[usgs_elev].min()
-
-        # Calculate NRMSE
-        NRMSE_numerator = (sum_y_diff / n) ** 0.5
-        NRMSE_denominator = y_max - y_min
-        NRMSE = NRMSE_numerator / NRMSE_denominator
-
-        # Calculate Mean Absolute Depth Difference
-        mean_abs_y_diff = abs(station_rc["yhat_minus_y"]).mean()
-
-        # Calculate Percent Bias
-        percent_bias = 100 * (station_rc["yhat_minus_y"].sum() / station_rc[usgs_elev].sum())
-
-        rc_stats.append([station, NRMSE, mean_abs_y_diff, percent_bias])
+    
+    usgs_elev = "USGS"
+    src_elev = "FIM"
+    
+    # Collect any extra columns not associated with melt
+    col_index = list(rc.columns)
+    pivot_vars = ['Source','elevation']
+    col_index = [col for col in col_index if col not in pivot_vars]
+    
+    # Unmelt elevation/Source
+    station_rc = (station_rc.set_index(col_index)
+        .pivot(columns="Source")['elevation']
+        .reset_index()
+        .rename_axis(None, axis=1)
+     )
+
+    if not slice_vars not None:
+        slice_vars = [stations]
+    
+    columns = columns + slice_vars
+
+    station_rc = rc.groupby(slice_vars)      
+
+    # Calculate variables for NRMSE
+    station_rc["yhat_minus_y"] = station_rc[src_elev] - station_rc[usgs_elev]
+    station_rc["yhat_minus_y_squared"] = station_rc["yhat_minus_y"] ** 2
+    sum_y_diff = station_rc["yhat_minus_y_squared"].sum()
+
+    # Determine number of events that are modeled
+    n = station_rc[usgs_elev].count()
+
+    # Determine the maximum/minimum USGS elevation
+    y_max = station_rc[usgs_elev].max()
+    y_min = station_rc[usgs_elev].min()
+
+    # Calculate NRMSE
+    NRMSE_numerator = (sum_y_diff / n) ** 0.5
+    NRMSE_denominator = y_max - y_min
+    NRMSE = NRMSE_numerator / NRMSE_denominator
+
+    # Calculate Mean Absolute Depth Difference
+    mean_abs_y_diff = abs(station_rc["yhat_minus_y"]).mean()
+
+    # Calculate Percent Bias
+    percent_bias = 100 * (station_rc["yhat_minus_y"].sum() / station_rc[usgs_elev].sum())
+
+    # rc_stats.append([station, NRMSE, mean_abs_y_diff, percent_bias])
 
     rc_stat_table = pd.DataFrame(rc_stats, columns=columns)
 
@@ -337,4 +334,6 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     pool.map(generate_rating_curve_metrics, procs_list)
 
     print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
-    aggregate_metrics(output_dir,procs_list)
+    # slice = ['str_order', 'HUC']
+    slice = ['location_id']
+    aggregate_metrics(output_dir,procs_list,slice)

From 4832246ef67d04286c77d9f27a3220cdd256516d Mon Sep 17 00:00:00 2001
From: Brian Avant <1558017798@mil>
Date: Fri, 26 Mar 2021 15:09:56 -0500
Subject: [PATCH 30/66] adding group arg for stat grouping

---
 tools/rating_curve_comparison.py | 128 ++++++++++++++++---------------
 1 file changed, 68 insertions(+), 60 deletions(-)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 69acd72e4..1224dcbca 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -12,7 +12,7 @@
 from multiprocessing import Pool
 from os.path import isfile, join, dirname
 sys.path.append('/foss_fim/src')
-from utils.shared_functions import getDriver
+# from utils.shared_functions import getDriver
 
 """
     Plot Rating Curves and Compare to USGS Gages
@@ -43,7 +43,7 @@ def generate_rating_curve_metrics(args):
     elev_table = pd.read_csv(elev_table_filename)
     hydrotable = pd.read_csv(hydrotable_filename)
     usgs_gages = pd.read_csv(usgs_gages_filename)
-
+    
     # Join rating curves with elevation data
     hydrotable = hydrotable.merge(elev_table, on="HydroID")
     relevant_gages = list(hydrotable.location_id.unique())
@@ -57,50 +57,51 @@ def generate_rating_curve_metrics(args):
         # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
         hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
         usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation"})
-
+        
         hydrotable['Source'] = "FIM"
         usgs_gages['Source'] = "USGS"
         limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','Source'])
         select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
-
+        
         rating_curves = limited_hydrotable.append(select_usgs_gages)
-
+        
         # Add stream order
         stream_order = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
         rating_curves = rating_curves.merge(stream_order, on='location_id')
         rating_curves['str_order'] = rating_curves['str_order'].astype('int')
-
+        
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
-
+        
         ## Calculate metrics for NWM reccurence intervals
         # NWM recurr intervals
         recurr_1_5_yr_filename = join(nwm_flow_dir,'recurr_1_5_cms.csv')
         recurr_5_yr_filename = join(nwm_flow_dir,'recurr_5_0_cms.csv')
         recurr_10_yr_filename = join(nwm_flow_dir,'recurr_10_0_cms.csv')
-
+        
         recurr_1_5_yr = pd.read_csv(recurr_1_5_yr_filename)
         recurr_1_5_yr = recurr_1_5_yr.rename(columns={"discharge": "1.5"})
         recurr_5_yr = pd.read_csv(recurr_5_yr_filename)
         recurr_5_yr = recurr_5_yr.rename(columns={"discharge": "5.0"})
         recurr_10_yr = pd.read_csv(recurr_10_yr_filename)
         recurr_10_yr = recurr_10_yr.rename(columns={"discharge": "10.0"})
-
+        
         nwm_recurr_intervals_all = reduce(lambda x,y: pd.merge(x,y, on='feature_id', how='outer'), [recurr_1_5_yr, recurr_5_yr, recurr_10_yr])
         nwm_recurr_intervals_all = pd.melt(nwm_recurr_intervals_all, id_vars=['feature_id'], value_vars=['1.5','5.0','10.0'], var_name='recurr_interval', value_name='discharge_cms')
         nwm_recurr_intervals_all['discharge_cfs'] = nwm_recurr_intervals_all.discharge_cms * 35.3147
         nwm_recurr_intervals_all = nwm_recurr_intervals_all.filter(items=['discharge_cfs', 'recurr_interval','feature_id']).drop_duplicates()
-
+        
         usgs_crosswalk = hydrotable.filter(items=['location_id', 'feature_id']).drop_duplicates()
-
+        
         nwm_recurr_data_table = pd.DataFrame()
         usgs_recurr_data = pd.DataFrame()
 
         for index, gage in usgs_crosswalk.iterrows():
+            print(gage)
             ## Interpolate USGS/FIM elevation at NWM recurrence intervals
             # Interpolate USGS elevation at NWM recurrence intervals
             usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="USGS")]
             
-            str_order = np.unique(usgs_rc.str_order)
+            str_order = np.unique(usgs_rc.str_order).item()
                         
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
@@ -112,7 +113,7 @@ def generate_rating_curve_metrics(args):
             usgs_pred_elev['location_id'] = gage.location_id
             usgs_pred_elev = usgs_pred_elev.filter(items=['location_id','recurr_interval', 'discharge_cfs','pred_elev'])
             usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "USGS"})
-
+            
             # Interpolate FIM elevation at NWM recurrence intervals
             fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="FIM")]
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
@@ -126,23 +127,23 @@ def generate_rating_curve_metrics(args):
             fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "FIM"})
             fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','FIM'])
             usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
-
+            
             usgs_pred_elev['HUC'] = huc
             usgs_pred_elev['str_order'] = str_order
-
+            
             usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','recurr_interval','discharge_cfs','HUC','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
-
+            
             ## Interpolate FIM elevation at USGS observations
             # Sort stage in ascending order
             usgs_rc = usgs_rc.rename(columns={"elevation": "USGS"})
             usgs_rc = usgs_rc.sort_values('USGS',ascending=True)
             fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
-
+            
             usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
             usgs_rc = usgs_rc[usgs_rc['FIM'].notna()]
             usgs_rc = usgs_rc.drop(columns=["Source"])
-
+            
             usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
 
             if not usgs_rc.empty:
@@ -163,7 +164,7 @@ def generate_rating_curve_metrics(args):
     else:
         print(f"no USGS data for gage(s): {relevant_gages} in huc {huc}")
 
-def aggregate_metrics(output_dir,procs_list,slice):
+def aggregate_metrics(output_dir,procs_list,stat_groups):
 
     agg_usgs_interp_elev_stats = join(output_dir,'agg_usgs_interp_elev_stats.csv')
     agg_nwm_recurr_flow_elev = join(output_dir,'agg_nwm_recurr_flow_elevations.csv')
@@ -171,16 +172,16 @@ def aggregate_metrics(output_dir,procs_list,slice):
     for huc in procs_list:
         if os.path.isfile(huc[3]):
             usgs_recurr_stats = pd.read_csv(huc[3])
-
+    
             # Write/append usgs_recurr_stats
             if os.path.isfile(agg_usgs_interp_elev_stats):
                 usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False, mode='a',header=False)
             else:
                 usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False)
-
+    
         if os.path.isfile(huc[4]):
             nwm_recurr_data = pd.read_csv(huc[4])
-
+    
             # Write/append nwm_recurr_data
             if os.path.isfile(agg_nwm_recurr_flow_elev):
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False, mode='a',header=False)
@@ -188,7 +189,8 @@ def aggregate_metrics(output_dir,procs_list,slice):
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
 
     agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
-    agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,slice)
+    
+    agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,stat_groups)
 
 
 def generate_facet_plot(rc, plot_filename):
@@ -243,11 +245,7 @@ def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
         return []
 
 
-def calculate_rc_stats_elev(rc,slice_vars=None):
-
-    stations = rc.location_id.unique()
-    columns = ['location_id','NRMSE','mean_abs_y_diff','percent_bias']
-    rc_stats = []
+def calculate_rc_stats_elev(rc,stat_groups=None):
     
     usgs_elev = "USGS"
     src_elev = "FIM"
@@ -258,45 +256,54 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     col_index = [col for col in col_index if col not in pivot_vars]
     
     # Unmelt elevation/Source
-    station_rc = (station_rc.set_index(col_index)
+    rc_unmelt = (rc.set_index(col_index)
         .pivot(columns="Source")['elevation']
         .reset_index()
         .rename_axis(None, axis=1)
      )
-
-    if not slice_vars not None:
-        slice_vars = [stations]
     
-    columns = columns + slice_vars
-
-    station_rc = rc.groupby(slice_vars)      
-
+    if stat_groups is None:
+        stat_groups = ['location_id']
+    
     # Calculate variables for NRMSE
-    station_rc["yhat_minus_y"] = station_rc[src_elev] - station_rc[usgs_elev]
-    station_rc["yhat_minus_y_squared"] = station_rc["yhat_minus_y"] ** 2
-    sum_y_diff = station_rc["yhat_minus_y_squared"].sum()
+    rc_unmelt["yhat_minus_y"] = rc_unmelt[src_elev] - rc_unmelt[usgs_elev]
+    rc_unmelt["yhat_minus_y_squared"] = rc_unmelt["yhat_minus_y"] ** 2
+    
+    station_rc = rc_unmelt.groupby(stat_groups)     
 
+    ## Calculate metrics by group
+    # Calculate variables for NRMSE
+    sum_y_diff = station_rc.apply(lambda x: x["yhat_minus_y_squared"].sum())\
+        .reset_index(stat_groups, drop = False).rename({0: "sum_y_diff"}, axis=1)
+        
     # Determine number of events that are modeled
-    n = station_rc[usgs_elev].count()
-
+    n = station_rc.apply(lambda x: x[usgs_elev].count())\
+        .reset_index(stat_groups, drop = False).rename({0: "n"}, axis=1)
+    
     # Determine the maximum/minimum USGS elevation
-    y_max = station_rc[usgs_elev].max()
-    y_min = station_rc[usgs_elev].min()
-
+    y_max = station_rc.apply(lambda x: x[usgs_elev].max())\
+        .reset_index(stat_groups, drop = False).rename({0: "y_max"}, axis=1)
+    y_min = station_rc.apply(lambda x: x[usgs_elev].min())\
+        .reset_index(stat_groups, drop = False).rename({0: "y_min"}, axis=1)
+    
+    # Collect variables for NRMSE
+    NRMSE_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [sum_y_diff, n, y_max, y_min])
+    NRMSE_table_group = NRMSE_table.groupby(stat_groups)  
+    
     # Calculate NRMSE
-    NRMSE_numerator = (sum_y_diff / n) ** 0.5
-    NRMSE_denominator = y_max - y_min
-    NRMSE = NRMSE_numerator / NRMSE_denominator
-
+    NRMSE = NRMSE_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5)/x['y_max'] - x['y_min'])\
+        .reset_index(stat_groups, drop = False).rename({0: "NRMSE"}, axis=1)
+    
     # Calculate Mean Absolute Depth Difference
-    mean_abs_y_diff = abs(station_rc["yhat_minus_y"]).mean()
-
+    mean_abs_y_diff = station_rc.apply(lambda x: abs(x["yhat_minus_y"]).mean())\
+        .reset_index(stat_groups, drop = False).rename({0: "mean_abs_y_diff"}, axis=1)
+    
     # Calculate Percent Bias
-    percent_bias = 100 * (station_rc["yhat_minus_y"].sum() / station_rc[usgs_elev].sum())
-
-    # rc_stats.append([station, NRMSE, mean_abs_y_diff, percent_bias])
+    percent_bias = station_rc.apply(lambda x: 100 * (x["yhat_minus_y"].sum()/x[usgs_elev].sum()))\
+        .reset_index(stat_groups, drop = False).rename({0: "percent_bias"}, axis=1)
+    
+    rc_stat_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [NRMSE, mean_abs_y_diff, percent_bias])
 
-    rc_stat_table = pd.DataFrame(rc_stats, columns=columns)
 
     return rc_stat_table
 
@@ -307,6 +314,7 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True)
     parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True)
     parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
+    parser.add_argument('-group','--stats-groups',help='column(s) to group stats',required=False,default=['location_id'])
 
     args = vars(parser.parse_args())
 
@@ -314,9 +322,11 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     usgs_gages_filename = args['usgs_gages_filename']
     nwm_flow_dir = args['nwm_flow_dir']
     number_of_jobs = args['number_of_jobs']
-
+    stat_groups = args['stat_groups']
+    
+    
     procs_list = []
-
+    
     huc_list  = os.listdir(output_dir)
     for huc in huc_list:
         elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
@@ -324,7 +334,7 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
         usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
         nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
         rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
-
+    
         if isfile(elev_table_filename):
             procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
 
@@ -334,6 +344,4 @@ def calculate_rc_stats_elev(rc,slice_vars=None):
     pool.map(generate_rating_curve_metrics, procs_list)
 
     print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
-    # slice = ['str_order', 'HUC']
-    slice = ['location_id']
-    aggregate_metrics(output_dir,procs_list,slice)
+    aggregate_metrics(output_dir,procs_list,stat_groups)

From 1fb39fa1ce3414143a83ce9be5360f40b6ce8afa Mon Sep 17 00:00:00 2001
From: Brian Avant <1558017798@mil>
Date: Fri, 26 Mar 2021 15:14:20 -0500
Subject: [PATCH 31/66] saving final agg stats

---
 tools/rating_curve_comparison.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 1224dcbca..7c5944b82 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -168,6 +168,7 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
 
     agg_usgs_interp_elev_stats = join(output_dir,'agg_usgs_interp_elev_stats.csv')
     agg_nwm_recurr_flow_elev = join(output_dir,'agg_nwm_recurr_flow_elevations.csv')
+    agg_nwm_recurr_flow_elev_stats = join(output_dir,'agg_nwm_recurr_flow_elev_stats.csv')
 
     for huc in procs_list:
         if os.path.isfile(huc[3]):
@@ -191,6 +192,8 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
     agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
     
     agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,stat_groups)
+    
+    agg_recurr_stats_table.to_csv(agg_nwm_recurr_flow_elev_stats,index=False, header=False)
 
 
 def generate_facet_plot(rc, plot_filename):

From d667c4f2980f37d4fe9bc74b1ca62876ea118556 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Mon, 29 Mar 2021 15:52:57 +0000
Subject: [PATCH 32/66] tidy up for PR

---
 tools/rating_curve_comparison.py | 142 +++++++++++++++----------------
 1 file changed, 68 insertions(+), 74 deletions(-)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 7c5944b82..c483157a2 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -11,8 +11,6 @@
 from functools import reduce
 from multiprocessing import Pool
 from os.path import isfile, join, dirname
-sys.path.append('/foss_fim/src')
-# from utils.shared_functions import getDriver
 
 """
     Plot Rating Curves and Compare to USGS Gages
@@ -25,6 +23,10 @@
         File name of USGS rating curves.
     nwm_flow_dir : str
         Directory containing NWM recurrence flows files.
+    number_of_jobs : str
+        Number of jobs.
+    stat_groups : str
+        string of columns to group eval metrics.
 """
 
 # recurr_intervals = ['recurr_1_5_cms.csv','recurr_5_0_cms.csv','recurr_10_0_cms.csv']
@@ -43,7 +45,7 @@ def generate_rating_curve_metrics(args):
     elev_table = pd.read_csv(elev_table_filename)
     hydrotable = pd.read_csv(hydrotable_filename)
     usgs_gages = pd.read_csv(usgs_gages_filename)
-    
+
     # Join rating curves with elevation data
     hydrotable = hydrotable.merge(elev_table, on="HydroID")
     relevant_gages = list(hydrotable.location_id.unique())
@@ -57,52 +59,54 @@ def generate_rating_curve_metrics(args):
         # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
         hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
         usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation"})
-        
-        hydrotable['Source'] = "FIM"
-        usgs_gages['Source'] = "USGS"
-        limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','Source'])
-        select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','Source'])
-        
+
+        hydrotable['source'] = "FIM"
+        usgs_gages['source'] = "USGS"
+        limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','source'])
+        select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','source'])
+
         rating_curves = limited_hydrotable.append(select_usgs_gages)
-        
+
         # Add stream order
         stream_order = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
         rating_curves = rating_curves.merge(stream_order, on='location_id')
         rating_curves['str_order'] = rating_curves['str_order'].astype('int')
-        
+
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
-        
+
         ## Calculate metrics for NWM reccurence intervals
         # NWM recurr intervals
         recurr_1_5_yr_filename = join(nwm_flow_dir,'recurr_1_5_cms.csv')
         recurr_5_yr_filename = join(nwm_flow_dir,'recurr_5_0_cms.csv')
         recurr_10_yr_filename = join(nwm_flow_dir,'recurr_10_0_cms.csv')
-        
+
         recurr_1_5_yr = pd.read_csv(recurr_1_5_yr_filename)
         recurr_1_5_yr = recurr_1_5_yr.rename(columns={"discharge": "1.5"})
         recurr_5_yr = pd.read_csv(recurr_5_yr_filename)
         recurr_5_yr = recurr_5_yr.rename(columns={"discharge": "5.0"})
         recurr_10_yr = pd.read_csv(recurr_10_yr_filename)
         recurr_10_yr = recurr_10_yr.rename(columns={"discharge": "10.0"})
-        
+
         nwm_recurr_intervals_all = reduce(lambda x,y: pd.merge(x,y, on='feature_id', how='outer'), [recurr_1_5_yr, recurr_5_yr, recurr_10_yr])
         nwm_recurr_intervals_all = pd.melt(nwm_recurr_intervals_all, id_vars=['feature_id'], value_vars=['1.5','5.0','10.0'], var_name='recurr_interval', value_name='discharge_cms')
         nwm_recurr_intervals_all['discharge_cfs'] = nwm_recurr_intervals_all.discharge_cms * 35.3147
         nwm_recurr_intervals_all = nwm_recurr_intervals_all.filter(items=['discharge_cfs', 'recurr_interval','feature_id']).drop_duplicates()
-        
+
         usgs_crosswalk = hydrotable.filter(items=['location_id', 'feature_id']).drop_duplicates()
-        
+
         nwm_recurr_data_table = pd.DataFrame()
         usgs_recurr_data = pd.DataFrame()
 
         for index, gage in usgs_crosswalk.iterrows():
-            print(gage)
             ## Interpolate USGS/FIM elevation at NWM recurrence intervals
             # Interpolate USGS elevation at NWM recurrence intervals
-            usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="USGS")]
-            
+            usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.source=="USGS")]
+
+            if len(usgs_rc) <1:
+                continue
+
             str_order = np.unique(usgs_rc.str_order).item()
-                        
+
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # Handle sites missing data
@@ -113,9 +117,9 @@ def generate_rating_curve_metrics(args):
             usgs_pred_elev['location_id'] = gage.location_id
             usgs_pred_elev = usgs_pred_elev.filter(items=['location_id','recurr_interval', 'discharge_cfs','pred_elev'])
             usgs_pred_elev = usgs_pred_elev.rename(columns={"pred_elev": "USGS"})
-            
+
             # Interpolate FIM elevation at NWM recurrence intervals
-            fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.Source=="FIM")]
+            fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.source=="FIM")]
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # Handle sites missing data
@@ -127,24 +131,24 @@ def generate_rating_curve_metrics(args):
             fim_pred_elev = fim_pred_elev.rename(columns={"pred_elev": "FIM"})
             fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','FIM'])
             usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
-            
+
             usgs_pred_elev['HUC'] = huc
             usgs_pred_elev['str_order'] = str_order
-            
-            usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','recurr_interval','discharge_cfs','HUC','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
+
+            usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','recurr_interval','discharge_cfs','HUC','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
-            
+
             ## Interpolate FIM elevation at USGS observations
             # Sort stage in ascending order
             usgs_rc = usgs_rc.rename(columns={"elevation": "USGS"})
             usgs_rc = usgs_rc.sort_values('USGS',ascending=True)
             fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
-            
+
             usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
             usgs_rc = usgs_rc[usgs_rc['FIM'].notna()]
-            usgs_rc = usgs_rc.drop(columns=["Source"])
-            
-            usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="Source", value_name='elevation')
+            usgs_rc = usgs_rc.drop(columns=["source"])
+
+            usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation')
 
             if not usgs_rc.empty:
                 usgs_recurr_data = usgs_recurr_data.append(usgs_rc)
@@ -173,16 +177,16 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
     for huc in procs_list:
         if os.path.isfile(huc[3]):
             usgs_recurr_stats = pd.read_csv(huc[3])
-    
+
             # Write/append usgs_recurr_stats
             if os.path.isfile(agg_usgs_interp_elev_stats):
                 usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False, mode='a',header=False)
             else:
                 usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False)
-    
+
         if os.path.isfile(huc[4]):
             nwm_recurr_data = pd.read_csv(huc[4])
-    
+
             # Write/append nwm_recurr_data
             if os.path.isfile(agg_nwm_recurr_flow_elev):
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False, mode='a',header=False)
@@ -190,39 +194,29 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
 
     agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
-    
+
     agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,stat_groups)
-    
-    agg_recurr_stats_table.to_csv(agg_nwm_recurr_flow_elev_stats,index=False, header=False)
+
+    agg_recurr_stats_table.to_csv(agg_nwm_recurr_flow_elev_stats,index=False)
 
 
 def generate_facet_plot(rc, plot_filename):
     # Filter FIM elevation based on USGS data
     for gage in rc.location_id.unique():
 
-        min_elev = rc.loc[(rc.location_id==gage) & (rc.Source=='USGS')].elevation.min()
-        max_elev = rc.loc[(rc.location_id==gage) & (rc.Source=='USGS')].elevation.max()
+        min_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation.min()
+        max_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation.max()
 
-        rc = rc.drop(rc[(rc.location_id==gage) & (rc.Source=='FIM') & (rc.elevation > (max_elev + 2))].index)
-        rc = rc.drop(rc[(rc.location_id==gage) & (rc.Source=='FIM') & (rc.elevation < min_elev - 2)].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation > (max_elev + 2))].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation < min_elev - 2)].index)
 
     rc = rc.rename(columns={"location_id": "USGS Gage"})
 
     ## Generate rating curve plots
     sns.set(style="ticks")
-    g = sns.FacetGrid(rc, col="USGS Gage", hue="Source",sharex=False, sharey=False,col_wrap=3)
+    g = sns.FacetGrid(rc, col="USGS Gage", hue="source",sharex=False, sharey=False,col_wrap=3)
     g.map(sns.scatterplot, "discharge_cfs", "elevation", palette="tab20c", marker="o")
     g.set_axis_labels(x_var="Discharge (cfs)", y_var="Elevation (ft)")
-    
-     ## Change labels
-    # axes = g.axes.flatten()
-    # for ax in axes:
-    #     ax.set_xlabel("Rating Curve Plot ({})\nNRMSE = {}; Mean Abs Diff = {} ft; Bias = {}%".format(
-    #     station,
-    #     round(NRMSE, 2),
-    #     round(mean_abs_y_diff, 2),
-    #     round(percent_bias, 1),
-    # ))
 
     # Adjust the arrangement of the plots
     g.fig.tight_layout(w_pad=1)
@@ -249,62 +243,62 @@ def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
 
 
 def calculate_rc_stats_elev(rc,stat_groups=None):
-    
+
     usgs_elev = "USGS"
     src_elev = "FIM"
-    
+
     # Collect any extra columns not associated with melt
     col_index = list(rc.columns)
-    pivot_vars = ['Source','elevation']
+    pivot_vars = ['source','elevation']
     col_index = [col for col in col_index if col not in pivot_vars]
-    
-    # Unmelt elevation/Source
+
+    # Unmelt elevation/source
     rc_unmelt = (rc.set_index(col_index)
-        .pivot(columns="Source")['elevation']
+        .pivot(columns="source")['elevation']
         .reset_index()
         .rename_axis(None, axis=1)
      )
-    
+
     if stat_groups is None:
         stat_groups = ['location_id']
-    
+
     # Calculate variables for NRMSE
     rc_unmelt["yhat_minus_y"] = rc_unmelt[src_elev] - rc_unmelt[usgs_elev]
     rc_unmelt["yhat_minus_y_squared"] = rc_unmelt["yhat_minus_y"] ** 2
-    
-    station_rc = rc_unmelt.groupby(stat_groups)     
+
+    station_rc = rc_unmelt.groupby(stat_groups)
 
     ## Calculate metrics by group
     # Calculate variables for NRMSE
     sum_y_diff = station_rc.apply(lambda x: x["yhat_minus_y_squared"].sum())\
         .reset_index(stat_groups, drop = False).rename({0: "sum_y_diff"}, axis=1)
-        
+
     # Determine number of events that are modeled
     n = station_rc.apply(lambda x: x[usgs_elev].count())\
         .reset_index(stat_groups, drop = False).rename({0: "n"}, axis=1)
-    
+
     # Determine the maximum/minimum USGS elevation
     y_max = station_rc.apply(lambda x: x[usgs_elev].max())\
         .reset_index(stat_groups, drop = False).rename({0: "y_max"}, axis=1)
     y_min = station_rc.apply(lambda x: x[usgs_elev].min())\
         .reset_index(stat_groups, drop = False).rename({0: "y_min"}, axis=1)
-    
+
     # Collect variables for NRMSE
     NRMSE_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [sum_y_diff, n, y_max, y_min])
-    NRMSE_table_group = NRMSE_table.groupby(stat_groups)  
-    
+    NRMSE_table_group = NRMSE_table.groupby(stat_groups)
+
     # Calculate NRMSE
     NRMSE = NRMSE_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5)/x['y_max'] - x['y_min'])\
         .reset_index(stat_groups, drop = False).rename({0: "NRMSE"}, axis=1)
-    
+
     # Calculate Mean Absolute Depth Difference
     mean_abs_y_diff = station_rc.apply(lambda x: abs(x["yhat_minus_y"]).mean())\
         .reset_index(stat_groups, drop = False).rename({0: "mean_abs_y_diff"}, axis=1)
-    
+
     # Calculate Percent Bias
     percent_bias = station_rc.apply(lambda x: 100 * (x["yhat_minus_y"].sum()/x[usgs_elev].sum()))\
         .reset_index(stat_groups, drop = False).rename({0: "percent_bias"}, axis=1)
-    
+
     rc_stat_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [NRMSE, mean_abs_y_diff, percent_bias])
 
 
@@ -317,7 +311,7 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True)
     parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True)
     parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
-    parser.add_argument('-group','--stats-groups',help='column(s) to group stats',required=False,default=['location_id'])
+    parser.add_argument('-group','--stat-groups',help='column(s) to group stats',required=False)
 
     args = vars(parser.parse_args())
 
@@ -326,10 +320,10 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     nwm_flow_dir = args['nwm_flow_dir']
     number_of_jobs = args['number_of_jobs']
     stat_groups = args['stat_groups']
-    
-    
+
+    stat_groups = stat_groups.split()
     procs_list = []
-    
+
     huc_list  = os.listdir(output_dir)
     for huc in huc_list:
         elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
@@ -337,7 +331,7 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
         usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
         nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
         rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
-    
+
         if isfile(elev_table_filename):
             procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
 

From f76f9779d277e4a3842ce35ed15b66464bca5eba Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Mon, 29 Mar 2021 16:16:56 +0000
Subject: [PATCH 33/66] adding back tools/generate_categorical_fim.py - thought
 that was an old file

---
 tools/generate_categorical_fim.py | 112 ++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100755 tools/generate_categorical_fim.py

diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py
new file mode 100755
index 000000000..f51bf5aa8
--- /dev/null
+++ b/tools/generate_categorical_fim.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import subprocess
+import argparse
+import time
+from pathlib import Path
+import geopandas as gpd
+import pandas as pd
+from datetime import date
+
+def update_mapping_status(output_mapping_dir, output_flows_dir):
+    '''
+    Updates the status for nws_lids from the flows subdirectory. Status
+    is updated for sites where the inundation.py routine was not able to
+    produce inundation for the supplied flow files. It is assumed that if 
+    an error occured in inundation.py that all flow files for a given site
+    experienced the error as they all would have the same nwm segments.
+
+    Parameters
+    ----------
+    output_mapping_dir : STR
+        Path to the output directory of all inundation maps.
+    output_flows_dir : STR
+        Path to the directory containing all flows.
+
+    Returns
+    -------
+    None.
+
+    '''
+    #Find all LIDs with empty mapping output folders
+    subdirs = [str(i) for i in Path(output_mapping_dir).rglob('**/*') if i.is_dir()]
+    empty_nws_lids = [Path(directory).name for directory in subdirs if not list(Path(directory).iterdir())]
+    
+    #Write list of empty nws_lids to DataFrame, these are sites that failed in inundation.py
+    mapping_df = pd.DataFrame({'nws_lid':empty_nws_lids})
+    mapping_df['did_it_map'] = 'no'
+    mapping_df['map_status'] = ' and all categories failed to map'
+    
+    #Import shapefile output from flows creation 
+    shapefile = Path(output_flows_dir)/'nws_lid_flows_sites.shp'
+    flows_df = gpd.read_file(shapefile)
+    
+    #Join failed sites to flows df    
+    flows_df = flows_df.merge(mapping_df, how = 'left', on = 'nws_lid')
+    
+    #Switch mapped column to no for failed sites and update status
+    flows_df.loc[flows_df['did_it_map'] == 'no', 'mapped'] = 'no'
+    flows_df.loc[flows_df['did_it_map']=='no','status'] = flows_df['status'] + flows_df['map_status']
+    
+    #Perform pass for HUCs where mapping was skipped due to missing data.
+    flows_hucs = [i.stem for i in Path(output_flows_dir).iterdir() if i.is_dir()]
+    mapping_hucs = [i.stem for i in Path(output_mapping_dir).iterdir() if i.is_dir()]
+    missing_mapping_hucs = list(set(flows_hucs) - set(mapping_hucs))
+    #Update status for nws_lid in missing hucs and change mapped attribute to 'no'
+    flows_df.loc[flows_df.eval('HUC8 in @missing_mapping_hucs & mapped == "yes"'), 'status'] = flows_df['status'] + ' and all categories failed to map because missing HUC information'
+    flows_df.loc[flows_df.eval('HUC8 in @missing_mapping_hucs & mapped == "yes"'), 'mapped'] = 'no'
+            
+    #Clean up GeoDataFrame and rename columns for consistency.
+    flows_df = flows_df.drop(columns = ['did_it_map','map_status'])
+    flows_df = flows_df.rename(columns = {'nws_lid':'ahps_lid'})
+    
+    #Write out to file
+    nws_lid_path = Path(output_mapping_dir) / 'nws_lid_sites.shp'
+    flows_df.to_file(nws_lid_path)
+    
+if __name__ == '__main__':
+    
+    #Parse arguments
+    parser = argparse.ArgumentParser(description = 'Run Categorical FIM')
+    parser.add_argument('-f','--fim_version',help='Name of directory containing outputs of fim_run.sh',required=True)
+    parser.add_argument('-j','--number_of_jobs',help='Number of processes to use. Default is 1.',required=False, default="1",type=int)
+    args = vars(parser.parse_args())
+    
+    #Get arguments
+    fim_version = args['fim_version']
+    number_of_jobs = args['number_of_jobs']
+    
+    ####################################################################
+    #Define default arguments. Modify these if necessary. 
+    today = date.today().strftime('%m%d%Y')
+    fim_run_dir = Path(f'/data/previous_fim/{fim_version}')
+    output_flows_dir = Path(f'/data/catfim/{fim_version}/{today}/flows')
+    output_mapping_dir = Path(f'/data/catfim/{fim_version}/{today}/mapping')
+    nwm_us_search = '10'
+    nwm_ds_search = '10'        
+    write_depth_tiff = False
+    ####################################################################
+    
+    ####################################################################
+    #Run CatFIM scripts in sequence
+    ####################################################################
+    #Generate CatFIM flow files.
+    print('Creating flow files')
+    start = time.time()
+    subprocess.call(['python3','generate_categorical_fim_flows.py', '-w' , str(output_flows_dir), '-u', nwm_us_search, '-d', nwm_ds_search])
+    end = time.time()
+    elapsed_time = (end-start)/60
+    print(f'Finished creating flow files in {elapsed_time} minutes')
+    
+    #Generate CatFIM mapping.
+    print('Begin mapping')
+    start = time.time()
+    subprocess.call(['python3','generate_categorical_fim_mapping.py', '-r' , str(fim_run_dir), '-s', str(output_flows_dir), '-o', str(output_mapping_dir), '-j', str(number_of_jobs)]) 
+    end = time.time()
+    elapsed_time = (end-start)/60
+    print(f'Finished mapping in {elapsed_time} minutes')
+    
+    #Updating Mapping Status
+    print('Updating mapping status')
+    update_mapping_status(str(output_mapping_dir), str(output_flows_dir))
+
+   
\ No newline at end of file

From 090339ab5cb2cbe860124988fadcec24d72cea9d Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Tue, 30 Mar 2021 22:10:16 +0000
Subject: [PATCH 34/66] addressing comments in PR review

---
 src/usgs_gage_crosswalk.py       | 10 ++--
 tools/rating_curve_comparison.py | 97 +++++++++++++++++++-------------
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/src/usgs_gage_crosswalk.py b/src/usgs_gage_crosswalk.py
index 6ce172856..8a8275028 100755
--- a/src/usgs_gage_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -39,20 +39,22 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     input_catchment = gpd.read_file(input_catchment_filename)
     dem_adj = rasterio.open(dem_adj_filename,'r')
 
+    if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
+
     # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
     closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','min_thal_elev','med_thal_elev','max_thal_elev', 'order_'])
+    closest_hydro_id = closest_hydro_id.dropna()
 
-    if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
+    # Get USGS gages that are within catchment boundaries
+    usgs_gages = usgs_gages.loc[usgs_gages.site_no.isin(list(closest_hydro_id.site_no))]
 
     columns = ['location_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev','str_order']
     gage_data = []
 
     # Move USGS gage to stream
     for index, gage in usgs_gages.iterrows():
-
         print (f"usgs gage: {gage.site_no}")
-
         # Get stream attributes
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
         str_order = str(int(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()))
@@ -89,7 +91,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
             dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
 
             # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
-            site_elevations = [gage.site_no, hydro_id, dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str_order]
+            site_elevations = [str(gage.site_no), str(hydro_id), dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str(str_order)]
             gage_data.append(site_elevations)
 
 
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index c483157a2..d1498abdf 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -11,6 +11,8 @@
 from functools import reduce
 from multiprocessing import Pool
 from os.path import isfile, join, dirname
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 """
     Plot Rating Curves and Compare to USGS Gages
@@ -42,9 +44,9 @@ def generate_rating_curve_metrics(args):
     nwm_flow_dir                = args[6]
     huc                         = args[7]
 
-    elev_table = pd.read_csv(elev_table_filename)
-    hydrotable = pd.read_csv(hydrotable_filename)
-    usgs_gages = pd.read_csv(usgs_gages_filename)
+    elev_table = pd.read_csv(elev_table_filename,dtype={'location_id': str})
+    hydrotable = pd.read_csv(hydrotable_filename,dtype={'HUC': str,'feature_id': str})
+    usgs_gages = pd.read_csv(usgs_gages_filename,dtype={'location_id': str})
 
     # Join rating curves with elevation data
     hydrotable = hydrotable.merge(elev_table, on="HydroID")
@@ -55,21 +57,21 @@ def generate_rating_curve_metrics(args):
     if len(usgs_gages) > 0:
 
         # Adjust rating curve to elevation
-        hydrotable['elevation'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
+        hydrotable['elevation_ft'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
         # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
         hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
-        usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation"})
+        usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation_ft"})
 
         hydrotable['source'] = "FIM"
         usgs_gages['source'] = "USGS"
-        limited_hydrotable = hydrotable.filter(items=['location_id','elevation','discharge_cfs','source'])
-        select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation', 'discharge_cfs','source'])
+        limited_hydrotable = hydrotable.filter(items=['location_id','elevation_ft','discharge_cfs','source'])
+        select_usgs_gages = usgs_gages.filter(items=['location_id', 'elevation_ft', 'discharge_cfs','source'])
 
         rating_curves = limited_hydrotable.append(select_usgs_gages)
 
         # Add stream order
-        stream_order = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
-        rating_curves = rating_curves.merge(stream_order, on='location_id')
+        stream_orders = hydrotable.filter(items=['location_id','str_order']).drop_duplicates()
+        rating_curves = rating_curves.merge(stream_orders, on='location_id')
         rating_curves['str_order'] = rating_curves['str_order'].astype('int')
 
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
@@ -106,6 +108,10 @@ def generate_rating_curve_metrics(args):
                 continue
 
             str_order = np.unique(usgs_rc.str_order).item()
+            try:
+                feature_id = str(gage.feature_id)
+            except:
+                print(f"huc: {huc}; gage: {gage.location_id}")
 
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
@@ -133,22 +139,24 @@ def generate_rating_curve_metrics(args):
             usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
 
             usgs_pred_elev['HUC'] = huc
+            usgs_pred_elev['HUC4'] = huc[0:4]
             usgs_pred_elev['str_order'] = str_order
+            usgs_pred_elev['feature_id'] = feature_id
 
-            usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','recurr_interval','discharge_cfs','HUC','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation')
+            usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','feature_id','recurr_interval','discharge_cfs','HUC','HUC4','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation_ft')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
 
             ## Interpolate FIM elevation at USGS observations
             # Sort stage in ascending order
-            usgs_rc = usgs_rc.rename(columns={"elevation": "USGS"})
+            usgs_rc = usgs_rc.rename(columns={"elevation_ft": "USGS"})
             usgs_rc = usgs_rc.sort_values('USGS',ascending=True)
             fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
 
-            usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation'], left = np.nan, right = np.nan)
+            usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation_ft'], left = np.nan, right = np.nan)
             usgs_rc = usgs_rc[usgs_rc['FIM'].notna()]
             usgs_rc = usgs_rc.drop(columns=["source"])
 
-            usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation')
+            usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation_ft')
 
             if not usgs_rc.empty:
                 usgs_recurr_data = usgs_recurr_data.append(usgs_rc)
@@ -158,11 +166,13 @@ def generate_rating_curve_metrics(args):
             usgs_recurr_stats_table = calculate_rc_stats_elev(usgs_recurr_data)
             usgs_recurr_stats_table.to_csv(usgs_recurr_stats_filename,index=False)
 
-        # Generate plots
-        fim_elev_at_USGS_rc_plot_filename = join(dirname(rc_comparison_plot_filename),'FIM_elevations_at_USGS_rc_' + str(huc) +'.png')
-        generate_facet_plot(usgs_recurr_data, fim_elev_at_USGS_rc_plot_filename)
+        # Generate plots (not currently being used)
+        # fim_elev_at_USGS_rc_plot_filename = join(dirname(rc_comparison_plot_filename),'FIM_elevations_at_USGS_rc_' + str(huc) +'.png')
+        # generate_facet_plot(usgs_recurr_data, fim_elev_at_USGS_rc_plot_filename)
 
         if not nwm_recurr_data_table.empty:
+            nwm_recurr_data_table.discharge_cfs = np.round(nwm_recurr_data_table.discharge_cfs,2)
+            nwm_recurr_data_table.elevation_ft = np.round(nwm_recurr_data_table.elevation_ft,2)
             nwm_recurr_data_table.to_csv(nwm_recurr_data_filename,index=False)
 
     else:
@@ -204,18 +214,24 @@ def generate_facet_plot(rc, plot_filename):
     # Filter FIM elevation based on USGS data
     for gage in rc.location_id.unique():
 
-        min_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation.min()
-        max_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation.max()
+        min_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation_ft.min()
+        max_elev = rc.loc[(rc.location_id==gage) & (rc.source=='USGS')].elevation_ft.max()
 
-        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation > (max_elev + 2))].index)
-        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation < min_elev - 2)].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation_ft > (max_elev + 2))].index)
+        rc = rc.drop(rc[(rc.location_id==gage) & (rc.source=='FIM') & (rc.elevation_ft < min_elev - 2)].index)
 
     rc = rc.rename(columns={"location_id": "USGS Gage"})
 
     ## Generate rating curve plots
+    num_plots = len(rc["USGS Gage"].unique())
+    if num_plots > 3:
+        columns = num_plots // 3
+    else:
+        columns = 1
+
     sns.set(style="ticks")
-    g = sns.FacetGrid(rc, col="USGS Gage", hue="source",sharex=False, sharey=False,col_wrap=3)
-    g.map(sns.scatterplot, "discharge_cfs", "elevation", palette="tab20c", marker="o")
+    g = sns.FacetGrid(rc, col="USGS Gage", hue="source",sharex=False, sharey=False,col_wrap=columns)
+    g.map(sns.scatterplot, "discharge_cfs", "elevation_ft", palette="tab20c", marker="o")
     g.set_axis_labels(x_var="Discharge (cfs)", y_var="Elevation (ft)")
 
     # Adjust the arrangement of the plots
@@ -234,7 +250,7 @@ def get_reccur_intervals(site_rc, usgs_crosswalk,nwm_recurr_intervals):
     if nwm_ids > 0:
 
         nwm_recurr_intervals = nwm_recurr_intervals.copy().loc[nwm_recurr_intervals.feature_id==usgs_site.feature_id.drop_duplicates().item()]
-        nwm_recurr_intervals['pred_elev'] = np.interp(nwm_recurr_intervals.discharge_cfs.values, usgs_site['discharge_cfs'], usgs_site['elevation'], left = np.nan, right = np.nan)
+        nwm_recurr_intervals['pred_elev'] = np.interp(nwm_recurr_intervals.discharge_cfs.values, usgs_site['discharge_cfs'], usgs_site['elevation_ft'], left = np.nan, right = np.nan)
 
         return nwm_recurr_intervals
 
@@ -249,12 +265,12 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
 
     # Collect any extra columns not associated with melt
     col_index = list(rc.columns)
-    pivot_vars = ['source','elevation']
+    pivot_vars = ['source','elevation_ft']
     col_index = [col for col in col_index if col not in pivot_vars]
 
     # Unmelt elevation/source
     rc_unmelt = (rc.set_index(col_index)
-        .pivot(columns="source")['elevation']
+        .pivot(columns="source")['elevation_ft']
         .reset_index()
         .rename_axis(None, axis=1)
      )
@@ -284,22 +300,22 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
         .reset_index(stat_groups, drop = False).rename({0: "y_min"}, axis=1)
 
     # Collect variables for NRMSE
-    NRMSE_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [sum_y_diff, n, y_max, y_min])
-    NRMSE_table_group = NRMSE_table.groupby(stat_groups)
+    nrmse_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [sum_y_diff, n, y_max, y_min])
+    nrmse_table_group = nrmse_table.groupby(stat_groups)
 
-    # Calculate NRMSE
-    NRMSE = NRMSE_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5)/x['y_max'] - x['y_min'])\
-        .reset_index(stat_groups, drop = False).rename({0: "NRMSE"}, axis=1)
+    # Calculate nrmse
+    nrmse = nrmse_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5)/x['y_max'] - x['y_min'])\
+        .reset_index(stat_groups, drop = False).rename({0: "nrmse"}, axis=1)
 
     # Calculate Mean Absolute Depth Difference
     mean_abs_y_diff = station_rc.apply(lambda x: abs(x["yhat_minus_y"]).mean())\
-        .reset_index(stat_groups, drop = False).rename({0: "mean_abs_y_diff"}, axis=1)
+        .reset_index(stat_groups, drop = False).rename({0: "mean_abs_y_diff_ft"}, axis=1)
 
     # Calculate Percent Bias
     percent_bias = station_rc.apply(lambda x: 100 * (x["yhat_minus_y"].sum()/x[usgs_elev].sum()))\
         .reset_index(stat_groups, drop = False).rename({0: "percent_bias"}, axis=1)
 
-    rc_stat_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [NRMSE, mean_abs_y_diff, percent_bias])
+    rc_stat_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [nrmse, mean_abs_y_diff, percent_bias])
 
 
     return rc_stat_table
@@ -326,14 +342,15 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
 
     huc_list  = os.listdir(output_dir)
     for huc in huc_list:
-        elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
-        hydrotable_filename = join(output_dir,huc,'hydroTable.csv')
-        usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
-        nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
-        rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
-
-        if isfile(elev_table_filename):
-            procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
+        if huc != 'logs':
+            elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
+            hydrotable_filename = join(output_dir,huc,'hydroTable.csv')
+            usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
+            nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
+            rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
+
+            if isfile(elev_table_filename):
+                procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
 
     # Initiate multiprocessing
     print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")

From 437db29b546e5ce7b002b3cc734f56fc0b9791d8 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 31 Mar 2021 17:13:07 +0000
Subject: [PATCH 35/66] addressing comments in PR review

---
 src/run_by_unit.sh               |   2 +-
 src/usgs_gage_crosswalk.py       |  55 ++++++++-------
 tools/rating_curve_comparison.py | 114 ++++++++++++++++++++++---------
 3 files changed, 111 insertions(+), 60 deletions(-)

diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 6805be7e3..1242768dc 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -437,7 +437,7 @@ Tcount
 echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
 date -u
 Tstart
-$srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/ahp_sites/evaluated_active_gages.shp -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
+$srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/usgs_gages/usgs_gages.gpkg -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
 Tcount
 
 ## CLEANUP OUTPUTS ##
diff --git a/src/usgs_gage_crosswalk.py b/src/usgs_gage_crosswalk.py
index 8a8275028..29ef7b592 100755
--- a/src/usgs_gage_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -8,6 +8,8 @@
 import argparse
 import pygeos
 from shapely.wkb import dumps, loads
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 ''' Get elevation at adjusted USGS gages locations
 
@@ -54,45 +56,42 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
 
     # Move USGS gage to stream
     for index, gage in usgs_gages.iterrows():
-        print (f"usgs gage: {gage.site_no}")
+
         # Get stream attributes
         hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
         str_order = str(int(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()))
+        min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].min_thal_elev.item(),2)
+        med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].med_thal_elev.item(),2)
+        max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].max_thal_elev.item(),2)
 
-        if not np.isnan(hydro_id):
-
-            min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].min_thal_elev.item(),2)
-            med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].med_thal_elev.item(),2)
-            max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].max_thal_elev.item(),2)
-
-            # Convert headwater point geometries to WKB representation
-            wkb_gages = dumps(gage.geometry)
+        # Convert headwater point geometries to WKB representation
+        wkb_gages = dumps(gage.geometry)
 
-            # Create pygeos headwater point geometries from WKB representation
-            gage_bin_geom = pygeos.io.from_wkb(wkb_gages)
+        # Create pygeos headwater point geometries from WKB representation
+        gage_bin_geom = pygeos.io.from_wkb(wkb_gages)
 
-            # Closest segment to headwater
-            closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
-            wkb_closest_stream = dumps(closest_stream.geometry.item())
-            stream_bin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+        # Closest segment to headwater
+        closest_stream = input_flows.loc[input_flows.HydroID==hydro_id]
+        wkb_closest_stream = dumps(closest_stream.geometry.item())
+        stream_bin_geom = pygeos.io.from_wkb(wkb_closest_stream)
 
-            # Linear reference headwater to closest stream segment
-            gage_distance_to_line = pygeos.linear.line_locate_point(stream_bin_geom, gage_bin_geom)
-            referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
+        # Linear reference headwater to closest stream segment
+        gage_distance_to_line = pygeos.linear.line_locate_point(stream_bin_geom, gage_bin_geom)
+        referenced_gage = pygeos.linear.line_interpolate_point(stream_bin_geom, gage_distance_to_line)
 
-            # Convert geometries to wkb representation
-            bin_referenced_gage = pygeos.io.to_wkb(referenced_gage)
+        # Convert geometries to wkb representation
+        bin_referenced_gage = pygeos.io.to_wkb(referenced_gage)
 
-            # Convert to shapely geometries
-            shply_referenced_gage = loads(bin_referenced_gage)
+        # Convert to shapely geometries
+        shply_referenced_gage = loads(bin_referenced_gage)
 
-            # Sample rasters at adjusted gage
-            dem_m_elev = round(list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item(),2)
-            dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
+        # Sample rasters at adjusted gage
+        dem_m_elev = round(list(rasterio.sample.sample_gen(dem_m,shply_referenced_gage.coords))[0].item(),2)
+        dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
 
-            # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
-            site_elevations = [str(gage.site_no), str(hydro_id), dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str(str_order)]
-            gage_data.append(site_elevations)
+        # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
+        site_elevations = [str(gage.site_no), str(hydro_id), dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str(str_order)]
+        gage_data.append(site_elevations)
 
 
     elev_table = pd.DataFrame(gage_data, columns=columns)
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index d1498abdf..0f61080da 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -11,6 +11,7 @@
 from functools import reduce
 from multiprocessing import Pool
 from os.path import isfile, join, dirname
+import shutil
 import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
@@ -19,8 +20,10 @@
 
     Parameters
     ----------
-    output_dir : str
+    fim_dir : str
         Directory containing FIM output folders.
+    output_dir : str
+        Directory containing rating curve plots and tables.
     usgs_gages_filename : str
         File name of USGS rating curves.
     nwm_flow_dir : str
@@ -58,7 +61,7 @@ def generate_rating_curve_metrics(args):
 
         # Adjust rating curve to elevation
         hydrotable['elevation_ft'] = (hydrotable.stage + hydrotable.dem_adj_elevation) * 3.28084 # convert from m to ft
-        # hydrotable['raw_elevation'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
+        # hydrotable['raw_elevation_ft'] = (hydrotable.stage + hydrotable.dem_elevation) * 3.28084 # convert from m to ft
         hydrotable['discharge_cfs'] = hydrotable.discharge_cms * 35.3147
         usgs_gages = usgs_gages.rename(columns={"flow": "discharge_cfs", "elevation_navd88": "elevation_ft"})
 
@@ -74,49 +77,53 @@ def generate_rating_curve_metrics(args):
         rating_curves = rating_curves.merge(stream_orders, on='location_id')
         rating_curves['str_order'] = rating_curves['str_order'].astype('int')
 
+        # plot rating curves
         generate_facet_plot(rating_curves, rc_comparison_plot_filename)
 
-        ## Calculate metrics for NWM reccurence intervals
         # NWM recurr intervals
         recurr_1_5_yr_filename = join(nwm_flow_dir,'recurr_1_5_cms.csv')
         recurr_5_yr_filename = join(nwm_flow_dir,'recurr_5_0_cms.csv')
         recurr_10_yr_filename = join(nwm_flow_dir,'recurr_10_0_cms.csv')
 
-        recurr_1_5_yr = pd.read_csv(recurr_1_5_yr_filename)
+        # Update column names
+        recurr_1_5_yr = pd.read_csv(recurr_1_5_yr_filename,dtype={'feature_id': str})
         recurr_1_5_yr = recurr_1_5_yr.rename(columns={"discharge": "1.5"})
-        recurr_5_yr = pd.read_csv(recurr_5_yr_filename)
+        recurr_5_yr = pd.read_csv(recurr_5_yr_filename,dtype={'feature_id': str})
         recurr_5_yr = recurr_5_yr.rename(columns={"discharge": "5.0"})
-        recurr_10_yr = pd.read_csv(recurr_10_yr_filename)
+        recurr_10_yr = pd.read_csv(recurr_10_yr_filename,dtype={'feature_id': str})
         recurr_10_yr = recurr_10_yr.rename(columns={"discharge": "10.0"})
 
+        # Merge NWM recurr intervals into a single layer
         nwm_recurr_intervals_all = reduce(lambda x,y: pd.merge(x,y, on='feature_id', how='outer'), [recurr_1_5_yr, recurr_5_yr, recurr_10_yr])
         nwm_recurr_intervals_all = pd.melt(nwm_recurr_intervals_all, id_vars=['feature_id'], value_vars=['1.5','5.0','10.0'], var_name='recurr_interval', value_name='discharge_cms')
         nwm_recurr_intervals_all['discharge_cfs'] = nwm_recurr_intervals_all.discharge_cms * 35.3147
         nwm_recurr_intervals_all = nwm_recurr_intervals_all.filter(items=['discharge_cfs', 'recurr_interval','feature_id']).drop_duplicates()
 
+
+        # Identify unique gages
         usgs_crosswalk = hydrotable.filter(items=['location_id', 'feature_id']).drop_duplicates()
 
         nwm_recurr_data_table = pd.DataFrame()
         usgs_recurr_data = pd.DataFrame()
 
+        # Interpolate USGS/FIM elevation at each gage
         for index, gage in usgs_crosswalk.iterrows():
-            ## Interpolate USGS/FIM elevation at NWM recurrence intervals
+
             # Interpolate USGS elevation at NWM recurrence intervals
             usgs_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.source=="USGS")]
 
             if len(usgs_rc) <1:
+                print(f"missing USGS rating curve data for usgs station {gage.location_id} in huc {huc}")
                 continue
 
             str_order = np.unique(usgs_rc.str_order).item()
-            try:
-                feature_id = str(gage.feature_id)
-            except:
-                print(f"huc: {huc}; gage: {gage.location_id}")
+            feature_id = str(gage.feature_id)
 
             usgs_pred_elev = get_reccur_intervals(usgs_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # Handle sites missing data
             if len(usgs_pred_elev) <1:
+                print(f"missing USGS elevation data for usgs station {gage.location_id} in huc {huc}")
                 continue
 
             # Clean up data
@@ -126,11 +133,16 @@ def generate_rating_curve_metrics(args):
 
             # Interpolate FIM elevation at NWM recurrence intervals
             fim_rc = rating_curves.loc[(rating_curves.location_id==gage.location_id) & (rating_curves.source=="FIM")]
+
+            if len(fim_rc) <1:
+                print(f"missing FIM rating curve data for usgs station {gage.location_id} in huc {huc}")
+                continue
+
             fim_pred_elev = get_reccur_intervals(fim_rc, usgs_crosswalk,nwm_recurr_intervals_all)
 
             # Handle sites missing data
             if len(fim_pred_elev) <1:
-                print(f"missing fim elevation data for usgs station {gage.location_id} in huc {huc}")
+                print(f"missing FIM elevation data for usgs station {gage.location_id} in huc {huc}")
                 continue
 
             # Clean up data
@@ -138,24 +150,29 @@ def generate_rating_curve_metrics(args):
             fim_pred_elev = fim_pred_elev.filter(items=['recurr_interval', 'discharge_cfs','FIM'])
             usgs_pred_elev = usgs_pred_elev.merge(fim_pred_elev, on=['recurr_interval','discharge_cfs'])
 
+            # Add attributes
             usgs_pred_elev['HUC'] = huc
             usgs_pred_elev['HUC4'] = huc[0:4]
             usgs_pred_elev['str_order'] = str_order
             usgs_pred_elev['feature_id'] = feature_id
 
+            # Melt dataframe
             usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','feature_id','recurr_interval','discharge_cfs','HUC','HUC4','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation_ft')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
 
-            ## Interpolate FIM elevation at USGS observations
-            # Sort stage in ascending order
+            # Interpolate FIM elevation at USGS observations (not currently being used)
+            fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
             usgs_rc = usgs_rc.rename(columns={"elevation_ft": "USGS"})
+
+            # Sort stage in ascending order
             usgs_rc = usgs_rc.sort_values('USGS',ascending=True)
-            fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
 
+            # Interpolate FIM elevation at USGS observations
             usgs_rc['FIM'] = np.interp(usgs_rc.discharge_cfs.values, fim_rc['discharge_cfs'], fim_rc['elevation_ft'], left = np.nan, right = np.nan)
             usgs_rc = usgs_rc[usgs_rc['FIM'].notna()]
             usgs_rc = usgs_rc.drop(columns=["source"])
 
+            # Melt dataframe
             usgs_rc = pd.melt(usgs_rc, id_vars=['location_id','discharge_cfs','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation_ft')
 
             if not usgs_rc.empty:
@@ -166,7 +183,7 @@ def generate_rating_curve_metrics(args):
             usgs_recurr_stats_table = calculate_rc_stats_elev(usgs_recurr_data)
             usgs_recurr_stats_table.to_csv(usgs_recurr_stats_filename,index=False)
 
-        # Generate plots (not currently being used)
+        # # Generate plots
         # fim_elev_at_USGS_rc_plot_filename = join(dirname(rc_comparison_plot_filename),'FIM_elevations_at_USGS_rc_' + str(huc) +'.png')
         # generate_facet_plot(usgs_recurr_data, fim_elev_at_USGS_rc_plot_filename)
 
@@ -182,7 +199,14 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
 
     agg_usgs_interp_elev_stats = join(output_dir,'agg_usgs_interp_elev_stats.csv')
     agg_nwm_recurr_flow_elev = join(output_dir,'agg_nwm_recurr_flow_elevations.csv')
-    agg_nwm_recurr_flow_elev_stats = join(output_dir,'agg_nwm_recurr_flow_elev_stats.csv')
+    agg_nwm_recurr_flow_elev_stats = join(output_dir,f"agg_nwm_recurr_flow_elev_stats_{'_'.join(stat_groups)}.csv")
+
+    if os.path.isfile(agg_usgs_interp_elev_stats):
+        os.remove(agg_usgs_interp_elev_stats)
+    if os.path.isfile(agg_nwm_recurr_flow_elev):
+        os.remove(agg_nwm_recurr_flow_elev)
+    if os.path.isfile(agg_nwm_recurr_flow_elev_stats):
+        os.remove(agg_nwm_recurr_flow_elev_stats)
 
     for huc in procs_list:
         if os.path.isfile(huc[3]):
@@ -195,7 +219,8 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
                 usgs_recurr_stats.to_csv(agg_usgs_interp_elev_stats,index=False)
 
         if os.path.isfile(huc[4]):
-            nwm_recurr_data = pd.read_csv(huc[4])
+            nwm_recurr_data = pd.read_csv(huc[4],dtype={'location_id': str,
+                                                        'feature_id': str})
 
             # Write/append nwm_recurr_data
             if os.path.isfile(agg_nwm_recurr_flow_elev):
@@ -203,7 +228,8 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
             else:
                 nwm_recurr_data.to_csv(agg_nwm_recurr_flow_elev,index=False)
 
-    agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev)
+    agg_stats = pd.read_csv(agg_nwm_recurr_flow_elev,dtype={'location_id': str,
+                                                            'feature_id': str})
 
     agg_recurr_stats_table = calculate_rc_stats_elev(agg_stats,stat_groups)
 
@@ -282,9 +308,9 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     rc_unmelt["yhat_minus_y"] = rc_unmelt[src_elev] - rc_unmelt[usgs_elev]
     rc_unmelt["yhat_minus_y_squared"] = rc_unmelt["yhat_minus_y"] ** 2
 
+    # Calculate metrics by group
     station_rc = rc_unmelt.groupby(stat_groups)
 
-    ## Calculate metrics by group
     # Calculate variables for NRMSE
     sum_y_diff = station_rc.apply(lambda x: x["yhat_minus_y_squared"].sum())\
         .reset_index(stat_groups, drop = False).rename({0: "sum_y_diff"}, axis=1)
@@ -304,26 +330,27 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     nrmse_table_group = nrmse_table.groupby(stat_groups)
 
     # Calculate nrmse
-    nrmse = nrmse_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5)/x['y_max'] - x['y_min'])\
+    nrmse = nrmse_table_group.apply(lambda x: ((x['sum_y_diff'] / x['n']) ** 0.5) / (x['y_max'] - x['y_min']))\
         .reset_index(stat_groups, drop = False).rename({0: "nrmse"}, axis=1)
 
     # Calculate Mean Absolute Depth Difference
-    mean_abs_y_diff = station_rc.apply(lambda x: abs(x["yhat_minus_y"]).mean())\
+    mean_abs_y_diff = station_rc.apply(lambda x: (abs(x["yhat_minus_y"]).mean() / x["location_id"].count()))\
         .reset_index(stat_groups, drop = False).rename({0: "mean_abs_y_diff_ft"}, axis=1)
 
     # Calculate Percent Bias
-    percent_bias = station_rc.apply(lambda x: 100 * (x["yhat_minus_y"].sum()/x[usgs_elev].sum()))\
+    percent_bias = station_rc.apply(lambda x: 100 * (x["yhat_minus_y"].sum() / x[usgs_elev].sum()))\
         .reset_index(stat_groups, drop = False).rename({0: "percent_bias"}, axis=1)
 
     rc_stat_table = reduce(lambda x,y: pd.merge(x,y, on=stat_groups, how='outer'), [nrmse, mean_abs_y_diff, percent_bias])
 
-
     return rc_stat_table
 
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
-    parser.add_argument('-output_dir','--output-dir', help='FIM output dir', required=True)
+    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True)
+    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True)
     parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True)
     parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True)
     parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
@@ -331,23 +358,41 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
 
     args = vars(parser.parse_args())
 
+    fim_dir = args['fim_dir']
     output_dir = args['output_dir']
     usgs_gages_filename = args['usgs_gages_filename']
     nwm_flow_dir = args['nwm_flow_dir']
     number_of_jobs = args['number_of_jobs']
     stat_groups = args['stat_groups']
 
+    # fim_dir= 'data/outputs/dev-usgs-crosswalk_PR_ms_c'
+    # output_dir= 'data/tools/rating_curve_comparison/dev-usgs-crosswalk_PR_ms_c'
+    # usgs_gages_filename= 'data/temp/tsg/usgs_rating_curve/usgs_rating_curves.csv'
+    # nwm_flow_dir= '/data/inundation_review/inundation_nwm_recurr/nwm_recurr_flow_data'
+    # stat_groups= 'recurr_interval'
+
+    # Open log file
+    sys.__stdout__ = sys.stdout
+    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
+    sys.stdout = log_file
+
     stat_groups = stat_groups.split()
     procs_list = []
 
-    huc_list  = os.listdir(output_dir)
+    plots_dir = join(output_dir,'plots')
+    os.makedirs(plots_dir, exist_ok=True)
+    tables_dir = join(output_dir,'tables')
+    os.makedirs(tables_dir, exist_ok=True)
+
+    huc_list  = os.listdir(fim_dir)
     for huc in huc_list:
+
         if huc != 'logs':
-            elev_table_filename = join(output_dir,huc,'usgs_elev_table.csv')
-            hydrotable_filename = join(output_dir,huc,'hydroTable.csv')
-            usgs_recurr_stats_filename = join(output_dir,huc,'usgs_interpolated_elevation_stats.csv')
-            nwm_recurr_data_filename = join(output_dir,huc,'nwm_recurrence_flow_elevations.csv')
-            rc_comparison_plot_filename = join(output_dir,huc,'FIM-USGS_rating_curve_comparison.png')
+            elev_table_filename = join(fim_dir,huc,'usgs_elev_table.csv')
+            hydrotable_filename = join(fim_dir,huc,'hydroTable.csv')
+            usgs_recurr_stats_filename = join(tables_dir,f"usgs_interpolated_elevation_stats_{huc}.csv")
+            nwm_recurr_data_filename = join(tables_dir,f"nwm_recurrence_flow_elevations_{huc}.csv")
+            rc_comparison_plot_filename = join(plots_dir,f"FIM-USGS_rating_curve_comparison_{huc}.png")
 
             if isfile(elev_table_filename):
                 procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir,huc])
@@ -359,3 +404,10 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
 
     print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
     aggregate_metrics(output_dir,procs_list,stat_groups)
+
+    print('Delete intermediate tables')
+    shutil.rmtree(tables_dir, ignore_errors=True)
+
+    # Close log file
+    sys.stdout = sys.__stdout__
+    log_file.close()

From 1f33dc9a6d2976500abb115bca275c88c4f2b4fc Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 31 Mar 2021 17:29:12 +0000
Subject: [PATCH 36/66] removing comments

---
 tools/rating_curve_comparison.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 0f61080da..6cd232ada 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -160,7 +160,7 @@ def generate_rating_curve_metrics(args):
             usgs_pred_elev = pd.melt(usgs_pred_elev, id_vars=['location_id','feature_id','recurr_interval','discharge_cfs','HUC','HUC4','str_order'], value_vars=['USGS','FIM'], var_name="source", value_name='elevation_ft')
             nwm_recurr_data_table = nwm_recurr_data_table.append(usgs_pred_elev)
 
-            # Interpolate FIM elevation at USGS observations (not currently being used)
+            # Interpolate FIM elevation at USGS observations
             fim_rc = fim_rc.merge(usgs_crosswalk, on="location_id")
             usgs_rc = usgs_rc.rename(columns={"elevation_ft": "USGS"})
 
@@ -183,7 +183,7 @@ def generate_rating_curve_metrics(args):
             usgs_recurr_stats_table = calculate_rc_stats_elev(usgs_recurr_data)
             usgs_recurr_stats_table.to_csv(usgs_recurr_stats_filename,index=False)
 
-        # # Generate plots
+        # # Generate plots (not currently being used)
         # fim_elev_at_USGS_rc_plot_filename = join(dirname(rc_comparison_plot_filename),'FIM_elevations_at_USGS_rc_' + str(huc) +'.png')
         # generate_facet_plot(usgs_recurr_data, fim_elev_at_USGS_rc_plot_filename)
 
@@ -237,6 +237,7 @@ def aggregate_metrics(output_dir,procs_list,stat_groups):
 
 
 def generate_facet_plot(rc, plot_filename):
+
     # Filter FIM elevation based on USGS data
     for gage in rc.location_id.unique():
 
@@ -365,12 +366,6 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     number_of_jobs = args['number_of_jobs']
     stat_groups = args['stat_groups']
 
-    # fim_dir= 'data/outputs/dev-usgs-crosswalk_PR_ms_c'
-    # output_dir= 'data/tools/rating_curve_comparison/dev-usgs-crosswalk_PR_ms_c'
-    # usgs_gages_filename= 'data/temp/tsg/usgs_rating_curve/usgs_rating_curves.csv'
-    # nwm_flow_dir= '/data/inundation_review/inundation_nwm_recurr/nwm_recurr_flow_data'
-    # stat_groups= 'recurr_interval'
-
     # Open log file
     sys.__stdout__ = sys.stdout
     log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")

From 92f315f10953a41b7414afadae4eb60c3147575c Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Wed, 31 Mar 2021 19:42:54 +0000
Subject: [PATCH 37/66] commenting out local headwater; refactoring
 pre-processing

---
 src/adjust_headwater_streams.py  |  26 ++--
 src/aggregate_vector_inputs.py   | 216 +++++++++++++++++--------------
 src/clip_vectors_to_wbd.py       |  30 ++---
 src/reduce_nhd_stream_density.py |  47 ++++---
 src/utils/shared_variables.py    |  22 ++++
 5 files changed, 199 insertions(+), 142 deletions(-)

diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
index e08bf3352..dd84f729d 100644
--- a/src/adjust_headwater_streams.py
+++ b/src/adjust_headwater_streams.py
@@ -15,7 +15,7 @@
 
 def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
 
-    # identify true headwater segments
+    # Identify true headwater segments
     if nhd_streams['headwaters_id'].dtype=='int':
         nhd_streams_adj = nhd_streams.loc[(nhd_streams.headwaters_id > 0) & (nhd_streams.downstream_of_headwater == False),:].copy()
         if headwaters[headwater_id].dtype != 'int': headwaters[headwater_id] = headwaters[headwater_id].astype(int)
@@ -32,16 +32,16 @@ def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
 
     for index, point in headwater_limited.iterrows():
 
-        # convert headwaterpoint geometries to WKB representation
+        # Convert headwaterpoint geometries to WKB representation
         wkb_points = dumps(point.geometry)
 
-        # create pygeos headwaterpoint geometries from WKB representation
+        # Create pygeos headwaterpoint geometries from WKB representation
         pointbin_geom = pygeos.io.from_wkb(wkb_points)
 
         # Closest segment to headwater
         closest_stream = nhd_streams_adj.loc[nhd_streams_adj["headwaters_id"]==point[headwater_id]]
 
-        try: # seeing inconsistent geometry objects even after exploding nhd_streams_adj; not sure why this is
+        try: # Seeing inconsistent geometry objects even after exploding nhd_streams_adj; not sure why this is
             closest_stream =closest_stream.explode()
         except:
             pass
@@ -56,39 +56,41 @@ def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
         pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
         referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
 
-        # convert geometries to wkb representation
+        # Convert geometries to wkb representation
         bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
 
-        # convert to shapely geometries
+        # Convert to shapely geometries
         shply_referencedpoint = loads(bin_referencedpoint)
         shply_linestring = loads(wkb_closest_stream)
         headpoint = Point(shply_referencedpoint.coords)
         cumulative_line = []
         relativedistlst = []
 
-        # collect all nhd stream segment linestring verticies
+        # Collect all nhd stream segment linestring verticies
         for point in zip(*shply_linestring.coords.xy):
             cumulative_line = cumulative_line + [point]
             relativedist = shply_linestring.project(Point(point))
             relativedistlst = relativedistlst + [relativedist]
 
-        # add linear referenced headwater point to closest nhd stream segment
+        # Add linear referenced headwater point to closest nhd stream segment
         if not headpoint in cumulative_line:
             cumulative_line = cumulative_line + [headpoint]
             relativedist = shply_linestring.project(headpoint)
             relativedistlst = relativedistlst + [relativedist]
 
-        # sort by relative line distance to place headwater point in linestring
+        # Sort by relative line distance to place headwater point in linestring
         sortline = pd.DataFrame({'geom' : cumulative_line, 'dist' : relativedistlst}).sort_values('dist')
         shply_linestring = LineString(sortline.geom.tolist())
         referencedpoints = referencedpoints + [headpoint]
 
-        # split the new linestring at the new headwater point
+        # Split the new linestring at the new headwater point
         try:
+
             line1,line2 = split(shply_linestring, headpoint)
             headwaterstreams = headwaterstreams + [LineString(line1)]
             nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1)
         except:
+
             line1 = split(shply_linestring, headpoint)
             headwaterstreams = headwaterstreams + [LineString(line1[0])]
             nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1[0])
@@ -98,9 +100,9 @@ def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
     try:
         del nhd_streams_adj, headwaters, headwater_limited, headwaterstreams, referencedpoints, cumulative_line, relativedistlst
     except:
-        print ('issue deleting adjusted stream variables for huc ' + str(huc))
+        print (f"issue deleting adjusted stream variables for huc {str(huc)}")
 
-    ## identify ajusted nhd headwaters
+    # Identify ajusted nhd headwaters
     # print('Identify NHD headwater points',flush=True)
     nhd_headwater_streams_adj = nhd_streams.loc[nhd_streams['is_headwater'],:]
     nhd_headwater_streams_adj = nhd_headwater_streams_adj.explode()
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index eb4d3e4f5..a33f2f144 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 
 import os
+import sys
 import geopandas as gpd
-from utils.shared_variables import PREP_PROJECTION
-from utils.shared_functions import getDriver
-from derive_headwaters import findHeadWaterPoints
-from reduce_nhd_stream_density import subset_nhd_network
-from adjust_headwater_streams import adjust_headwaters
 from tqdm import tqdm
 from os.path import splitext
 from shapely.geometry import Point
@@ -15,6 +11,14 @@
 import numpy as np
 from shapely.wkb import dumps, loads
 import pygeos
+sys.path.append('/foss_fim/src')
+from utils.shared_variables import PREP_PROJECTION
+from utils.shared_functions import getDriver
+from derive_headwaters import findHeadWaterPoints
+from reduce_nhd_stream_density import subset_nhd_network
+from adjust_headwater_streams import adjust_headwaters
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 in_dir ='data/inputs/nhdplus_vectors'
 nwm_dir = 'data/inputs/nwm_hydrofabric'
@@ -24,34 +28,39 @@
 
 wbd_filename = os.path.join(wbd_dir, 'WBD_National.gpkg')
 nwm_streams_fr_filename = os.path.join(nwm_dir,'nwm_flows.gpkg')
-nwm_streams_ms_filename = os.path.join(nwm_dir,'nwm_flows_ms.gpkg')
 nwm_headwaters_filename = os.path.join(nwm_dir,'nwm_headwaters.gpkg')
-nwm_huc4_intersections_ms_filename = os.path.join(nwm_dir,'nwm_ms_huc4_intersections.gpkg')
-nwm_huc4_intersections_fr_filename = os.path.join(nwm_dir,'nwm_fr_huc4_intersections.gpkg')
+nwm_huc4_intersections_filename = os.path.join(nwm_dir,'nwm_huc4_intersections_NEW.gpkg')
+nwm_huc8_intersections_filename = os.path.join(nwm_dir,'nwm_huc8_intersections.gpkg')
+nhd_streams_ms_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg')
+# nhd_ms_adj_headwater_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms_NEW.gpkg')
+nhd_streams_fr_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg')
+
+def identify_nwm_ms_streams(args):
 
-def subset_nwm_ms_streams(args):
     nwm_streams_filename    = args[0]
     in_dir                  = args[1]
     ahps_dir                = args[2]
-    output_filename         = args[3]
 
-    # subset nwm network to ms
-    ahps_headwaters_filename = os.path.join(ahps_dir,'bed_lids.gpkg')
+    # Subset nwm network to ms
+    ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
     ahps_headwaters = gpd.read_file(ahps_headwaters_filename)
 
     nwm_streams = gpd.read_file(nwm_streams_filename)
 
+    # Remove mainstem column if it already exists
+    nwm_streams = nwm_streams.drop(['mainstem'], axis=1, errors='ignore')
+
     nwm_streams['is_headwater'] = False
     nwm_streams['downstream_of_headwater'] = False
 
     nwm_streams.loc[nwm_streams.ID.isin(list(ahps_headwaters.nwm_featur)),'is_headwater'] = True
 
-    ## subset NHDPlus HR
+    # Subset NHDPlus HR
     nwm_streams['is_relevant_stream'] = nwm_streams['is_headwater'].copy()
 
     nwm_streams = nwm_streams.explode()
 
-    # trace down from headwaters
+    # Trace down from headwaters
     nwm_streams.set_index('ID',inplace=True,drop=False)
 
     Q = deque(nwm_streams.loc[nwm_streams['is_headwater'],'ID'].tolist())
@@ -61,35 +70,38 @@ def subset_nwm_ms_streams(args):
         q = Q.popleft()
         if q in visited:
             continue
-        #
+
         visited.add(q)
         toNode = nwm_streams.loc[q,'to']
-        #
+
         if not toNode == 0:
-        #
+
             nwm_streams.loc[nwm_streams.ID==toNode,'is_relevant_stream'] = True
-            #
+
             if toNode not in visited:
                 Q.append(toNode)
 
-    nwm_streams = nwm_streams.loc[nwm_streams['is_relevant_stream'],:]
+    nwm_streams_ms = nwm_streams.loc[nwm_streams['is_relevant_stream'],:]
+
+    ms_segments = nwm_streams_ms.ID.to_list()
 
     nwm_streams.reset_index(drop=True,inplace=True)
 
-    nwm_streams.to_file(output_filename,driver=getDriver(output_filename),index=False)
+    # Add column to FR nwm layer to indicate MS segments
+    nwm_streams['mainstem'] = np.where(nwm_streams.ID.isin(ms_segments), 1, 0)
 
-def find_nwm_incoming_streams(args):
+    nwm_streams.to_file(nwm_streams_filename,driver=getDriver(nwm_streams_filename),index=False)
 
-    nwm_streams_filename    = args[0]
-    wbd_filename            = args[1]
-    in_dir                  = args[2]
-    output_filename         = args[3]
 
-    wbd = gpd.read_file(wbd_filename, layer='WBDHU4')
+def find_nwm_incoming_streams(nwm_streams_filename,wbd_filename,huc_unit,in_dir,output_filename):
+
+    layer = "WBDHU" + str(huc_unit)
+    wbd = gpd.read_file(wbd_filename, layer=layer)
 
     intersecting_points = []
+    mainstem = []
     for index, row in tqdm(wbd.iterrows(),total=len(wbd)):
-        col_name = 'HUC4'
+        col_name = 'HUC' + str(huc_unit)
         huc = row[col_name]
 
         huc_mask = wbd.loc[wbd[col_name]==str(huc)]
@@ -97,6 +109,7 @@ def find_nwm_incoming_streams(args):
         huc_mask = huc_mask.reset_index(drop=True)
 
         nwm_streams = gpd.read_file(nwm_streams_filename, mask=huc_mask)
+
         nwm_streams = nwm_streams.explode()
         nwm_streams = nwm_streams.reset_index(drop=True)
 
@@ -105,25 +118,28 @@ def find_nwm_incoming_streams(args):
             nwm_streams_subset =nwm_streams[crosses]
             nwm_streams_subset = nwm_streams_subset.reset_index(drop=True)
 
-            for index, linestring in enumerate(nwm_streams_subset.geometry):
+            for index, segment in nwm_streams_subset.iterrows():
                 distances = []
-                # distance to each stream segment
+                is_mainstem = segment.mainstem
+                linestring = segment.geometry
+
+                # Distance to each stream segment
                 for point in zip(*linestring.coords.xy):
                     distance = Point(point).distance(polygon.exterior)
                     distances = distances + [distance]
 
-                # find minimum distance
+                # Find minimum distance
                 min_index = np.argmin(distances)
 
                 # Closest segment to headwater
                 closest_point = list(linestring.coords)[min_index]
                 last_node = Point(closest_point)
 
-                # convert geometries to WKB representation
+                # Convert geometries to WKB representation
                 wkb_point = dumps(last_node)
                 wkb_poly = dumps(polygon.exterior)
 
-                # create pygeos geometries from WKB representation
+                # Create pygeos geometries from WKB representation
                 stream_point_geom = pygeos.io.from_wkb(wkb_point)
                 polybin_geom = pygeos.io.from_wkb(wkb_poly)
 
@@ -131,22 +147,24 @@ def find_nwm_incoming_streams(args):
                 pointdistancetoline = pygeos.linear.line_locate_point(polybin_geom,stream_point_geom)
                 referencedpoint = pygeos.linear.line_interpolate_point(polybin_geom, pointdistancetoline)
 
-                # convert geometries to wkb representation
+                # Convert geometries to wkb representation
                 bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
 
-                # convert to shapely geometries
+                # Convert to shapely geometries
                 shply_referencedpoint = loads(bin_referencedpoint)
 
-                # collect all nhd stream segment linestring verticies
+                # Collect all nhd stream segment linestring verticies
                 intersecting_points = intersecting_points + [shply_referencedpoint]
+                mainstem = mainstem + [is_mainstem]
 
-    huc_intersection = gpd.GeoDataFrame({'geometry' : intersecting_points},crs=nwm_streams.crs,geometry='geometry')
+    huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'mainstem': mainstem},crs=nwm_streams.crs,geometry='geometry')
     huc_intersection = huc_intersection.drop_duplicates()
     huc_intersection.to_file(output_filename,driver=getDriver(output_filename))
 
 
 def collect_stream_attributes(args, huc):
-    print ('Starting huc: ' + str(huc))
+
+    print (f"Starting huc: {str(huc)}")
     in_dir = args[0]
     nwm_dir = args[1]
     ahps_dir = args[2]
@@ -177,15 +195,16 @@ def collect_stream_attributes(args, huc):
         nhd_streams = nhd_streams.loc[nhd_streams.geometry!=None,:] # special case: remove segments without geometries
         nhd_streams['HUC4'] = str(huc)
 
-        # write out NHDPlus HR aggregated
+        # Write out NHDPlus HR aggregated
         nhd_streams_agg_fileName = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
         nhd_streams.to_file(nhd_streams_agg_fileName,driver=getDriver(nhd_streams_agg_fileName),index=False)
         del nhd_streams
 
-        print ('finished huc: ' + str(huc))
+        print (f"finished huc: {str(huc)}")
 
     else:
-        print ('missing data for huc ' + str(huc))
+        print (f"missing data for huc {str(huc)}")
+
 
 def subset_stream_networks(args, huc):
 
@@ -194,17 +213,16 @@ def subset_stream_networks(args, huc):
     wbd4                               = args[2]
     wbd8                               = args[3]
     in_dir                             = args[4]
-    nwm_huc4_intersect_fr_filename     = args[5]
-    nwm_huc4_intersect_ms_filename     = args[6]
+    nwm_huc4_intersect_filename        = args[5]
 
-    print("starting HUC " + str(huc),flush=True)
+    print(f"starting HUC {str(huc)}",flush=True)
     nwm_headwater_id = 'ID'
     nwm_headwaters_filename = os.path.join(nwm_dir,'nwm_headwaters.gpkg')
     ahps_headwater_id = 'nws_lid'
     ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
     nhd_streams_filename = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
 
-    # subset to reduce footprint
+    # Subset to reduce footprint
     selected_wbd4 = wbd4.loc[wbd4.HUC4.str.startswith(str(huc))]
     del wbd4
     selected_wbd8 = wbd8.loc[wbd8.HUC8.str.startswith(huc)]
@@ -217,10 +235,10 @@ def subset_stream_networks(args, huc):
     if len(selected_wbd8.HUC8) > 0:
         selected_wbd8 = selected_wbd8.reset_index(drop=True)
 
-        # identify FR/NWM headwaters
-        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersect_fr_filename)
+        # Identify FR/NWM headwaters
+        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersect_filename)
 
-        ## adjust FR/NWM headwater segments
+        # Adjust FR/NWM headwater segments
         nwm_headwaters = gpd.read_file(nwm_headwaters_filename, mask=huc_mask)
 
         if len(nwm_headwaters) > 0:
@@ -230,20 +248,20 @@ def subset_stream_networks(args, huc):
             nhd_streams_fr_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
             adj_nhd_headwaters_fr_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
 
-            # write out FR adjusted
+            # Write out FR adjusted
             adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
             adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
 
             del adj_nhd_streams_fr, adj_nhd_headwater_points_fr
         else:
-            print ('skipping FR headwater adjustments for HUC: ' + str(huc))
+            print (f"skipping FR headwater adjustments for HUC: {str(huc)}")
 
         del nhd_streams_fr
 
-        ## identify MS/AHPs headwaters
-        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_headwaters_filename,ahps_headwater_id,nwm_huc4_intersect_ms_filename)
+        # Identify MS/AHPs headwaters
+        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_headwaters_filename,ahps_headwater_id,nwm_huc4_intersect_filename,True)
 
-        ## adjust MS/AHPs headwater segments
+        # Adjust MS/AHPs headwater segments
         ahps_headwaters = gpd.read_file(ahps_headwaters_filename, mask=huc_mask)
 
         if len(ahps_headwaters) > 0:
@@ -253,86 +271,89 @@ def subset_stream_networks(args, huc):
             nhd_streams_ms_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
             adj_nhd_headwaters_ms_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
 
-            # write out MS adjusted
+            # Write out MS adjusted
             adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
             adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
 
             del adj_nhd_streams_ms, adj_nhd_headwater_points_ms
 
         else:
-            print ('skipping MS headwater adjustments for HUC: ' + str(huc))
+            print (f"skipping MS headwater adjustments for HUC: {str(huc)}")
             del nhd_streams_ms
 
+
 def aggregate_stream_networks(in_dir,agg_dir, huc_list):
 
     for huc in huc_list:
 
-        ## FR adjusted
-        adj_nhd_headwaters_fr_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_fr.gpkg')
+        # FR adjusted
+        adj_nhd_headwaters_fr_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_fr_NEW.gpkg')
         nhd_fr_adj_huc_subset = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-        nhd_streams_fr_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted.gpkg')
+        nhd_streams_fr_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg')
         nhd_fr_adj_headwaters_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
 
         if os.path.isfile(nhd_fr_adj_huc_subset):
             adj_nhd_streams_fr = gpd.read_file(nhd_fr_adj_huc_subset)
 
-            # write out FR adjusted
+            # Write out FR adjusted
             if os.path.isfile(nhd_streams_fr_adjusted_fileName):
                 adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False, mode='a')
             else:
                 adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
-
             del adj_nhd_streams_fr
 
         if os.path.isfile(nhd_fr_adj_headwaters_subset):
             adj_nhd_headwater_points_fr = gpd.read_file(nhd_fr_adj_headwaters_subset)
 
-            # write out FR adjusted
+            # Write out FR adjusted
             if os.path.isfile(adj_nhd_headwaters_fr_fileName):
                 adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False, mode='a')
             else:
                 adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
-
             del adj_nhd_headwater_points_fr
 
-        ## MS adjusted
-        adj_nhd_headwaters_ms_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_ms.gpkg')
+        # MS adjusted
+        adj_nhd_headwaters_ms_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_ms_NEW.gpkg')
         nhd_ms_adj_huc_subset = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-        nhd_streams_ms_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted.gpkg')
+        nhd_streams_ms_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg')
         nhd_ms_adj_headwater_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
 
         if os.path.isfile(nhd_ms_adj_huc_subset):
             adj_nhd_streams_ms = gpd.read_file(nhd_ms_adj_huc_subset)
 
-            # write out ms adjusted
+            # Write out ms adjusted
             if os.path.isfile(nhd_streams_ms_adjusted_fileName):
                 adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False, mode='a')
             else:
                 adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
-
             del adj_nhd_streams_ms
 
         if os.path.isfile(nhd_ms_adj_headwater_subset):
             adj_nhd_headwater_points_ms = gpd.read_file(nhd_ms_adj_headwater_subset)
 
-            # write out ms adjusted
+            # Write out ms adjusted
             if os.path.isfile(adj_nhd_headwaters_ms_fileName):
                 adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False, mode='a')
             else:
                 adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
-
             del adj_nhd_headwater_points_ms
 
+
 def clean_up_intermediate_files(in_dir):
 
     for huc in os.listdir(in_dir):
+
         agg_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+
         fr_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr.gpkg')
         fr_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
+
         ms_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms.gpkg')
         ms_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
+
         ms_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
         fr_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+
         ms_headwater_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_ms.gpkg')
         fr_headwater_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_fr.gpkg')
 
@@ -364,49 +385,46 @@ def clean_up_intermediate_files(in_dir):
             os.remove(fr_headwater_path)
 
 
-
 if(__name__=='__main__'):
 
-    ## generate NWM Headwaters
-    # print ('deriving nwm headwater points')
-    # nwm_headwaters = findHeadWaterPoints(nwm_streams_fr_filename)
-    # nwm_headwaters['ID'] = nwm_headwaters.index + 1
-    # nwm_headwaters.to_file(nwm_headwaters_filename,driver=getDriver(nwm_headwaters_filename),index=False)
-    
-    # del nwm_headwaters, nwm_streams
-
-    ## subset NWM MS Streams
-    # nwm_subset_ms_args = (nwm_streams_fr_filename,in_dir,ahps_dir,nwm_streams_ms_filename)
-    # print ('deriving nwm ms streams')
-    # subset_nwm_ms_streams(nwm_subset_ms_args)
-
-    ## generate NWM intersection points with WBD4 boundaries
-    # ms_nwm_intersect_args = (nwm_streams_ms_filename,wbd_filename,in_dir,nwm_huc4_intersections_ms_filename)
-    # fr_nwm_intersect_args = (nwm_streams_fr_filename,wbd_filename,in_dir,nwm_huc4_intersections_fr_filename)
-    # print ('deriving nwm ms intersection points')
-    # find_nwm_incoming_streams(ms_nwm_intersect_args)
-    # print ('deriving nwm fr intersection points')
-    # find_nwm_incoming_streams(fr_nwm_intersect_args)
+    # Generate NWM Headwaters
+    print ('deriving nwm headwater points')
+    nwm_headwaters = findHeadWaterPoints(nwm_streams_fr_filename)
+    nwm_headwaters['ID'] = nwm_headwaters.index + 1
+    nwm_headwaters.to_file(nwm_headwaters_filename,driver=getDriver(nwm_headwaters_filename),index=False)
+
+    del nwm_headwaters, nwm_streams
+
+    # Identify NWM MS Streams
+    identify_nwm_ms_args = (nwm_streams_fr_filename,in_dir,ahps_dir)
+    print ('identifing nwm ms streams')
+    identify_nwm_ms_streams(identify_nwm_ms_args)
+
+    # Generate NWM intersection points with WBD4 boundaries
+    print ('deriving NWM fr/ms intersection points')
+    find_nwm_incoming_streams(nwm_streams_fr_filename,wbd_filename,4,in_dir,nwm_huc4_intersections_filename)
 
     print ('loading wb4')
     wbd4 = gpd.read_file(wbd_filename, layer='WBDHU4')
     print ('loading wb8')
     wbd8 = gpd.read_file(wbd_filename, layer='WBDHU8')
 
-    subset_arg_list = (nwm_dir,ahps_dir,wbd4,wbd8,in_dir,nwm_huc4_intersections_fr_filename,nwm_huc4_intersections_ms_filename)
     collect_arg_list = (in_dir,nwm_dir,ahps_dir)
+    subset_arg_list = (nwm_dir,ahps_dir,wbd4,wbd8,in_dir,nwm_huc4_intersections_filename)
 
-    num_workers=9
+    num_workers = 11
 
-    with ProcessPoolExecutor(max_workers=num_workers) as executor:
-        ## preprocess nhd hr and add attributes
-        collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in os.listdir(in_dir)]
-        ## subset nhd hr network
-        subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in os.listdir(in_dir)]
+with ProcessPoolExecutor(max_workers=num_workers) as executor:
+    # Preprocess NHD HR and add attributes
+    collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in os.listdir(in_dir)]
+    # Subset NHD HR network
+    subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in os.listdir(in_dir)]
 
+    # Generate NWM intersection points with WBD8 boundaries using subset_stream_networks
+    # find_nwm_incoming_streams(nhd_streams_fr_adjusted_fileName,wbd_filename,8,in_dir,nwm_huc8_intersections_filename)
 
-    ## aggregate fr and ms nhd netowrks for entire nwm domain
+    # Aggregate fr and ms nhd netowrks for entire nwm domain
     aggregate_stream_networks(in_dir,agg_dir, os.listdir(in_dir))
 
-    ## remove intermediate files
-    # clean_up_intermediate_files(in_dir)
+    # Remove intermediate files
+    clean_up_intermediate_files(in_dir)
diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index 654fe6e4f..3ae82306f 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -58,21 +58,21 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd_buffer)
 
     ## identify local headwater stream segments
-    nhd_streams_subset = gpd.read_file(nhd_streams_filename, mask = wbd)
-    nhd_streams_subset = nhd_streams_subset.loc[~nhd_streams_subset.FromNode.isin(list(set(nhd_streams_subset.ToNode) & set(nhd_streams_subset.FromNode)))]
-    nhd_streams_subset = nhd_streams_subset[~nhd_streams_subset['is_headwater']]
-
-    if not nhd_streams_subset.empty:
-        nhd_streams_subset = nhd_streams_subset.reset_index(drop=True)
-        start_coords = []
-        NHDPlusIDs = []
-        for index, linestring in enumerate(nhd_streams_subset.geometry):
-            start_coords = start_coords + [linestring.coords[-1]]
-            NHDPlusIDs = NHDPlusIDs + [nhd_streams_subset.iloc[index].NHDPlusID]
-
-        start_geoms = [Point(point) for point in start_coords]
-        local_headwaters = gpd.GeoDataFrame({'NHDPlusID': NHDPlusIDs,'geometry': start_geoms}, crs=projection, geometry='geometry')
-        nhd_headwaters = nhd_headwaters.append(local_headwaters)
+    # nhd_streams_subset = gpd.read_file(nhd_streams_filename, mask = wbd)
+    # nhd_streams_subset = nhd_streams_subset.loc[~nhd_streams_subset.FromNode.isin(list(set(nhd_streams_subset.ToNode) & set(nhd_streams_subset.FromNode)))]
+    # nhd_streams_subset = nhd_streams_subset[~nhd_streams_subset['is_headwater']]
+    #
+    # if not nhd_streams_subset.empty:
+    #     nhd_streams_subset = nhd_streams_subset.reset_index(drop=True)
+    #     start_coords = []
+    #     NHDPlusIDs = []
+    #     for index, linestring in enumerate(nhd_streams_subset.geometry):
+    #         start_coords = start_coords + [linestring.coords[-1]]
+    #         NHDPlusIDs = NHDPlusIDs + [nhd_streams_subset.iloc[index].NHDPlusID]
+    #
+    #     start_geoms = [Point(point) for point in start_coords]
+    #     local_headwaters = gpd.GeoDataFrame({'NHDPlusID': NHDPlusIDs,'geometry': start_geoms}, crs=projection, geometry='geometry')
+    #     nhd_headwaters = nhd_headwaters.append(local_headwaters)
 
         # nhd_streams = nhd_streams.loc[~nhd_streams.NHDPlusID.isin(NHDPlusIDs)]
 
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index cce2fa7ca..c17effacf 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -11,7 +11,7 @@
 from shapely.wkb import dumps
 from utils.shared_functions import getDriver
 
-def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwaters_filename,headwater_id,nwm_intersections_filename):
+def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False):
 
     headwater_streams = pd.DataFrame()
 
@@ -37,7 +37,7 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwat
                 for index, linestring in enumerate(streams_subset.geometry):
                     streams_subset.at[index, 'b_geom'] = dumps(linestring)
 
-                # create pygeos nhd stream geometries from WKB representation
+                # Create pygeos nhd stream geometries from WKB representation
                 streambin_geom = pygeos.io.from_wkb(streams_subset['b_geom'])
 
                 streams_subset.loc[:,'HUC8'] = str(huc)
@@ -49,19 +49,19 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwat
 
                 streams_subset.loc[:,'headwaters_id'] = n
 
-                # find stream segment closest to headwater point
+                # Find stream segment closest to headwater point
                 for index, point in headwaters_mask.iterrows():
 
-                    # convert headwaterpoint geometries to WKB representation
+                    # Convert headwaterpoint geometries to WKB representation
                     wkb_points = dumps(point.geometry)
 
-                    # create pygeos headwaterpoint geometries from WKB representation
+                    # Create pygeos headwaterpoint geometries from WKB representation
                     pointbin_geom = pygeos.io.from_wkb(wkb_points)
 
-                    # distance to each stream segment
+                    # Distance to each stream segment
                     distances = pygeos.measurement.distance(streambin_geom, pointbin_geom)
 
-                    # find minimum distance
+                    # Find minimum distance
                     min_index = np.argmin(distances)
 
                     # Closest segment to headwater
@@ -77,30 +77,34 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwat
 
     huc4_mask_buffer = huc4_mask.buffer(10)
 
-    # identify inflowing streams
+    # Identify inflowing streams
     nwm_intersections = gpd.read_file(nwm_intersections_filename, mask=huc4_mask_buffer)
 
+    if mainstem_flag == True:
+        nwm_intersections = nwm_intersections.loc[nwm_intersections.mainstem==True]
+        nhd_streams['mainstem'] = True
+
     nhd_streams['downstream_of_headwater'] = False
     nhd_streams = nhd_streams.explode()
     nhd_streams = nhd_streams.reset_index(drop=True)
 
-    # find stream segment closest to nwm intersection point
+    # Find stream segment closest to nwm intersection point
     for index, point in nwm_intersections.iterrows():
 
-        # distance to each stream segment
+        # Distance to each stream segment
         distances = nhd_streams.distance(point.geometry)
 
-        # find minimum distance
+        # Find minimum distance
         min_index = np.argmin(distances)
 
-        # update attributes for incoming stream
+        # Update attributes for incoming stream
         nhd_streams.loc[min_index,'is_headwater'] = True
         nhd_streams.loc[min_index,'downstream_of_headwater'] = True
 
-    ## subset NHDPlus HR
+    # Subset NHDPlus HR
     nhd_streams['is_relevant_stream'] = nhd_streams['is_headwater'].copy()
 
-    # trace down from headwaters
+    # Trace down from headwaters
     nhd_streams.set_index('NHDPlusID',inplace=True,drop=False)
 
     nhd_streams = get_downstream_segments(nhd_streams, 'is_headwater')
@@ -156,10 +160,21 @@ def get_downstream_segments(streams, attribute):
     parser.add_argument('-s','--subset-nhd-streams-fileName',help='Output streams layer name',required=False,type=str,default=None)
     parser.add_argument('-i','--headwater-id',help='Headwater points ID column',required=True)
     parser.add_argument('-i','--nwm-intersections-filename',help='NWM HUC4 intersection points',required=True)
+    parser.add_argument('-ms','--mainstem-flag',help='flag for mainstem network',required=False,default=False)
 
     args = vars(parser.parse_args())
 
-    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id)
+    huc_number = args['huc_number']
+    huc4_mask = args['huc4_mask']
+    selected_wbd8 = args['selected_wbd8']
+    nhd_streams = args['nhd_streams']
+    headwaters_filename = args['headwaters_filename']
+    subset_nhd_streams_fileName = args['subset_nhd_streams_fileName']
+    headwater_id = args['headwater_id']
+    nwm_intersections_filename = args['nwm_intersections_filename']
+    mainstem_flag = args['mainstem_flag']
+
+    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag)
 
     if subset_nhd_streams_fileName is not None:
-        subset_streams_gdf.to_file(args['subset_nhd_streams_fileName'],driver=getDriver(args['subset_nhd_streams_fileName']),index=False)
+        subset_streams_gdf.to_file(subset_nhd_streams_fileName,driver=getDriver(subset_nhd_streams_fileName),index=False)
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index 244a12d2b..d353d2a8f 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+import os
+
 # Projections.
 #PREP_PROJECTION = "+proj=aea +datum=NAD83 +x_0=0.0 +y_0=0.0 +lon_0=96dW +lat_0=23dN +lat_1=29d30'N +lat_2=45d30'N +towgs84=-0.9956000824677655,1.901299877314078,0.5215002840524426,0.02591500053005733,0.009425998542707753,0.01159900118427752,-0.00062000005129903 +no_defs +units=m"
 PREP_PROJECTION = 'PROJCS["USA_Contiguous_Albers_Equal_Area_Conic_USGS_version",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.2572221010042,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433],AUTHORITY["EPSG","4269"]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["standard_parallel_1",29.5],PARAMETER["standard_parallel_2",45.5],PARAMETER["latitude_of_center",23],PARAMETER["longitude_of_center",-96],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]]]'
@@ -33,3 +35,23 @@
 OVERWRITE_WBD = 'OVERWRITE_WBD'
 OVERWRITE_NHD = 'OVERWRITE_NHD'
 OVERWRITE_ALL = 'OVERWRITE_ALL'
+
+## Input Paths and Directories
+# Directories
+src_dir = '/foss_fim/src'
+input_dir ='data/inputs'
+nhdplus_rasters_dir = os.path.join(input_dir,'nhdplus_rasters')
+nhdplus_vectors_dir = os.path.join(input_dir,'nhdplus_vectors')
+nwm_hydrofabric_dir = os.path.join(input_dir,'nwm_hydrofabric')
+wbd_dir = os.path.join(input_dir,'wbd')
+ahps_dir = os.path.join(input_dir,'ahp_sites')
+nhdplus_vectors_aggregate_dir = os.path.join(input_dir,'nhdplus_vectors_aggregate')
+
+# File Paths
+wbd_filename = os.path.join(wbd_dir, 'WBD_National.gpkg')
+nwm_streams_fr_filename = os.path.join(nwm_hydrofabric_dir,'nwm_flows.gpkg')
+nwm_streams_ms_filename = os.path.join(nwm_hydrofabric_dir,'nwm_flows_ms.gpkg')
+nwm_headwaters_filename = os.path.join(nwm_hydrofabric_dir,'nwm_headwaters.gpkg')
+nwm_huc4_intersections_ms_filename = os.path.join(nwm_hydrofabric_dir,'nwm_ms_huc4_intersections.gpkg')
+nwm_huc4_intersections_fr_filename = os.path.join(nwm_hydrofabric_dir,'nwm_fr_huc4_intersections.gpkg')
+ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')

From 7bce6631860930477fc1f225a09ba03e919c55e6 Mon Sep 17 00:00:00 2001
From: "brian.avant" <brian.avant@noaa.gov>
Date: Tue, 13 Apr 2021 14:12:43 +0000
Subject: [PATCH 38/66] check_dem_data scratch file

---
 fim_run.sh                       |   8 +-
 src/adjust_headwater_streams.py  |   2 +-
 src/aggregate_fim_outputs.py     |  18 ++--
 src/aggregate_vector_inputs.py   | 112 ++++++++++++------------
 src/agreedem.py                  |   2 -
 src/check_dem_nodata.py          | 141 +++++++++++++++++++++++++++++++
 src/clip_vectors_to_wbd.py       |   6 +-
 src/reduce_nhd_stream_density.py |  16 +++-
 src/run_by_unit.sh               |  29 ++++---
 src/utils/shared_functions.py    |  65 ++++++++++++++
 tools/rating_curve_comparison.py |  10 +--
 11 files changed, 313 insertions(+), 96 deletions(-)
 create mode 100755 src/check_dem_nodata.py

diff --git a/fim_run.sh b/fim_run.sh
index 8d1875e5f..c467d47b0 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -117,10 +117,10 @@ export input_NWM_Catchments_ms=$inputDataDir/nwm_hydrofabric/nwm_catchments_ms.g
 export input_NWM_Flows_fr=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
 export input_NWM_Flows_ms=$inputDataDir/nwm_hydrofabric/nwm_flows_ms.gpkg
 export input_NWM_Headwaters=$inputDataDir/nwm_hydrofabric/nwm_headwaters.gpkg
-export input_nhd_flowlines_fr=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_fr_adjusted.gpkg
-export input_nhd_flowlines_ms=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_ms_adjusted.gpkg
-export input_nhd_headwaters_fr=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_fr.gpkg
-export input_nhd_headwaters_ms=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_ms.gpkg
+export input_nhd_flowlines_fr=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg
+export input_nhd_flowlines_ms=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg
+export input_nhd_headwaters_fr=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_fr_NEW.gpkg
+export input_nhd_headwaters_ms=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_ms_NEW.gpkg
 
 ## Input handling ##
 $srcDir/check_huc_inputs.py -u "$hucList"
diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
index dd84f729d..bc12939bf 100644
--- a/src/adjust_headwater_streams.py
+++ b/src/adjust_headwater_streams.py
@@ -117,7 +117,7 @@ def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
 
     del nhd_headwater_streams_adj
 
-    return(nhd_streams, nhd_headwater_points_adj)
+    return nhd_streams, nhd_headwater_points_adj
 
 if __name__ == '__main__':
 
diff --git a/src/aggregate_fim_outputs.py b/src/aggregate_fim_outputs.py
index 9d8676364..b149a3d56 100644
--- a/src/aggregate_fim_outputs.py
+++ b/src/aggregate_fim_outputs.py
@@ -88,8 +88,8 @@ def aggregate_fim_outputs(args):
 
     ## aggregate rasters
     # aggregate file paths
-    rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}_unprj.tif')
-    catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}_unprj.tif')
+    rem_mosaic = os.path.join(huc6_dir,f'hand_grid_{huc6}_prepprj.tif')
+    catchment_mosaic = os.path.join(huc6_dir,f'catchments_{huc6}_prepprj.tif')
 
     if huc6 not in huc_list:
 
@@ -155,28 +155,28 @@ def aggregate_fim_outputs(args):
         shutil.copy(catchment_filename, catchment_mosaic)
 
     ## reproject rasters
-    reproject_raster(rem_mosaic)
+    reproject_raster(rem_mosaic,VIZ_PROJECTION)
     os.remove(rem_mosaic)
 
-    reproject_raster(catchment_mosaic)
+    reproject_raster(catchment_mosaic,VIZ_PROJECTION)
     os.remove(catchment_mosaic)
 
 
-def reproject_raster(raster_name):
+def reproject_raster(raster_name,reprojection):
 
     with rasterio.open(raster_name) as src:
         transform, width, height = calculate_default_transform(
-            src.crs, VIZ_PROJECTION, src.width, src.height, *src.bounds)
+            src.crs, reprojection, src.width, src.height, *src.bounds)
         kwargs = src.meta.copy()
         kwargs.update({
-            'crs': VIZ_PROJECTION,
+            'crs': reprojection,
             'transform': transform,
             'width': width,
             'height': height,
             'compress': 'lzw'
         })
 
-        raster_proj_rename = os.path.split(raster_name)[1].replace('_unprj.tif', '.tif')
+        raster_proj_rename = os.path.split(raster_name)[1].replace('_prepprj.tif', '.tif')
         raster_proj_dir = os.path.join(os.path.dirname(raster_name), raster_proj_rename)
 
         with rasterio.open(raster_proj_dir, 'w', **kwargs, tiled=True, blockxsize=1024, blockysize=1024, BIGTIFF='YES') as dst:
@@ -187,7 +187,7 @@ def reproject_raster(raster_name):
                 src_transform=src.transform,
                 src_crs=src.crs,
                 dst_transform=transform,
-                dst_crs=VIZ_PROJECTION,
+                dst_crs=reprojection,
                 resampling=Resampling.nearest)
     del src, dst
 
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index a33f2f144..933d20235 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -32,7 +32,6 @@
 nwm_huc4_intersections_filename = os.path.join(nwm_dir,'nwm_huc4_intersections_NEW.gpkg')
 nwm_huc8_intersections_filename = os.path.join(nwm_dir,'nwm_huc8_intersections.gpkg')
 nhd_streams_ms_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg')
-# nhd_ms_adj_headwater_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms_NEW.gpkg')
 nhd_streams_fr_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg')
 
 def identify_nwm_ms_streams(args):
@@ -93,34 +92,48 @@ def identify_nwm_ms_streams(args):
     nwm_streams.to_file(nwm_streams_filename,driver=getDriver(nwm_streams_filename),index=False)
 
 
-def find_nwm_incoming_streams(nwm_streams_filename,wbd_filename,huc_unit,in_dir,output_filename):
+def find_nwm_incoming_streams(nwm_streams,wbd,huc_unit,in_dir):
 
-    layer = "WBDHU" + str(huc_unit)
-    wbd = gpd.read_file(wbd_filename, layer=layer)
+    # input wbd
+    if isinstance(wbd,str):
+        layer = "WBDHU" + str(huc_unit)
+        wbd = gpd.read_file(wbd, layer=layer)
+    elif isinstance(wbd,gpd.GeoDataFrame):
+        pass
+    else:
+        raise TypeError("Pass dataframe or filepath for wbd")
 
     intersecting_points = []
-    mainstem = []
+    nhdplus_ids = []
     for index, row in tqdm(wbd.iterrows(),total=len(wbd)):
+
         col_name = 'HUC' + str(huc_unit)
         huc = row[col_name]
-
         huc_mask = wbd.loc[wbd[col_name]==str(huc)]
         huc_mask = huc_mask.explode()
         huc_mask = huc_mask.reset_index(drop=True)
 
-        nwm_streams = gpd.read_file(nwm_streams_filename, mask=huc_mask)
+        # input nwm streams
+        if isinstance(nwm_streams,str):
+            nwm_streams = gpd.read_file(nwm_streams_filename, mask=huc_mask)
+        elif isinstance(nwm_streams,gpd.GeoDataFrame):
+            pass
+        else:
+            raise TypeError("Pass dataframe or filepath for nwm streams")
 
         nwm_streams = nwm_streams.explode()
         nwm_streams = nwm_streams.reset_index(drop=True)
 
         for index, polygon in enumerate(huc_mask.geometry):
+
             crosses=nwm_streams.crosses(polygon.exterior)
             nwm_streams_subset =nwm_streams[crosses]
             nwm_streams_subset = nwm_streams_subset.reset_index(drop=True)
 
             for index, segment in nwm_streams_subset.iterrows():
+
                 distances = []
-                is_mainstem = segment.mainstem
+                nhdplus_id = segment.NHDPlusID
                 linestring = segment.geometry
 
                 # Distance to each stream segment
@@ -155,11 +168,14 @@ def find_nwm_incoming_streams(nwm_streams_filename,wbd_filename,huc_unit,in_dir,
 
                 # Collect all nhd stream segment linestring verticies
                 intersecting_points = intersecting_points + [shply_referencedpoint]
-                mainstem = mainstem + [is_mainstem]
 
-    huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'mainstem': mainstem},crs=nwm_streams.crs,geometry='geometry')
+                nhdplus_ids = nhdplus_ids + [nhdplus_id]
+
+    huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'NHDPlusID': nhdplus_ids},crs=nwm_streams.crs,geometry='geometry')
     huc_intersection = huc_intersection.drop_duplicates()
-    huc_intersection.to_file(output_filename,driver=getDriver(output_filename))
+
+    return huc_intersection
+
 
 
 def collect_stream_attributes(args, huc):
@@ -207,76 +223,67 @@ def collect_stream_attributes(args, huc):
 
 
 def subset_stream_networks(args, huc):
-
     nwm_dir                            = args[0]
     ahps_dir                           = args[1]
     wbd4                               = args[2]
     wbd8                               = args[3]
     in_dir                             = args[4]
-    nwm_huc4_intersect_filename        = args[5]
-
+    nwm_huc4_intersections_filename    = args[5]
     print(f"starting HUC {str(huc)}",flush=True)
     nwm_headwater_id = 'ID'
     nwm_headwaters_filename = os.path.join(nwm_dir,'nwm_headwaters.gpkg')
     ahps_headwater_id = 'nws_lid'
     ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
     nhd_streams_filename = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
-
     # Subset to reduce footprint
     selected_wbd4 = wbd4.loc[wbd4.HUC4.str.startswith(str(huc))]
     del wbd4
     selected_wbd8 = wbd8.loc[wbd8.HUC8.str.startswith(huc)]
     del wbd8
-
     huc_mask = selected_wbd4.loc[selected_wbd4.HUC4.str.startswith(str(huc))]
     huc_mask = huc_mask.explode()
     huc_mask = huc_mask.reset_index(drop=True)
-
     if len(selected_wbd8.HUC8) > 0:
         selected_wbd8 = selected_wbd8.reset_index(drop=True)
-
         # Identify FR/NWM headwaters
-        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersect_filename)
-
+        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersections_filename)
+        nwm_huc8_intersections_fr = find_nwm_incoming_streams(nhd_streams_fr,selected_wbd8,8,in_dir)
+        nwm_huc8_intersections_fr['intersection'] = True
         # Adjust FR/NWM headwater segments
         nwm_headwaters = gpd.read_file(nwm_headwaters_filename, mask=huc_mask)
-
         if len(nwm_headwaters) > 0:
-
             adj_nhd_streams_fr, adj_nhd_headwater_points_fr = adjust_headwaters(str(huc),nhd_streams_fr,nwm_headwaters,nwm_headwater_id)
-
+            adj_nhd_headwater_points_fr['intersection'] = False
+            adj_nhd_headwater_points_fr = adj_nhd_headwater_points_fr.append(nwm_huc8_intersections_fr)
             nhd_streams_fr_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
             adj_nhd_headwaters_fr_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
-
             # Write out FR adjusted
             adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
             adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
-
             del adj_nhd_streams_fr, adj_nhd_headwater_points_fr
         else:
             print (f"skipping FR headwater adjustments for HUC: {str(huc)}")
-
         del nhd_streams_fr
-
         # Identify MS/AHPs headwaters
-        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_headwaters_filename,ahps_headwater_id,nwm_huc4_intersect_filename,True)
-
+        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_headwaters_filename,ahps_headwater_id,nwm_huc4_intersections_filename,True)
+        nwm_huc8_intersections_ms = find_nwm_incoming_streams(nhd_streams_ms,selected_wbd8,8,in_dir)
+        nwm_huc8_intersections_ms['intersection'] = True
+        nwm_huc8_intersections_ms['mainstem'] = True
         # Adjust MS/AHPs headwater segments
         ahps_headwaters = gpd.read_file(ahps_headwaters_filename, mask=huc_mask)
-
         if len(ahps_headwaters) > 0:
-
             adj_nhd_streams_ms, adj_nhd_headwater_points_ms = adjust_headwaters(str(huc),nhd_streams_ms,ahps_headwaters,ahps_headwater_id)
-
             nhd_streams_ms_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
             adj_nhd_headwaters_ms_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
-
             # Write out MS adjusted
             adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
+            adj_nhd_headwater_points_ms['intersection'] = False
+            ahps_headwaters = ahps_headwaters.drop(['name','nwm_featur'], axis=1, errors='ignore')
+            ahps_headwaters['NHDPlusID'] = 0
+            nwm_huc8_intersections_ms['nws_lid'] = 'FR'
+            adj_nhd_headwater_points_ms = adj_nhd_headwater_points_ms.append(nwm_huc8_intersections_ms)
             adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
-
             del adj_nhd_streams_ms, adj_nhd_headwater_points_ms
-
         else:
             print (f"skipping MS headwater adjustments for HUC: {str(huc)}")
             del nhd_streams_ms
@@ -326,6 +333,7 @@ def aggregate_stream_networks(in_dir,agg_dir, huc_list):
                 adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False, mode='a')
             else:
                 adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
+
             del adj_nhd_streams_ms
 
         if os.path.isfile(nhd_ms_adj_headwater_subset):
@@ -336,6 +344,7 @@ def aggregate_stream_networks(in_dir,agg_dir, huc_list):
                 adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False, mode='a')
             else:
                 adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
+
             del adj_nhd_headwater_points_ms
 
 
@@ -343,32 +352,21 @@ def clean_up_intermediate_files(in_dir):
 
     for huc in os.listdir(in_dir):
 
-        agg_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+        # agg_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
 
-        fr_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr.gpkg')
         fr_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
 
-        ms_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms.gpkg')
         ms_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
 
         ms_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
         fr_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
 
-        ms_headwater_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_ms.gpkg')
-        fr_headwater_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_fr.gpkg')
-
-        if os.path.exists(agg_path):
-            os.remove(agg_path)
-
-        if os.path.exists(fr_path):
-            os.remove(fr_path)
+        # if os.path.exists(agg_path):
+        #     os.remove(agg_path)
 
         if os.path.exists(fr_adj_path):
             os.remove(fr_adj_path)
 
-        if os.path.exists(ms_path):
-            os.remove(ms_path)
-
         if os.path.exists(ms_adj_path):
             os.remove(ms_adj_path)
 
@@ -378,12 +376,6 @@ def clean_up_intermediate_files(in_dir):
         if os.path.exists(fr_headwater_adj_path):
             os.remove(fr_headwater_adj_path)
 
-        if os.path.exists(ms_headwater_path):
-            os.remove(ms_headwater_path)
-
-        if os.path.exists(fr_headwater_path):
-            os.remove(fr_headwater_path)
-
 
 if(__name__=='__main__'):
 
@@ -402,7 +394,8 @@ def clean_up_intermediate_files(in_dir):
 
     # Generate NWM intersection points with WBD4 boundaries
     print ('deriving NWM fr/ms intersection points')
-    find_nwm_incoming_streams(nwm_streams_fr_filename,wbd_filename,4,in_dir,nwm_huc4_intersections_filename)
+    huc_intersection = find_nwm_incoming_streams(nwm_streams_fr_filename,wbd_filename,4,in_dir)
+    huc_intersection.to_file(nwm_huc4_intersections_filename,driver=getDriver(nwm_huc4_intersections_filename))
 
     print ('loading wb4')
     wbd4 = gpd.read_file(wbd_filename, layer='WBDHU4')
@@ -412,16 +405,17 @@ def clean_up_intermediate_files(in_dir):
     collect_arg_list = (in_dir,nwm_dir,ahps_dir)
     subset_arg_list = (nwm_dir,ahps_dir,wbd4,wbd8,in_dir,nwm_huc4_intersections_filename)
 
-    num_workers = 11
+    num_workers = 14
 
 with ProcessPoolExecutor(max_workers=num_workers) as executor:
     # Preprocess NHD HR and add attributes
-    collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in os.listdir(in_dir)]
+    # collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in os.listdir(in_dir)]
     # Subset NHD HR network
     subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in os.listdir(in_dir)]
 
     # Generate NWM intersection points with WBD8 boundaries using subset_stream_networks
-    # find_nwm_incoming_streams(nhd_streams_fr_adjusted_fileName,wbd_filename,8,in_dir,nwm_huc8_intersections_filename)
+    # huc_intersection = find_nwm_incoming_streams(nhd_streams_fr_adjusted_fileName,wbd_filename,8,in_dir)
+    # huc_intersection.to_file(nwm_huc8_intersections_filename,driver=getDriver(nwm_huc8_intersections_filename))
 
     # Aggregate fr and ms nhd netowrks for entire nwm domain
     aggregate_stream_networks(in_dir,agg_dir, os.listdir(in_dir))
diff --git a/src/agreedem.py b/src/agreedem.py
index 15ae40c4c..dbff2d2d4 100755
--- a/src/agreedem.py
+++ b/src/agreedem.py
@@ -45,8 +45,6 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     # Import dem layer and river layer and get dem profile.
     elev = rasterio.open(dem)
     dem_profile = elev.profile
-    if elev.nodata == 0.0:
-        dem_profile.update(nodata = -999)
 
     rivers = rasterio.open(rivers_raster)
 
diff --git a/src/check_dem_nodata.py b/src/check_dem_nodata.py
new file mode 100755
index 000000000..971adcf77
--- /dev/null
+++ b/src/check_dem_nodata.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.append('/foss_fim/src')
+import rasterio
+import numpy as np
+from utils.shared_variables import PREP_PROJECTION,VIZ_PROJECTION,PREP_PROJECTION_CM
+import argparse
+
+
+
+with rasterio.open(input_raster_name) as src:
+    # check projection
+    if src.crs.to_string() != reprojection:
+        if src.crs.to_string().startswith('EPSG'):
+            epsg = src.crs.to_epsg()
+            proj_crs = CRS.from_epsg(epsg)
+            rio_crs = rasterio.crs.CRS.from_user_input(proj_crs).to_string()
+        else:
+            rio_crs = src.crs.to_string()
+    if rio_crs != reprojection:
+        print(f"{input_raster_name} not projected")
+        # print(f"Reprojecting from {rio_crs} to {reprojection}")
+
+    # dem_dir = '/data/inputs/nhdplus_rasters'
+
+raster_dir = '/data/inputs/nhdplus_rasters'
+m_proj_count = 0
+for huc in os.listdir(raster_dir):
+    # elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
+    # elev_cm_OG = os.path.join(raster_dir,huc, 'elev_cm_orig.tif')
+    elev_cm_proj_tif = os.path.join(raster_dir,huc, 'elev_cm_proj.tif')
+    elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
+    if os.path.exists(elev_m_tif):
+        os.remove(elev_cm_proj_tif)
+    if not os.path.exists(elev_m_tif):
+        # print(f"missubg huc {elev_cm_proj_tif}")
+        m_proj_count = m_proj_count + 1
+
+
+
+################################################################################
+    # Windowed reading/calculating/writing
+    with rasterio.open(elev_cm_filename) as dem_cm:
+        no_data = dem_cm.nodata
+        for block_index, window in dem_cm.block_windows(1):
+            block_data = dem_cm.read(window=window)
+            dem_m = np.where(block_data == int(no_data), nodata_val, (block_data/100).astype(rasterio.float32))
+
+        dem_m_profile = dem_cm.profile.copy()
+
+        dem_m_profile.update(driver='GTiff',tiled=True,nodata=nodata_val,
+                             blockxsize=blocksize, blockysize=blocksize,
+                             dtype='float32',crs=projection,compress='lzw',interleave='band')
+    write_window = Window.from_slices((30, 269), (50, 313))
+    # write_window.height = 239, write_window.width = 263
+
+    with rasterio.open(
+            elev_m_filename, 'w',
+            driver='GTiff', width=500, height=300, count=3,
+            dtype=r.dtype) as dst:
+        for k, arr in [(1, b), (2, g), (3, r)]:
+            dst.write(arr, indexes=k, window=write_window)
+################################################################################
+
+
+
+
+
+
+raster_dir = '/data/inputs/nhdplus_rasters'
+cm_proj_count = 0
+m_proj_count = 0
+other_proj_hucs = []
+for huc in os.listdir(raster_dir):
+    # elev_cm_tif = os.path.join(raster_dir,huc, 'elev_cm.tif')
+    # elev_cm_OG = os.path.join(raster_dir,huc, 'elev_cm_orig.tif')
+    # elev_cm_proj_tif = os.path.join(raster_dir,huc, 'elev_cm_proj.tif')
+    elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
+    src =  rasterio.open(elev_cm_tif)
+    # check projection
+    if src.crs.to_string() == PREP_PROJECTION_CM:
+        cm_proj_count = cm_proj_count + 1
+    elif src.crs.to_string() == PREP_PROJECTION:
+        m_proj_count = m_proj_count + 1
+    else:
+        other_proj_hucs = other_proj_hucs + [huc]
+    tot_proj_count = cm_proj_count + m_proj_count
+            if src.crs.to_string().startswith('EPSG'):
+                epsg = src.crs.to_epsg()
+                proj_crs = CRS.from_epsg(epsg)
+                rio_crs = rasterio.crs.CRS.from_user_input(proj_crs).to_string()
+            else:
+                rio_crs = src.crs.to_string()
+            if rio_crs != PREP_PROJECTION:
+                print(f"{elev_cm_tif} not projected")
+                # print(f"{rio_crs}")
+
+
+
+
+    if not os.path.exists(elev_m_tif):
+        print(f"missubg huc {elev_m_tif}")
+    if os.path.exists(elev_cm_OG):
+        reproject_raster(elev_cm_OG,PREP_PROJECTION_CM,512,elev_cm_proj_tif)
+    if os.path.exists(elev_cm_proj_tif):
+        print(f"reprojected huc {huc}")
+    # update_raster_profile(elev_cm_tif,elev_m_tif)
+
+
+def update_raster_profile(elev_cm_filename,elev_m_filename):
+
+    # Update nodata value and convert from cm to meters
+    dem_cm = rasterio.open(elev_cm_filename)
+    no_data = dem_cm.nodata
+    data = dem_cm.read(1)
+    dem_m = np.where(dem_cm == int(no_data), -9999.0, (dem_cm/100).astype(rasterio.float32))
+
+    dem_m_profile = dem_cm.profile.copy()
+    dem_m_profile.update(driver='GTiff',tiled=True,nodata=-9999.0,dtype='float32',compress='lzw',interleave='band')
+
+    with rasterio.open(elev_m_filename, "w", **dem_m_profile, BIGTIFF='YES') as dest:
+        dest.write(dem_m, indexes = 1)
+
+    dem_cm.close()
+
+
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Update nodata value')
+    parser.add_argument('-in_dem','--in-dem-filename', help='DEM filename', required=True,type=str)
+    parser.add_argument('-out_dem','--out-dem-filename', help='out DEM filename', required=True,type=str)
+
+    args = vars(parser.parse_args())
+
+    in_dem_filename = args['in_dem_filename']
+    out_dem_filename = args['out_dem_filename']
+
+    update_raster_profile(in_dem_filename,out_dem_filename)
diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index 3ae82306f..ca13b5e78 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -55,13 +55,13 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
 
     # subset nhd streams
     print("Querying NHD Streams for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
-    nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd_buffer)
+    nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd)
 
-    ## identify local headwater stream segments
+    # identify local headwater stream segments
     # nhd_streams_subset = gpd.read_file(nhd_streams_filename, mask = wbd)
     # nhd_streams_subset = nhd_streams_subset.loc[~nhd_streams_subset.FromNode.isin(list(set(nhd_streams_subset.ToNode) & set(nhd_streams_subset.FromNode)))]
     # nhd_streams_subset = nhd_streams_subset[~nhd_streams_subset['is_headwater']]
-    #
+
     # if not nhd_streams_subset.empty:
     #     nhd_streams_subset = nhd_streams_subset.reset_index(drop=True)
     #     start_coords = []
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index c17effacf..62b23db1a 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -11,28 +11,37 @@
 from shapely.wkb import dumps
 from utils.shared_functions import getDriver
 
-def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False):
+'''
+
+'''
+
+def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False):
 
     headwater_streams = pd.DataFrame()
 
     nhd_streams = gpd.read_file(nhd_streams_filename)
 
+    # Locate the closest NHDPlus HR stream segment to NWM headwater points. Done by HUC8 to reduce processing time and to contain NWM headwater in the same HUC
     for index, row in selected_wbd8.iterrows():
         huc = row["HUC8"]
 
+        # Double check that this is a nested HUC (probably overkill)
         if huc.startswith(str(huc4)):
             huc8_mask = selected_wbd8.loc[selected_wbd8.HUC8.str.startswith(huc)]
             huc8_mask = huc8_mask.reset_index(drop=True)
 
+            # Masking headwaters by HUC8
             headwaters_mask = gpd.read_file(headwaters_filename, mask = huc8_mask)
             headwaters_mask = headwaters_mask.reset_index(drop=True)
 
+            # Masking subset FR streams by HUC8
             streams_subset = gpd.read_file(nhd_streams_filename, mask = huc8_mask)
 
             if not streams_subset.empty:
                 streams_subset.loc[:,'is_headwater'] = False
                 streams_subset = streams_subset.reset_index(drop=True)
 
+                # Create WKB geometry column
                 streams_subset['b_geom'] = None
                 for index, linestring in enumerate(streams_subset.geometry):
                     streams_subset.at[index, 'b_geom'] = dumps(linestring)
@@ -40,13 +49,16 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwat
                 # Create pygeos nhd stream geometries from WKB representation
                 streambin_geom = pygeos.io.from_wkb(streams_subset['b_geom'])
 
+                # Add HUC8 column
                 streams_subset.loc[:,'HUC8'] = str(huc)
 
+                # Assign default headwater ID (nwm_headwater_id = int; ahps_headwater_id = str)
                 if headwaters_mask[headwater_id].dtype=='int':
                     n = -1
                 else:
                     n = ''
 
+                # Add headwaters_id column
                 streams_subset.loc[:,'headwaters_id'] = n
 
                 # Find stream segment closest to headwater point
@@ -112,7 +124,7 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwat
     nhd_streams = nhd_streams.loc[nhd_streams['is_relevant_stream'],:]
     nhd_streams.reset_index(drop=True,inplace=True)
 
-    return(nhd_streams)
+    return nhd_streams
 
 def get_downstream_segments(streams, attribute):
 
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 1242768dc..bb87bf6ae 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -83,7 +83,7 @@ Tcount
 if [ "$extent" = "MS" ]; then
   if [[ ! -f $outputHucDataDir/nhd_headwater_points_subset.gpkg ]] ; then
     echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-    rm -rf $outputHucDataDir
+    # rm -rf $outputHucDataDir
     exit 0
   fi
 fi
@@ -103,11 +103,18 @@ Tstart
 gdalwarp -cutline $outputHucDataDir/wbd_buffered.gpkg -crop_to_cutline -ot Int32 -r bilinear -of "GTiff" -overwrite -co "BLOCKXSIZE=512" -co "BLOCKYSIZE=512" -co "TILED=YES" -co "COMPRESS=LZW" -co "BIGTIFF=YES" $input_DEM $outputHucDataDir/dem.tif
 Tcount
 
+## CHECK DEM NODATA
+echo -e $startDiv"Check DEM Nodata $hucNumber"$stopDiv
+date -u
+Tstart
+$srcDir/check_dem_nodata.py -in_dem $outputHucDataDir/dem.tif -out_dem $outputHucDataDir/dem_nodata.tif
+Tcount
+
 ## GET RASTER METADATA
 echo -e $startDiv"Get DEM Metadata $hucNumber"$stopDiv
 date -u
 Tstart
-read fsize ncols nrows ndv xmin ymin xmax ymax cellsize_resx cellsize_resy<<<$($srcDir/getRasterInfoNative.py $outputHucDataDir/dem.tif)
+read fsize ncols nrows ndv xmin ymin xmax ymax cellsize_resx cellsize_resy<<<$($srcDir/getRasterInfoNative.py $outputHucDataDir/dem_nodata.tif)
 
 ## RASTERIZE NLD MULTILINES ##
 echo -e $startDiv"Rasterize all NLD multilines using zelev vertices"$stopDiv
@@ -122,7 +129,7 @@ echo -e $startDiv"Convert DEM to Meters $hucNumber"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/dem_meters.tif ] && \
-gdal_calc.py --quiet --type=Float32 --co "BLOCKXSIZE=512" --co "BLOCKYSIZE=512" --co "TILED=YES" --co "COMPRESS=LZW" --co "BIGTIFF=YES" -A $outputHucDataDir/dem.tif --outfile="$outputHucDataDir/dem_meters.tif" --calc="A/100" --NoDataValue=$ndv
+gdal_calc.py --quiet --type=Float32 --co "BLOCKXSIZE=512" --co "BLOCKYSIZE=512" --co "TILED=YES" --co "COMPRESS=LZW" --co "BIGTIFF=YES" -A $outputHucDataDir/dem_nodata.tif --outfile="$outputHucDataDir/dem_meters.tif" --calc="A/100" --NoDataValue=$ndv
 Tcount
 
 ## RASTERIZE REACH BOOLEAN (1 & 0) ##
@@ -263,7 +270,7 @@ Tcount
 
 if [[ ! -f $outputHucDataDir/demDerived_reaches_split.gpkg ]] ; then
   echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-  rm -rf $outputHucDataDir
+  # rm -rf $outputHucDataDir
   exit 0
 fi
 
@@ -277,7 +284,7 @@ if [ "$extent" = "MS" ]; then
 
   if [[ ! -f $outputHucDataDir/dem_thalwegCond_MS.tif ]] ; then
     echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-    rm -rf $outputHucDataDir
+    # rm -rf $outputHucDataDir
     exit 0
   fi
 
@@ -357,7 +364,7 @@ $srcDir/filter_catchments_and_add_attributes.py $outputHucDataDir/gw_catchments_
 
 if [[ ! -f $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes.gpkg ]] ; then
   echo "No relevant streams within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-  rm -rf $outputHucDataDir
+  # rm -rf $outputHucDataDir
   exit 0
 fi
 Tcount
@@ -434,11 +441,11 @@ Tcount
 
 
 ## USGS CROSSWALK ##
-echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
-date -u
-Tstart
-$srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/usgs_gages/usgs_gages.gpkg -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
-Tcount
+# echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
+# date -u
+# Tstart
+# $srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/usgs_gages/usgs_gages.gpkg -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
+# Tcount
 
 ## CLEANUP OUTPUTS ##
 echo -e $startDiv"Cleaning up outputs $hucNumber"$stopDiv
diff --git a/src/utils/shared_functions.py b/src/utils/shared_functions.py
index 6ea7b0a74..72e11a068 100644
--- a/src/utils/shared_functions.py
+++ b/src/utils/shared_functions.py
@@ -83,3 +83,68 @@ def subset_wbd_gpkg(wbd_gpkg, multilayer_wbd_geopackage):
     layer_name = os.path.split(wbd_gpkg)[1].strip('.gpkg')
     gdf.crs = PREP_PROJECTION
     gdf.to_file(multilayer_wbd_geopackage, layer=layer_name,driver='GPKG',index=False)
+
+
+def update_raster_profile(elev_cm_filename,elev_m_filename):
+
+    # Update nodata value and convert from cm to meters
+    dem_cm = rasterio.open(elev_cm_filename)
+    no_data = dem_cm.nodata
+    data = dem_cm.read(1)
+    dem_m = np.where(dem_cm == int(no_data), -9999.0, (dem_cm/100).astype(rasterio.float32))
+
+    dem_m_profile = dem_cm.profile.copy()
+    dem_m_profile.update(driver='GTiff',tiled=True,nodata=-9999.0,dtype='float32',compress='lzw',interleave='band')
+
+    with rasterio.open(elev_m_filename, "w", **dem_m_profile, BIGTIFF='YES') as dest:
+        dest.write(dem_m, indexes = 1)
+
+    dem_cm.close()
+
+
+
+# raster_list = ['2002','2003','2004','2005','2006','2007','2008','2101','2102','2201','2202','2203','0430']
+def reproject_raster(input_raster_name,reprojection,blocksize=None,reprojected_raster_name=None):
+
+    if blocksize is not None:
+        if isinstance(blocksize, int):
+            pass
+        elif isinstance(blocksize,str):
+            blocksize = int(blocksize)
+        elif isinstance(blocksize,float):
+
+            blocksize = int(blocksize)
+        else:
+            raise TypeError("Pass integer for blocksize")
+    else:
+        blocksize = 256
+
+    assert input_raster_name.endswith('.tif'), "input raster needs to be a tif"
+
+    with rasterio.open(input_raster_name) as src:
+        transform, width, height = calculate_default_transform(
+            src.crs, reprojection, src.width, src.height, *src.bounds)
+        kwargs = src.meta.copy()
+        kwargs.update({
+            'crs': reprojection,
+            'transform': transform,
+            'width': width,
+            'height': height,
+            'compress': 'lzw'
+        })
+
+        if reprojected_raster_name is None:
+            reprojected_raster_name = input_raster_name
+
+        assert reprojected_raster_name.endswith('.tif'), "output raster needs to be a tif"
+
+        with rasterio.open(reprojected_raster_name, 'w', **kwargs, tiled=True, blockxsize=blocksize, blockysize=blocksize, BIGTIFF='YES') as dst:
+            reproject(
+                source=rasterio.band(src, 1),
+                destination=rasterio.band(dst, 1),
+                src_transform=src.transform,
+                src_crs=src.crs,
+                dst_transform=transform,
+                dst_crs=reprojection,
+                resampling=Resampling.nearest)
+    del src, dst
diff --git a/tools/rating_curve_comparison.py b/tools/rating_curve_comparison.py
index 6cd232ada..8ef04a0c3 100755
--- a/tools/rating_curve_comparison.py
+++ b/tools/rating_curve_comparison.py
@@ -366,11 +366,6 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     number_of_jobs = args['number_of_jobs']
     stat_groups = args['stat_groups']
 
-    # Open log file
-    sys.__stdout__ = sys.stdout
-    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
-    sys.stdout = log_file
-
     stat_groups = stat_groups.split()
     procs_list = []
 
@@ -379,6 +374,11 @@ def calculate_rc_stats_elev(rc,stat_groups=None):
     tables_dir = join(output_dir,'tables')
     os.makedirs(tables_dir, exist_ok=True)
 
+    # Open log file
+    sys.__stdout__ = sys.stdout
+    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
+    sys.stdout = log_file
+
     huc_list  = os.listdir(fim_dir)
     for huc in huc_list:
 

From e780468836fe4453dce4ffdea7ea802cb71fd61e Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Mon, 19 Apr 2021 19:50:10 +0000
Subject: [PATCH 39/66] cleaning up scratch code

---
 src/check_dem_nodata.py    | 141 -------------------------------------
 src/run_by_unit.sh         |  16 ++---
 src/usgs_gage_crosswalk.py |  38 +---------
 3 files changed, 6 insertions(+), 189 deletions(-)
 delete mode 100755 src/check_dem_nodata.py

diff --git a/src/check_dem_nodata.py b/src/check_dem_nodata.py
deleted file mode 100755
index 971adcf77..000000000
--- a/src/check_dem_nodata.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-sys.path.append('/foss_fim/src')
-import rasterio
-import numpy as np
-from utils.shared_variables import PREP_PROJECTION,VIZ_PROJECTION,PREP_PROJECTION_CM
-import argparse
-
-
-
-with rasterio.open(input_raster_name) as src:
-    # check projection
-    if src.crs.to_string() != reprojection:
-        if src.crs.to_string().startswith('EPSG'):
-            epsg = src.crs.to_epsg()
-            proj_crs = CRS.from_epsg(epsg)
-            rio_crs = rasterio.crs.CRS.from_user_input(proj_crs).to_string()
-        else:
-            rio_crs = src.crs.to_string()
-    if rio_crs != reprojection:
-        print(f"{input_raster_name} not projected")
-        # print(f"Reprojecting from {rio_crs} to {reprojection}")
-
-    # dem_dir = '/data/inputs/nhdplus_rasters'
-
-raster_dir = '/data/inputs/nhdplus_rasters'
-m_proj_count = 0
-for huc in os.listdir(raster_dir):
-    # elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
-    # elev_cm_OG = os.path.join(raster_dir,huc, 'elev_cm_orig.tif')
-    elev_cm_proj_tif = os.path.join(raster_dir,huc, 'elev_cm_proj.tif')
-    elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
-    if os.path.exists(elev_m_tif):
-        os.remove(elev_cm_proj_tif)
-    if not os.path.exists(elev_m_tif):
-        # print(f"missubg huc {elev_cm_proj_tif}")
-        m_proj_count = m_proj_count + 1
-
-
-
-################################################################################
-    # Windowed reading/calculating/writing
-    with rasterio.open(elev_cm_filename) as dem_cm:
-        no_data = dem_cm.nodata
-        for block_index, window in dem_cm.block_windows(1):
-            block_data = dem_cm.read(window=window)
-            dem_m = np.where(block_data == int(no_data), nodata_val, (block_data/100).astype(rasterio.float32))
-
-        dem_m_profile = dem_cm.profile.copy()
-
-        dem_m_profile.update(driver='GTiff',tiled=True,nodata=nodata_val,
-                             blockxsize=blocksize, blockysize=blocksize,
-                             dtype='float32',crs=projection,compress='lzw',interleave='band')
-    write_window = Window.from_slices((30, 269), (50, 313))
-    # write_window.height = 239, write_window.width = 263
-
-    with rasterio.open(
-            elev_m_filename, 'w',
-            driver='GTiff', width=500, height=300, count=3,
-            dtype=r.dtype) as dst:
-        for k, arr in [(1, b), (2, g), (3, r)]:
-            dst.write(arr, indexes=k, window=write_window)
-################################################################################
-
-
-
-
-
-
-raster_dir = '/data/inputs/nhdplus_rasters'
-cm_proj_count = 0
-m_proj_count = 0
-other_proj_hucs = []
-for huc in os.listdir(raster_dir):
-    # elev_cm_tif = os.path.join(raster_dir,huc, 'elev_cm.tif')
-    # elev_cm_OG = os.path.join(raster_dir,huc, 'elev_cm_orig.tif')
-    # elev_cm_proj_tif = os.path.join(raster_dir,huc, 'elev_cm_proj.tif')
-    elev_m_tif = os.path.join(raster_dir,huc, 'elev_m.tif')
-    src =  rasterio.open(elev_cm_tif)
-    # check projection
-    if src.crs.to_string() == PREP_PROJECTION_CM:
-        cm_proj_count = cm_proj_count + 1
-    elif src.crs.to_string() == PREP_PROJECTION:
-        m_proj_count = m_proj_count + 1
-    else:
-        other_proj_hucs = other_proj_hucs + [huc]
-    tot_proj_count = cm_proj_count + m_proj_count
-            if src.crs.to_string().startswith('EPSG'):
-                epsg = src.crs.to_epsg()
-                proj_crs = CRS.from_epsg(epsg)
-                rio_crs = rasterio.crs.CRS.from_user_input(proj_crs).to_string()
-            else:
-                rio_crs = src.crs.to_string()
-            if rio_crs != PREP_PROJECTION:
-                print(f"{elev_cm_tif} not projected")
-                # print(f"{rio_crs}")
-
-
-
-
-    if not os.path.exists(elev_m_tif):
-        print(f"missubg huc {elev_m_tif}")
-    if os.path.exists(elev_cm_OG):
-        reproject_raster(elev_cm_OG,PREP_PROJECTION_CM,512,elev_cm_proj_tif)
-    if os.path.exists(elev_cm_proj_tif):
-        print(f"reprojected huc {huc}")
-    # update_raster_profile(elev_cm_tif,elev_m_tif)
-
-
-def update_raster_profile(elev_cm_filename,elev_m_filename):
-
-    # Update nodata value and convert from cm to meters
-    dem_cm = rasterio.open(elev_cm_filename)
-    no_data = dem_cm.nodata
-    data = dem_cm.read(1)
-    dem_m = np.where(dem_cm == int(no_data), -9999.0, (dem_cm/100).astype(rasterio.float32))
-
-    dem_m_profile = dem_cm.profile.copy()
-    dem_m_profile.update(driver='GTiff',tiled=True,nodata=-9999.0,dtype='float32',compress='lzw',interleave='band')
-
-    with rasterio.open(elev_m_filename, "w", **dem_m_profile, BIGTIFF='YES') as dest:
-        dest.write(dem_m, indexes = 1)
-
-    dem_cm.close()
-
-
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Update nodata value')
-    parser.add_argument('-in_dem','--in-dem-filename', help='DEM filename', required=True,type=str)
-    parser.add_argument('-out_dem','--out-dem-filename', help='out DEM filename', required=True,type=str)
-
-    args = vars(parser.parse_args())
-
-    in_dem_filename = args['in_dem_filename']
-    out_dem_filename = args['out_dem_filename']
-
-    update_raster_profile(in_dem_filename,out_dem_filename)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 1c2046c8f..792811748 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -83,7 +83,7 @@ Tcount
 if [ "$extent" = "MS" ]; then
   if [[ ! -f $outputHucDataDir/nhd_headwater_points_subset.gpkg ]] ; then
     echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-    # rm -rf $outputHucDataDir
+    rm -rf $outputHucDataDir
     exit 0
   fi
 fi
@@ -103,13 +103,6 @@ Tstart
 gdalwarp -cutline $outputHucDataDir/wbd_buffered.gpkg -crop_to_cutline -ot Float32 -r bilinear -of "GTiff" -overwrite -co "BLOCKXSIZE=512" -co "BLOCKYSIZE=512" -co "TILED=YES" -co "COMPRESS=LZW" -co "BIGTIFF=YES" $input_DEM $outputHucDataDir/dem_meters.tif
 Tcount
 
-## CHECK DEM NODATA
-echo -e $startDiv"Check DEM Nodata $hucNumber"$stopDiv
-date -u
-Tstart
-$srcDir/check_dem_nodata.py -in_dem $outputHucDataDir/dem.tif -out_dem $outputHucDataDir/dem_nodata.tif
-Tcount
-
 ## GET RASTER METADATA
 echo -e $startDiv"Get DEM Metadata $hucNumber"$stopDiv
 date -u
@@ -121,7 +114,6 @@ echo -e $startDiv"Rasterize all NLD multilines using zelev vertices"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/nld_rasterized_elev.tif ] && [ -f $outputHucDataDir/nld_subset_levees.gpkg ] && \
-<<<<<<< HEAD
 gdal_rasterize -l nld_subset_levees -3d -at -init -9999 -a_nodata $ndv -te $xmin $ymin $xmax $ymax -ts $ncols $nrows -ot Float32 -of GTiff -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" $outputHucDataDir/nld_subset_levees.gpkg $outputHucDataDir/nld_rasterized_elev.tif
 Tcount
 
@@ -263,7 +255,7 @@ Tcount
 
 if [[ ! -f $outputHucDataDir/demDerived_reaches_split.gpkg ]] ; then
   echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-  # rm -rf $outputHucDataDir
+  rm -rf $outputHucDataDir
   exit 0
 fi
 
@@ -277,7 +269,7 @@ if [ "$extent" = "MS" ]; then
 
   if [[ ! -f $outputHucDataDir/dem_thalwegCond_MS.tif ]] ; then
     echo "No AHPs point(s) within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-    # rm -rf $outputHucDataDir
+    rm -rf $outputHucDataDir
     exit 0
   fi
 
@@ -357,7 +349,7 @@ $srcDir/filter_catchments_and_add_attributes.py $outputHucDataDir/gw_catchments_
 
 if [[ ! -f $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes.gpkg ]] ; then
   echo "No relevant streams within HUC $hucNumber boundaries. Aborting run_by_unit.sh"
-  # rm -rf $outputHucDataDir
+  rm -rf $outputHucDataDir
   exit 0
 fi
 Tcount
diff --git a/src/usgs_gage_crosswalk.py b/src/usgs_gage_crosswalk.py
index d330506ab..c85cdbe32 100755
--- a/src/usgs_gage_crosswalk.py
+++ b/src/usgs_gage_crosswalk.py
@@ -32,11 +32,7 @@
 '''
 
 
-<<<<<<< HEAD
-def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,output_table_filename):
-=======
 def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename,dem_adj_filename,output_table_filename,extent):
->>>>>>> dev
 
     wbd_buffer = gpd.read_file(wbd_buffer_filename)
     usgs_gages = gpd.read_file(usgs_gages_filename, mask=wbd_buffer)
@@ -45,8 +41,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     input_catchment = gpd.read_file(input_catchment_filename)
     dem_adj = rasterio.open(dem_adj_filename,'r')
 
-<<<<<<< HEAD
-=======
+
     #MS extent use gages that are mainstem
     if extent == "MS":
         usgs_gages = usgs_gages.query('curve == "yes" & mainstem == "yes"')
@@ -54,24 +49,15 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     if extent == "FR":
         usgs_gages = usgs_gages.query('curve == "yes" & mainstem == "no"')
 
->>>>>>> dev
     if input_flows.HydroID.dtype != 'int': input_flows.HydroID = input_flows.HydroID.astype(int)
 
     # Identify closest HydroID
     closest_catchment = gpd.sjoin(usgs_gages, input_catchment, how='left', op='within').reset_index(drop=True)
-<<<<<<< HEAD
-    closest_hydro_id = closest_catchment.filter(items=['site_no','HydroID','min_thal_elev','med_thal_elev','max_thal_elev', 'order_'])
-    closest_hydro_id = closest_hydro_id.dropna()
-
-    # Get USGS gages that are within catchment boundaries
-    usgs_gages = usgs_gages.loc[usgs_gages.site_no.isin(list(closest_hydro_id.site_no))]
-=======
     closest_hydro_id = closest_catchment.filter(items=['location_id','HydroID','min_thal_elev','med_thal_elev','max_thal_elev', 'order_'])
     closest_hydro_id = closest_hydro_id.dropna()
 
     # Get USGS gages that are within catchment boundaries
     usgs_gages = usgs_gages.loc[usgs_gages.location_id.isin(list(closest_hydro_id.location_id))]
->>>>>>> dev
 
     columns = ['location_id','HydroID','dem_elevation','dem_adj_elevation','min_thal_elev', 'med_thal_elev','max_thal_elev','str_order']
     gage_data = []
@@ -80,19 +66,11 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     for index, gage in usgs_gages.iterrows():
 
         # Get stream attributes
-<<<<<<< HEAD
-        hydro_id = closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].HydroID.item()
-        str_order = str(int(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].order_.item()))
-        min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].min_thal_elev.item(),2)
-        med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].med_thal_elev.item(),2)
-        max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.site_no==gage.site_no].max_thal_elev.item(),2)
-=======
         hydro_id = closest_hydro_id.loc[closest_hydro_id.location_id==gage.location_id].HydroID.item()
         str_order = str(int(closest_hydro_id.loc[closest_hydro_id.location_id==gage.location_id].order_.item()))
         min_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.location_id==gage.location_id].min_thal_elev.item(),2)
         med_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.location_id==gage.location_id].med_thal_elev.item(),2)
         max_thal_elev = round(closest_hydro_id.loc[closest_hydro_id.location_id==gage.location_id].max_thal_elev.item(),2)
->>>>>>> dev
 
         # Convert headwater point geometries to WKB representation
         wkb_gages = dumps(gage.geometry)
@@ -120,11 +98,7 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
         dem_adj_elev = round(list(rasterio.sample.sample_gen(dem_adj,shply_referenced_gage.coords))[0].item(),2)
 
         # Append dem_m_elev, dem_adj_elev, hydro_id, and gage number to table
-<<<<<<< HEAD
-        site_elevations = [str(gage.site_no), str(hydro_id), dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str(str_order)]
-=======
         site_elevations = [str(gage.location_id), str(hydro_id), dem_m_elev, dem_adj_elev, min_thal_elev, med_thal_elev, max_thal_elev,str(str_order)]
->>>>>>> dev
         gage_data.append(site_elevations)
 
 
@@ -144,11 +118,8 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     parser.add_argument('-wbd','--wbd-buffer-filename', help='WBD buffer', required=True)
     parser.add_argument('-dem_adj','--dem-adj-filename', help='Thalweg adjusted DEM', required=True)
     parser.add_argument('-outtable','--output-table-filename', help='Table to append data', required=True)
-<<<<<<< HEAD
-
-=======
     parser.add_argument('-e', '--extent', help="extent configuration entered by user when running fim_run.sh", required = True)
->>>>>>> dev
+
     args = vars(parser.parse_args())
 
     usgs_gages_filename = args['usgs_gages_filename']
@@ -158,11 +129,6 @@ def crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,in
     wbd_buffer_filename = args['wbd_buffer_filename']
     dem_adj_filename = args['dem_adj_filename']
     output_table_filename = args['output_table_filename']
-<<<<<<< HEAD
-
-    crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_adj_filename,output_table_filename)
-=======
     extent = args['extent']
 
     crosswalk_usgs_gage(usgs_gages_filename,dem_filename,input_flows_filename,input_catchment_filename,wbd_buffer_filename, dem_adj_filename,output_table_filename, extent)
->>>>>>> dev

From 4f21b3ff5df837dc13a759e1179c97d2a8e16db6 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 21 Apr 2021 17:06:14 +0000
Subject: [PATCH 40/66] converting to env variables

---
 src/aggregate_vector_inputs.py | 286 +++++++++++++++++----------------
 src/run_by_unit.sh             |  10 +-
 src/utils/shared_variables.py  |  33 ++--
 3 files changed, 170 insertions(+), 159 deletions(-)

diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index 933d20235..d610f18a5 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -2,47 +2,41 @@
 
 import os
 import sys
-import geopandas as gpd
-from tqdm import tqdm
-from os.path import splitext
-from shapely.geometry import Point
-from concurrent.futures import ProcessPoolExecutor,as_completed
-from collections import deque
-import numpy as np
-from shapely.wkb import dumps, loads
-import pygeos
 sys.path.append('/foss_fim/src')
+import geopandas as gpd
 from utils.shared_variables import PREP_PROJECTION
 from utils.shared_functions import getDriver
 from derive_headwaters import findHeadWaterPoints
 from reduce_nhd_stream_density import subset_nhd_network
 from adjust_headwater_streams import adjust_headwaters
-import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-in_dir ='data/inputs/nhdplus_vectors'
-nwm_dir = 'data/inputs/nwm_hydrofabric'
-wbd_dir = 'data/inputs/wbd'
-ahps_dir = 'data/inputs/ahp_sites'
-agg_dir = 'data/inputs/nhdplus_vectors_aggregate'
-
-wbd_filename = os.path.join(wbd_dir, 'WBD_National.gpkg')
-nwm_streams_fr_filename = os.path.join(nwm_dir,'nwm_flows.gpkg')
-nwm_headwaters_filename = os.path.join(nwm_dir,'nwm_headwaters.gpkg')
-nwm_huc4_intersections_filename = os.path.join(nwm_dir,'nwm_huc4_intersections_NEW.gpkg')
-nwm_huc8_intersections_filename = os.path.join(nwm_dir,'nwm_huc8_intersections.gpkg')
-nhd_streams_ms_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg')
-nhd_streams_fr_adjusted_fileName = os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg')
+from shapely.geometry import Point
+from concurrent.futures import ProcessPoolExecutor
+from collections import deque
+import numpy as np
+from shapely.wkb import dumps, loads
+import pygeos
+
+nhdplus_vectors_dir = os.environ.get('nhdplus_vectors_dir')
+wbd_filename = os.environ.get('wbd_filename')
+nwm_streams_orig_filename = os.environ.get('nwm_streams_orig_filename')
+nwm_streams_all_filename = os.environ.get('nwm_streams_all_filename')
+nwm_headwaters_filename = os.environ.get('nwm_headwaters_filename')
+ahps_filename = os.environ.get('ahps_filename')
+nwm_huc4_intersections_filename = os.environ.get('nwm_huc4_intersections_filename')
+nwm_huc8_intersections_filename = os.environ.get('nwm_huc8_intersections_filename')
+agg_nhd_headwaters_adj_fileName = os.environ['agg_nhd_headwaters_adj_fileName']
+agg_nhd_streams_adj_fileName = os.environ['agg_nhd_streams_adj_fileName']
+
+
 
 def identify_nwm_ms_streams(args):
 
-    nwm_streams_filename    = args[0]
-    in_dir                  = args[1]
-    ahps_dir                = args[2]
+    nwm_streams_filename     = args[0]
+    ahps_filename            = args[1]
+    nwm_streams_all_filename = args[2]
 
     # Subset nwm network to ms
-    ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
-    ahps_headwaters = gpd.read_file(ahps_headwaters_filename)
+    ahps_headwaters = gpd.read_file(ahps_filename)
 
     nwm_streams = gpd.read_file(nwm_streams_filename)
 
@@ -50,7 +44,6 @@ def identify_nwm_ms_streams(args):
     nwm_streams = nwm_streams.drop(['mainstem'], axis=1, errors='ignore')
 
     nwm_streams['is_headwater'] = False
-    nwm_streams['downstream_of_headwater'] = False
 
     nwm_streams.loc[nwm_streams.ID.isin(list(ahps_headwaters.nwm_featur)),'is_headwater'] = True
 
@@ -81,7 +74,6 @@ def identify_nwm_ms_streams(args):
                 Q.append(toNode)
 
     nwm_streams_ms = nwm_streams.loc[nwm_streams['is_relevant_stream'],:]
-
     ms_segments = nwm_streams_ms.ID.to_list()
 
     nwm_streams.reset_index(drop=True,inplace=True)
@@ -89,14 +81,16 @@ def identify_nwm_ms_streams(args):
     # Add column to FR nwm layer to indicate MS segments
     nwm_streams['mainstem'] = np.where(nwm_streams.ID.isin(ms_segments), 1, 0)
 
-    nwm_streams.to_file(nwm_streams_filename,driver=getDriver(nwm_streams_filename),index=False)
+    nwm_streams = nwm_streams.drop(['is_relevant_stream','is_headwater'], axis=1, errors='ignore')
+
+    nwm_streams.to_file(nwm_streams_all_filename,driver=getDriver(nwm_streams_all_filename),index=False)
 
 
-def find_nwm_incoming_streams(nwm_streams,wbd,huc_unit,in_dir):
+def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
 
-    # input wbd
+    # Input wbd
     if isinstance(wbd,str):
-        layer = "WBDHU" + str(huc_unit)
+        layer = f"WBDHU{huc_unit}"
         wbd = gpd.read_file(wbd, layer=layer)
     elif isinstance(wbd,gpd.GeoDataFrame):
         pass
@@ -105,19 +99,20 @@ def find_nwm_incoming_streams(nwm_streams,wbd,huc_unit,in_dir):
 
     intersecting_points = []
     nhdplus_ids = []
-    for index, row in tqdm(wbd.iterrows(),total=len(wbd)):
+    mainstem_flag = []
+    for index, row in wbd.iterrows():
 
-        col_name = 'HUC' + str(huc_unit)
+        col_name = f"HUC{huc_unit}"
         huc = row[col_name]
         huc_mask = wbd.loc[wbd[col_name]==str(huc)]
         huc_mask = huc_mask.explode()
         huc_mask = huc_mask.reset_index(drop=True)
 
-        # input nwm streams
-        if isinstance(nwm_streams,str):
-            nwm_streams = gpd.read_file(nwm_streams_filename, mask=huc_mask)
-        elif isinstance(nwm_streams,gpd.GeoDataFrame):
-            pass
+        # Input nwm streams
+        if isinstance(nwm_streams_,str):
+            nwm_streams = gpd.read_file(nwm_streams_, mask=huc_mask)
+        elif isinstance(nwm_streams_,gpd.GeoDataFrame):
+            nwm_streams = nwm_streams_.copy()
         else:
             raise TypeError("Pass dataframe or filepath for nwm streams")
 
@@ -133,8 +128,13 @@ def find_nwm_incoming_streams(nwm_streams,wbd,huc_unit,in_dir):
             for index, segment in nwm_streams_subset.iterrows():
 
                 distances = []
-                nhdplus_id = segment.NHDPlusID
+                try:
+                    nhdplus_id = segment.ID
+                except:
+                    nhdplus_id = segment.NHDPlusID
+
                 linestring = segment.geometry
+                mainstem = segment.mainstem
 
                 # Distance to each stream segment
                 for point in zip(*linestring.coords.xy):
@@ -168,29 +168,26 @@ def find_nwm_incoming_streams(nwm_streams,wbd,huc_unit,in_dir):
 
                 # Collect all nhd stream segment linestring verticies
                 intersecting_points = intersecting_points + [shply_referencedpoint]
-
                 nhdplus_ids = nhdplus_ids + [nhdplus_id]
+                mainstem_flag = mainstem_flag + [mainstem]
 
-    huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'NHDPlusID': nhdplus_ids},crs=nwm_streams.crs,geometry='geometry')
+    huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'NHDPlusID': nhdplus_ids,'mainstem': mainstem_flag},crs=nwm_streams.crs,geometry='geometry')
     huc_intersection = huc_intersection.drop_duplicates()
 
     return huc_intersection
 
 
-
 def collect_stream_attributes(args, huc):
 
-    print (f"Starting huc: {str(huc)}")
-    in_dir = args[0]
-    nwm_dir = args[1]
-    ahps_dir = args[2]
+    print ('Starting huc: ' + str(huc))
+    nhdplus_vectors_dir = args[0]
 
-    print ('Collecting NHDPlus HR attributes')
-    burnline_filename = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')
-    vaa_filename = os.path.join(in_dir,huc,'NHDPlusFlowLineVAA' + str(huc) + '.gpkg')
-    flowline_filename = os.path.join(in_dir,huc,'NHDFlowline' + str(huc) + '.gpkg')
+    # Collecting NHDPlus HR attributes
+    burnline_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')
+    vaa_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusFlowLineVAA' + str(huc) + '.gpkg')
+    flowline_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDFlowline' + str(huc) + '.gpkg')
 
-    if os.path.exists(os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')):
+    if os.path.exists(os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')):
 
         burnline = gpd.read_file(burnline_filename)
         burnline = burnline[['NHDPlusID','ReachCode','geometry']]
@@ -212,118 +209,125 @@ def collect_stream_attributes(args, huc):
         nhd_streams['HUC4'] = str(huc)
 
         # Write out NHDPlus HR aggregated
-        nhd_streams_agg_fileName = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+        nhd_streams_agg_fileName = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
         nhd_streams.to_file(nhd_streams_agg_fileName,driver=getDriver(nhd_streams_agg_fileName),index=False)
         del nhd_streams
 
-        print (f"finished huc: {str(huc)}")
+        print ('finished huc: ' + str(huc))
 
     else:
-        print (f"missing data for huc {str(huc)}")
+        print ('missing data for huc ' + str(huc))
 
 
 def subset_stream_networks(args, huc):
-    nwm_dir                            = args[0]
-    ahps_dir                           = args[1]
+
+    nwm_headwaters_filename            = args[0]
+    ahps_filename                      = args[1]
     wbd4                               = args[2]
     wbd8                               = args[3]
-    in_dir                             = args[4]
-    nwm_huc4_intersections_filename    = args[5]
-    print(f"starting HUC {str(huc)}",flush=True)
+    nhdplus_vectors_dir                = args[4]
+    nwm_huc4_intersect_fr_filename     = args[5]
+    nwm_huc4_intersect_ms_filename     = args[6]
+
+    print("starting HUC " + str(huc),flush=True)
     nwm_headwater_id = 'ID'
-    nwm_headwaters_filename = os.path.join(nwm_dir,'nwm_headwaters.gpkg')
     ahps_headwater_id = 'nws_lid'
-    ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
-    nhd_streams_filename = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+    nhd_streams_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+
     # Subset to reduce footprint
     selected_wbd4 = wbd4.loc[wbd4.HUC4.str.startswith(str(huc))]
     del wbd4
     selected_wbd8 = wbd8.loc[wbd8.HUC8.str.startswith(huc)]
     del wbd8
+
     huc_mask = selected_wbd4.loc[selected_wbd4.HUC4.str.startswith(str(huc))]
     huc_mask = huc_mask.explode()
     huc_mask = huc_mask.reset_index(drop=True)
+
     if len(selected_wbd8.HUC8) > 0:
         selected_wbd8 = selected_wbd8.reset_index(drop=True)
+
         # Identify FR/NWM headwaters
-        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersections_filename)
-        nwm_huc8_intersections_fr = find_nwm_incoming_streams(nhd_streams_fr,selected_wbd8,8,in_dir)
-        nwm_huc8_intersections_fr['intersection'] = True
+        nhd_streams_fr = identify_headwater_streams(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersect_fr_filename)
+
         # Adjust FR/NWM headwater segments
         nwm_headwaters = gpd.read_file(nwm_headwaters_filename, mask=huc_mask)
+        nwm_huc4_intersect_fr = gpd.read_file(nwm_huc4_intersect_fr_filename, mask=huc_mask)
+
         if len(nwm_headwaters) > 0:
+
             adj_nhd_streams_fr, adj_nhd_headwater_points_fr = adjust_headwaters(str(huc),nhd_streams_fr,nwm_headwaters,nwm_headwater_id)
-            adj_nhd_headwater_points_fr['intersection'] = False
-            adj_nhd_headwater_points_fr = adj_nhd_headwater_points_fr.append(nwm_huc8_intersections_fr)
-            nhd_streams_fr_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-            adj_nhd_headwaters_fr_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+
+            nhd_streams_fr_adjusted_fileName=os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
+            adj_nhd_headwaters_fr_fileName=os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+
             # Write out FR adjusted
             adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
             adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
+
             del adj_nhd_streams_fr, adj_nhd_headwater_points_fr
         else:
-            print (f"skipping FR headwater adjustments for HUC: {str(huc)}")
+            print ('skipping FR headwater adjustments for HUC: ' + str(huc))
+
         del nhd_streams_fr
+
         # Identify MS/AHPs headwaters
-        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_headwaters_filename,ahps_headwater_id,nwm_huc4_intersections_filename,True)
-        nwm_huc8_intersections_ms = find_nwm_incoming_streams(nhd_streams_ms,selected_wbd8,8,in_dir)
-        nwm_huc8_intersections_ms['intersection'] = True
-        nwm_huc8_intersections_ms['mainstem'] = True
+        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_filename,ahps_headwater_id,nwm_huc4_intersect_ms_filename)
+
         # Adjust MS/AHPs headwater segments
-        ahps_headwaters = gpd.read_file(ahps_headwaters_filename, mask=huc_mask)
+        ahps_headwaters = gpd.read_file(ahps_filename, mask=huc_mask)
+
         if len(ahps_headwaters) > 0:
+
             adj_nhd_streams_ms, adj_nhd_headwater_points_ms = adjust_headwaters(str(huc),nhd_streams_ms,ahps_headwaters,ahps_headwater_id)
-            nhd_streams_ms_adjusted_fileName=os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-            adj_nhd_headwaters_ms_fileName=os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
+
+            nhd_streams_ms_adjusted_fileName=os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
+            adj_nhd_headwaters_ms_fileName=os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
+
             # Write out MS adjusted
             adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
-            adj_nhd_headwater_points_ms['intersection'] = False
-            ahps_headwaters = ahps_headwaters.drop(['name','nwm_featur'], axis=1, errors='ignore')
-            ahps_headwaters['NHDPlusID'] = 0
-            nwm_huc8_intersections_ms['nws_lid'] = 'FR'
-            adj_nhd_headwater_points_ms = adj_nhd_headwater_points_ms.append(nwm_huc8_intersections_ms)
             adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
+
             del adj_nhd_streams_ms, adj_nhd_headwater_points_ms
+
         else:
-            print (f"skipping MS headwater adjustments for HUC: {str(huc)}")
+            print ('skipping MS headwater adjustments for HUC: ' + str(huc))
             del nhd_streams_ms
 
 
-def aggregate_stream_networks(in_dir,agg_dir, huc_list):
+def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list):
 
     for huc in huc_list:
 
         # FR adjusted
-        adj_nhd_headwaters_fr_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_fr_NEW.gpkg')
-        nhd_fr_adj_huc_subset = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-        nhd_streams_fr_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg')
-        nhd_fr_adj_headwaters_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+        nhd_fr_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
+        nhd_fr_adj_headwaters_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
 
         if os.path.isfile(nhd_fr_adj_huc_subset):
             adj_nhd_streams_fr = gpd.read_file(nhd_fr_adj_huc_subset)
 
             # Write out FR adjusted
-            if os.path.isfile(nhd_streams_fr_adjusted_fileName):
-                adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False, mode='a')
+            if os.path.isfile(agg_nhd_streams_adj_fileName):
+                adj_nhd_streams_fr.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False, mode='a')
             else:
-                adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
+                adj_nhd_streams_fr.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False)
+
             del adj_nhd_streams_fr
 
         if os.path.isfile(nhd_fr_adj_headwaters_subset):
             adj_nhd_headwater_points_fr = gpd.read_file(nhd_fr_adj_headwaters_subset)
 
             # Write out FR adjusted
-            if os.path.isfile(adj_nhd_headwaters_fr_fileName):
-                adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False, mode='a')
+            if os.path.isfile(agg_nhd_headwaters_adj_fileName):
+                adj_nhd_headwater_points_fr.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False, mode='a')
             else:
-                adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
+                adj_nhd_headwater_points_fr.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False)
+
             del adj_nhd_headwater_points_fr
 
-        # MS adjusted
-        adj_nhd_headwaters_ms_fileName=os.path.join(agg_dir,'nhd_headwaters_adjusted_ms_NEW.gpkg')
-        nhd_ms_adj_huc_subset = os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-        nhd_streams_ms_adjusted_fileName=os.path.join(agg_dir,'NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg')
-        nhd_ms_adj_headwater_subset = os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
+        ## MS adjusted
+        nhd_ms_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
+        nhd_ms_adj_headwater_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
 
         if os.path.isfile(nhd_ms_adj_huc_subset):
             adj_nhd_streams_ms = gpd.read_file(nhd_ms_adj_huc_subset)
@@ -348,25 +352,31 @@ def aggregate_stream_networks(in_dir,agg_dir, huc_list):
             del adj_nhd_headwater_points_ms
 
 
-def clean_up_intermediate_files(in_dir):
-
-    for huc in os.listdir(in_dir):
-
-        # agg_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+def clean_up_intermediate_files(nhdplus_vectors_dir):
 
-        fr_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
+    for huc in os.listdir(nhdplus_vectors_dir):
+        agg_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
+        fr_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr.gpkg')
+        fr_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
+        ms_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms.gpkg')
+        ms_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
+        ms_headwater_adj_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
+        fr_headwater_adj_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+        ms_headwater_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_ms.gpkg')
+        fr_headwater_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_fr.gpkg')
 
-        ms_adj_path= os.path.join(in_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
+        if os.path.exists(agg_path):
+            os.remove(agg_path)
 
-        ms_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
-        fr_headwater_adj_path= os.path.join(in_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
-
-        # if os.path.exists(agg_path):
-        #     os.remove(agg_path)
+        if os.path.exists(fr_path):
+            os.remove(fr_path)
 
         if os.path.exists(fr_adj_path):
             os.remove(fr_adj_path)
 
+        if os.path.exists(ms_path):
+            os.remove(ms_path)
+
         if os.path.exists(ms_adj_path):
             os.remove(ms_adj_path)
 
@@ -376,49 +386,55 @@ def clean_up_intermediate_files(in_dir):
         if os.path.exists(fr_headwater_adj_path):
             os.remove(fr_headwater_adj_path)
 
+        if os.path.exists(ms_headwater_path):
+            os.remove(ms_headwater_path)
+
+        if os.path.exists(fr_headwater_path):
+            os.remove(fr_headwater_path)
+
 
 if(__name__=='__main__'):
 
     # Generate NWM Headwaters
     print ('deriving nwm headwater points')
-    nwm_headwaters = findHeadWaterPoints(nwm_streams_fr_filename)
+    nwm_headwaters = findHeadWaterPoints(nwm_streams_orig_filename)
     nwm_headwaters['ID'] = nwm_headwaters.index + 1
     nwm_headwaters.to_file(nwm_headwaters_filename,driver=getDriver(nwm_headwaters_filename),index=False)
 
     del nwm_headwaters, nwm_streams
 
     # Identify NWM MS Streams
-    identify_nwm_ms_args = (nwm_streams_fr_filename,in_dir,ahps_dir)
+    identify_nwm_ms_args = (nwm_streams_orig_filename,ahps_filename,nwm_streams_all_filename)
     print ('identifing nwm ms streams')
     identify_nwm_ms_streams(identify_nwm_ms_args)
 
     # Generate NWM intersection points with WBD4 boundaries
     print ('deriving NWM fr/ms intersection points')
-    huc_intersection = find_nwm_incoming_streams(nwm_streams_fr_filename,wbd_filename,4,in_dir)
+    huc_intersection = find_nwm_incoming_streams(nwm_streams_all_filename,wbd_filename,4)
     huc_intersection.to_file(nwm_huc4_intersections_filename,driver=getDriver(nwm_huc4_intersections_filename))
 
-    print ('loading wb4')
+    del huc_intersection
+
+    print ('loading HUC4s')
     wbd4 = gpd.read_file(wbd_filename, layer='WBDHU4')
-    print ('loading wb8')
+    print ('loading HUC8s')
     wbd8 = gpd.read_file(wbd_filename, layer='WBDHU8')
 
-    collect_arg_list = (in_dir,nwm_dir,ahps_dir)
-    subset_arg_list = (nwm_dir,ahps_dir,wbd4,wbd8,in_dir,nwm_huc4_intersections_filename)
-
-    num_workers = 14
+    collect_arg_list = (nhdplus_vectors_dir)
+    subset_arg_list = (nwm_headwaters_filename,ahps_filename,wbd4,wbd8,nhdplus_vectors_dir,nwm_huc4_intersections_filename)
+    huc_list = os.listdir(nhdplus_vectors_dir)
+    num_workers=11
 
-with ProcessPoolExecutor(max_workers=num_workers) as executor:
-    # Preprocess NHD HR and add attributes
-    # collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in os.listdir(in_dir)]
-    # Subset NHD HR network
-    subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in os.listdir(in_dir)]
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        # Preprocess nhd hr and add attributes
+        collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in huc_list]
+        # Subset nhd hr network
+        subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in huc_list]
 
-    # Generate NWM intersection points with WBD8 boundaries using subset_stream_networks
-    # huc_intersection = find_nwm_incoming_streams(nhd_streams_fr_adjusted_fileName,wbd_filename,8,in_dir)
-    # huc_intersection.to_file(nwm_huc8_intersections_filename,driver=getDriver(nwm_huc8_intersections_filename))
+    del wbd4,wbd8
 
     # Aggregate fr and ms nhd netowrks for entire nwm domain
-    aggregate_stream_networks(in_dir,agg_dir, os.listdir(in_dir))
+    aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list)
 
     # Remove intermediate files
-    clean_up_intermediate_files(in_dir)
+    clean_up_intermediate_files(nhdplus_vectors_dir)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 792811748..c8f490696 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -114,7 +114,7 @@ echo -e $startDiv"Rasterize all NLD multilines using zelev vertices"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/nld_rasterized_elev.tif ] && [ -f $outputHucDataDir/nld_subset_levees.gpkg ] && \
-gdal_rasterize -l nld_subset_levees -3d -at -init -9999 -a_nodata $ndv -te $xmin $ymin $xmax $ymax -ts $ncols $nrows -ot Float32 -of GTiff -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" $outputHucDataDir/nld_subset_levees.gpkg $outputHucDataDir/nld_rasterized_elev.tif
+gdal_rasterize -l nld_subset_levees -3d -at -a_nodata $ndv -te $xmin $ymin $xmax $ymax -ts $ncols $nrows -ot Float32 -of GTiff -co "BLOCKXSIZE=512" -co "BLOCKYSIZE=512" -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" $outputHucDataDir/nld_subset_levees.gpkg $outputHucDataDir/nld_rasterized_elev.tif
 Tcount
 
 ## RASTERIZE REACH BOOLEAN (1 & 0) ##
@@ -431,14 +431,6 @@ Tstart
 $srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/usgs_gages/usgs_gages.gpkg -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv -e $extent
 Tcount
 
-
-## USGS CROSSWALK ##
-# echo -e $startDiv"USGS Crosswalk $hucNumber"$stopDiv
-# date -u
-# Tstart
-# $srcDir/usgs_gage_crosswalk.py -gages $inputDataDir/usgs_gages/usgs_gages.gpkg -dem $outputHucDataDir/dem_meters.tif -flows $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -cat $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -wbd $outputHucDataDir/wbd_buffered.gpkg -dem_adj $dem_thalwegCond -outtable $outputHucDataDir/usgs_elev_table.csv
-# Tcount
-
 ## CLEANUP OUTPUTS ##
 echo -e $startDiv"Cleaning up outputs $hucNumber"$stopDiv
 args=()
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index a004a4842..b8f156205 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -39,20 +39,23 @@
 
 ## Input Paths and Directories
 # Directories
-src_dir = '/foss_fim/src'
-input_dir ='data/inputs'
-nhdplus_rasters_dir = os.path.join(input_dir,'nhdplus_rasters')
-nhdplus_vectors_dir = os.path.join(input_dir,'nhdplus_vectors')
-nwm_hydrofabric_dir = os.path.join(input_dir,'nwm_hydrofabric')
-wbd_dir = os.path.join(input_dir,'wbd')
-ahps_dir = os.path.join(input_dir,'ahp_sites')
-nhdplus_vectors_aggregate_dir = os.path.join(input_dir,'nhdplus_vectors_aggregate')
+os.environ['src_dir'] = '/foss_fim/src'
+os.environ['input_dir'] = 'data/inputs'
+
+os.environ['nhdplus_rasters_dir'] = os.path.join(os.environ.get('input_dir'),'nhdplus_rasters')
+os.environ['nhdplus_vectors_dir'] = os.path.join(os.environ.get('input_dir'),'nhdplus_vectors')
+os.environ['nwm_dir'] = os.path.join(os.environ.get('input_dir'),'nwm_hydrofabric')
+os.environ['wbd_dir'] = os.path.join(os.environ.get('input_dir'),'wbd')
+os.environ['ahps_dir'] = os.path.join(os.environ.get('input_dir'),'ahp_sites')
+os.environ['nhdplus_aggregate_dir'] = os.path.join(os.environ.get('input_dir'),'nhdplus_vectors_aggregate')
 
 # File Paths
-wbd_filename = os.path.join(wbd_dir, 'WBD_National.gpkg')
-nwm_streams_fr_filename = os.path.join(nwm_hydrofabric_dir,'nwm_flows.gpkg')
-nwm_streams_ms_filename = os.path.join(nwm_hydrofabric_dir,'nwm_flows_ms.gpkg')
-nwm_headwaters_filename = os.path.join(nwm_hydrofabric_dir,'nwm_headwaters.gpkg')
-nwm_huc4_intersections_ms_filename = os.path.join(nwm_hydrofabric_dir,'nwm_ms_huc4_intersections.gpkg')
-nwm_huc4_intersections_fr_filename = os.path.join(nwm_hydrofabric_dir,'nwm_fr_huc4_intersections.gpkg')
-ahps_headwaters_filename = os.path.join(ahps_dir,'nws_lid.gpkg')
+os.environ['wbd_filename'] = os.path.join(os.environ.get('wbd_dir'),'WBD_National.gpkg')
+os.environ['nwm_streams_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_flows_original.gpkg')
+os.environ['nwm_streams_all_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_flows.gpkg')
+os.environ['nwm_headwaters_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_headwaters.gpkg')
+os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
+os.environ['nwm_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc8_intersections.gpkg')
+os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid.gpkg')
+os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adjusted.gpkg')
+os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj.gpkg')

From 780b09c90075f58f8069da82f57c996ad6b7ebcc Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Fri, 23 Apr 2021 03:12:26 +0000
Subject: [PATCH 41/66] consolidating fr and ms input layers

---
 fim_run.sh                       |  15 +--
 src/add_crosswalk.py             |   1 +
 src/adjust_headwater_streams.py  | 202 ++++++++++++++++------------
 src/aggregate_vector_inputs.py   | 224 ++++++++++++-------------------
 src/clip_vectors_to_wbd.py       |  34 ++---
 src/reduce_nhd_stream_density.py |  88 ++++++------
 src/run_by_unit.sh               |  17 +--
 src/utils/shared_variables.py    |   4 +-
 8 files changed, 276 insertions(+), 309 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index c467d47b0..569606a9a 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -111,16 +111,11 @@ logFile=$outputRunDataDir/logs/summary.log
 
 ## Define inputs
 export input_WBD_gdb=$inputDataDir/wbd/WBD_National.gpkg
-export input_NWM_Lakes=$inputDataDir/nwm_hydrofabric/nwm_lakes.gpkg
-export input_NWM_Catchments_fr=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
-export input_NWM_Catchments_ms=$inputDataDir/nwm_hydrofabric/nwm_catchments_ms.gpkg
-export input_NWM_Flows_fr=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
-export input_NWM_Flows_ms=$inputDataDir/nwm_hydrofabric/nwm_flows_ms.gpkg
-export input_NWM_Headwaters=$inputDataDir/nwm_hydrofabric/nwm_headwaters.gpkg
-export input_nhd_flowlines_fr=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_fr_adjusted_NEW.gpkg
-export input_nhd_flowlines_ms=$inputDataDir/nhdplus_vectors_aggregate/NHDPlusBurnLineEvent_ms_adjusted_NEW.gpkg
-export input_nhd_headwaters_fr=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_fr_NEW.gpkg
-export input_nhd_headwaters_ms=$inputDataDir/nhdplus_vectors_aggregate/nhd_headwaters_adjusted_ms_NEW.gpkg
+export input_nwm_lakes=$inputDataDir/nwm_hydrofabric/nwm_lakes.gpkg
+export input_nwm_catchments=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
+export input_nwm_flows=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
+export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj.gpkg
+export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adjusted.gpkg
 
 ## Input handling ##
 $srcDir/check_huc_inputs.py -u "$hucList"
diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
index 2e7fbccbd..ef21ea23a 100755
--- a/src/add_crosswalk.py
+++ b/src/add_crosswalk.py
@@ -52,6 +52,7 @@ def add_crosswalk(input_catchments_fileName,input_flows_fileName,input_srcbase_f
     elif extent == 'MS':
         ## crosswalk using stream segment midpoint method
         input_nwmcat = gpd.read_file(input_nwmcat_fileName, mask=input_huc)
+        input_nwmcat = input_nwmcat.loc[input_nwmcat.mainstem==1]
         input_nwmcat = input_nwmcat.rename(columns={'ID':'feature_id'})
         if input_nwmcat.feature_id.dtype != 'int': input_nwmcat.feature_id = input_nwmcat.feature_id.astype(int)
         input_nwmcat=input_nwmcat.set_index('feature_id')
diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
index bc12939bf..7b7d6156d 100644
--- a/src/adjust_headwater_streams.py
+++ b/src/adjust_headwater_streams.py
@@ -12,112 +12,138 @@
 from shapely.wkb import dumps, loads
 from utils.shared_variables import PREP_PROJECTION
 from utils.shared_functions import getDriver
+import warnings
+warnings.simplefilter("ignore")
 
-def adjust_headwaters(huc,nhd_streams,headwaters,headwater_id):
+def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
 
     # Identify true headwater segments
-    if nhd_streams['headwaters_id'].dtype=='int':
-        nhd_streams_adj = nhd_streams.loc[(nhd_streams.headwaters_id > 0) & (nhd_streams.downstream_of_headwater == False),:].copy()
-        if headwaters[headwater_id].dtype != 'int': headwaters[headwater_id] = headwaters[headwater_id].astype(int)
-    else:
-        nhd_streams_adj = nhd_streams.loc[(nhd_streams.headwaters_id.notna()) & (nhd_streams.downstream_of_headwater == False),:].copy()
-
+    nhd_streams_adj = nhd_streams.loc[(nhd_streams.headwaters_id > 0) & (nhd_streams.downstream_of_headwater == False),:].copy()
     nhd_streams_adj = nhd_streams_adj.explode()
     nhd_streams_adj = nhd_streams_adj.reset_index(drop=True)
 
-    headwater_limited = headwaters.merge(nhd_streams_adj["headwaters_id"],left_on=headwater_id, right_on="headwaters_id",how='right')
-
-    headwaterstreams = []
-    referencedpoints = []
-
-    for index, point in headwater_limited.iterrows():
-
-        # Convert headwaterpoint geometries to WKB representation
-        wkb_points = dumps(point.geometry)
-
-        # Create pygeos headwaterpoint geometries from WKB representation
-        pointbin_geom = pygeos.io.from_wkb(wkb_points)
-
-        # Closest segment to headwater
-        closest_stream = nhd_streams_adj.loc[nhd_streams_adj["headwaters_id"]==point[headwater_id]]
+    if nwm_headwaters["site_id"].dtype != 'int': nwm_headwaters["site_id"] = nwm_headwaters["site_id"].astype(int)
+    headwater_limited = nwm_headwaters.merge(nhd_streams_adj[["headwaters_id","mainstem"]],left_on="site_id", right_on="headwaters_id",how='right')
+    headwater_limited = headwater_limited.drop(columns=['headwaters_id'])
+
+    # Combine NWM headwaters and AHPS sites to be snapped to NHDPlus HR segments
+    headwater_pts = headwater_limited.append(nws_lids)
+
+    if headwater_pts is not None:
+
+        headwaterstreams = []
+        referencedpoints = []
+        snapped_ahps = []
+        nws_lid = []
+        for index, point in headwater_pts.iterrows():
+
+            # Convert headwaterpoint geometries to WKB representation
+            wkb_points = dumps(point.geometry)
+
+            # Create pygeos headwaterpoint geometries from WKB representation
+            pointbin_geom = pygeos.io.from_wkb(wkb_points)
+
+            if point.pt_type == 'nwm_headwater':
+                # Closest segment to headwater
+                closest_stream = nhd_streams_adj.loc[nhd_streams_adj["headwaters_id"]==point[headwater_id]]
+            else:
+                # Closest segment to ahps site
+                closest_stream = nhd_streams.loc[nhd_streams["nws_lid"]==point[headwater_id]]
+
+            try: # Seeing inconsistent geometry objects even after exploding nhd_streams_adj; not sure why this is
+                closest_stream =closest_stream.explode()
+            except:
+                pass
+
+            try:
+                wkb_closest_stream = dumps(closest_stream.geometry[0])
+            except:
+                wkb_closest_stream = dumps(closest_stream.geometry[0][0])
+
+            streambin_geom = pygeos.io.from_wkb(wkb_closest_stream)
+
+            # Linear reference headwater to closest stream segment
+            pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
+            referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
+
+            # Convert geometries to wkb representation
+            bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
+
+            # Convert to shapely geometries
+            shply_referencedpoint = loads(bin_referencedpoint)
+            shply_linestring = loads(wkb_closest_stream)
+            headpoint = Point(shply_referencedpoint.coords)
+
+            if point.pt_type == 'nwm_headwater':
+
+                cumulative_line = []
+                relativedistlst = []
+                # Collect all nhd stream segment linestring verticies
+                for point in zip(*shply_linestring.coords.xy):
+                    cumulative_line = cumulative_line + [point]
+                    relativedist = shply_linestring.project(Point(point))
+                    relativedistlst = relativedistlst + [relativedist]
+
+                # Add linear referenced headwater point to closest nhd stream segment
+                if not headpoint in cumulative_line:
+                    cumulative_line = cumulative_line + [headpoint]
+                    relativedist = shply_linestring.project(headpoint)
+                    relativedistlst = relativedistlst + [relativedist]
+
+                # Sort by relative line distance to place headwater point in linestring
+                sortline = pd.DataFrame({'geom' : cumulative_line, 'dist' : relativedistlst}).sort_values('dist')
+                shply_linestring = LineString(sortline.geom.tolist())
+                referencedpoints = referencedpoints + [headpoint]
+
+                # Split the new linestring at the new headwater point
+                try:
+                    line1,line2 = split(shply_linestring, headpoint)
+                    headwaterstreams = headwaterstreams + [LineString(line1)]
+                    nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1)
+                except:
+                    line1 = split(shply_linestring, headpoint)
+                    headwaterstreams = headwaterstreams + [LineString(line1[0])]
+                    nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1[0])
+            else:
+                snapped_ahps = snapped_ahps + [headpoint]
+                nws_lid = nws_lid + [point[headwater_id]]
+
+        nhd_streams = nhd_streams.drop(columns=['is_relevant_stream', 'headwaters_id', 'downstream_of_headwater'])
 
-        try: # Seeing inconsistent geometry objects even after exploding nhd_streams_adj; not sure why this is
-            closest_stream =closest_stream.explode()
-        except:
-            pass
         try:
-            wkb_closest_stream = dumps(closest_stream.geometry[0])
+            del nhd_streams_adj, headwaters, headwater_limited, headwaterstreams, referencedpoints, cumulative_line, relativedistlst
         except:
-            wkb_closest_stream = dumps(closest_stream.geometry[0][0])
-
-        streambin_geom = pygeos.io.from_wkb(wkb_closest_stream)
-
-        # Linear reference headwater to closest stream segment
-        pointdistancetoline = pygeos.linear.line_locate_point(streambin_geom, pointbin_geom)
-        referencedpoint = pygeos.linear.line_interpolate_point(streambin_geom, pointdistancetoline)
-
-        # Convert geometries to wkb representation
-        bin_referencedpoint = pygeos.io.to_wkb(referencedpoint)
-
-        # Convert to shapely geometries
-        shply_referencedpoint = loads(bin_referencedpoint)
-        shply_linestring = loads(wkb_closest_stream)
-        headpoint = Point(shply_referencedpoint.coords)
-        cumulative_line = []
-        relativedistlst = []
-
-        # Collect all nhd stream segment linestring verticies
-        for point in zip(*shply_linestring.coords.xy):
-            cumulative_line = cumulative_line + [point]
-            relativedist = shply_linestring.project(Point(point))
-            relativedistlst = relativedistlst + [relativedist]
-
-        # Add linear referenced headwater point to closest nhd stream segment
-        if not headpoint in cumulative_line:
-            cumulative_line = cumulative_line + [headpoint]
-            relativedist = shply_linestring.project(headpoint)
-            relativedistlst = relativedistlst + [relativedist]
-
-        # Sort by relative line distance to place headwater point in linestring
-        sortline = pd.DataFrame({'geom' : cumulative_line, 'dist' : relativedistlst}).sort_values('dist')
-        shply_linestring = LineString(sortline.geom.tolist())
-        referencedpoints = referencedpoints + [headpoint]
-
-        # Split the new linestring at the new headwater point
-        try:
+            print (f"issue deleting adjusted stream variables for huc {str(huc)}")
 
-            line1,line2 = split(shply_linestring, headpoint)
-            headwaterstreams = headwaterstreams + [LineString(line1)]
-            nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1)
-        except:
 
-            line1 = split(shply_linestring, headpoint)
-            headwaterstreams = headwaterstreams + [LineString(line1[0])]
-            nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1[0])
+        # Create snapped ahps sites
+        if len(snapped_ahps) > 0:
+            snapped_ahps_points = gpd.GeoDataFrame({'pt_type': 'nws_lid', headwater_id: nws_lid, 'mainstem': True,
+                                                    'geometry': snapped_ahps},geometry='geometry',crs=PREP_PROJECTION)
 
-    nhd_streams = nhd_streams.drop(columns=['is_relevant_stream', 'headwaters_id', 'downstream_of_headwater'])
+        # Identify ajusted nhd headwaters
+        nhd_headwater_streams_adj = nhd_streams.loc[nhd_streams['is_headwater'],:]
+        nhd_headwater_streams_adj = nhd_headwater_streams_adj.explode()
 
-    try:
-        del nhd_streams_adj, headwaters, headwater_limited, headwaterstreams, referencedpoints, cumulative_line, relativedistlst
-    except:
-        print (f"issue deleting adjusted stream variables for huc {str(huc)}")
+        hw_points = np.zeros(len(nhd_headwater_streams_adj),dtype=object)
+        for index,lineString in enumerate(nhd_headwater_streams_adj.geometry):
+            hw_point = [point for point in zip(*lineString.coords.xy)][-1]
+            hw_points[index] = Point(*hw_point)
 
-    # Identify ajusted nhd headwaters
-    # print('Identify NHD headwater points',flush=True)
-    nhd_headwater_streams_adj = nhd_streams.loc[nhd_streams['is_headwater'],:]
-    nhd_headwater_streams_adj = nhd_headwater_streams_adj.explode()
 
-    hw_points = np.zeros(len(nhd_headwater_streams_adj),dtype=object)
-    for index,lineString in enumerate(nhd_headwater_streams_adj.geometry):
-        hw_point = [point for point in zip(*lineString.coords.xy)][-1]
-        hw_points[index] = Point(*hw_point)
+        nhd_headwater_points_adj = gpd.GeoDataFrame({'pt_type': 'NHDPlusID', headwater_id: nhd_headwater_streams_adj['NHDPlusID'],
+                                                 'mainstem': False, 'geometry': hw_points},geometry='geometry',crs=PREP_PROJECTION)
 
-    nhd_headwater_points_adj = gpd.GeoDataFrame({'NHDPlusID' : nhd_headwater_streams_adj['NHDPlusID'],
-                                            'geometry' : hw_points},geometry='geometry',crs=PREP_PROJECTION)
+        nhd_headwater_points_adj = nhd_headwater_points_adj.reset_index(drop=True)
 
-    del nhd_headwater_streams_adj
+        del nhd_headwater_streams_adj
+
+        try:
+            combined_pts = snapped_ahps_points.append(nhd_headwater_points_adj)
+        except:
+            combined_pts = nhd_headwater_points_adj.copy()
 
-    return nhd_streams, nhd_headwater_points_adj
+        return nhd_streams, combined_pts
 
 if __name__ == '__main__':
 
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index d610f18a5..60decd906 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -15,25 +15,23 @@
 import numpy as np
 from shapely.wkb import dumps, loads
 import pygeos
+from tqdm import tqdm
 
 nhdplus_vectors_dir = os.environ.get('nhdplus_vectors_dir')
 wbd_filename = os.environ.get('wbd_filename')
 nwm_streams_orig_filename = os.environ.get('nwm_streams_orig_filename')
 nwm_streams_all_filename = os.environ.get('nwm_streams_all_filename')
 nwm_headwaters_filename = os.environ.get('nwm_headwaters_filename')
+nwm_catchments_orig_filename = os.environ.get('nwm_catchments_orig_filename')
+nwm_catchments_all_filename = os.environ.get('nwm_catchments_all_filename')
 ahps_filename = os.environ.get('ahps_filename')
 nwm_huc4_intersections_filename = os.environ.get('nwm_huc4_intersections_filename')
-nwm_huc8_intersections_filename = os.environ.get('nwm_huc8_intersections_filename')
+nhd_huc8_intersections_filename = os.environ.get('nhd_huc8_intersections_filename')
 agg_nhd_headwaters_adj_fileName = os.environ['agg_nhd_headwaters_adj_fileName']
 agg_nhd_streams_adj_fileName = os.environ['agg_nhd_streams_adj_fileName']
 
 
-
-def identify_nwm_ms_streams(args):
-
-    nwm_streams_filename     = args[0]
-    ahps_filename            = args[1]
-    nwm_streams_all_filename = args[2]
+def identify_nwm_ms_streams(nwm_streams_filename,ahps_filename,nwm_streams_all_filename):
 
     # Subset nwm network to ms
     ahps_headwaters = gpd.read_file(ahps_filename)
@@ -83,7 +81,9 @@ def identify_nwm_ms_streams(args):
 
     nwm_streams = nwm_streams.drop(['is_relevant_stream','is_headwater'], axis=1, errors='ignore')
 
-    nwm_streams.to_file(nwm_streams_all_filename,driver=getDriver(nwm_streams_all_filename),index=False)
+    nwm_streams.to_file(nwm_streams_all_filename,driver=getDriver(nwm_streams_all_filename),index=False,layer='nwm_streams')
+
+    return ms_segments
 
 
 def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
@@ -100,7 +100,7 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
     intersecting_points = []
     nhdplus_ids = []
     mainstem_flag = []
-    for index, row in wbd.iterrows():
+    for index, row in tqdm(wbd.iterrows(),total=len(wbd)):
 
         col_name = f"HUC{huc_unit}"
         huc = row[col_name]
@@ -128,6 +128,7 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
             for index, segment in nwm_streams_subset.iterrows():
 
                 distances = []
+
                 try:
                     nhdplus_id = segment.ID
                 except:
@@ -171,17 +172,19 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
                 nhdplus_ids = nhdplus_ids + [nhdplus_id]
                 mainstem_flag = mainstem_flag + [mainstem]
 
+        del huc_mask
+
     huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'NHDPlusID': nhdplus_ids,'mainstem': mainstem_flag},crs=nwm_streams.crs,geometry='geometry')
     huc_intersection = huc_intersection.drop_duplicates()
 
+    del nwm_streams,wbd
+
     return huc_intersection
 
 
-def collect_stream_attributes(args, huc):
+def collect_stream_attributes(nhdplus_vectors_dir, huc):
 
     print ('Starting huc: ' + str(huc))
-    nhdplus_vectors_dir = args[0]
-
     # Collecting NHDPlus HR attributes
     burnline_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')
     vaa_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusFlowLineVAA' + str(huc) + '.gpkg')
@@ -226,12 +229,13 @@ def subset_stream_networks(args, huc):
     wbd4                               = args[2]
     wbd8                               = args[3]
     nhdplus_vectors_dir                = args[4]
-    nwm_huc4_intersect_fr_filename     = args[5]
-    nwm_huc4_intersect_ms_filename     = args[6]
+    nwm_huc4_intersections_filename    = args[5]
 
     print("starting HUC " + str(huc),flush=True)
     nwm_headwater_id = 'ID'
     ahps_headwater_id = 'nws_lid'
+    headwater_pts_id = 'site_id'
+    column_order = ['pt_type', headwater_pts_id, 'mainstem', 'geometry']
     nhd_streams_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
 
     # Subset to reduce footprint
@@ -247,187 +251,137 @@ def subset_stream_networks(args, huc):
     if len(selected_wbd8.HUC8) > 0:
         selected_wbd8 = selected_wbd8.reset_index(drop=True)
 
-        # Identify FR/NWM headwaters
-        nhd_streams_fr = identify_headwater_streams(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersect_fr_filename)
+        # Identify FR/NWM headwaters and subset HR network
+        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersections_filename)
+
+        # Identify nhd mainstem streams
+        nhd_streams_all = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_fr,ahps_filename,ahps_headwater_id,nwm_huc4_intersections_filename,True)
 
-        # Adjust FR/NWM headwater segments
+        # Identify HUC8 intersection points
+        nhd_huc8_intersections = find_nwm_incoming_streams(nhd_streams_all,selected_wbd8,8)
+        nhd_huc8_intersections['pt_type'] = 'nhd_huc8_intersections'
+        nhd_huc8_intersections = nhd_huc8_intersections.rename(columns={"NHDPlusID": headwater_pts_id})
+        nhd_huc8_intersections = nhd_huc8_intersections[column_order]
+
+        # Load nwm headwaters
         nwm_headwaters = gpd.read_file(nwm_headwaters_filename, mask=huc_mask)
-        nwm_huc4_intersect_fr = gpd.read_file(nwm_huc4_intersect_fr_filename, mask=huc_mask)
+        nwm_headwaters['pt_type'] = 'nwm_headwater'
+        nwm_headwaters = nwm_headwaters.rename(columns={"ID": headwater_pts_id})
+
+        # Load nws lids
+        nws_lids = gpd.read_file(ahps_filename, mask=huc_mask)
+        nws_lids = nws_lids.drop(columns=['name','nwm_featur'])
+        nws_lids = nws_lids.rename(columns={"nws_lid": headwater_pts_id})
+        nws_lids['pt_type'] = 'nws_lid'
 
-        if len(nwm_headwaters) > 0:
+        if (len(nwm_headwaters) > 0) or (len(nws_lids) > 0):
+            # Adjust FR/NWM headwater segments
+            adj_nhd_streams_all, adj_nhd_headwater_points = adjust_headwaters(huc,nhd_streams_all,nwm_headwaters,nws_lids,headwater_pts_id)
 
-            adj_nhd_streams_fr, adj_nhd_headwater_points_fr = adjust_headwaters(str(huc),nhd_streams_fr,nwm_headwaters,nwm_headwater_id)
+            adj_nhd_headwater_points = adj_nhd_headwater_points[column_order]
+            adj_nhd_headwater_points_all = adj_nhd_headwater_points.append(nhd_huc8_intersections)
 
-            nhd_streams_fr_adjusted_fileName=os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-            adj_nhd_headwaters_fr_fileName=os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+            adj_nhd_streams_all_fileName = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
+            adj_nhd_headwaters_all_fileName = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
 
             # Write out FR adjusted
-            adj_nhd_streams_fr.to_file(nhd_streams_fr_adjusted_fileName,driver=getDriver(nhd_streams_fr_adjusted_fileName),index=False)
-            adj_nhd_headwater_points_fr.to_file(adj_nhd_headwaters_fr_fileName,driver=getDriver(adj_nhd_headwaters_fr_fileName),index=False)
+            adj_nhd_streams_all.to_file(adj_nhd_streams_all_fileName,driver=getDriver(adj_nhd_streams_all_fileName),index=False)
+            adj_nhd_headwater_points_all.to_file(adj_nhd_headwaters_all_fileName,driver=getDriver(adj_nhd_headwaters_all_fileName),index=False)
 
-            del adj_nhd_streams_fr, adj_nhd_headwater_points_fr
+            del adj_nhd_streams_all, adj_nhd_headwater_points_all
         else:
-            print ('skipping FR headwater adjustments for HUC: ' + str(huc))
+            print ('skipping headwater adjustments for HUC: ' + str(huc))
 
         del nhd_streams_fr
 
-        # Identify MS/AHPs headwaters
-        nhd_streams_ms = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,ahps_filename,ahps_headwater_id,nwm_huc4_intersect_ms_filename)
-
-        # Adjust MS/AHPs headwater segments
-        ahps_headwaters = gpd.read_file(ahps_filename, mask=huc_mask)
-
-        if len(ahps_headwaters) > 0:
-
-            adj_nhd_streams_ms, adj_nhd_headwater_points_ms = adjust_headwaters(str(huc),nhd_streams_ms,ahps_headwaters,ahps_headwater_id)
-
-            nhd_streams_ms_adjusted_fileName=os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-            adj_nhd_headwaters_ms_fileName=os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
-
-            # Write out MS adjusted
-            adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
-            adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
-
-            del adj_nhd_streams_ms, adj_nhd_headwater_points_ms
-
-        else:
-            print ('skipping MS headwater adjustments for HUC: ' + str(huc))
-            del nhd_streams_ms
-
 
 def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list):
 
     for huc in huc_list:
 
         # FR adjusted
-        nhd_fr_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-        nhd_fr_adj_headwaters_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
+        nhd_fr_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
+        nhd_fr_adj_headwaters_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
 
         if os.path.isfile(nhd_fr_adj_huc_subset):
-            adj_nhd_streams_fr = gpd.read_file(nhd_fr_adj_huc_subset)
+            adj_nhd_streams_all = gpd.read_file(nhd_fr_adj_huc_subset)
 
             # Write out FR adjusted
             if os.path.isfile(agg_nhd_streams_adj_fileName):
-                adj_nhd_streams_fr.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False, mode='a')
+                adj_nhd_streams_all.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False, mode='a')
             else:
-                adj_nhd_streams_fr.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False)
+                adj_nhd_streams_all.to_file(agg_nhd_streams_adj_fileName,driver=getDriver(agg_nhd_streams_adj_fileName),index=False)
 
-            del adj_nhd_streams_fr
+            del adj_nhd_streams_all
 
         if os.path.isfile(nhd_fr_adj_headwaters_subset):
-            adj_nhd_headwater_points_fr = gpd.read_file(nhd_fr_adj_headwaters_subset)
+            adj_nhd_headwater_points_all = gpd.read_file(nhd_fr_adj_headwaters_subset)
 
             # Write out FR adjusted
             if os.path.isfile(agg_nhd_headwaters_adj_fileName):
-                adj_nhd_headwater_points_fr.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False, mode='a')
+                adj_nhd_headwater_points_all.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False, mode='a')
             else:
-                adj_nhd_headwater_points_fr.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False)
+                adj_nhd_headwater_points_all.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False)
 
             del adj_nhd_headwater_points_fr
 
-        ## MS adjusted
-        nhd_ms_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-        nhd_ms_adj_headwater_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
-
-        if os.path.isfile(nhd_ms_adj_huc_subset):
-            adj_nhd_streams_ms = gpd.read_file(nhd_ms_adj_huc_subset)
-
-            # Write out ms adjusted
-            if os.path.isfile(nhd_streams_ms_adjusted_fileName):
-                adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False, mode='a')
-            else:
-                adj_nhd_streams_ms.to_file(nhd_streams_ms_adjusted_fileName,driver=getDriver(nhd_streams_ms_adjusted_fileName),index=False)
-
-            del adj_nhd_streams_ms
-
-        if os.path.isfile(nhd_ms_adj_headwater_subset):
-            adj_nhd_headwater_points_ms = gpd.read_file(nhd_ms_adj_headwater_subset)
-
-            # Write out ms adjusted
-            if os.path.isfile(adj_nhd_headwaters_ms_fileName):
-                adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False, mode='a')
-            else:
-                adj_nhd_headwater_points_ms.to_file(adj_nhd_headwaters_ms_fileName,driver=getDriver(adj_nhd_headwaters_ms_fileName),index=False)
-
-            del adj_nhd_headwater_points_ms
-
 
 def clean_up_intermediate_files(nhdplus_vectors_dir):
 
     for huc in os.listdir(nhdplus_vectors_dir):
         agg_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
-        fr_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr.gpkg')
-        fr_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_fr_adjusted.gpkg')
-        ms_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms.gpkg')
-        ms_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_ms_adjusted.gpkg')
-        ms_headwater_adj_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_ms.gpkg')
-        fr_headwater_adj_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adjusted_fr.gpkg')
-        ms_headwater_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_ms.gpkg')
-        fr_headwater_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_fr.gpkg')
+        streams_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
+        headwater_adj_path= os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
 
         if os.path.exists(agg_path):
             os.remove(agg_path)
 
-        if os.path.exists(fr_path):
-            os.remove(fr_path)
-
-        if os.path.exists(fr_adj_path):
-            os.remove(fr_adj_path)
+        if os.path.exists(streams_adj_path):
+            os.remove(streams_adj_path)
 
-        if os.path.exists(ms_path):
-            os.remove(ms_path)
-
-        if os.path.exists(ms_adj_path):
-            os.remove(ms_adj_path)
-
-        if os.path.exists(ms_headwater_adj_path):
-            os.remove(ms_headwater_adj_path)
-
-        if os.path.exists(fr_headwater_adj_path):
-            os.remove(fr_headwater_adj_path)
-
-        if os.path.exists(ms_headwater_path):
-            os.remove(ms_headwater_path)
-
-        if os.path.exists(fr_headwater_path):
-            os.remove(fr_headwater_path)
+        if os.path.exists(headwater_adj_path):
+            os.remove(headwater_adj_path)
 
 
 if(__name__=='__main__'):
 
-    # Generate NWM Headwaters
-    print ('deriving nwm headwater points')
-    nwm_headwaters = findHeadWaterPoints(nwm_streams_orig_filename)
-    nwm_headwaters['ID'] = nwm_headwaters.index + 1
-    nwm_headwaters.to_file(nwm_headwaters_filename,driver=getDriver(nwm_headwaters_filename),index=False)
-
-    del nwm_headwaters, nwm_streams
-
-    # Identify NWM MS Streams
-    identify_nwm_ms_args = (nwm_streams_orig_filename,ahps_filename,nwm_streams_all_filename)
-    print ('identifing nwm ms streams')
-    identify_nwm_ms_streams(identify_nwm_ms_args)
-
-    # Generate NWM intersection points with WBD4 boundaries
-    print ('deriving NWM fr/ms intersection points')
-    huc_intersection = find_nwm_incoming_streams(nwm_streams_all_filename,wbd_filename,4)
-    huc_intersection.to_file(nwm_huc4_intersections_filename,driver=getDriver(nwm_huc4_intersections_filename))
-
-    del huc_intersection
+    # # Generate NWM Headwaters
+    # print ('deriving nwm headwater points')
+    # nwm_headwaters = findHeadWaterPoints(nwm_streams_orig_filename)
+    # nwm_headwaters['ID'] = nwm_headwaters.index + 1
+    # nwm_headwaters.to_file(nwm_headwaters_filename,driver=getDriver(nwm_headwaters_filename),index=False,layer='nwm_headwaters')
+    # del nwm_headwaters, nwm_streams
+    #
+    # # Identify NWM MS Streams
+    # print ('identifing nwm ms streams')
+    # ms_segments = identify_nwm_ms_streams(nwm_streams_orig_filename,ahps_filename,nwm_streams_all_filename)
+    #
+    # # Identify NWM MS Catchments
+    # print ('identifing nwm ms catchments')
+    # nwm_catchments = gpd.read_file(nwm_catchments_orig_filename)
+    # # Add column to FR nwm layer to indicate MS segments
+    # nwm_catchments['mainstem'] = np.where(nwm_catchments.ID.isin(ms_segments), 1, 0)
+    # nwm_catchments.to_file(nwm_catchments_all_filename,driver=getDriver(nwm_catchments_all_filename),index=False,layer='nwm_catchments')
+    # del nwm_catchments, ms_segments
+
+    # # Generate NWM intersection points with WBD4 boundaries
+    # print ('deriving NWM fr/ms intersection points')
+    # huc4_intersection = find_nwm_incoming_streams(nwm_streams_all_filename,wbd_filename,4)
+    # huc4_intersection.to_file(nwm_huc4_intersections_filename,driver=getDriver(nwm_huc4_intersections_filename),layer='huc4_intersection')
+    # del huc4_intersection
 
     print ('loading HUC4s')
     wbd4 = gpd.read_file(wbd_filename, layer='WBDHU4')
     print ('loading HUC8s')
     wbd8 = gpd.read_file(wbd_filename, layer='WBDHU8')
 
-    collect_arg_list = (nhdplus_vectors_dir)
     subset_arg_list = (nwm_headwaters_filename,ahps_filename,wbd4,wbd8,nhdplus_vectors_dir,nwm_huc4_intersections_filename)
     huc_list = os.listdir(nhdplus_vectors_dir)
     num_workers=11
 
     with ProcessPoolExecutor(max_workers=num_workers) as executor:
         # Preprocess nhd hr and add attributes
-        collect_attributes = [executor.submit(collect_stream_attributes, collect_arg_list, str(huc)) for huc in huc_list]
+        # collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
         # Subset nhd hr network
         subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in huc_list]
 
diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index ca13b5e78..dc19309a2 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -7,7 +7,7 @@
 from shapely.geometry import MultiPolygon,Polygon,Point
 from utils.shared_functions import getDriver
 
-def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,dissolveLinks=False):
+def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,dissolveLinks=False):
 
     hucUnitLength = len(str(hucCode))
 
@@ -46,35 +46,22 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     # find intersecting nwm_catchments
     print("Subsetting NWM Catchments for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nwm_catchments = gpd.read_file(nwm_catchments_filename, mask = wbd_buffer)
+    if extent == 'MS':
+        nwm_catchments = nwm_catchments.loc[nwm_catchments.mainstem==1]
     nwm_catchments.to_file(subset_nwm_catchments_filename,driver=getDriver(subset_nwm_catchments_filename),index=False)
     del nwm_catchments
 
     # subset nhd headwaters
     print("Subsetting NHD Headwater Points for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nhd_headwaters = gpd.read_file(nhd_headwaters_filename, mask = wbd_buffer)
+    if extent == 'MS':
+        nhd_headwaters = nhd_headwaters.loc[nhd_headwaters.mainstem==1]
 
     # subset nhd streams
     print("Querying NHD Streams for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd)
-
-    # identify local headwater stream segments
-    # nhd_streams_subset = gpd.read_file(nhd_streams_filename, mask = wbd)
-    # nhd_streams_subset = nhd_streams_subset.loc[~nhd_streams_subset.FromNode.isin(list(set(nhd_streams_subset.ToNode) & set(nhd_streams_subset.FromNode)))]
-    # nhd_streams_subset = nhd_streams_subset[~nhd_streams_subset['is_headwater']]
-
-    # if not nhd_streams_subset.empty:
-    #     nhd_streams_subset = nhd_streams_subset.reset_index(drop=True)
-    #     start_coords = []
-    #     NHDPlusIDs = []
-    #     for index, linestring in enumerate(nhd_streams_subset.geometry):
-    #         start_coords = start_coords + [linestring.coords[-1]]
-    #         NHDPlusIDs = NHDPlusIDs + [nhd_streams_subset.iloc[index].NHDPlusID]
-    #
-    #     start_geoms = [Point(point) for point in start_coords]
-    #     local_headwaters = gpd.GeoDataFrame({'NHDPlusID': NHDPlusIDs,'geometry': start_geoms}, crs=projection, geometry='geometry')
-    #     nhd_headwaters = nhd_headwaters.append(local_headwaters)
-
-        # nhd_streams = nhd_streams.loc[~nhd_streams.NHDPlusID.isin(NHDPlusIDs)]
+    if extent == 'MS':
+        nhd_streams = nhd_streams.loc[nhd_streams.mainstem==1]
 
     if len(nhd_streams) > 0:
         nhd_streams.to_file(subset_nhd_streams_filename,driver=getDriver(subset_nhd_streams_filename),index=False)
@@ -92,6 +79,8 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     # subset nwm streams
     print("Subsetting NWM Streams and deriving headwaters for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nwm_streams = gpd.read_file(nwm_streams_filename, mask = wbd_buffer)
+    if extent == 'MS':
+        nwm_streams = nwm_streams.loc[nwm_streams.mainstem==1]
     nwm_streams.to_file(subset_nwm_streams_filename,driver=getDriver(subset_nwm_streams_filename),index=False)
     del nwm_streams
 
@@ -115,8 +104,10 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     parser.add_argument('-e','--subset-nhd-headwaters',help='NHD headwaters subset',required=True,default=None)
     parser.add_argument('-b','--subset-nwm-streams',help='NWM streams subset',required=True)
     parser.add_argument('-x','--subset-landsea',help='LandSea subset',required=True)
+    parser.add_argument('-extent','--extent',help='FIM extent',required=True)
     parser.add_argument('-o','--dissolve-links',help='remove multi-line strings',action="store_true",default=False)
 
+
     args = vars(parser.parse_args())
 
     hucCode = args['hucCode']
@@ -136,6 +127,7 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     subset_nhd_headwaters_filename = args['subset_nhd_headwaters']
     subset_nwm_streams_filename = args['subset_nwm_streams']
     subset_landsea_filename = args['subset_landsea']
+    extent = args['extent']
     dissolveLinks = args['dissolve_links']
 
-    subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,dissolveLinks)
+    subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,dissolveLinks)
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index 62b23db1a..2fd9b5143 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -3,23 +3,27 @@
 import geopandas as gpd
 import pandas as pd
 import numpy as np
-from os.path import splitext
-from tqdm import tqdm
 from collections import deque
 import argparse
 import pygeos
 from shapely.wkb import dumps
 from utils.shared_functions import getDriver
 
-'''
 
-'''
-
-def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False):
+def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False):
 
     headwater_streams = pd.DataFrame()
 
-    nhd_streams = gpd.read_file(nhd_streams_filename)
+    if mainstem_flag == False:
+        nhd_streams = gpd.read_file(nhd_streams_)
+        headwater_col = 'is_headwater'
+        id_col = 'headwaters_id'
+        n = -1
+    else:
+        nhd_streams = nhd_streams_.copy()
+        headwater_col = 'mainstem'
+        id_col = 'nws_lid'
+        n = ''
 
     # Locate the closest NHDPlus HR stream segment to NWM headwater points. Done by HUC8 to reduce processing time and to contain NWM headwater in the same HUC
     for index, row in selected_wbd8.iterrows():
@@ -27,18 +31,22 @@ def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename
 
         # Double check that this is a nested HUC (probably overkill)
         if huc.startswith(str(huc4)):
-            huc8_mask = selected_wbd8.loc[selected_wbd8.HUC8.str.startswith(huc)]
+
+            huc8_mask = selected_wbd8.loc[selected_wbd8.HUC8==huc]
             huc8_mask = huc8_mask.reset_index(drop=True)
 
             # Masking headwaters by HUC8
             headwaters_mask = gpd.read_file(headwaters_filename, mask = huc8_mask)
             headwaters_mask = headwaters_mask.reset_index(drop=True)
 
-            # Masking subset FR streams by HUC8
-            streams_subset = gpd.read_file(nhd_streams_filename, mask = huc8_mask)
+            # Masking subset streams by HUC8
+            if mainstem_flag == False:
+                streams_subset = gpd.read_file(nhd_streams_, mask = huc8_mask)
+            else:
+                streams_subset = nhd_streams.loc[nhd_streams.HUC8==huc].copy()
 
             if not streams_subset.empty:
-                streams_subset.loc[:,'is_headwater'] = False
+                streams_subset[headwater_col] = False
                 streams_subset = streams_subset.reset_index(drop=True)
 
                 # Create WKB geometry column
@@ -50,16 +58,10 @@ def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename
                 streambin_geom = pygeos.io.from_wkb(streams_subset['b_geom'])
 
                 # Add HUC8 column
-                streams_subset.loc[:,'HUC8'] = str(huc)
-
-                # Assign default headwater ID (nwm_headwater_id = int; ahps_headwater_id = str)
-                if headwaters_mask[headwater_id].dtype=='int':
-                    n = -1
-                else:
-                    n = ''
+                streams_subset['HUC8'] = str(huc)
 
                 # Add headwaters_id column
-                streams_subset.loc[:,'headwaters_id'] = n
+                streams_subset[id_col] = n
 
                 # Find stream segment closest to headwater point
                 for index, point in headwaters_mask.iterrows():
@@ -77,13 +79,20 @@ def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename
                     min_index = np.argmin(distances)
 
                     # Closest segment to headwater
-                    streams_subset.loc[min_index,'is_headwater'] = True
-                    streams_subset.loc[min_index,'headwaters_id'] = point[headwater_id]
+                    streams_subset.loc[min_index,headwater_col] = True
+                    streams_subset.loc[min_index,id_col] = point[headwater_id]
 
-                headwater_streams = headwater_streams.append(streams_subset[['NHDPlusID','is_headwater','headwaters_id','HUC8']])
+                headwater_streams = headwater_streams.append(streams_subset[['NHDPlusID',headwater_col,id_col,'HUC8']])
 
-    headwater_streams = headwater_streams.sort_values('is_headwater', ascending=False).drop_duplicates('NHDPlusID') # keeps headwater=True for conflicting duplicates
-    nhd_streams = nhd_streams.merge(headwater_streams,on='NHDPlusID',how='inner')
+    headwater_streams = headwater_streams.sort_values(headwater_col, ascending=False).drop_duplicates('NHDPlusID') # keeps headwater=True for conflicting duplicates
+
+    if mainstem_flag == False:
+        nhd_streams = nhd_streams.merge(headwater_streams,on='NHDPlusID',how='inner')
+    else:
+        headwater_streams = headwater_streams.drop(columns=['HUC8'])
+        nhd_streams = nhd_streams.merge(headwater_streams,on='NHDPlusID',how='outer')
+        nhd_streams[id_col] = nhd_streams[id_col].fillna(n)
+        nhd_streams[headwater_col] = nhd_streams[headwater_col].fillna(0)
 
     del selected_wbd8, streams_subset, headwater_streams
 
@@ -92,11 +101,11 @@ def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename
     # Identify inflowing streams
     nwm_intersections = gpd.read_file(nwm_intersections_filename, mask=huc4_mask_buffer)
 
-    if mainstem_flag == True:
-        nwm_intersections = nwm_intersections.loc[nwm_intersections.mainstem==True]
-        nhd_streams['mainstem'] = True
+    if mainstem_flag == False:
+        nhd_streams['downstream_of_headwater'] = False
+    else:
+        nwm_intersections = nwm_intersections.loc[nwm_intersections.mainstem==1]
 
-    nhd_streams['downstream_of_headwater'] = False
     nhd_streams = nhd_streams.explode()
     nhd_streams = nhd_streams.reset_index(drop=True)
 
@@ -110,23 +119,23 @@ def identify_headwater_streams(huc4,huc4_mask,selected_wbd8,nhd_streams_filename
         min_index = np.argmin(distances)
 
         # Update attributes for incoming stream
-        nhd_streams.loc[min_index,'is_headwater'] = True
-        nhd_streams.loc[min_index,'downstream_of_headwater'] = True
+        nhd_streams.loc[min_index,headwater_col] = True
 
-    # Subset NHDPlus HR
-    nhd_streams['is_relevant_stream'] = nhd_streams['is_headwater'].copy()
+        if mainstem_flag == False:
+            nhd_streams.loc[min_index,'downstream_of_headwater'] = True
+            nhd_streams['is_relevant_stream'] = nhd_streams[headwater_col].copy()
 
     # Trace down from headwaters
     nhd_streams.set_index('NHDPlusID',inplace=True,drop=False)
 
-    nhd_streams = get_downstream_segments(nhd_streams, 'is_headwater')
+    nhd_streams = get_downstream_segments(nhd_streams,headwater_col,mainstem_flag)
 
     nhd_streams = nhd_streams.loc[nhd_streams['is_relevant_stream'],:]
     nhd_streams.reset_index(drop=True,inplace=True)
 
     return nhd_streams
 
-def get_downstream_segments(streams, attribute):
+def get_downstream_segments(streams, attribute,mainstem_flag):
 
     Q = deque(streams.loc[streams[attribute],'NHDPlusID'].tolist())
     visited = set()
@@ -152,8 +161,11 @@ def get_downstream_segments(streams, attribute):
         else:
             relevant_ids = downstream_ids
 
-        streams.loc[relevant_ids,'is_relevant_stream'] = True
-        streams.loc[relevant_ids,'downstream_of_headwater'] = True
+        if mainstem_flag == False:
+            streams.loc[relevant_ids,'is_relevant_stream'] = True
+            streams.loc[relevant_ids,'downstream_of_headwater'] = True
+        else:
+            streams.loc[relevant_ids,'mainstem'] = True
 
         for i in relevant_ids:
             if i not in visited:
@@ -172,7 +184,6 @@ def get_downstream_segments(streams, attribute):
     parser.add_argument('-s','--subset-nhd-streams-fileName',help='Output streams layer name',required=False,type=str,default=None)
     parser.add_argument('-i','--headwater-id',help='Headwater points ID column',required=True)
     parser.add_argument('-i','--nwm-intersections-filename',help='NWM HUC4 intersection points',required=True)
-    parser.add_argument('-ms','--mainstem-flag',help='flag for mainstem network',required=False,default=False)
 
     args = vars(parser.parse_args())
 
@@ -184,9 +195,8 @@ def get_downstream_segments(streams, attribute):
     subset_nhd_streams_fileName = args['subset_nhd_streams_fileName']
     headwater_id = args['headwater_id']
     nwm_intersections_filename = args['nwm_intersections_filename']
-    mainstem_flag = args['mainstem_flag']
 
-    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag)
+    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id,nwm_intersections_filename)
 
     if subset_nhd_streams_fileName is not None:
         subset_streams_gdf.to_file(subset_nhd_streams_fileName,driver=getDriver(subset_nhd_streams_fileName),index=False)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index c8f490696..0c5e65cf5 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -43,19 +43,6 @@ else
   input_LANDSEA=$inputDataDir/landsea/water_polygons_us.gpkg
 fi
 
-# Define streams and headwaters based on extent #
-if [ "$extent" = "MS" ]; then
-  input_nhd_flowlines=$input_nhd_flowlines_ms
-  input_nhd_headwaters=$input_nhd_headwaters_ms
-  input_NWM_Flows=$input_NWM_Flows_ms
-  input_NWM_Catchments=$input_NWM_Catchments_ms
-else
-  input_nhd_flowlines=$input_nhd_flowlines_fr
-  input_nhd_headwaters=$input_nhd_headwaters_fr
-  input_NWM_Flows=$input_NWM_Flows_fr
-  input_NWM_Catchments=$input_NWM_Catchments_fr
-fi
-
 ## GET WBD ##
 echo -e $startDiv"Get WBD $hucNumber"$stopDiv
 date -u
@@ -77,7 +64,7 @@ echo -e $startDiv"Get Vector Layers and Subset $hucNumber"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg ] && \
-$srcDir/clip_vectors_to_wbd.py -d $hucNumber -w $input_NWM_Flows -s $input_nhd_flowlines -l $input_NWM_Lakes -r $input_NLD -g $outputHucDataDir/wbd.gpkg -f $outputHucDataDir/wbd_buffered.gpkg -m $input_NWM_Catchments -y $input_nhd_headwaters -v $input_LANDSEA -c $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg -z $outputHucDataDir/nld_subset_levees.gpkg -a $outputHucDataDir/nwm_lakes_proj_subset.gpkg -n $outputHucDataDir/nwm_catchments_proj_subset.gpkg -e $outputHucDataDir/nhd_headwater_points_subset.gpkg -b $outputHucDataDir/nwm_subset_streams.gpkg -x $outputHucDataDir/LandSea_subset.gpkg
+$srcDir/clip_vectors_to_wbd.py -d $hucNumber -w $input_nwm_flows -s $input_nhd_flowlines -l $input_nwm_lakes -r $input_NLD -g $outputHucDataDir/wbd.gpkg -f $outputHucDataDir/wbd_buffered.gpkg -m $input_nwm_catchments -y $input_nhd_headwaters -v $input_LANDSEA -c $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg -z $outputHucDataDir/nld_subset_levees.gpkg -a $outputHucDataDir/nwm_lakes_proj_subset.gpkg -n $outputHucDataDir/nwm_catchments_proj_subset.gpkg -e $outputHucDataDir/nhd_headwater_points_subset.gpkg -b $outputHucDataDir/nwm_subset_streams.gpkg -x $outputHucDataDir/LandSea_subset.gpkg -extent $extent
 Tcount
 
 if [ "$extent" = "MS" ]; then
@@ -421,7 +408,7 @@ echo -e $startDiv"Finalize catchments and model streams $hucNumber"$stopDiv outp
 date -u
 Tstart
 [ ! -f $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg ] && \
-$srcDir/add_crosswalk.py -d $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes.gpkg -a $outputHucDataDir/demDerived_reaches_split_filtered.gpkg -s $outputHucDataDir/src_base.csv -u $inputDataDir/bathymetry/BANKFULL_CONUS.txt -v $outputHucDataDir/bathy_crosswalk_calcs.csv -e $outputHucDataDir/bathy_stream_order_calcs.csv -g $outputHucDataDir/bathy_thalweg_flag.csv -i $outputHucDataDir/bathy_xs_area_hydroid_lookup.csv -l $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -f $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -r $outputHucDataDir/src_full_crosswalked.csv -j $outputHucDataDir/src.json -x $outputHucDataDir/crosswalk_table.csv -t $outputHucDataDir/hydroTable.csv -w $outputHucDataDir/wbd8_clp.gpkg -b $outputHucDataDir/nwm_subset_streams.gpkg -y $outputHucDataDir/nwm_catchments_proj_subset.tif -m $manning_n -z $input_NWM_Catchments -p $extent -k $outputHucDataDir/small_segments.csv
+$srcDir/add_crosswalk.py -d $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes.gpkg -a $outputHucDataDir/demDerived_reaches_split_filtered.gpkg -s $outputHucDataDir/src_base.csv -u $inputDataDir/bathymetry/BANKFULL_CONUS.txt -v $outputHucDataDir/bathy_crosswalk_calcs.csv -e $outputHucDataDir/bathy_stream_order_calcs.csv -g $outputHucDataDir/bathy_thalweg_flag.csv -i $outputHucDataDir/bathy_xs_area_hydroid_lookup.csv -l $outputHucDataDir/gw_catchments_reaches_filtered_addedAttributes_crosswalked.gpkg -f $outputHucDataDir/demDerived_reaches_split_filtered_addedAttributes_crosswalked.gpkg -r $outputHucDataDir/src_full_crosswalked.csv -j $outputHucDataDir/src.json -x $outputHucDataDir/crosswalk_table.csv -t $outputHucDataDir/hydroTable.csv -w $outputHucDataDir/wbd8_clp.gpkg -b $outputHucDataDir/nwm_subset_streams.gpkg -y $outputHucDataDir/nwm_catchments_proj_subset.tif -m $manning_n -z $input_nwm_catchments -p $extent -k $outputHucDataDir/small_segments.csv
 Tcount
 
 ## USGS CROSSWALK ##
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index b8f156205..6f28e7180 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -55,7 +55,9 @@
 os.environ['nwm_streams_all_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_flows.gpkg')
 os.environ['nwm_headwaters_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_headwaters.gpkg')
 os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
-os.environ['nwm_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc8_intersections.gpkg')
+os.environ['nhd_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nhd_huc8_intersections.gpkg')
 os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid.gpkg')
 os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adjusted.gpkg')
 os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj.gpkg')
+os.environ['nwm_catchments_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments_original.gpkg')
+os.environ['nwm_catchments_all_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments.gpkg')

From def4944eb588605600f328ea3fb22d5684bb8cc5 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Sun, 25 Apr 2021 23:55:59 +0000
Subject: [PATCH 42/66] handling case where two nws lids exist near a single
 stream segment

---
 fim_run.sh                       |  2 +-
 src/adjust_headwater_streams.py  | 43 +++++++++++++++++---------
 src/aggregate_vector_inputs.py   | 53 +++++++++++++++++++-------------
 src/reduce_nhd_stream_density.py |  6 ++--
 src/utils/shared_variables.py    |  2 +-
 5 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index 569606a9a..2cfc744e2 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -115,7 +115,7 @@ export input_nwm_lakes=$inputDataDir/nwm_hydrofabric/nwm_lakes.gpkg
 export input_nwm_catchments=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
 export input_nwm_flows=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
 export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj.gpkg
-export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adjusted.gpkg
+export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adj.gpkg
 
 ## Input handling ##
 $srcDir/check_huc_inputs.py -u "$hucList"
diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
index 7b7d6156d..66ec29d84 100644
--- a/src/adjust_headwater_streams.py
+++ b/src/adjust_headwater_streams.py
@@ -3,8 +3,6 @@
 import geopandas as gpd
 import pandas as pd
 import numpy as np
-from os.path import splitext
-from tqdm import tqdm
 import argparse
 import pygeos
 from shapely.geometry import Point,LineString
@@ -15,6 +13,7 @@
 import warnings
 warnings.simplefilter("ignore")
 
+
 def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
 
     # Identify true headwater segments
@@ -26,8 +25,18 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
     headwater_limited = nwm_headwaters.merge(nhd_streams_adj[["headwaters_id","mainstem"]],left_on="site_id", right_on="headwaters_id",how='right')
     headwater_limited = headwater_limited.drop(columns=['headwaters_id'])
 
+    nws_lid_limited = nws_lids.merge(nhd_streams[["nws_lid"]],left_on="site_id", right_on="nws_lid",how='right')
+    nws_lid_limited = nws_lid_limited.loc[nws_lid_limited.nws_lid!='']
+    nws_lid_limited = nws_lid_limited.drop(columns=['nws_lid'])
+
+    # Check for issues in nws_lid layer
+    if len(nws_lid_limited) < len(nws_lids):
+        missing_nws_lids = list(set(nws_lids.site_id) - set(nws_lid_limited.site_id))
+        print (f"nws lid(s) {missing_nws_lids} missing from aggregare dataset")
+
     # Combine NWM headwaters and AHPS sites to be snapped to NHDPlus HR segments
-    headwater_pts = headwater_limited.append(nws_lids)
+    headwater_pts = headwater_limited.append(nws_lid_limited)
+    headwater_pts = headwater_pts.reset_index(drop=True)
 
     if headwater_pts is not None:
 
@@ -75,9 +84,9 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
             headpoint = Point(shply_referencedpoint.coords)
 
             if point.pt_type == 'nwm_headwater':
-
                 cumulative_line = []
                 relativedistlst = []
+
                 # Collect all nhd stream segment linestring verticies
                 for point in zip(*shply_linestring.coords.xy):
                     cumulative_line = cumulative_line + [point]
@@ -104,6 +113,12 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
                     line1 = split(shply_linestring, headpoint)
                     headwaterstreams = headwaterstreams + [LineString(line1[0])]
                     nhd_streams.loc[nhd_streams.NHDPlusID==closest_stream.NHDPlusID.values[0],'geometry'] = LineString(line1[0])
+
+                try:
+                    del cumulative_line, relativedistlst
+                except:
+                    print (f"issue deleting adjusted stream variables for huc {huc}")
+
             else:
                 snapped_ahps = snapped_ahps + [headpoint]
                 nws_lid = nws_lid + [point[headwater_id]]
@@ -111,10 +126,9 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
         nhd_streams = nhd_streams.drop(columns=['is_relevant_stream', 'headwaters_id', 'downstream_of_headwater'])
 
         try:
-            del nhd_streams_adj, headwaters, headwater_limited, headwaterstreams, referencedpoints, cumulative_line, relativedistlst
+            del nhd_streams_adj, headwater_limited, referencedpoints, headwaterstreams
         except:
-            print (f"issue deleting adjusted stream variables for huc {str(huc)}")
-
+            print (f"issue deleting adjusted stream variables for huc {huc}")
 
         # Create snapped ahps sites
         if len(snapped_ahps) > 0:
@@ -150,21 +164,22 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
     parser = argparse.ArgumentParser(description='adjust headwater stream geometery based on headwater start points')
     parser.add_argument('-f','--huc',help='huc number',required=True)
     parser.add_argument('-l','--nhd-streams',help='NHDPlus HR geodataframe',required=True)
-    parser.add_argument('-p','--headwaters',help='Headwater points layer',required=True,type=str)
+    parser.add_argument('-p','--nwm-headwaters',help='Headwater points layer',required=True,type=str)
     parser.add_argument('-s','--subset-nhd-streams-fileName',help='Output streams layer name',required=False,type=str,default=None)
-    parser.add_argument('-s','--adj-headwater-points-fileName',help='Output adj headwater points layer name',required=False,type=str,default=None)
+    parser.add_argument('-a','--adj-headwater-points-fileName',help='Output adj headwater points layer name',required=False,type=str,default=None)
     parser.add_argument('-g','--headwater-points-fileName',help='Output headwater points layer name',required=False,type=str,default=None)
-    parser.add_argument('-i','--headwater-id',help='Output headwaters points',required=True)
+    parser.add_argument('-b','--nws-lids',help='NWS lid points',required=True)
+    parser.add_argument('-i','--headwater-id',help='Headwater id column name',required=True)
 
     args = vars(parser.parse_args())
 
-    adj_streams_gdf,adj_headwaters_gdf = adjust_headwaters(huc,nhd_streams,headwaters,headwater_id)
+    adj_streams_gdf, adj_headwaters_gdf = adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id)
 
     if subset_nhd_streams_fileName is not None:
-        adj_streams_gdf.to_file(args['subset_nhd_streams_fileName'],driver=getDriver(args['subset_nhd_streams_fileName']),index=False)
+        adj_streams_gdf.to_file(args['subset_nhd_streams_fileName'],driver=getDriver(args['subset_nhd_streams_fileName']))
 
     if headwater_points_fileName is not None:
-        headwater_points_fileName.to_file(args['headwater_points_fileName'],driver=getDriver(args['headwater_points_fileName']),index=False)
+        headwater_points_fileName.to_file(args['headwater_points_fileName'],driver=getDriver(args['headwater_points_fileName']))
 
     if adj_headwater_points_fileName is not None:
-        adj_headwaters_gdf.to_file(args['adj_headwater_points_fileName'],driver=getDriver(args['adj_headwater_points_fileName']),index=False)
+        adj_headwaters_gdf.to_file(args['adj_headwater_points_fileName'],driver=getDriver(args['adj_headwater_points_fileName']))
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index 60decd906..2c1081989 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -2,7 +2,7 @@
 
 import os
 import sys
-sys.path.append('/foss_fim/src')
+# sys.path.append('/foss_fim/src')
 import geopandas as gpd
 from utils.shared_variables import PREP_PROJECTION
 from utils.shared_functions import getDriver
@@ -15,7 +15,6 @@
 import numpy as np
 from shapely.wkb import dumps, loads
 import pygeos
-from tqdm import tqdm
 
 nhdplus_vectors_dir = os.environ.get('nhdplus_vectors_dir')
 wbd_filename = os.environ.get('wbd_filename')
@@ -100,7 +99,8 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
     intersecting_points = []
     nhdplus_ids = []
     mainstem_flag = []
-    for index, row in tqdm(wbd.iterrows(),total=len(wbd)):
+    print (f"iterating through {len(wbd)} hucs")
+    for index, row in wbd.iterrows():
 
         col_name = f"HUC{huc_unit}"
         huc = row[col_name]
@@ -126,7 +126,6 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
             nwm_streams_subset = nwm_streams_subset.reset_index(drop=True)
 
             for index, segment in nwm_streams_subset.iterrows():
-
                 distances = []
 
                 try:
@@ -176,7 +175,6 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
 
     huc_intersection = gpd.GeoDataFrame({'geometry': intersecting_points, 'NHDPlusID': nhdplus_ids,'mainstem': mainstem_flag},crs=nwm_streams.crs,geometry='geometry')
     huc_intersection = huc_intersection.drop_duplicates()
-
     del nwm_streams,wbd
 
     return huc_intersection
@@ -239,12 +237,12 @@ def subset_stream_networks(args, huc):
     nhd_streams_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_agg.gpkg')
 
     # Subset to reduce footprint
-    selected_wbd4 = wbd4.loc[wbd4.HUC4.str.startswith(str(huc))]
+    selected_wbd4 = wbd4.loc[wbd4.HUC4.str.startswith(huc)]
     del wbd4
     selected_wbd8 = wbd8.loc[wbd8.HUC8.str.startswith(huc)]
     del wbd8
 
-    huc_mask = selected_wbd4.loc[selected_wbd4.HUC4.str.startswith(str(huc))]
+    huc_mask = selected_wbd4.loc[selected_wbd4.HUC4.str.startswith(huc)]
     huc_mask = huc_mask.explode()
     huc_mask = huc_mask.reset_index(drop=True)
 
@@ -259,9 +257,6 @@ def subset_stream_networks(args, huc):
 
         # Identify HUC8 intersection points
         nhd_huc8_intersections = find_nwm_incoming_streams(nhd_streams_all,selected_wbd8,8)
-        nhd_huc8_intersections['pt_type'] = 'nhd_huc8_intersections'
-        nhd_huc8_intersections = nhd_huc8_intersections.rename(columns={"NHDPlusID": headwater_pts_id})
-        nhd_huc8_intersections = nhd_huc8_intersections[column_order]
 
         # Load nwm headwaters
         nwm_headwaters = gpd.read_file(nwm_headwaters_filename, mask=huc_mask)
@@ -273,13 +268,19 @@ def subset_stream_networks(args, huc):
         nws_lids = nws_lids.drop(columns=['name','nwm_featur'])
         nws_lids = nws_lids.rename(columns={"nws_lid": headwater_pts_id})
         nws_lids['pt_type'] = 'nws_lid'
+        nws_lids['mainstem'] = True
 
         if (len(nwm_headwaters) > 0) or (len(nws_lids) > 0):
             # Adjust FR/NWM headwater segments
             adj_nhd_streams_all, adj_nhd_headwater_points = adjust_headwaters(huc,nhd_streams_all,nwm_headwaters,nws_lids,headwater_pts_id)
 
             adj_nhd_headwater_points = adj_nhd_headwater_points[column_order]
+
+            nhd_huc8_intersections['pt_type'] = 'nhd_huc8_intersections'
+            nhd_huc8_intersections = nhd_huc8_intersections.rename(columns={"NHDPlusID": headwater_pts_id})
+            nhd_huc8_intersections = nhd_huc8_intersections[column_order]
             adj_nhd_headwater_points_all = adj_nhd_headwater_points.append(nhd_huc8_intersections)
+            adj_nhd_headwater_points_all = adj_nhd_headwater_points_all.reset_index(drop=True)
 
             adj_nhd_streams_all_fileName = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
             adj_nhd_headwaters_all_fileName = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
@@ -290,7 +291,7 @@ def subset_stream_networks(args, huc):
 
             del adj_nhd_streams_all, adj_nhd_headwater_points_all
         else:
-            print ('skipping headwater adjustments for HUC: ' + str(huc))
+            print (f"skipping headwater adjustments for HUC: {huc}")
 
         del nhd_streams_fr
 
@@ -299,12 +300,12 @@ def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileNam
 
     for huc in huc_list:
 
-        # FR adjusted
-        nhd_fr_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
-        nhd_fr_adj_headwaters_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
+        # aggregated final filenames
+        nhd_agg_adj_huc_subset = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
+        nhd_agg_adj_headwaters_subset = os.path.join(nhdplus_vectors_dir,huc,'nhd' + str(huc) + '_headwaters_adj.gpkg')
 
-        if os.path.isfile(nhd_fr_adj_huc_subset):
-            adj_nhd_streams_all = gpd.read_file(nhd_fr_adj_huc_subset)
+        if os.path.isfile(nhd_agg_adj_huc_subset):
+            adj_nhd_streams_all = gpd.read_file(nhd_agg_adj_huc_subset)
 
             # Write out FR adjusted
             if os.path.isfile(agg_nhd_streams_adj_fileName):
@@ -314,8 +315,8 @@ def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileNam
 
             del adj_nhd_streams_all
 
-        if os.path.isfile(nhd_fr_adj_headwaters_subset):
-            adj_nhd_headwater_points_all = gpd.read_file(nhd_fr_adj_headwaters_subset)
+        if os.path.isfile(nhd_agg_adj_headwaters_subset):
+            adj_nhd_headwater_points_all = gpd.read_file(nhd_agg_adj_headwaters_subset)
 
             # Write out FR adjusted
             if os.path.isfile(agg_nhd_headwaters_adj_fileName):
@@ -323,7 +324,7 @@ def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileNam
             else:
                 adj_nhd_headwater_points_all.to_file(agg_nhd_headwaters_adj_fileName,driver=getDriver(agg_nhd_headwaters_adj_fileName),index=False)
 
-            del adj_nhd_headwater_points_fr
+            del adj_nhd_headwater_points_all
 
 
 def clean_up_intermediate_files(nhdplus_vectors_dir):
@@ -377,18 +378,26 @@ def clean_up_intermediate_files(nhdplus_vectors_dir):
 
     subset_arg_list = (nwm_headwaters_filename,ahps_filename,wbd4,wbd8,nhdplus_vectors_dir,nwm_huc4_intersections_filename)
     huc_list = os.listdir(nhdplus_vectors_dir)
+
+    missing_subsets = []
+    for huc in os.listdir(nhdplus_vectors_dir):
+        streams_adj_path= os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '_adj.gpkg')
+        if not os.path.isfile(streams_adj_path):
+            missing_subsets = missing_subsets + [huc]
+
+    print (f"running subset_results on {len(missing_subsets)} HUC4s")
     num_workers=11
 
     with ProcessPoolExecutor(max_workers=num_workers) as executor:
         # Preprocess nhd hr and add attributes
         # collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
         # Subset nhd hr network
-        subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in huc_list]
+        subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in missing_subsets]
 
-    del wbd4,wbd8
+    # del wbd4,wbd8
 
     # Aggregate fr and ms nhd netowrks for entire nwm domain
     aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list)
 
     # Remove intermediate files
-    clean_up_intermediate_files(nhdplus_vectors_dir)
+    # clean_up_intermediate_files(nhdplus_vectors_dir)
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index 2fd9b5143..9614bfe32 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -183,7 +183,8 @@ def get_downstream_segments(streams, attribute,mainstem_flag):
     parser.add_argument('-a','--headwaters-filename',help='Headwaters points layer name',required=True,type=str)
     parser.add_argument('-s','--subset-nhd-streams-fileName',help='Output streams layer name',required=False,type=str,default=None)
     parser.add_argument('-i','--headwater-id',help='Headwater points ID column',required=True)
-    parser.add_argument('-i','--nwm-intersections-filename',help='NWM HUC4 intersection points',required=True)
+    parser.add_argument('-c','--nwm-intersections-filename',help='NWM HUC4 intersection points',required=True)
+    parser.add_argument('-d','--mainstem-flag',help='flag for mainstems network',required=False,default=False)
 
     args = vars(parser.parse_args())
 
@@ -195,8 +196,9 @@ def get_downstream_segments(streams, attribute,mainstem_flag):
     subset_nhd_streams_fileName = args['subset_nhd_streams_fileName']
     headwater_id = args['headwater_id']
     nwm_intersections_filename = args['nwm_intersections_filename']
+    mainstem_flag = args['mainstem_flag']
 
-    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id,nwm_intersections_filename)
+    subset_streams_gdf = subset_nhd_network(huc_number,huc4_mask,selected_wbd8,nhd_streams,headwaters_filename,headwater_id,nwm_intersections_filename,mainstem_flag=False)
 
     if subset_nhd_streams_fileName is not None:
         subset_streams_gdf.to_file(subset_nhd_streams_fileName,driver=getDriver(subset_nhd_streams_fileName),index=False)
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index 6f28e7180..fefad3cfa 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -57,7 +57,7 @@
 os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
 os.environ['nhd_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nhd_huc8_intersections.gpkg')
 os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid.gpkg')
-os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adjusted.gpkg')
+os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adj.gpkg')
 os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj.gpkg')
 os.environ['nwm_catchments_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments_original.gpkg')
 os.environ['nwm_catchments_all_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments.gpkg')

From 267136ea2ca137698360a4fc0d81ed4003a1ad73 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Mon, 26 Apr 2021 23:19:32 +0000
Subject: [PATCH 43/66] fixing issue with sythesize_test_case.py
 parallelization

---
 src/adjust_headwater_streams.py |   2 +-
 tools/inundation.py             | 217 ++++++++++++++++----------------
 tools/run_test_case.py          |  59 ++++-----
 3 files changed, 144 insertions(+), 134 deletions(-)

diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
index 66ec29d84..71f73186e 100644
--- a/src/adjust_headwater_streams.py
+++ b/src/adjust_headwater_streams.py
@@ -32,7 +32,7 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
     # Check for issues in nws_lid layer
     if len(nws_lid_limited) < len(nws_lids):
         missing_nws_lids = list(set(nws_lids.site_id) - set(nws_lid_limited.site_id))
-        print (f"nws lid(s) {missing_nws_lids} missing from aggregare dataset")
+        print (f"nws lid(s) {missing_nws_lids} missing from aggregate dataset in huc {huc}")
 
     # Combine NWM headwaters and AHPS sites to be snapped to NHDPlus HR segments
     headwater_pts = headwater_limited.append(nws_lid_limited)
diff --git a/tools/inundation.py b/tools/inundation.py
index d093385b8..d105ea52c 100755
--- a/tools/inundation.py
+++ b/tools/inundation.py
@@ -156,59 +156,63 @@ def inundate(
     else:
         raise TypeError("Pass hydro table csv")
 
-    # make windows generator
-    window_gen = __make_windows_generator(rem,catchments,catchment_poly,mask_type,catchmentStagesDict,inundation_raster,inundation_polygon,
-                                          depths,out_raster_profile,out_vector_profile,quiet,hucs=hucs,hucSet=hucSet)
+    if catchmentStagesDict is not None:
 
-    # start up thread pool
-    executor = ThreadPoolExecutor(max_workers=num_workers)
+        # make windows generator
+        window_gen = __make_windows_generator(rem,catchments,catchment_poly,mask_type,catchmentStagesDict,inundation_raster,inundation_polygon,
+                                              depths,out_raster_profile,out_vector_profile,quiet,hucs=hucs,hucSet=hucSet)
 
-    # submit jobs
-    results = {executor.submit(__inundate_in_huc,*wg) : wg[6] for wg in window_gen}
+        # start up thread pool
+        executor = ThreadPoolExecutor(max_workers=num_workers)
 
-    inundation_rasters = [] ; depth_rasters = [] ; inundation_polys = []
-    for future in as_completed(results):
-        try:
-            future.result()
-        except Exception as exc:
-            __vprint("Exception {} for {}".format(exc,results[future]),not quiet)
-        else:
+        # submit jobs
+        results = {executor.submit(__inundate_in_huc,*wg) : wg[6] for wg in window_gen}
 
-            if results[future] is not None:
-                __vprint("... {} complete".format(results[future]),not quiet)
+        inundation_rasters = [] ; depth_rasters = [] ; inundation_polys = []
+        for future in as_completed(results):
+            try:
+                future.result()
+            except Exception as exc:
+                __vprint("Exception {} for {}".format(exc,results[future]),not quiet)
             else:
-                __vprint("... complete",not quiet)
-
-            inundation_rasters += [future.result()[0]]
-            depth_rasters += [future.result()[1]]
-            inundation_polys += [future.result()[2]]
-
-    # power down pool
-    executor.shutdown(wait=True)
-
-    # optional aggregation
-    if (aggregate) & (hucs is not None):
-        # inun grid vrt
-        if inundation_raster is not None:
-            inun_vrt = BuildVRT(splitext(inundation_raster)[0]+'.vrt',inundation_rasters)
-            inun_vrt = None
-            #_ = run('gdalbuildvrt -q -overwrite {} {}'.format(splitext(inundation_raster)[0]+'.vrt'," ".join(inundation_rasters)),shell=True)
-        # depths vrt
-        if depths is not None:
-            depths_vrt = BuildVRT(splitext(depths)[0]+'.vrt',depth_rasters,resampleAlg='bilinear')
-            depths_vrt = None
-            #_ = run('gdalbuildvrt -q -overwrite -r bilinear {} {}'.format(splitext(depths)[0]+'.vrt'," ".join(depth_rasters)),shell=True)
 
-        # concat inun poly
-        if inundation_polygon is not None:
-            _ = run('ogrmerge.py -o {} {} -f GPKG -single -overwrite_ds'.format(inundation_polygon," ".join(inundation_polys)),shell=True)
-
-    # close datasets
-    rem.close()
-    catchments.close()
-
-    return(0)
+                if results[future] is not None:
+                    __vprint("... {} complete".format(results[future]),not quiet)
+                else:
+                    __vprint("... complete",not quiet)
+
+                inundation_rasters += [future.result()[0]]
+                depth_rasters += [future.result()[1]]
+                inundation_polys += [future.result()[2]]
+
+        # power down pool
+        executor.shutdown(wait=True)
+
+        # optional aggregation
+        if (aggregate) & (hucs is not None):
+            # inun grid vrt
+            if inundation_raster is not None:
+                inun_vrt = BuildVRT(splitext(inundation_raster)[0]+'.vrt',inundation_rasters)
+                inun_vrt = None
+                #_ = run('gdalbuildvrt -q -overwrite {} {}'.format(splitext(inundation_raster)[0]+'.vrt'," ".join(inundation_rasters)),shell=True)
+            # depths vrt
+            if depths is not None:
+                depths_vrt = BuildVRT(splitext(depths)[0]+'.vrt',depth_rasters,resampleAlg='bilinear')
+                depths_vrt = None
+                #_ = run('gdalbuildvrt -q -overwrite -r bilinear {} {}'.format(splitext(depths)[0]+'.vrt'," ".join(depth_rasters)),shell=True)
+
+            # concat inun poly
+            if inundation_polygon is not None:
+                _ = run('ogrmerge.py -o {} {} -f GPKG -single -overwrite_ds'.format(inundation_polygon," ".join(inundation_polys)),shell=True)
+
+        # close datasets
+        rem.close()
+        catchments.close()
+
+        return(0)
 
+    else:
+        return(1)
 
 def __inundate_in_huc(rem_array,catchments_array,crs,window_transform,rem_profile,catchments_profile,hucCode,
                       catchmentStagesDict,depths,inundation_raster,inundation_polygon,
@@ -328,6 +332,7 @@ def __inundate_in_huc(rem_array,catchments_array,crs,window_transform,rem_profil
     if isinstance(depths,DatasetWriter): depths.close()
     if isinstance(inundation_raster,DatasetWriter): inundation_raster.close()
     if isinstance(inundation_polygon,fiona.Collection): inundation_polygon.close()
+    if isinstance(hucs,fiona.Collection): inundation_polygon.close()
 
     # return file names of outputs for aggregation. Handle Nones
     try:
@@ -414,6 +419,7 @@ def __return_huc_in_hucSet(hucCode,hucSet):
 
                     rem_array,window_transform = mask(rem,catchment_poly['geometry'],crop=True,indexes=1)
                     catchments_array,_ = mask(catchments,catchment_poly['geometry'],crop=True,indexes=1)
+                    del catchment_poly
                 else:
                     print ("invalid mask type. Options are 'huc' or 'filter'")
             except ValueError: # shape doesn't overlap raster
@@ -458,78 +464,79 @@ def __subset_hydroTable_to_forecast(hydroTable,forecast,subset_hucs=None):
         huc_error = hydroTable.HUC.unique()
         hydroTable.set_index(['HUC','feature_id','HydroID'],inplace=True)
 
-        hydroTable = hydroTable[hydroTable["LakeID"] == -999]  # Subset hydroTable to include only non-lake catchments.
-
-        if hydroTable.empty:
-            print(f"All stream segments in HUC(s): {huc_error} are within lake boundaries.")
-            sys.exit(0)
-
     elif isinstance(hydroTable,pd.DataFrame):
         pass #consider checking for correct dtypes, indices, and columns
     else:
         raise TypeError("Pass path to hydro-table csv or Pandas DataFrame")
 
-    if isinstance(forecast,str):
-        forecast = pd.read_csv(
-                               forecast,
-                               dtype={'feature_id' : str , 'discharge' : float}
-                              )
-        forecast.set_index('feature_id',inplace=True)
-    elif isinstance(forecast,pd.DataFrame):
-        pass # consider checking for dtypes, indices, and columns
-    else:
-        raise TypeError("Pass path to forecast file csv or Pandas DataFrame")
-
-
-    # susbset hucs if passed
-    if subset_hucs is not None:
-        if isinstance(subset_hucs,list):
-            if len(subset_hucs) == 1:
-                try:
-                    subset_hucs = open(subset_hucs[0]).read().split('\n')
-                except FileNotFoundError:
-                    pass
-        elif isinstance(subset_hucs,str):
-                try:
-                    subset_hucs = open(subset_hucs).read().split('\n')
-                except FileNotFoundError:
-                    subset_hucs = [subset_hucs]
-
-        # subsets HUCS
-        subset_hucs_orig = subset_hucs.copy() ; subset_hucs = []
-        for huc in np.unique(hydroTable.index.get_level_values('HUC')):
-            for sh in subset_hucs_orig:
-                if huc.startswith(sh):
-                    subset_hucs += [huc]
-
-        hydroTable = hydroTable[np.in1d(hydroTable.index.get_level_values('HUC'), subset_hucs)]
-
-    # join tables
-    try:
-        hydroTable = hydroTable.join(forecast,on=['feature_id'],how='inner')
-    except AttributeError:
-        print (f"No matching feature IDs between forecast and hydrotable for HUC(s): {subset_hucs}")
-        sys.exit(0)
+    hydroTable = hydroTable[hydroTable["LakeID"] == -999]  # Subset hydroTable to include only non-lake catchments.
 
-    # initialize dictionary
-    catchmentStagesDict = typed.Dict.empty(types.int32,types.float64)
+    if not hydroTable.empty:
 
-    # interpolate stages
-    for hid,sub_table in hydroTable.groupby(level='HydroID'):
+        if isinstance(forecast,str):
+            forecast = pd.read_csv(
+                                   forecast,
+                                   dtype={'feature_id' : str , 'discharge' : float}
+                                  )
+            forecast.set_index('feature_id',inplace=True)
+        elif isinstance(forecast,pd.DataFrame):
+            pass # consider checking for dtypes, indices, and columns
+        else:
+            raise TypeError("Pass path to forecast file csv or Pandas DataFrame")
+
+        # susbset hucs if passed
+        if subset_hucs is not None:
+            if isinstance(subset_hucs,list):
+                if len(subset_hucs) == 1:
+                    try:
+                        subset_hucs = open(subset_hucs[0]).read().split('\n')
+                    except FileNotFoundError:
+                        pass
+            elif isinstance(subset_hucs,str):
+                    try:
+                        subset_hucs = open(subset_hucs).read().split('\n')
+                    except FileNotFoundError:
+                        subset_hucs = [subset_hucs]
+
+            # subsets HUCS
+            subset_hucs_orig = subset_hucs.copy() ; subset_hucs = []
+            for huc in np.unique(hydroTable.index.get_level_values('HUC')):
+                for sh in subset_hucs_orig:
+                    if huc.startswith(sh):
+                        subset_hucs += [huc]
+
+            hydroTable = hydroTable[np.in1d(hydroTable.index.get_level_values('HUC'), subset_hucs)]
+
+        # join tables
+        try:
+            hydroTable = hydroTable.join(forecast,on=['feature_id'],how='inner')
+
+
+            # initialize dictionary
+            catchmentStagesDict = typed.Dict.empty(types.int32,types.float64)
 
-        interpolated_stage = np.interp(sub_table.loc[:,'discharge'].unique(),sub_table.loc[:,'discharge_cms'],sub_table.loc[:,'stage'])
+            # interpolate stages
+            for hid,sub_table in hydroTable.groupby(level='HydroID'):
 
-        # add this interpolated stage to catchment stages dict
-        h = round(interpolated_stage[0],4)
+                interpolated_stage = np.interp(sub_table.loc[:,'discharge'].unique(),sub_table.loc[:,'discharge_cms'],sub_table.loc[:,'stage'])
 
-        hid = types.int32(hid) ; h = types.float32(h)
-        catchmentStagesDict[hid] = h
+                # add this interpolated stage to catchment stages dict
+                h = round(interpolated_stage[0],4)
 
-    # huc set
-    hucSet = [str(i) for i in hydroTable.index.get_level_values('HUC').unique().to_list()]
+                hid = types.int32(hid) ; h = types.float32(h)
+                catchmentStagesDict[hid] = h
 
-    return(catchmentStagesDict,hucSet)
+            # huc set
+            hucSet = [str(i) for i in hydroTable.index.get_level_values('HUC').unique().to_list()]
 
+            return(catchmentStagesDict,hucSet)
+
+        except AttributeError:
+            print (f"No matching feature IDs between forecast and hydrotable for HUC(s): {subset_hucs}")
+            return(None,None)
+    else:
+        print(f"All stream segments in HUC(s): {huc_error} are within lake boundaries.")
+        return(None,None)
 
 def __vprint(message,verbose):
     if verbose:
diff --git a/tools/run_test_case.py b/tools/run_test_case.py
index e3168a422..3b0f2ff1f 100755
--- a/tools/run_test_case.py
+++ b/tools/run_test_case.py
@@ -132,38 +132,41 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
             # Run inundate.
             print("-----> Running inundate() to produce modeled inundation extent for the " + magnitude + " magnitude...")
             try:
-                inundate(
+                inundate_test = inundate(
                          rem,catchments,catchment_poly,hydro_table,forecast,mask_type,hucs=hucs,hucs_layerName=hucs_layerName,
                          subset_hucs=current_huc,num_workers=1,aggregate=False,inundation_raster=inundation_raster,inundation_polygon=None,
                          depths=None,out_raster_profile=None,out_vector_profile=None,quiet=True
                         )
-
-                print("-----> Inundation mapping complete.")
-                predicted_raster_path = os.path.join(os.path.split(inundation_raster)[0], os.path.split(inundation_raster)[1].replace('.tif', '_' + current_huc + '.tif'))  # The inundate adds the huc to the name so I account for that here.
-
-                # Define outputs for agreement_raster, stats_json, and stats_csv.
-                if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
-                    agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, lid + 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
-                else:
-                    agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
-
-                compute_contingency_stats_from_rasters(predicted_raster_path,
-                                                       benchmark_raster_path,
-                                                       agreement_raster,
-                                                       stats_csv=stats_csv,
-                                                       stats_json=stats_json,
-                                                       mask_values=[],
-                                                       stats_modes_list=stats_modes_list,
-                                                       test_id=test_id,
-                                                       mask_dict=mask_dict,
-                                                       )
-
-                if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
-                    del mask_dict[ahps_lid]
-
-                print(" ")
-                print("Evaluation complete. All metrics for " + test_id + ", " + version + ", " + magnitude + " are available at " + CYAN_BOLD + version_test_case_dir + ENDC)
-                print(" ")
+                if inundate_test == 0:
+                    print("-----> Inundation mapping complete.")
+                    predicted_raster_path = os.path.join(os.path.split(inundation_raster)[0], os.path.split(inundation_raster)[1].replace('.tif', '_' + current_huc + '.tif'))  # The inundate adds the huc to the name so I account for that here.
+
+                    # Define outputs for agreement_raster, stats_json, and stats_csv.
+                    if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
+                        agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, lid + 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
+                    else:
+                        agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
+
+                    compute_contingency_stats_from_rasters(predicted_raster_path,
+                                                           benchmark_raster_path,
+                                                           agreement_raster,
+                                                           stats_csv=stats_csv,
+                                                           stats_json=stats_json,
+                                                           mask_values=[],
+                                                           stats_modes_list=stats_modes_list,
+                                                           test_id=test_id,
+                                                           mask_dict=mask_dict,
+                                                           )
+
+                    if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
+                        del mask_dict[ahps_lid]
+
+                    print(" ")
+                    print("Evaluation complete. All metrics for " + test_id + ", " + version + ", " + magnitude + " are available at " + CYAN_BOLD + version_test_case_dir + ENDC)
+                    print(" ")
+                elif inundate_test == 1:
+                    print (f"No matching feature IDs between forecast and hydrotable for magnitude: {magnitude}")
+                    return
             except Exception as e:
                 print(e)
 

From 1d0cf00de330b86182fe1aa29f254910754e758d Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Tue, 27 Apr 2021 13:05:50 +0000
Subject: [PATCH 44/66] fixing bug where synthesize_test_case.py gets hung up
 in multiprocessing

---
 tools/inundation.py    | 217 +++++++++++++++++++++--------------------
 tools/run_test_case.py |  59 +++++------
 2 files changed, 143 insertions(+), 133 deletions(-)

diff --git a/tools/inundation.py b/tools/inundation.py
index d093385b8..d105ea52c 100755
--- a/tools/inundation.py
+++ b/tools/inundation.py
@@ -156,59 +156,63 @@ def inundate(
     else:
         raise TypeError("Pass hydro table csv")
 
-    # make windows generator
-    window_gen = __make_windows_generator(rem,catchments,catchment_poly,mask_type,catchmentStagesDict,inundation_raster,inundation_polygon,
-                                          depths,out_raster_profile,out_vector_profile,quiet,hucs=hucs,hucSet=hucSet)
+    if catchmentStagesDict is not None:
 
-    # start up thread pool
-    executor = ThreadPoolExecutor(max_workers=num_workers)
+        # make windows generator
+        window_gen = __make_windows_generator(rem,catchments,catchment_poly,mask_type,catchmentStagesDict,inundation_raster,inundation_polygon,
+                                              depths,out_raster_profile,out_vector_profile,quiet,hucs=hucs,hucSet=hucSet)
 
-    # submit jobs
-    results = {executor.submit(__inundate_in_huc,*wg) : wg[6] for wg in window_gen}
+        # start up thread pool
+        executor = ThreadPoolExecutor(max_workers=num_workers)
 
-    inundation_rasters = [] ; depth_rasters = [] ; inundation_polys = []
-    for future in as_completed(results):
-        try:
-            future.result()
-        except Exception as exc:
-            __vprint("Exception {} for {}".format(exc,results[future]),not quiet)
-        else:
+        # submit jobs
+        results = {executor.submit(__inundate_in_huc,*wg) : wg[6] for wg in window_gen}
 
-            if results[future] is not None:
-                __vprint("... {} complete".format(results[future]),not quiet)
+        inundation_rasters = [] ; depth_rasters = [] ; inundation_polys = []
+        for future in as_completed(results):
+            try:
+                future.result()
+            except Exception as exc:
+                __vprint("Exception {} for {}".format(exc,results[future]),not quiet)
             else:
-                __vprint("... complete",not quiet)
-
-            inundation_rasters += [future.result()[0]]
-            depth_rasters += [future.result()[1]]
-            inundation_polys += [future.result()[2]]
-
-    # power down pool
-    executor.shutdown(wait=True)
-
-    # optional aggregation
-    if (aggregate) & (hucs is not None):
-        # inun grid vrt
-        if inundation_raster is not None:
-            inun_vrt = BuildVRT(splitext(inundation_raster)[0]+'.vrt',inundation_rasters)
-            inun_vrt = None
-            #_ = run('gdalbuildvrt -q -overwrite {} {}'.format(splitext(inundation_raster)[0]+'.vrt'," ".join(inundation_rasters)),shell=True)
-        # depths vrt
-        if depths is not None:
-            depths_vrt = BuildVRT(splitext(depths)[0]+'.vrt',depth_rasters,resampleAlg='bilinear')
-            depths_vrt = None
-            #_ = run('gdalbuildvrt -q -overwrite -r bilinear {} {}'.format(splitext(depths)[0]+'.vrt'," ".join(depth_rasters)),shell=True)
 
-        # concat inun poly
-        if inundation_polygon is not None:
-            _ = run('ogrmerge.py -o {} {} -f GPKG -single -overwrite_ds'.format(inundation_polygon," ".join(inundation_polys)),shell=True)
-
-    # close datasets
-    rem.close()
-    catchments.close()
-
-    return(0)
+                if results[future] is not None:
+                    __vprint("... {} complete".format(results[future]),not quiet)
+                else:
+                    __vprint("... complete",not quiet)
+
+                inundation_rasters += [future.result()[0]]
+                depth_rasters += [future.result()[1]]
+                inundation_polys += [future.result()[2]]
+
+        # power down pool
+        executor.shutdown(wait=True)
+
+        # optional aggregation
+        if (aggregate) & (hucs is not None):
+            # inun grid vrt
+            if inundation_raster is not None:
+                inun_vrt = BuildVRT(splitext(inundation_raster)[0]+'.vrt',inundation_rasters)
+                inun_vrt = None
+                #_ = run('gdalbuildvrt -q -overwrite {} {}'.format(splitext(inundation_raster)[0]+'.vrt'," ".join(inundation_rasters)),shell=True)
+            # depths vrt
+            if depths is not None:
+                depths_vrt = BuildVRT(splitext(depths)[0]+'.vrt',depth_rasters,resampleAlg='bilinear')
+                depths_vrt = None
+                #_ = run('gdalbuildvrt -q -overwrite -r bilinear {} {}'.format(splitext(depths)[0]+'.vrt'," ".join(depth_rasters)),shell=True)
+
+            # concat inun poly
+            if inundation_polygon is not None:
+                _ = run('ogrmerge.py -o {} {} -f GPKG -single -overwrite_ds'.format(inundation_polygon," ".join(inundation_polys)),shell=True)
+
+        # close datasets
+        rem.close()
+        catchments.close()
+
+        return(0)
 
+    else:
+        return(1)
 
 def __inundate_in_huc(rem_array,catchments_array,crs,window_transform,rem_profile,catchments_profile,hucCode,
                       catchmentStagesDict,depths,inundation_raster,inundation_polygon,
@@ -328,6 +332,7 @@ def __inundate_in_huc(rem_array,catchments_array,crs,window_transform,rem_profil
     if isinstance(depths,DatasetWriter): depths.close()
     if isinstance(inundation_raster,DatasetWriter): inundation_raster.close()
     if isinstance(inundation_polygon,fiona.Collection): inundation_polygon.close()
+    if isinstance(hucs,fiona.Collection): inundation_polygon.close()
 
     # return file names of outputs for aggregation. Handle Nones
     try:
@@ -414,6 +419,7 @@ def __return_huc_in_hucSet(hucCode,hucSet):
 
                     rem_array,window_transform = mask(rem,catchment_poly['geometry'],crop=True,indexes=1)
                     catchments_array,_ = mask(catchments,catchment_poly['geometry'],crop=True,indexes=1)
+                    del catchment_poly
                 else:
                     print ("invalid mask type. Options are 'huc' or 'filter'")
             except ValueError: # shape doesn't overlap raster
@@ -458,78 +464,79 @@ def __subset_hydroTable_to_forecast(hydroTable,forecast,subset_hucs=None):
         huc_error = hydroTable.HUC.unique()
         hydroTable.set_index(['HUC','feature_id','HydroID'],inplace=True)
 
-        hydroTable = hydroTable[hydroTable["LakeID"] == -999]  # Subset hydroTable to include only non-lake catchments.
-
-        if hydroTable.empty:
-            print(f"All stream segments in HUC(s): {huc_error} are within lake boundaries.")
-            sys.exit(0)
-
     elif isinstance(hydroTable,pd.DataFrame):
         pass #consider checking for correct dtypes, indices, and columns
     else:
         raise TypeError("Pass path to hydro-table csv or Pandas DataFrame")
 
-    if isinstance(forecast,str):
-        forecast = pd.read_csv(
-                               forecast,
-                               dtype={'feature_id' : str , 'discharge' : float}
-                              )
-        forecast.set_index('feature_id',inplace=True)
-    elif isinstance(forecast,pd.DataFrame):
-        pass # consider checking for dtypes, indices, and columns
-    else:
-        raise TypeError("Pass path to forecast file csv or Pandas DataFrame")
-
-
-    # susbset hucs if passed
-    if subset_hucs is not None:
-        if isinstance(subset_hucs,list):
-            if len(subset_hucs) == 1:
-                try:
-                    subset_hucs = open(subset_hucs[0]).read().split('\n')
-                except FileNotFoundError:
-                    pass
-        elif isinstance(subset_hucs,str):
-                try:
-                    subset_hucs = open(subset_hucs).read().split('\n')
-                except FileNotFoundError:
-                    subset_hucs = [subset_hucs]
-
-        # subsets HUCS
-        subset_hucs_orig = subset_hucs.copy() ; subset_hucs = []
-        for huc in np.unique(hydroTable.index.get_level_values('HUC')):
-            for sh in subset_hucs_orig:
-                if huc.startswith(sh):
-                    subset_hucs += [huc]
-
-        hydroTable = hydroTable[np.in1d(hydroTable.index.get_level_values('HUC'), subset_hucs)]
-
-    # join tables
-    try:
-        hydroTable = hydroTable.join(forecast,on=['feature_id'],how='inner')
-    except AttributeError:
-        print (f"No matching feature IDs between forecast and hydrotable for HUC(s): {subset_hucs}")
-        sys.exit(0)
+    hydroTable = hydroTable[hydroTable["LakeID"] == -999]  # Subset hydroTable to include only non-lake catchments.
 
-    # initialize dictionary
-    catchmentStagesDict = typed.Dict.empty(types.int32,types.float64)
+    if not hydroTable.empty:
 
-    # interpolate stages
-    for hid,sub_table in hydroTable.groupby(level='HydroID'):
+        if isinstance(forecast,str):
+            forecast = pd.read_csv(
+                                   forecast,
+                                   dtype={'feature_id' : str , 'discharge' : float}
+                                  )
+            forecast.set_index('feature_id',inplace=True)
+        elif isinstance(forecast,pd.DataFrame):
+            pass # consider checking for dtypes, indices, and columns
+        else:
+            raise TypeError("Pass path to forecast file csv or Pandas DataFrame")
+
+        # susbset hucs if passed
+        if subset_hucs is not None:
+            if isinstance(subset_hucs,list):
+                if len(subset_hucs) == 1:
+                    try:
+                        subset_hucs = open(subset_hucs[0]).read().split('\n')
+                    except FileNotFoundError:
+                        pass
+            elif isinstance(subset_hucs,str):
+                    try:
+                        subset_hucs = open(subset_hucs).read().split('\n')
+                    except FileNotFoundError:
+                        subset_hucs = [subset_hucs]
+
+            # subsets HUCS
+            subset_hucs_orig = subset_hucs.copy() ; subset_hucs = []
+            for huc in np.unique(hydroTable.index.get_level_values('HUC')):
+                for sh in subset_hucs_orig:
+                    if huc.startswith(sh):
+                        subset_hucs += [huc]
+
+            hydroTable = hydroTable[np.in1d(hydroTable.index.get_level_values('HUC'), subset_hucs)]
+
+        # join tables
+        try:
+            hydroTable = hydroTable.join(forecast,on=['feature_id'],how='inner')
+
+
+            # initialize dictionary
+            catchmentStagesDict = typed.Dict.empty(types.int32,types.float64)
 
-        interpolated_stage = np.interp(sub_table.loc[:,'discharge'].unique(),sub_table.loc[:,'discharge_cms'],sub_table.loc[:,'stage'])
+            # interpolate stages
+            for hid,sub_table in hydroTable.groupby(level='HydroID'):
 
-        # add this interpolated stage to catchment stages dict
-        h = round(interpolated_stage[0],4)
+                interpolated_stage = np.interp(sub_table.loc[:,'discharge'].unique(),sub_table.loc[:,'discharge_cms'],sub_table.loc[:,'stage'])
 
-        hid = types.int32(hid) ; h = types.float32(h)
-        catchmentStagesDict[hid] = h
+                # add this interpolated stage to catchment stages dict
+                h = round(interpolated_stage[0],4)
 
-    # huc set
-    hucSet = [str(i) for i in hydroTable.index.get_level_values('HUC').unique().to_list()]
+                hid = types.int32(hid) ; h = types.float32(h)
+                catchmentStagesDict[hid] = h
 
-    return(catchmentStagesDict,hucSet)
+            # huc set
+            hucSet = [str(i) for i in hydroTable.index.get_level_values('HUC').unique().to_list()]
 
+            return(catchmentStagesDict,hucSet)
+
+        except AttributeError:
+            print (f"No matching feature IDs between forecast and hydrotable for HUC(s): {subset_hucs}")
+            return(None,None)
+    else:
+        print(f"All stream segments in HUC(s): {huc_error} are within lake boundaries.")
+        return(None,None)
 
 def __vprint(message,verbose):
     if verbose:
diff --git a/tools/run_test_case.py b/tools/run_test_case.py
index e3168a422..3b0f2ff1f 100755
--- a/tools/run_test_case.py
+++ b/tools/run_test_case.py
@@ -132,38 +132,41 @@ def run_alpha_test(fim_run_dir, version, test_id, magnitude, compare_to_previous
             # Run inundate.
             print("-----> Running inundate() to produce modeled inundation extent for the " + magnitude + " magnitude...")
             try:
-                inundate(
+                inundate_test = inundate(
                          rem,catchments,catchment_poly,hydro_table,forecast,mask_type,hucs=hucs,hucs_layerName=hucs_layerName,
                          subset_hucs=current_huc,num_workers=1,aggregate=False,inundation_raster=inundation_raster,inundation_polygon=None,
                          depths=None,out_raster_profile=None,out_vector_profile=None,quiet=True
                         )
-
-                print("-----> Inundation mapping complete.")
-                predicted_raster_path = os.path.join(os.path.split(inundation_raster)[0], os.path.split(inundation_raster)[1].replace('.tif', '_' + current_huc + '.tif'))  # The inundate adds the huc to the name so I account for that here.
-
-                # Define outputs for agreement_raster, stats_json, and stats_csv.
-                if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
-                    agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, lid + 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
-                else:
-                    agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
-
-                compute_contingency_stats_from_rasters(predicted_raster_path,
-                                                       benchmark_raster_path,
-                                                       agreement_raster,
-                                                       stats_csv=stats_csv,
-                                                       stats_json=stats_json,
-                                                       mask_values=[],
-                                                       stats_modes_list=stats_modes_list,
-                                                       test_id=test_id,
-                                                       mask_dict=mask_dict,
-                                                       )
-
-                if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
-                    del mask_dict[ahps_lid]
-
-                print(" ")
-                print("Evaluation complete. All metrics for " + test_id + ", " + version + ", " + magnitude + " are available at " + CYAN_BOLD + version_test_case_dir + ENDC)
-                print(" ")
+                if inundate_test == 0:
+                    print("-----> Inundation mapping complete.")
+                    predicted_raster_path = os.path.join(os.path.split(inundation_raster)[0], os.path.split(inundation_raster)[1].replace('.tif', '_' + current_huc + '.tif'))  # The inundate adds the huc to the name so I account for that here.
+
+                    # Define outputs for agreement_raster, stats_json, and stats_csv.
+                    if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
+                        agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, lid + 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
+                    else:
+                        agreement_raster, stats_json, stats_csv = os.path.join(version_test_case_dir, 'total_area_agreement.tif'), os.path.join(version_test_case_dir, 'stats.json'), os.path.join(version_test_case_dir, 'stats.csv')
+
+                    compute_contingency_stats_from_rasters(predicted_raster_path,
+                                                           benchmark_raster_path,
+                                                           agreement_raster,
+                                                           stats_csv=stats_csv,
+                                                           stats_json=stats_json,
+                                                           mask_values=[],
+                                                           stats_modes_list=stats_modes_list,
+                                                           test_id=test_id,
+                                                           mask_dict=mask_dict,
+                                                           )
+
+                    if benchmark_category in AHPS_BENCHMARK_CATEGORIES:
+                        del mask_dict[ahps_lid]
+
+                    print(" ")
+                    print("Evaluation complete. All metrics for " + test_id + ", " + version + ", " + magnitude + " are available at " + CYAN_BOLD + version_test_case_dir + ENDC)
+                    print(" ")
+                elif inundate_test == 1:
+                    print (f"No matching feature IDs between forecast and hydrotable for magnitude: {magnitude}")
+                    return
             except Exception as e:
                 print(e)
 

From 6f111269d060e9d9b5a89ed0a696b52c7dca08bd Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Tue, 27 Apr 2021 21:12:41 +0000
Subject: [PATCH 45/66] removing incoming segments to wbd buffer boundary so
 they will not be routed as outflow in hydroconditioning

---
 src/clip_vectors_to_wbd.py | 58 +++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index dc19309a2..3a5585045 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -22,61 +22,75 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
         landsea.to_file(subset_landsea_filename,driver=getDriver(subset_landsea_filename),index=False)
     del landsea
 
-    # find intersecting lakes and writeout
+    # Find intersecting lakes and writeout
     print("Subsetting NWM Lakes for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nwm_lakes = gpd.read_file(nwm_lakes_filename, mask = wbd_buffer)
 
     if not nwm_lakes.empty:
-        # perform fill process to remove holes/islands in the NWM lake polygons
+        # Perform fill process to remove holes/islands in the NWM lake polygons
         nwm_lakes = nwm_lakes.explode()
         nwm_lakes_fill_holes=MultiPolygon(Polygon(p.exterior) for p in nwm_lakes['geometry']) # remove donut hole geometries
-        # loop through the filled polygons and insert the new geometry
+        # Loop through the filled polygons and insert the new geometry
         for i in range(len(nwm_lakes_fill_holes)):
             nwm_lakes.loc[i,'geometry'] = nwm_lakes_fill_holes[i]
         nwm_lakes.to_file(subset_nwm_lakes_filename,driver=getDriver(subset_nwm_lakes_filename),index=False)
     del nwm_lakes
 
-    # find intersecting levee lines
+    # Find intersecting levee lines
     print("Subsetting NLD levee lines for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nld_lines = gpd.read_file(nld_lines_filename, mask = wbd_buffer)
     if not nld_lines.empty:
         nld_lines.to_file(subset_nld_lines_filename,driver=getDriver(subset_nld_lines_filename),index=False)
     del nld_lines
 
-    # find intersecting nwm_catchments
-    print("Subsetting NWM Catchments for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
-    nwm_catchments = gpd.read_file(nwm_catchments_filename, mask = wbd_buffer)
-    if extent == 'MS':
-        nwm_catchments = nwm_catchments.loc[nwm_catchments.mainstem==1]
-    nwm_catchments.to_file(subset_nwm_catchments_filename,driver=getDriver(subset_nwm_catchments_filename),index=False)
-    del nwm_catchments
-
-    # subset nhd headwaters
+    # Subset nhd headwaters
     print("Subsetting NHD Headwater Points for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nhd_headwaters = gpd.read_file(nhd_headwaters_filename, mask = wbd_buffer)
     if extent == 'MS':
         nhd_headwaters = nhd_headwaters.loc[nhd_headwaters.mainstem==1]
 
-    # subset nhd streams
+    if len(nhd_headwaters) > 0:
+        nhd_headwaters.to_file(subset_nhd_headwaters_filename,driver=getDriver(subset_nhd_headwaters_filename),index=False)
+        del nhd_headwaters, nhd_streams
+    else:
+        print ("No headwater point(s) within HUC " + str(hucCode) +  " boundaries.")
+        sys.exit(0)
+
+    # Subset nhd streams
     print("Querying NHD Streams for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
-    nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd)
+    nhd_streams = gpd.read_file(nhd_streams_filename, mask = wbd_buffer)
     if extent == 'MS':
         nhd_streams = nhd_streams.loc[nhd_streams.mainstem==1]
 
     if len(nhd_streams) > 0:
+        # Find incoming stream segments (to WBD buffer) and identify which are upstream
+        threshold_segments = gpd.overlay(nhd_streams, wbd_buffer, how='symmetric_difference')
+        from_list = threshold_segments.FromNode.to_list()
+        to_list = nhd_streams.ToNode.to_list()
+        missing_segments = list(set(from_list) - set(to_list))
+
+        # Remove incoming stream segment so it won't be routed as outflow during hydroconditioning
+        nhd_streams = nhd_streams.loc[~nhd_streams.FromNode.isin(missing_segments)]
+
         nhd_streams.to_file(subset_nhd_streams_filename,driver=getDriver(subset_nhd_streams_filename),index=False)
     else:
         print ("No NHD streams within HUC " + str(hucCode) +  " boundaries.")
         sys.exit(0)
 
-    if len(nhd_headwaters) > 0:
-        nhd_headwaters.to_file(subset_nhd_headwaters_filename,driver=getDriver(subset_nhd_headwaters_filename),index=False)
-        del nhd_headwaters, nhd_streams
-    else:
-        print ("No headwater point(s) within HUC " + str(hucCode) +  " boundaries.")
-        sys.exit(0)
+        # Find intersecting nwm_catchments
+        print("Subsetting NWM Catchments for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
+        nwm_catchments = gpd.read_file(nwm_catchments_filename, mask = wbd_buffer)
+        if extent == 'MS':
+            nwm_catchments = nwm_catchments.loc[nwm_catchments.mainstem==1]
+
+        if len(nwm_catchments) > 0:
+            nwm_catchments.to_file(subset_nwm_catchments_filename,driver=getDriver(subset_nwm_catchments_filename),index=False)
+        else:
+            print ("No NHD catchments within HUC " + str(hucCode) +  " boundaries.")
+            sys.exit(0)
+        del nwm_catchments
 
-    # subset nwm streams
+    # Subset nwm streams
     print("Subsetting NWM Streams and deriving headwaters for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
     nwm_streams = gpd.read_file(nwm_streams_filename, mask = wbd_buffer)
     if extent == 'MS':

From b748ca1118251c96c0971c43b254d826c898b7bc Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Tue, 27 Apr 2021 21:22:58 +0000
Subject: [PATCH 46/66] fixing indentation

---
 src/clip_vectors_to_wbd.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index 3a5585045..f29217e76 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -51,10 +51,10 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
 
     if len(nhd_headwaters) > 0:
         nhd_headwaters.to_file(subset_nhd_headwaters_filename,driver=getDriver(subset_nhd_headwaters_filename),index=False)
-        del nhd_headwaters, nhd_streams
     else:
         print ("No headwater point(s) within HUC " + str(hucCode) +  " boundaries.")
         sys.exit(0)
+    del nhd_headwaters
 
     # Subset nhd streams
     print("Querying NHD Streams for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
@@ -76,19 +76,20 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     else:
         print ("No NHD streams within HUC " + str(hucCode) +  " boundaries.")
         sys.exit(0)
+    del nhd_streams
 
-        # Find intersecting nwm_catchments
-        print("Subsetting NWM Catchments for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
-        nwm_catchments = gpd.read_file(nwm_catchments_filename, mask = wbd_buffer)
-        if extent == 'MS':
-            nwm_catchments = nwm_catchments.loc[nwm_catchments.mainstem==1]
-
-        if len(nwm_catchments) > 0:
-            nwm_catchments.to_file(subset_nwm_catchments_filename,driver=getDriver(subset_nwm_catchments_filename),index=False)
-        else:
-            print ("No NHD catchments within HUC " + str(hucCode) +  " boundaries.")
-            sys.exit(0)
-        del nwm_catchments
+    # Find intersecting nwm_catchments
+    print("Subsetting NWM Catchments for HUC{} {}".format(hucUnitLength,hucCode),flush=True)
+    nwm_catchments = gpd.read_file(nwm_catchments_filename, mask = wbd_buffer)
+    if extent == 'MS':
+        nwm_catchments = nwm_catchments.loc[nwm_catchments.mainstem==1]
+
+    if len(nwm_catchments) > 0:
+        nwm_catchments.to_file(subset_nwm_catchments_filename,driver=getDriver(subset_nwm_catchments_filename),index=False)
+    else:
+        print ("No NHD catchments within HUC " + str(hucCode) +  " boundaries.")
+        sys.exit(0)
+    del nwm_catchments
 
     # Subset nwm streams
     print("Subsetting NWM Streams and deriving headwaters for HUC{} {}".format(hucUnitLength,hucCode),flush=True)

From 12ef27fe4ab57c50fa5cc083cf18413bc3caddf5 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 28 Apr 2021 11:25:34 -0500
Subject: [PATCH 47/66] Update CHANGELOG.md

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1cc4cc499..307586fab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
+## v3.0.15.7 - 2021-04-28 - [PR #367](https://github.com/NOAA-OWP/cahaba/pull/367)
+
+Refactor synthesize_test_case.py to handle exceptions during multiprocessing. Resolves issue #351
+
+## Changes
+- refactored `inundation.py` and `run_test_case.py` to handle exceptions without using `sys.exit()`.
+
 ## v3.0.15.6 - 2021-04-23 - [PR #365](https://github.com/NOAA-OWP/cahaba/pull/365)
 
 Implement CatFIM threshold flows to Sierra test and add AHPS benchmark preprocessing scripts.

From 18d08225e94307657562ad5d30d6549cff48ca6c Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 28 Apr 2021 11:26:34 -0500
Subject: [PATCH 48/66] Update CHANGELOG.md

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 307586fab..00089e85e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Refactor synthesize_test_case.py to handle exceptions during multiprocessing. Re
 ## Changes
 - refactored `inundation.py` and `run_test_case.py` to handle exceptions without using `sys.exit()`.
 
+<br/><br/>
 ## v3.0.15.6 - 2021-04-23 - [PR #365](https://github.com/NOAA-OWP/cahaba/pull/365)
 
 Implement CatFIM threshold flows to Sierra test and add AHPS benchmark preprocessing scripts.
@@ -27,11 +28,12 @@ Prevent eval_plots.py from erroring out when spatial argument enabled if certain
 
 ## Changes
 - Add check to make sure analyzed dataset is available prior to creating spatial dataset.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.15.4 - 2021-04-20 - [PR #356](https://github.com/NOAA-OWP/cahaba/pull/356)
 
 Closing all multiprocessing Pool objects in repo.
+
 <br/><br/>
 ## v3.0.15.3 - 2021-04-19 - [PR #358](https://github.com/NOAA-OWP/cahaba/pull/358)
 

From 554efc5efef0fae83a48aebaa39d3b93effbd594 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 28 Apr 2021 11:30:15 -0500
Subject: [PATCH 49/66] Update CHANGELOG.md

---
 CHANGELOG.md | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00089e85e..82b0f0cd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,6 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
+<br/><br/>
 ## v3.0.15.7 - 2021-04-28 - [PR #367](https://github.com/NOAA-OWP/cahaba/pull/367)
 
 Refactor synthesize_test_case.py to handle exceptions during multiprocessing. Resolves issue #351
@@ -53,6 +54,7 @@ Preprocess NHDPlus HR rasters for consistent projections, nodata values, and con
 ## v3.0.15.2 - 2021-04-16 - [PR #359](https://github.com/NOAA-OWP/cahaba/pull/359)
 
 Hotfix to preserve desired files when production flag used in `fim_run.sh`. 
+
 ## Changes
 
 - Fixed production whitelisted files.
@@ -61,6 +63,7 @@ Hotfix to preserve desired files when production flag used in `fim_run.sh`.
 ## v3.0.15.1 - 2021-04-13 - [PR #355](https://github.com/NOAA-OWP/cahaba/pull/355)
 
 Sierra test considered all USGS gage locations to be mainstems even though many actually occurred with tributaries. This resulted in unrealistic comparisons as incorrect gages were assigned to mainstems segments. This feature branch identifies gages that are on mainstems via attribute field. 
+
 ## Changes
 
 - Modifies `usgs_gage_crosswalk.py` to filter out gages from the `usgs_gages.gpkg` layer such that for a "MS" run, only consider gages that contain rating curve information (via `curve` attribute) and are also mainstems gages (via `mainstems` attribute). 
@@ -74,7 +77,6 @@ Sierra test considered all USGS gage locations to be mainstems even though many
 - Adds the `extent` argument specified by user when running `fim_run.sh` to `usgs_gage_crosswalk.py`.
 
 <br/><br/>
-
 ## v3.0.15.0 - 2021-04-08 - [PR #340](https://github.com/NOAA-OWP/cahaba/pull/340)
 
 Implementing a prototype technique to estimate the missing bathymetric component in the HAND-derived synthetic rating curves. The new Bathymetric Adjusted Rating Curve (BARC) function is built within the `fim_run.sh` workflow and will ingest bankfull geometry estimates provided by the user to modify the cross section area used in the synthetic rating curve generation.
@@ -90,6 +92,7 @@ Implementing a prototype technique to estimate the missing bathymetric component
     - Imports the existing synthetic rating curve table and the bankfull geometry input data (topwidth and cross section area per COMID).
     - Performs new synthetic rating curve calculations with bathymetry estimation modifications.
     - Flags issues with the thalweg-notch artifact.
+
 <br/><br/>
 ## v3.0.14.0 - 2021-04-05 - [PR #338](https://github.com/NOAA-OWP/cahaba/pull/338)
 
@@ -106,6 +109,7 @@ Create tool to retrieve rating curves from USGS sites and convert to elevation (
      1) `usgs_rating_curves.csv`: A csv file that contains rating curves (including converted to NAVD88 elevation) for USGS gages in a format that is compatible with  `rating_curve_comparisons.py`. As it is is currently configured, only gages within CONUS will have rating curve data.
      2) `log.csv`: A log file that records status for each gage and includes error messages.
      3) `usgs_gages.gpkg`: A geospatial layer (in FIM projection) of all active USGS gages that meet a predefined criteria. Additionally, the `curve` attribute indicates whether a rating curve is found in the `usgs_rating_curves.csv`. This spatial file is only generated if the `all` option is passed with the `-l` argument.
+
 <br/><br/>
 ## v3.0.13.0 - 2021-04-01 - [PR #332](https://github.com/NOAA-OWP/cahaba/pull/332)
 
@@ -119,8 +123,8 @@ Created tool to compare synthetic rating curve with benchmark rating curve (Sier
 ### Additions
  - `usgs_gage_crosswalk.py`: generates `usgs_elev_table.csv` in `run_by_unit.py` with elevation and additional attributes at USGS gages.
  - `rating_curve_comparison.py`: post-processing script to plot and calculate metrics between synthetic rating curves and USGS rating curve data.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.12.1 - 2021-03-31 - [PR #336](https://github.com/NOAA-OWP/cahaba/pull/336)
 
 Fix spatial option in `eval_plots.py` when creating plots and spatial outputs.
@@ -133,8 +137,8 @@ Fix spatial option in `eval_plots.py` when creating plots and spatial outputs.
 ### Additions
  - Creates `fim_performance_points.shp`: this layer consists of all evaluated ahps points (with metrics). Spatial data retrieved from WRDS on the fly.
  - Creates `fim_performance_polys.shp`: this layer consists of all evaluated huc8s (with metrics). Spatial data retrieved from WBD layer.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.12.0 - 2021-03-26 - [PR #327](https://github.com/NOAA-OWP/cahaba/pull/237)
 
 Add more detail/information to plotting capabilities.
@@ -146,8 +150,8 @@ Add more detail/information to plotting capabilities.
 ### Additions
  - Optional argument to create barplots of CSI for each individual site.
  - Create a csv containing the data used to create the scatterplots.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.11.0 - 2021-03-22 - [PR #319](https://github.com/NOAA-OWP/cahaba/pull/298)
 
 Improvements to CatFIM service source data generation.
@@ -160,16 +164,16 @@ Improvements to CatFIM service source data generation.
 ### Additions
  - Added `generate_categorical_fim.py` to wrap `generate_categorical_fim_flows.py` and `generate_categorical_fim_mapping.py`.
  - Create new `nws_lid_sites` shapefile located in same directory as the `catfim_library` shapefile.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.10.1 - 2021-03-24 - [PR #320](https://github.com/NOAA-OWP/cahaba/pull/320)
 
 Patch to synthesize_test_cases.py.
 
 ### Changes
  - Bug fix to `synthesize_test_cases.py` to allow comparison between `testing` version and `official` versions.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.10.0 - 2021-03-12 - [PR #298](https://github.com/NOAA-OWP/cahaba/pull/298)
 
 Preprocessing of flow files for Categorical FIM.
@@ -183,8 +187,8 @@ Preprocessing of flow files for Categorical FIM.
 
  ### Changes
  - Stability fixes to `generate_categorical_fim.py`.
-<br/><br/>
 
+<br/><br/>
 ## v3.0.9.0 - 2021-03-12 - [PR #297](https://github.com/NOAA-OWP/cahaba/pull/297)
 
 Enhancements to FIM API.

From 958a0d960f6992627a0a4904039213813ac7840e Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 28 Apr 2021 13:39:44 -0500
Subject: [PATCH 50/66] Update CHANGELOG.md

---
 CHANGELOG.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 82b0f0cd3..4c6e3ade4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
+<br/><br/>
+## v3.0.15.8 - 2021-04-29 - [PR #371](https://github.com/NOAA-OWP/cahaba/pull/371)
+
+Refactor NHDPlus HR preprocessing workflow. Resolves issue #238
+
+## Changes
+- Consolidate NHD streams, NWM catchments, and headwaters MS and FR layers with `mainstem` column.
+- HUC8 intersections are included in the input headwaters layer.
+- `clip_vectors_to_wbd.py` removes incoming stream segment from the selected layers. 
+
 <br/><br/>
 ## v3.0.15.7 - 2021-04-28 - [PR #367](https://github.com/NOAA-OWP/cahaba/pull/367)
 

From 86064580dd4ddc6506ad35079b3163a35cfeae53 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Mon, 3 May 2021 15:46:57 +0000
Subject: [PATCH 51/66] initial elevation profile tools

---
 src/agreedem.py                  | 171 ++++++++--------
 src/reduce_nhd_stream_density.py |   7 +
 src/run_by_unit.sh               |  13 +-
 src/thalweg_drop_check.py        | 330 +++++++++++++++++++++++++++++++
 tools/thalweg_comparison.py      | 225 +++++++++++++++++++++
 5 files changed, 664 insertions(+), 82 deletions(-)
 create mode 100755 src/thalweg_drop_check.py
 create mode 100755 tools/thalweg_comparison.py

diff --git a/src/agreedem.py b/src/agreedem.py
index dbff2d2d4..1abeef552 100755
--- a/src/agreedem.py
+++ b/src/agreedem.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+
 import rasterio
 import numpy as np
 import os
@@ -36,13 +37,15 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     None.
 
     '''
-    #------------------------------------------------------------------
-    # 1. From Hellweger documentation: Compute the vector grid
-    # (vectgrid). The cells in the vector grid corresponding to the
-    # lines in the vector coverage have data. All other cells have no
-    # data.
 
-    # Import dem layer and river layer and get dem profile.
+    '''
+    ------------------------------------------------------------------
+    1. From Hellweger documentation: Compute the vector grid (vectgrid).
+    The cells in the vector grid corresponding to the lines in the vector
+    coverage have data. All other cells have no data.
+    '''
+
+    # Import dem layer and river layer and get dem profile
     elev = rasterio.open(dem)
     dem_profile = elev.profile
 
@@ -66,15 +69,17 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
                 river_raw_data_window = rivers.read(1, window = window)
                 river_data_window = np.where(elev_mask_window == True, river_raw_data_window, 0)
 
-                #---------------------------------------------------------------
-                # 2. From Hellweger documentation: Compute the smooth drop/raise
-                # grid (smogrid). The cells in the smooth drop/raise grid
-                # corresponding to the vector lines have an elevation equal to that
-                # of the original DEM (oelevgrid) plus a certain distance
-                # (smoothdist). All other cells have no data.
-
-                # Assign smooth distance and calculate the smogrid.
-                smooth_dist = -1 * smooth_drop # in meters.
+                '''
+                ---------------------------------------------------------------
+                2. From Hellweger documentation: Compute the smooth drop/raise
+                grid (smogrid). The cells in the smooth drop/raise grid
+                corresponding to the vector lines have an elevation equal to that
+                of the original DEM (oelevgrid) plus a certain distance
+                (smoothdist). All other cells have no data.
+                '''
+
+                # Assign smooth distance and calculate the smogrid
+                smooth_dist = -1 * smooth_drop # in meters
                 smogrid_window = river_data_window*(elev_data_window + smooth_dist)
 
                 # Write out raster
@@ -83,23 +88,26 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     elev.close()
     rivers.close()
     raster.close()
-    #------------------------------------------------------------------
-    # 3. From Hellweger documentation: Compute the vector distance grids
-    # (vectdist and vectallo). The cells in the vector distance grid
-    # (vectdist) store the distance to the closest vector cell. The
-    # cells in vector allocation grid (vectallo) store the elevation of
-    # the closest vector cell.
-
-    # Compute allocation and proximity grid using GRASS gis
-    # r.grow.distance tool. Output distance grid in meters. Set datatype
-    # for output allocation and proximity grids to float32.
+
+    '''
+    ------------------------------------------------------------------
+    3. From Hellweger documentation: Compute the vector distance grids
+    (vectdist and vectallo). The cells in the vector distance grid
+    (vectdist) store the distance to the closest vector cell. The
+    cells in vector allocation grid (vectallo) store the elevation of
+    the closest vector cell.
+    '''
+    # Compute allocation and proximity grid using GRASS gis r.grow.distance tool.
+    # Output distance grid in meters. Set datatype for output allocation and proximity grids to float32.
     vectdist_grid, vectallo_grid = r_grow_distance(smo_output, grass_workspace, 'Float32', 'Float32')
 
-    #------------------------------------------------------------------
-    # 4. From Hellweger documentation: Compute the buffer grid
-    # (bufgrid2). The cells in the buffer grid outside the buffer
-    # distance (buffer) store the original elevation. The cells in the
-    # buffer grid inside the buffer distance have no data.
+    '''
+    ------------------------------------------------------------------
+    4. From Hellweger documentation: Compute the buffer grid
+    (bufgrid2). The cells in the buffer grid outside the buffer
+    distance (buffer) store the original elevation. The cells in the
+    buffer grid inside the buffer distance have no data.
+    '''
 
     # Open distance, allocation, elevation grids.
     vectdist = rasterio.open(vectdist_grid)
@@ -120,35 +128,35 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
                 vectallo_data_window = vectallo.read(1, window = window)
                 elev_data_window = elev.read(1, window = window)
 
-                # Define buffer distance and calculate adjustment to compute the
-                # bufgrid.
+                # Define buffer distance and calculate adjustment to compute the bufgrid.
                 # half_res adjustment equal to half distance of one cell
                 half_res = elev.res[0]/2
                 final_buffer = buffer_dist - half_res # assume all units in meters.
 
-                # Calculate bufgrid. Assign NODATA to areas where vectdist_data <=
-                # buffered value.
+                # Calculate bufgrid. Assign NODATA to areas where vectdist_data <= buffered value.
                 bufgrid_window = np.where(vectdist_data_window > final_buffer, elev_data_window, dem_profile['nodata'])
 
-                # Write out raster.
+                # Write out raster
                 raster.write(bufgrid_window.astype('float32'), indexes = 1, window = window)
 
     vectdist.close()
     vectallo.close()
     elev.close()
-    #------------------------------------------------------------------
-    # 5. From Hellweger documentation: Compute the buffer distance grids
-    # (bufdist and bufallo). The cells in the buffer distance grid
-    # (bufdist) store the distance to the closest valued buffer grid
-    # cell (bufgrid2). The cells in buffer allocation grid (bufallo)
-    # store the elevation of the closest valued buffer cell.
-
-    # Compute allocation and proximity grid using GRASS gis
-    # r.grow.distance. Output distance grid in meters. Set datatype for
-    # output allocation and proximity grids to float32.
+
+    '''
+    ------------------------------------------------------------------
+    5. From Hellweger documentation: Compute the buffer distance grids
+    (bufdist and bufallo). The cells in the buffer distance grid
+    (bufdist) store the distance to the closest valued buffer grid
+    cell (bufgrid2). The cells in buffer allocation grid (bufallo)
+    store the elevation of the closest valued buffer cell.
+    '''
+
+    # Compute allocation and proximity grid using GRASS gis r.grow.distance.
+    # Output distance grid in meters. Set datatype for output allocation and proximity grids to float32.
     bufdist_grid, bufallo_grid = r_grow_distance(buf_output, grass_workspace, 'Float32', 'Float32')
 
-    # Open distance, allocation, elevation grids.
+    # Open distance, allocation, elevation grids
     bufdist = rasterio.open(bufdist_grid)
     bufallo = rasterio.open(bufallo_grid)
     vectdist = rasterio.open(vectdist_grid)
@@ -156,7 +164,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     rivers = rasterio.open(rivers_raster)
     elev = rasterio.open(dem)
 
-    # Define profile output file.
+    # Define profile output file
     agree_output = output_raster
     agree_profile = dem_profile.copy()
     agree_profile.update(dtype = 'float32')
@@ -165,7 +173,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     with rasterio.Env():
         with rasterio.open(agree_output, 'w', **agree_profile) as raster:
             for ji, window in elev.block_windows(1):
-                # Read elevation data and mask, distance and allocation grids, and river data.
+                # Read elevation data and mask, distance and allocation grids, and river data
                 elev_data_window = elev.read(1, window = window)
                 elev_mask_window = elev.read_masks(1, window = window).astype('bool')
                 bufdist_data_window = bufdist.read(1, window = window)
@@ -176,37 +184,42 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
 
 
                 river_data_window = np.where(elev_mask_window == True, river_raw_data_window, -20.0)
-                #------------------------------------------------------------------
-                # 6. From Hellweger documentation: Compute the smooth modified
-                # elevation grid (smoelev). The cells in the smooth modified
-                # elevation grid store the results of the smooth surface
-                # reconditioning process. Note that for cells outside the buffer the
-                # equation below assigns the original elevation.
-
-                # Calculate smoelev.
-                smoelev_window = vectallo_data_window + ((bufallo_data_window - vectallo_data_window)/(bufdist_data_window + vectdist_data_window)) * vectdist_data_window
 
-                #------------------------------------------------------------------
-                # 7. From Hellweger documentation: Compute the sharp drop/raise grid
-                # (shagrid). The cells in the sharp drop/raise grid corresponding to
-                # the vector lines have an elevation equal to that of the smooth
-                # modified elevation grid (smoelev) plus a certain distance
-                # (sharpdist). All other cells have no data.
+                '''
+                ------------------------------------------------------------------
+                6. From Hellweger documentation: Compute the smooth modified
+                elevation grid (smoelev). The cells in the smooth modified
+                elevation grid store the results of the smooth surface
+                reconditioning process. Note that for cells outside the buffer the
+                equation below assigns the original elevation.
+                '''
 
-                # Define sharp drop distance and calculate the sharp drop grid where
-                # only river cells are dropped by the sharp_dist amount.
-                sharp_dist = -1 * sharp_drop # in meters.
+                # Calculate smoelev
+                smoelev_window = vectallo_data_window + ((bufallo_data_window - vectallo_data_window)/(bufdist_data_window + vectdist_data_window)) * vectdist_data_window
+
+                '''
+                ------------------------------------------------------------------
+                7. From Hellweger documentation: Compute the sharp drop/raise grid
+                (shagrid). The cells in the sharp drop/raise grid corresponding to
+                the vector lines have an elevation equal to that of the smooth
+                modified elevation grid (smoelev) plus a certain distance
+                (sharpdist). All other cells have no data.
+                '''
+
+                # Define sharp drop distance and calculate the sharp drop grid where only river cells are dropped by the sharp_dist amount.
+                sharp_dist = -1 * sharp_drop # in meters
                 shagrid_window = (smoelev_window + sharp_dist) * river_data_window
 
-                #------------------------------------------------------------------
-                # 8. From Hellweger documentation: Compute the modified elevation
-                # grid (elevgrid). The cells in the modified elevation grid store
-                # the results of the surface reconditioning process. Note that for
-                # cells outside the buffer the the equation below assigns the
-                # original elevation.
+                '''
+                ------------------------------------------------------------------
+                8. From Hellweger documentation: Compute the modified elevation
+                grid (elevgrid). The cells in the modified elevation grid store
+                the results of the surface reconditioning process. Note that for
+                cells outside the buffer the the equation below assigns the
+                original elevation.
+                '''
 
-                # Merge sharp drop grid with smoelev grid. Then apply the same
-                # NODATA mask as original elevation grid.
+                # Merge sharp drop grid with smoelev grid. Then apply the same NODATA mask as original elevation grid.
                 elevgrid_window = np.where(river_data_window == 0, smoelev_window, shagrid_window)
                 agree_dem_window = np.where(elev_mask_window == True, elevgrid_window, dem_profile['nodata'])
 
@@ -219,7 +232,8 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     vectallo.close()
     rivers.close()
     elev.close()
-    # If the '-t' flag is called, intermediate data is removed.
+
+    # If the '-t' flag is called, intermediate data is removed
     if delete_intermediate_data:
         os.remove(smo_output)
         os.remove(buf_output)
@@ -231,7 +245,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
 
 if __name__ == '__main__':
 
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser(description = 'Calculate AGREE DEM')
     parser.add_argument('-r', '--rivers', help = 'flows grid boolean layer', required = True)
     parser.add_argument('-d', '--dem_m',  help = 'DEM raster in meters', required = True)
@@ -243,10 +257,9 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     parser.add_argument('-sh', '---sharp', help = 'Sharp drop (m)', required = True)
     parser.add_argument('-t',  '--del',  help = 'Optional flag to delete intermediate datasets', action = 'store_true')
 
-    #Extract to dictionary and assign to variables.
+    # Extract to dictionary and assign to variables
     args = vars(parser.parse_args())
 
-    # rename variable inputs
     rivers_raster = args['rivers']
     dem = args['dem_m']
     workspace = args['workspace']
@@ -257,5 +270,5 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     sharp_drop =  float(args['sharp'])
     delete_intermediate_data = args['del']
 
-    #Run agreedem
+    # Run agreedem
     agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buffer_dist, smooth_drop, sharp_drop, delete_intermediate_data)
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index 9614bfe32..5ab8cf8de 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -37,6 +37,7 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_,headwaters_file
 
             # Masking headwaters by HUC8
             headwaters_mask = gpd.read_file(headwaters_filename, mask = huc8_mask)
+            # headwaters_mask = headwaters_mask.loc[headwaters_mask.headwater=True]
             headwaters_mask = headwaters_mask.reset_index(drop=True)
 
             # Masking subset streams by HUC8
@@ -64,6 +65,12 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_,headwaters_file
                 streams_subset[id_col] = n
 
                 # Find stream segment closest to headwater point
+                # co_located_sites = headwaters_mask.loc[headwaters_mask.co_located==True].to_list()
+                # true_headwater_sites = headwaters_mask.loc[headwaters_mask.co_located==False].to_list()
+
+                # additional headwaters = function_to_determine_true_headwater(co_located_sites)
+                # headwaters_mask = true_headwater_sites.append(additional)
+
                 for index, point in headwaters_mask.iterrows():
 
                     # Convert headwaterpoint geometries to WKB representation
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 0c5e65cf5..be57108a7 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -139,9 +139,8 @@ $srcDir/burn_in_levees.py -dem $outputHucDataDir/dem_meters.tif -nld $outputHucD
 Tcount
 
 ## DEM Reconditioning ##
-# Using AGREE methodology, hydroenforce the DEM so that it is consistent
-# with the supplied stream network. This allows for more realistic catchment
-# delineation which is ultimately reflected in the output FIM mapping.
+# Using AGREE methodology, hydroenforce the DEM so that it is consistent with the supplied stream network.
+# This allows for more realistic catchment delineation which is ultimately reflected in the output FIM mapping.
 echo -e $startDiv"Creating AGREE DEM using $agree_DEM_buffer meter buffer"$stopDiv
 date -u
 Tstart
@@ -149,6 +148,14 @@ Tstart
 $srcDir/agreedem.py -r $outputHucDataDir/flows_grid_boolean.tif -d $outputHucDataDir/dem_meters.tif -w $outputHucDataDir -g $outputHucDataDir/temp_work -o $outputHucDataDir/dem_burned.tif -b $agree_DEM_buffer -sm 10 -sh 1000
 Tcount
 
+## CHECK THALWEG DROP ##
+echo -e $startDiv"Check Thalweg Drop $hucNumber"$stopDiv
+date -u
+Tstart
+[ -f $outputHucDataDir/dem_burned.tif ] && \
+$srcDir/thalweg_drop_check.py -d $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg -o $outputHucDataDir/dem_burned.tif
+Tcount
+
 ## PIT REMOVE BURNED DEM ##
 echo -e $startDiv"Pit remove Burned DEM $hucNumber"$stopDiv
 date -u
diff --git a/src/thalweg_drop_check.py b/src/thalweg_drop_check.py
new file mode 100755
index 000000000..3c42f6081
--- /dev/null
+++ b/src/thalweg_drop_check.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import geopandas as gpd
+from shapely.geometry import Point
+import rasterio
+import pandas as pd
+import numpy as np
+import argparse
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import deque
+from functools import reduce
+from os.path import isfile, join, dirname
+import shutil
+import warnings
+from pathlib import Path
+from collections import OrderedDict
+import time
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+"""
+    Plot Rating Curves and Compare to USGS Gages
+
+    Parameters
+    ----------
+    fim_dir : str
+        Directory containing FIM output folders.
+    output_dir : str
+        Directory containing rating curve plots and tables.
+    usgs_gages_filename : str
+        File name of USGS rating curves.
+    nwm_flow_dir : str
+        Directory containing NWM recurrence flows files.
+    number_of_jobs : str
+        Number of jobs.
+    stat_groups : str
+        string of columns to group eval metrics.
+"""
+outfolder = '/data/outputs/single_pixel_huc_ms_c/02030103'
+# outfolder = '/data/outputs/single_pixel_huc_ms_c/12090301'
+
+dem_meters_filename = os.path.join(outfolder,'dem_meters.tif')
+dem_burned_filename = os.path.join(outfolder,'dem_burned.tif')
+dem_burned_filled_filename = os.path.join(outfolder,'dem_burned_filled.tif')
+dem_lateral_thalweg_adj_filename = os.path.join(outfolder,'dem_lateral_thalweg_adj.tif')
+dem_thalwegCond_filename = os.path.join(outfolder,'dem_thalwegCond.tif')
+
+reaches_filename = os.path.join(outfolder,'NHDPlusBurnLineEvent_subset.gpkg')
+
+
+def compare_thalweg(args):
+
+    huc                              = args[0]
+    reaches_split_points_filename    = args[1]
+    reaches_filename                 = args[2]
+    dem_burned_filename         = args[3]
+    dem_meters_filename              = args[4]
+
+# reaches_split_points = gpd.read_file(reaches_split_points_filename)
+reaches = gpd.read_file(reaches_filename)
+dem_meters = rasterio.open(dem_meters_filename,'r')
+dem_burned = rasterio.open(dem_burned_filename,'r')
+dem_burned_filled = rasterio.open(dem_burned_filled_filename,'r')
+dem_lateral_thalweg_adj = rasterio.open(dem_lateral_thalweg_adj_filename,'r')
+dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
+
+### Get lists of all complete reaches using headwater attributes
+#########################################
+
+
+headwater_col = 'true_headwater'
+reaches[headwater_col] = False
+reaches.loc[reaches.NHDPlusID==10000100014087.0,headwater_col] = True
+headwaters = reaches.loc[reaches[headwater_col]==True]
+
+for index, headwater in headwaters.iterrows():
+    reaches["headwater_path"] = headwater.nws_lid
+    reaches.set_index('NHDPlusID',inplace=True,drop=False)
+
+    stream_path = get_downstream_segments(reaches,headwater_col, 'downstream')
+
+
+def get_downstream_segments(streams, headwater_col,flag_column):
+    streams[flag_column] = False
+    streams.loc[streams[headwater_col],flag_column] = True
+    Q = deque(streams.loc[streams[headwater_col],'NHDPlusID'].tolist())
+    visited = set()
+    while Q:
+        q = Q.popleft()
+        if q in visited:
+            continue
+        visited.add(q)
+        toNode,DnLevelPat = streams.loc[q,['ToNode','DnLevelPat']]
+        try:
+            downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
+        except ValueError: # 18050002 has duplicate nhd stream feature
+            if len(toNode.unique()) == 1:
+                toNode = toNode.iloc[0]
+                downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
+        # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
+        if len(set(downstream_ids))>1: # special case: remove duplicate NHDPlusIDs
+            relevant_ids = [segment for segment in downstream_ids if DnLevelPat == streams.loc[segment,'LevelPathI']]
+        else:
+            relevant_ids = downstream_ids
+        streams.loc[relevant_ids,flag_column] = True
+        for i in relevant_ids:
+            if i not in visited:
+                Q.append(i)
+    streams = streams.loc[streams[flag_column],:]
+    return(streams)
+
+#########################################
+# Collect elevation values from multiple grids along each individual reach point
+
+# Get all vertices
+split_points = []
+stream_ids = []
+dem_m_elev = []
+dem_burned_elev = []
+dem_burned_filled_elev = []
+dem_lat_thal_adj_elev = []
+dem_thal_adj_elev = []
+index_count = []
+count = 0
+for index, segment in stream_path.iterrows():
+    lineString = segment.geometry
+    # x,y = lineString.coords.xy
+    # count = len(x)
+    for point in zip(*lineString.coords.xy):
+        stream_ids = stream_ids + [segment.NHDPlusID]
+        split_points = split_points + [Point(point)]
+        count = count + 1
+        index_count = index_count + [count]
+        dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(point).coords), indexes=1))).item()]
+        dem_burned_elev = dem_burned_elev + [np.array(list(dem_burned.sample((Point(point).coords), indexes=1))).item()]
+        dem_burned_filled_elev = dem_burned_filled_elev + [np.array(list(dem_burned_filled.sample((Point(point).coords), indexes=1))).item()]
+        dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(point).coords), indexes=1))).item()]
+        dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(point).coords), indexes=1))).item()]
+
+dem_m_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_m', 'elevation_m': dem_m_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+# dem_burned_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_burned', 'elevation_m': dem_burned_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+dem_burned_filled_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_burned_filled', 'elevation_m': dem_burned_filled_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+dem_lat_thal_adj_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_lat_thal_adj', 'elevation_m': dem_lat_thal_adj_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+dem_thal_adj_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'thal_adj_dem', 'elevation_m': dem_thal_adj_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+
+burnline_points = dem_m_pts.append([dem_thal_adj_pts,dem_lat_thal_adj_pts]) # dem_burned_pts, dem_burned_filled_pts,
+
+# remove nodata_pts
+burnline_points = burnline_points.loc[burnline_points.elevation_m>-9999.0]
+# burnline_points = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'ToNode': ToNodes, 'FromNode': FromNodes, 'elevation_m': dem_burned_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
+
+#########################################
+# Identify significant drops in elevation (trace multiple grids)
+def find_elevation_drops(burnline_points):
+    drop_streams = []
+    for index, segment in burnline_points.iterrows():
+        upstream_elev = segment.elevation_m
+        try:
+            downstream_elev = burnline_points.loc[(burnline_points.index_count==(segment.index_count + 1))].elevation_m.item()
+            if (downstream_elev - upstream_elev) > 5:
+                print (f"elevation drop of {downstream_elev - upstream_elev} meters ")
+                drop_streams = drop_streams + [index]
+        except: # terminal point
+            pass
+    return drop_streams
+
+burnline_points["headwater_path"] = 'WNQN4'
+
+profile_plots_filename = '/data/outputs/single_pixel_huc_ms_c/02030103/profile_drop_plots2.png'
+
+# num_plots = len(burnline_points.headwater_path.unique())
+num_plots = len(burnline_points.source.unique())
+
+if num_plots > 3:
+    columns = num_plots // 3
+else:
+    columns = 1
+
+sns.set(style="ticks")
+# g = sns.FacetGrid(burnline_points, col="headwater_path", hue="source",sharex=True, sharey=False,col_wrap=columns)
+# g.map(sns.lineplot, "index_count", "elevation_m", palette="tab20c") # , marker="o"
+# g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
+g = sns.FacetGrid(burnline_points, col="source", hue="headwater_path",sharex=True, sharey=False,col_wrap=columns)
+g.map(sns.lineplot, "index_count", "elevation_m", palette="tab20c") # , marker="o"
+g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
+
+# Iterate thorugh each axis to get individual y-axis bounds
+for ax in g.axes.flat:
+    print (ax.lines)
+    mins = []
+    maxes = []
+    for line in ax.lines:
+        mins = mins + [min(line.get_ydata())]
+        maxes = maxes + [max(line.get_ydata())]
+    min_y = min(mins) - (max(maxes) - min(mins))/10
+    # min_y = -100
+    max_y = max(maxes) + (max(maxes) - min(mins))/10
+    ax.set_ylim(min_y,max_y)
+
+# Adjust the arrangement of the plots
+g.fig.tight_layout(w_pad=1)
+g.add_legend()
+
+plt.savefig(profile_plots_filename)
+plt.close()
+
+###############################################################################################################################################
+
+dem_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': dem_m_elev,'source': 'thalweg_adj'})
+dem_adj_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': thal_adj_elev,'source': 'dem_meters'})
+
+all_elevations = dem_thalweg_elevations.append(dem_adj_thalweg_elevations)
+
+reach_att = reaches[['HydroID', 'From_Node', 'To_Node', 'NextDownID']]
+
+thalweg_elevations = all_elevations.merge(reach_att, on="HydroID")
+
+# Find segments where elevation drops 5 m per
+# drops = thalweg_elevations.loc[thalweg_elevations.HydroID
+# all_hydro_ids = dict(thalweg_elevations[['HydroID','elevation_m']])
+thalweg_elevations.NextDownID = thalweg_elevations.NextDownID.astype('int')
+dem_adj_thalweg_elevations = thalweg_elevations.loc[thalweg_elevations.source=='thalweg_adj']
+min_index = dem_adj_thalweg_elevations.groupby(['HydroID']).pt_order.min()
+min_index = min_index.reset_index()
+min_index = min_index.rename(columns={'pt_order': 'min_index'})
+
+for index, downstream_id in dem_adj_thalweg_elevations.iterrows():
+    if index == 1:
+        break
+    if downstream_id.NextDownID != -1:
+        downstream_elevs = dem_adj_thalweg_elevations.loc[(dem_adj_thalweg_elevations.HydroID==downstream_id.NextDownID) & (dem_adj_thalweg_elevations.source=='thalweg_adj')].elevation_m
+        if (downstream_id.elevation_m - downstream_elevs[0]) > 5:
+            print (f"HydroID {HydroID} drops {(downstream_id.elevation_m - downstream_elev)} meters down from HydroID {NextDownID}")
+        upstream_elev = dem_adj_thalweg_elevations.loc[dem_adj_thalweg_elevations.NextDownID==downstream_id.NextDownID].elevation_m
+
+# drops = thalweg_elevations.
+
+select_hydroids = [10680001,10680002,10680020,10680034,10680061,10680076,10680077,10680148,10680094]
+
+select_elevations = thalweg_elevations.loc[thalweg_elevations.HydroID.isin(select_hydroids)]
+
+# Convert index to longitudinal distance
+
+# Find reference index for each segment to convert index to longitudinal distance
+min_index = select_elevations.groupby(['HydroID']).pt_order.min()
+min_index = min_index.reset_index()
+min_index = min_index.rename(columns={'pt_order': 'min_index'})
+
+# Subtract reference index from index and convert to feet
+segment_distance = pd.merge(select_elevations[['HydroID', 'pt_order','source']],min_index, on="HydroID").reset_index(drop=True)
+segment_distance['distance'] = (segment_distance.pt_order - segment_distance.min_index)* 32.8084
+segment_distance.distance = segment_distance.distance.round(1)
+# merge distances back into table
+select_elevations = select_elevations.reset_index(drop=True)
+# segment_distance_sub = segment_distance.filter(items=['HydroID', 'distance']).reset_index(drop=True)
+select_elevations = pd.concat([select_elevations.set_index('HydroID'), segment_distance[['HydroID', 'distance']].set_index('HydroID')], axis=1, join="inner")
+select_elevations = select_elevations.reset_index()
+# Convert elevation to feet
+select_elevations['elevation_ft'] = select_elevations.elevation_m * 3.28084 # convert from m to ft
+select_elevations.elevation_ft = select_elevations.elevation_ft.round(1)
+
+select_elevations = select_elevations.sort_values(['HydroID', 'distance','elevation_ft'], ascending=[1, 0, 0])
+select_elevations = select_elevations.reset_index(drop=True)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
+    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True,type=str)
+    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True,type=str)
+    parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True,type=str)
+    parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True,type=str)
+    parser.add_argument('-catfim', '--catfim-flows-filename', help='Categorical FIM flows file',required = True,type=str)
+    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
+    parser.add_argument('-group','--stat-groups',help='column(s) to group stats',required=False,type=str)
+
+    args = vars(parser.parse_args())
+
+    fim_dir = args['fim_dir']
+    output_dir = args['output_dir']
+    usgs_gages_filename = args['usgs_gages_filename']
+    nwm_flow_dir = args['nwm_flow_dir']
+    catfim_flows_filename = args['catfim_flows_filename']
+    number_of_jobs = args['number_of_jobs']
+    stat_groups = args['stat_groups']
+
+    stat_groups = stat_groups.split()
+    procs_list = []
+
+    plots_dir = join(output_dir,'plots')
+    os.makedirs(plots_dir, exist_ok=True)
+    tables_dir = join(output_dir,'tables')
+    os.makedirs(tables_dir, exist_ok=True)
+
+    #Check age of gages csv and recommend updating if older than 30 days.
+    print(check_file_age(usgs_gages_filename))
+
+    # Open log file
+    sys.__stdout__ = sys.stdout
+    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
+    sys.stdout = log_file
+
+    huc_list  = os.listdir(fim_dir)
+    for huc in huc_list:
+
+        if huc != 'logs':
+            elev_table_filename = join(fim_dir,huc,'usgs_elev_table.csv')
+            hydrotable_filename = join(fim_dir,huc,'hydroTable.csv')
+            usgs_recurr_stats_filename = join(tables_dir,f"usgs_interpolated_elevation_stats_{huc}.csv")
+            nwm_recurr_data_filename = join(tables_dir,f"nwm_recurrence_flow_elevations_{huc}.csv")
+            rc_comparison_plot_filename = join(plots_dir,f"FIM-USGS_rating_curve_comparison_{huc}.png")
+
+            if isfile(elev_table_filename):
+                procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir, catfim_flows_filename, huc])
+
+    # Initiate multiprocessing
+    print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")
+    with Pool(processes=number_of_jobs) as pool:
+        pool.map(generate_rating_curve_metrics, procs_list)
+
+    print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
+    aggregate_metrics(output_dir,procs_list,stat_groups)
+
+    print('Delete intermediate tables')
+    shutil.rmtree(tables_dir, ignore_errors=True)
+
+    # Close log file
+    sys.stdout = sys.__stdout__
+    log_file.close()
diff --git a/tools/thalweg_comparison.py b/tools/thalweg_comparison.py
new file mode 100755
index 000000000..5f9f734e6
--- /dev/null
+++ b/tools/thalweg_comparison.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import geopandas as gpd
+import rasterio
+import pandas as pd
+import numpy as np
+import argparse
+import matplotlib.pyplot as plt
+import seaborn as sns
+from functools import reduce
+from multiprocessing import Pool
+from os.path import isfile, join, dirname
+import shutil
+import warnings
+from pathlib import Path
+import time
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+"""
+    Plot Rating Curves and Compare to USGS Gages
+
+    Parameters
+    ----------
+    fim_dir : str
+        Directory containing FIM output folders.
+    output_dir : str
+        Directory containing rating curve plots and tables.
+    usgs_gages_filename : str
+        File name of USGS rating curves.
+    nwm_flow_dir : str
+        Directory containing NWM recurrence flows files.
+    number_of_jobs : str
+        Number of jobs.
+    stat_groups : str
+        string of columns to group eval metrics.
+"""
+outfolder = '/data/outputs/single_pixel_huc_ms_c/02030103' # dev_v3_0_15_7_adj_huc_test
+dem_thalwegCond_filename = os.path.join(outfolder,'dem_thalwegCond.tif')
+dem_meters_filename = os.path.join(outfolder,'dem_meters.tif')
+reaches_split_points_filename = os.path.join(outfolder,'demDerived_reaches_split_points.gpkg')
+reaches_filename = os.path.join(outfolder,'demDerived_reaches_split.gpkg')
+
+
+def compare_thalweg(args):
+
+    huc                              = args[0]
+    reaches_split_points_filename    = args[1]
+    reaches_filename                 = args[2]
+    dem_thalwegCond_filename         = args[3]
+    dem_meters_filename              = args[4]
+
+reaches_split_points = gpd.read_file(reaches_split_points_filename)
+reaches = gpd.read_file(reaches_filename)
+dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
+dem_meters = rasterio.open(dem_meters_filename,'r')
+
+plot_filename = '/data/outputs/single_pixel_huc_ms_c/02030103/elev_plots.png'
+
+reaches_split_points = reaches_split_points.rename(columns={'id': 'HydroID'})
+
+hydroid = []
+index_order = []
+thal_adj_elev = []
+dem_m_elev = []
+for index, point in reaches_split_points.iterrows():
+    hydroid = hydroid + [point.HydroID]
+    index_order = index_order + [index]
+    dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((point.geometry.coords), indexes=1))).item()]
+    thal_adj_elev = thal_adj_elev + [np.array(list(dem_thalwegCond.sample((point.geometry.coords), indexes=1))).item()]
+
+dem_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': dem_m_elev,'source': 'dem_meters'})
+dem_adj_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': thal_adj_elev,'source': 'thalweg_adj'})
+
+all_elevations = dem_thalweg_elevations.append(dem_adj_thalweg_elevations)
+
+reach_att = reaches[['HydroID', 'From_Node', 'To_Node', 'NextDownID']]
+
+thalweg_elevations = all_elevations.merge(reach_att, on="HydroID")
+
+# Find segments where elevation drops 5 m per
+# drops = thalweg_elevations.loc[thalweg_elevations.HydroID
+# all_hydro_ids = dict(thalweg_elevations[['HydroID','elevation_m']])
+thalweg_elevations.NextDownID = thalweg_elevations.NextDownID.astype('int')
+dem_adj_thalweg_elevations = thalweg_elevations.loc[thalweg_elevations.source=='thalweg_adj']
+min_index = dem_adj_thalweg_elevations.groupby(['HydroID']).pt_order.min()
+min_index = min_index.reset_index()
+min_index = min_index.rename(columns={'pt_order': 'min_index'})
+
+for index, downstream_id in dem_adj_thalweg_elevations.iterrows():
+    if index == 1:
+        break
+    if downstream_id.NextDownID != -1:
+        downstream_elevs = dem_adj_thalweg_elevations.loc[(dem_adj_thalweg_elevations.HydroID==downstream_id.NextDownID) & (dem_adj_thalweg_elevations.source=='thalweg_adj')].elevation_m
+        if (downstream_id.elevation_m - downstream_elevs[0]) > 5:
+            print (f"HydroID {HydroID} drops {(downstream_id.elevation_m - downstream_elev)} meters down from HydroID {NextDownID}")
+        upstream_elev = dem_adj_thalweg_elevations.loc[dem_adj_thalweg_elevations.NextDownID==downstream_id.NextDownID].elevation_m
+
+# drops = thalweg_elevations.
+
+select_hydroids = [10680001,10680002,10680020,10680034,10680061,10680076,10680077,10680148,10680094]
+
+select_elevations = thalweg_elevations.loc[thalweg_elevations.HydroID.isin(select_hydroids)]
+
+# Convert index to longitudinal distance
+
+# Find reference index for each segment to convert index to longitudinal distance
+min_index = select_elevations.groupby(['HydroID']).pt_order.min()
+min_index = min_index.reset_index()
+min_index = min_index.rename(columns={'pt_order': 'min_index'})
+
+# Subtract reference index from index and convert to feet
+segment_distance = pd.merge(select_elevations[['HydroID', 'pt_order','source']],min_index, on="HydroID").reset_index(drop=True)
+segment_distance['distance'] = (segment_distance.pt_order - segment_distance.min_index)* 32.8084
+segment_distance.distance = segment_distance.distance.round(1)
+# merge distances back into table
+select_elevations = select_elevations.reset_index(drop=True)
+# segment_distance_sub = segment_distance.filter(items=['HydroID', 'distance']).reset_index(drop=True)
+select_elevations = pd.concat([select_elevations.set_index('HydroID'), segment_distance[['HydroID', 'distance']].set_index('HydroID')], axis=1, join="inner")
+select_elevations = select_elevations.reset_index()
+# Convert elevation to feet
+select_elevations['elevation_ft'] = select_elevations.elevation_m * 3.28084 # convert from m to ft
+select_elevations.elevation_ft = select_elevations.elevation_ft.round(1)
+
+select_elevations = select_elevations.sort_values(['HydroID', 'distance','elevation_ft'], ascending=[1, 0, 0])
+select_elevations = select_elevations.reset_index(drop=True)
+
+## Generate rating curve plots
+num_plots = len(select_elevations.HydroID.unique())
+
+if num_plots > 3:
+    columns = num_plots // 3
+else:
+    columns = 1
+
+sns.set(style="ticks")
+g = sns.FacetGrid(select_elevations, col="HydroID", hue="source",sharex=True, sharey=False,col_wrap=columns)
+g.map(sns.lineplot, "distance", "elevation_ft", palette="tab20c") # , marker="o"
+g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
+
+# Iterate thorugh each axis to get individual y-axis bounds
+for ax in g.axes.flat:
+    print (ax.lines)
+    mins = []
+    maxes = []
+    for line in ax.lines:
+        mins = mins + [min(line.get_ydata())]
+        maxes = maxes + [max(line.get_ydata())]
+    min_y = min(mins) - (max(maxes) - min(mins))/10
+    max_y = max(maxes) + (max(maxes) - min(mins))/10
+    ax.set_ylim(min_y,max_y)
+
+# Adjust the arrangement of the plots
+g.fig.tight_layout(w_pad=1)
+g.add_legend()
+
+plt.savefig(plot_filename)
+plt.close()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
+    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True,type=str)
+    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True,type=str)
+    parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True,type=str)
+    parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True,type=str)
+    parser.add_argument('-catfim', '--catfim-flows-filename', help='Categorical FIM flows file',required = True,type=str)
+    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
+    parser.add_argument('-group','--stat-groups',help='column(s) to group stats',required=False,type=str)
+
+    args = vars(parser.parse_args())
+
+    fim_dir = args['fim_dir']
+    output_dir = args['output_dir']
+    usgs_gages_filename = args['usgs_gages_filename']
+    nwm_flow_dir = args['nwm_flow_dir']
+    catfim_flows_filename = args['catfim_flows_filename']
+    number_of_jobs = args['number_of_jobs']
+    stat_groups = args['stat_groups']
+
+    stat_groups = stat_groups.split()
+    procs_list = []
+
+    plots_dir = join(output_dir,'plots')
+    os.makedirs(plots_dir, exist_ok=True)
+    tables_dir = join(output_dir,'tables')
+    os.makedirs(tables_dir, exist_ok=True)
+
+    #Check age of gages csv and recommend updating if older than 30 days.
+    print(check_file_age(usgs_gages_filename))
+
+    # Open log file
+    sys.__stdout__ = sys.stdout
+    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
+    sys.stdout = log_file
+
+    huc_list  = os.listdir(fim_dir)
+    for huc in huc_list:
+
+        if huc != 'logs':
+            elev_table_filename = join(fim_dir,huc,'usgs_elev_table.csv')
+            hydrotable_filename = join(fim_dir,huc,'hydroTable.csv')
+            usgs_recurr_stats_filename = join(tables_dir,f"usgs_interpolated_elevation_stats_{huc}.csv")
+            nwm_recurr_data_filename = join(tables_dir,f"nwm_recurrence_flow_elevations_{huc}.csv")
+            rc_comparison_plot_filename = join(plots_dir,f"FIM-USGS_rating_curve_comparison_{huc}.png")
+
+            if isfile(elev_table_filename):
+                procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir, catfim_flows_filename, huc])
+
+    # Initiate multiprocessing
+    print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")
+    with Pool(processes=number_of_jobs) as pool:
+        pool.map(generate_rating_curve_metrics, procs_list)
+
+    print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
+    aggregate_metrics(output_dir,procs_list,stat_groups)
+
+    print('Delete intermediate tables')
+    shutil.rmtree(tables_dir, ignore_errors=True)
+
+    # Close log file
+    sys.stdout = sys.__stdout__
+    log_file.close()

From b16a0338e411fab103d4ca455552cbbfe6545d12 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Thu, 13 May 2021 16:47:07 +0000
Subject: [PATCH 52/66] adding tool to check elevation changes along thalweg

---
 src/clip_vectors_to_wbd.py           |  10 +-
 src/output_cleanup.py                |   5 +-
 src/raster.py                        | 462 ---------------------------
 src/reachID_grid_to_vector_points.py |  50 +--
 src/reduce_nhd_stream_density.py     |   4 +-
 src/split_flows.py                   |  18 +-
 src/thalweg_drop_check.py            | 331 -------------------
 tools/thalweg_drop_check.py          | 382 ++++++++++++++++++++++
 8 files changed, 402 insertions(+), 860 deletions(-)
 delete mode 100644 src/raster.py
 delete mode 100755 src/thalweg_drop_check.py
 create mode 100644 tools/thalweg_drop_check.py

diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index 11d2fd262..576297ac4 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -7,7 +7,7 @@
 from shapely.geometry import MultiPolygon,Polygon,Point
 from utils.shared_functions import getDriver
 
-def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,dissolveLinks=False):
+def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent):
 
     hucUnitLength = len(str(hucCode))
 
@@ -64,10 +64,6 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
 
     if len(nhd_streams) > 0:
 
-        # Get headwater segments
-        # nhd_streams['is_headwater'] = False
-        # nhd_streams_headwaters = nhd_streams.loc[~(nhd_streams.nws_lid=='') & (nhd_streams.is_headwater==True)]
-
         # Find incoming stream segments (to WBD buffer) and identify which are upstream
         threshold_segments = gpd.overlay(nhd_streams, wbd_buffer, how='symmetric_difference')
         from_list = threshold_segments.FromNode.to_list()
@@ -125,7 +121,6 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     parser.add_argument('-b','--subset-nwm-streams',help='NWM streams subset',required=True)
     parser.add_argument('-x','--subset-landsea',help='LandSea subset',required=True)
     parser.add_argument('-extent','--extent',help='FIM extent',required=True)
-    parser.add_argument('-o','--dissolve-links',help='remove multi-line strings',action="store_true",default=False)
 
 
     args = vars(parser.parse_args())
@@ -148,6 +143,5 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     subset_nwm_streams_filename = args['subset_nwm_streams']
     subset_landsea_filename = args['subset_landsea']
     extent = args['extent']
-    dissolveLinks = args['dissolve_links']
 
-    subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,dissolveLinks)
+    subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent)
diff --git a/src/output_cleanup.py b/src/output_cleanup.py
index 879103ad6..63c551c64 100755
--- a/src/output_cleanup.py
+++ b/src/output_cleanup.py
@@ -38,7 +38,10 @@ def output_cleanup(huc_number, output_folder_path, additional_whitelist, is_prod
         'bathy_xs_area_hydroid_lookup.csv',
         'src_full_crosswalked.csv',
         'usgs_elev_table.csv',
-        'hand_ref_elev_table.csv'
+        'hand_ref_elev_table.csv',
+        'dem_lateral_thalweg_adj.tif',
+        'dem_thalwegCond.tif',
+        'dem_meters.tif'
     ]
 
     # List of files that will be saved during a viz run
diff --git a/src/raster.py b/src/raster.py
deleted file mode 100644
index a10a02430..000000000
--- a/src/raster.py
+++ /dev/null
@@ -1,462 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from osgeo import gdal, ogr, osr
-import numpy as np
-from os.path import isfile
-from os import remove
-from copy import deepcopy
-from subprocess import call
-
-class Raster:
-
-	"""
-	Raster object from single band rasters
-
-	...
-
-	Attributes
-	----------
-	array : numpy array
-		raster data in numpy array
-	gt : list
-		geotransform. see gdal docs for more info.
-	proj : str
-		Projection string
-	ndv : number
-		No data value
-	des : str
-		band description
-	ct : gdal.colorTable
-		color table
-	dt : int
-		GDAL GDT data type. See notes.
-	dim : tuple
-		raster dimensions (bands, rows, columns) for multi-bands and (row, columns) for single band
-	nbands : int
-		number of bands.
-	nrows : int
-		number of rows
-	ncols : int
-		number of columns
-
-	Methods
-	-------
-	writeRaster(fileName,dtype=None,driverName='GTiff',verbose=False)
-		Write out raster file as geotiff
-	copy()
-		Copy method. Uses deepcopy since array data is present
-	clipToVector(raster_fileName,vector_fileName,verbose=False,output_fileType='GTiff',output_fileName=None,loadOutput=True)
-		Clips to vector using gdalwarp command line utility
-
-	Raises
-	------
-	OSError
-		If fileName does not exist
-	ValueError
-		Raises if input raster
-
-	See Also
-	--------
-
-	Notes
-	-----
-	Currently only accepts single band rasters.
-
-	Multiple datatypes are used. The table below shows which numpy datatypes correspond to the the GDAL types and their integer codes.
-
-	#  ## Integer Code ##   ## Global Descriptor Table ##      ## Numpy ##
-  	#         0                     GDT_Unknown                   NA
-  	#         1  				    GDT_Byte                      np.bool, np.int ,np.int8, np.long, np.byte, np.uint8
-  	#         2      				GDT_UInt16                    np.uint16, np.ushort
-  	#         3     				GDT_Int16                     np.int16, np.short
-  	#         4       				GDT_UInt32                    np.uint32 , np.uintc
-  	#         5       				GDT_Int32                     np.int32, np.intc
-  	#         6       				GDT_Float32                   np.float32, np.single
-  	#         7       				GDT_Float64                   np.float64, np.double
-  	#         8       				GDT_CInt16                    np.complex64
-  	#         9       				GDT_CInt32                    np.complex64
-  	#         10       				GDT_CFloat32                  np.complex64
-  	#         11       				GDT_CFloat64                  np.complex128
-  	#         12       				GDT_TypeCount                 NA
-
-	Examples
-	--------
-	Load Raster
-	>>> rasterData = fldpln.Raster('path/to/raster')
-
-	"""
-
-	# converts numpy datatypes and gdal GDT variables to integer codes
-	dataTypeConversion_name_to_integer = { np.int8 : 1 , np.bool : 1 , np.int : 1 , np.long : 1 , np.byte : 1, np.uint8 : 1,
-										   np.uint16 : 2 , np.int16 : 3 ,
-										   np.ushort : 2 , np.short : 3 ,
-										   np.uint32 : 4 , np.uintc : 4 , np.int32 : 5 , np.intc : 5 ,
-										   np.float32 : 6 , np.single : 6 ,
-										   np.float64 : 7 , np.double : 7 ,
-										   np.complex64 : 10 , np.complex128 : 11 ,
-										   0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9,10:10,11:11,12:12 }
-
-	# converts integer codes and gdal GDT variables to numpy datatypes
-	dataTypeConversion_integer_to_name = {0 : np.complex128 , 1 : np.int8 , 2 : np.uint16 , 3 : np.int16 ,
-										  4 : np.uint32 , 5 : np.int32 , 6 : np.float32 , 7 : np.float64 ,
-										  8 : np.complex64 , 9 : np.complex64 , 10 : np.complex64 , 11 : np.complex128 }
-
-
-	def __init__(self,fileName,loadArray=True,dtype=None):
-
-		"""
-		Initializes Raster Instance from single band raster
-
-		...
-
-		Parameters
-		----------
-		fileName : str
-			File path to single band raster
-		dtype : numpy datatype or int, optional
-			Numpy, GDT, or integer code data type used to override the data type on the file when imported to array (Default Value = None, None sets to the numpy array data type to the one in the raster file)
-
-		Returns
-		-------
-		raster
-			Instance of raster object
-
-		"""
-
-		if not isfile(fileName):
-			raise OSError("File \'{}\' does not exist".format(fileName))
-
-		stream = gdal.Open(fileName,gdal.GA_ReadOnly)
-
-		self.nrows,self.ncols = stream.RasterYSize , stream.RasterXSize
-		self.nbands = stream.RasterCount
-
-		if loadArray:
-			self.array = stream.ReadAsArray()
-
-		self.gt = stream.GetGeoTransform()
-		self.proj = stream.GetProjection()
-
-		# if self.nbands > 1:
-			# raise ValueError('Raster class only accepts single band rasters for now')
-
-		band = stream.GetRasterBand(1)
-
-		self.ndv = band.GetNoDataValue()
-
-		# set data type
-		if dtype is not None: # override raster file type
-
-			# sets dt to dtype integer code
-			try:
-				self.dt = self.dataTypeConversion_name_to_integer[dtype]
-			except KeyError:
-				raise ValueError('{} dtype parameter not accepted. check docs for valid input or set to None to use data type from raster'.format(dtype))
-
-			# sets array data type
-			if isinstance(dtype,type): # if dtype is a numpy data tpe
-
-				self.array = self.array.astype(dtype)
-
-			else: # if dtype is an integer code of GDAL GDT variable
-
-				try:
-					self.array = self.array.astype(self.dataTypeConversion_integer_to_name[dtype])
-				except KeyError:
-					raise ValueError('{} dtype parameter not accepted. check docs for valid input or set to None to use data type from raster'.format(dtype))
-
-		else: # sets to default data type in raster file
-
-			self.dt = band.DataType
-
-			try:
-				self.array.astype(self.dataTypeConversion_integer_to_name[self.dt])
-			except KeyError:
-				raise ValueError('{} dtype parameter not accepted. check docs for valid input or set to None to use data type from raster'.format(self.dt))
-
-		try:
-			self.des = band.GetDescription()
-		except AttributeError:
-			pass
-
-		try:
-			self.ct = stream.GetRasterColorTable()
-		except AttributeError:
-			pass
-
-		# self.dim = self.array.shape
-		self.fileName = fileName
-
-		stream,band = None,None
-
-
-	@property
-	def dim(self):
-		""" Property method for number of dimensions """
-
-		if self.nbands == 1:
-			DIMS = self.nrows,self.ncols
-		if self.nbands > 1:
-			DIMS = self.nbands,self.nrows,self.ncols
-
-		return(DIMS)
-
-
-	def copy(self):
-		""" Copy method. Uses deepcopy since array data is present """
-		return(deepcopy(self))
-
-
-	def writeRaster(self,fileName,dtype=None,driverName='GTiff',verbose=False):
-
-		"""
-		Write out raster file as geotiff
-
-		Parameters
-		----------
-		fileName : str
-			File path to output raster to
-		dtype : numpy datatype or int, optional
-			Numpy, GDT, or integer code data type (Default Value = self.dt attribute value, otherwise uses data type from the numpy array)
-		driverName : str, optional
-			GDAL driver type. See gdal docs for more details. Only tested for GTiff. (Default Value = 'GTiff')
-		verbose : Boolean, optional
-			Verbose output (Default Value = False)
-
-		Returns
-		-------
-		None
-
-		Raises
-		------
-		ValueError
-			Raises ValueError when the data type parameter is not recognized. See the help docs for raster class to see which numpy, gdal, or encoded values are accepted.
-
-		Examples
-		--------
-		Write Geotiff raster
-		>>> rasterData = fldpln.Raster('path/to/raster')
-		>>> rasterData.writeRaster('/different/path/to/raster',dtype=np.int8)
-
-		"""
-
-		driver = gdal.GetDriverByName(driverName)
-
-		if dtype is None:
-			try:
-				dtype = self.dt
-			except AttributeError:
-				# dtype = gdal.GDT_Float64
-				try:
-					dtype = self.dataTypeConversion_name_to_integer[self.array.dtype]
-				except KeyError:
-					raise ValueError('{} dtype parameter not accepted. check docs for valid input or set to None to use data type from numpy array'.format(self.array.dtype))
-		else:
-			try:
-				dtype = self.dataTypeConversion_name_to_integer[dtype]
-			except KeyError:
-				raise ValueError('{} dtype parameter not accepted. check docs for valid input or set to None to use data type from numpy array'.format(self.array.dtype))
-
-		dataset = driver.Create(fileName, self.ncols, self.nrows, 1, dtype)
-		dataset.SetGeoTransform(self.gt)
-		dataset.SetProjection(self.proj)
-		band = dataset.GetRasterBand(1)
-
-		# set color table and color interpretation
-		#print(band.__dict__)
-		try:
-			band.SetRasterColorTable(self.ct)
-			#band.SetRasterColorInterpretation(gdal.GCI_PaletteIndex)
-		except AttributeError:
-			pass
-
-		try:
-			band.SetDescription(self.des)
-		except AttributeError:
-			pass
-
-		band.SetNoDataValue(self.ndv)
-		band.WriteArray(self.array)
-		band, dataset = None,None  # Close the file
-
-		if verbose:
-			print("Successfully wrote out raster to {}".format(fileName))
-
-	def polygonize(self,vector_fileName,vector_driver,layer_name,verbose):
-
-		gdal.UseExceptions()
-
-		#  get raster datasource
-		#
-		src_ds = gdal.Open( self.fileName )
-		srcband = src_ds.GetRasterBand(1)
-
-		#
-		#  create output datasource
-		driver_ext_dict = {'ESRI Shapefile' : 'shp' , 'GPKG' : 'gpkg'}
-
-		if vector_driver not in driver_ext_dict:
-			raise ValueError('Driver not found in {}'.format(driver_ext_dict))
-
-		drv = ogr.GetDriverByName(vector_driver)
-		dst_ds = drv.CreateDataSource( vector_fileName)
-
-		srs = osr.SpatialReference()
-		srs.ImportFromWkt(self.proj)
-
-		dst_layer = dst_ds.CreateLayer(layer_name, srs = srs, geom_type = ogr.wkbPolygon )
-
-		if verbose:
-			prog_func = gdal.TermProgress_nocb
-		else:
-			prog_func = None
-
-		gdal.Polygonize( srcband, None, dst_layer, -1, ['8CONNECTED=8'], callback=prog_func )
-
-	@classmethod
-	def clipToVector(cls,raster_fileName,vector_fileName,output_fileName=None,output_fileType='GTiff',verbose=False):
-		"""
-		Clips to vector using gdalwarp command line utility
-
-		...
-
-		Parameters
-		----------
-		raster_fileName : str
-			File path to raster to clip
-		vector_fileName : str
-			File path to vector layer to clip with
-		output_fileName : str
-			Set file path to output clipped raster (Default Value = None)
-		output_fileType : str
-			Set file type of output from GDAL drivers list (Default Value = 'GTiff')
-		verbose : Boolean
-			Verbose output (Default Value = False)
-
-		Returns
-		-------
-		raster : raster
-			Clipped raster layer
-
-		Notes
-		-----
-		gdalwarp utility must be installed and callable via a subprocess
-
-		Examples
-		--------
-		clip raster and don't return
-		>>> fldpln.raster.clipToVector('path/to/raster','path/to/clipping/vector','path/to/write/output/raster/to')
-		Clip raster and return but don't write
-		>>> clippedRaster = fldpln.raster.clipToVector('path/to/raster','path/to/clipping/vector')
-
-
-		"""
-
-		# create temp output if none is desired
-		if output_fileName is None:
-			output_fileName = 'temp.tif'
-
-		# generate command
-		command = ['gdalwarp','-overwrite','-of',output_fileType,'-cutline',vector_fileName,'-crop_to_cutline',raster_fileName,output_fileName]
-
-		# insert quiet flag if not verbose
-		if not verbose:
-			command = command.insert(1,'-q')
-
-		# call command
-		call(command)
-
-		# remove temp file
-		if output_fileName is None:
-			remove(output_fileName)
-
-		return(cls(output_fileName))
-
-	def getCoordinatesFromIndex(self,row,col):
-		"""
-		Returns coordinates in the rasters projection from a given multi-index
-
-		"""
-
-		# extract variables for readability
-		x_upper_limit, y_upper_limit = self.gt[0], self.gt[3]
-		x_resolution, y_resolution = self.gt[1], self.gt[5]
-		nrows, ncols = self.nrows, self.ncols
-
-		x = x_upper_limit + (col * x_resolution)
-		y = y_upper_limit + (row * y_resolution)
-
-		return(x,y)
-
-
-	def sampleFromCoordinates(self,x,y,returns='value'):
-		"""
-		Sample raster value from coordinates
-		...
-
-		Parameters
-		----------
-		raster_fileName : str
-			File path to raster to clip
-		vector_fileName : str
-			File path to vector layer to clip with
-		output_fileName : str
-			Set file path to output clipped raster (Default Value = None)
-		output_fileType : str
-			Set file type of output from GDAL drivers list (Default Value = 'GTiff')
-		verbose : Boolean
-			Verbose output (Default Value = False)
-
-		Returns
-		-------
-		raster : raster
-			Clipped raster layer
-
-		Notes
-		-----
-		gdalwarp utility must be installed and callable via a subprocess
-
-		Examples
-		--------
-		clip raster and don't return
-		>>> fldpln.raster.clipToVector('path/to/raster','path/to/clipping/vector','path/to/write/output/raster/to')
-		Clip raster and return but don't write
-		>>> clippedRaster = fldpln.raster.clipToVector('path/to/raster','path/to/clipping/vector')
-
-
-		"""
-
-		# extract variables for readability
-		x_upper_limit, y_upper_limit = self.gt[0], self.gt[3]
-		x_resolution, y_resolution = self.gt[1], self.gt[5]
-		nrows, ncols = self.nrows, self.ncols
-
-		# get upper left hand corner coordinates from the centroid coordinates of the upper left pixel
-		x_upper_limit =  x_upper_limit - (x_resolution/2)
-		y_upper_limit = y_upper_limit - (y_resolution/2)
-
-		# get indices
-		columnIndex = int( ( x - x_upper_limit) / x_resolution)
-		rowIndex = int( ( y - y_upper_limit) / y_resolution)
-
-		# check indices lie within raster limits
-		columnIndexInRange = ncols > columnIndex >= 0
-		rowIndexInRange = nrows > rowIndex >= 0
-
-		if (not columnIndexInRange) | (not rowIndexInRange):
-			raise ValueError("Row Index {} or column index {} not in raster range ({},{})".format(rowIndex,columnIndex,nrows,ncols))
-
-		# check value is not ndv
-		if self.array[rowIndex,columnIndex] == self.ndv:
-			raise ValueError("Sample value is no data at ({},{})".format(nrows,ncols))
-
-		# return if statements
-		if returns == 'value':
-			return(self.array[rowIndex,columnIndex])
-		elif returns == 'multi-index':
-			return(rowIndex,columnIndex)
-		elif returns == 'ravel-index':
-			return(np.ravel_multi_index((rowIndex,columnIndex),(nrows,ncols)))
-		else:
-			raise ValueError('Enter valid returns argument')
diff --git a/src/reachID_grid_to_vector_points.py b/src/reachID_grid_to_vector_points.py
index 5dadd43c1..790b09a2b 100755
--- a/src/reachID_grid_to_vector_points.py
+++ b/src/reachID_grid_to_vector_points.py
@@ -1,17 +1,13 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8
 
-from osgeo import gdal
 import numpy as np
 import osgeo.ogr
 import osgeo.osr
 import sys
-
-import cProfile
 from tqdm import tqdm
 import geopandas as gpd
 from shapely.geometry import Point
-from raster import Raster
+import rasterio
 from utils.shared_functions import getDriver
 
 """
@@ -24,57 +20,26 @@
 outputFileName = sys.argv[2]
 writeOption = sys.argv[3]
 
-#r = gdal.Open(path)
-#band = r.GetRasterBand(1)
-boolean=Raster(path)
-
-#(upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = r.GetGeoTransform()
-(upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = boolean.gt
-
-#a = band.ReadAsArray().astype(np.float)
-
-# indices = np.nonzero(a != band.GetNoDataValue())
-indices = np.nonzero(boolean.array >= 1)
-
-# Init the shapefile stuff..
-#srs = osgeo.osr.SpatialReference()
-#srs.ImportFromWkt(r.GetProjection())
+boolean = rasterio.open(path,'r')
 
-#driver = osgeo.ogr.GetDriverByName('GPKG')
-#shapeData = driver.CreateDataSource(outputFileName)
-
-#layer = shapeData.CreateLayer('ogr_pts', srs, osgeo.ogr.wkbPoint)
-#layerDefinition = layer.GetLayerDefn()
-
-#idField = osgeo.ogr.FieldDefn("id", osgeo.ogr.OFTInteger)
-#layer.CreateField(idField)
+(upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = boolean.get_transform()
+indices = np.nonzero(boolean.read(1) >= 1)
 
 id =[None] * len(indices[0]);points = [None]*len(indices[0])
 
 # Iterate over the Numpy points..
 i = 1
 for y_index,x_index in tqdm(zip(*indices),total=len(indices[0])):
-    x = x_index * x_size + upper_left_x + (x_size / 2) #add half the cell size
-    y = y_index * y_size + upper_left_y + (y_size / 2) #to centre the point
-
-    # get raster value
-    #reachID = a[y_index,x_index]
-
-    #point = osgeo.ogr.Geometry(osgeo.ogr.wkbPoint)
-    #point.SetPoint(0, x, y)
+    x = x_index * x_size + upper_left_x + (x_size / 2) # add half the cell size
+    y = y_index * y_size + upper_left_y + (y_size / 2) # to center the point
     points[i-1] = Point(x,y)
 
-    #feature = osgeo.ogr.Feature(layerDefinition)
-    #feature.SetGeometry(point)
-    #feature.SetFID(i)
     if writeOption == 'reachID':
         reachID = a[y_index,x_index]
         id[i-1] = reachID
-	#feature.SetField("id",reachID)
+
     elif (writeOption == 'featureID') |( writeOption == 'pixelID'):
-        #feature.SetField("id",i)
         id[i-1] = i
-    #layer.CreateFeature(feature)
 
     i += 1
 
@@ -82,4 +47,3 @@
 pointGDF.to_file(outputFileName,driver=getDriver(outputFileName),index=False)
 
 print("Complete")
-#shapeData.Destroy()
diff --git a/src/reduce_nhd_stream_density.py b/src/reduce_nhd_stream_density.py
index e52fab3dd..c32afd990 100644
--- a/src/reduce_nhd_stream_density.py
+++ b/src/reduce_nhd_stream_density.py
@@ -71,10 +71,10 @@ def subset_nhd_network(huc4,huc4_mask,selected_wbd8,nhd_streams_,headwaters_file
                 for index, point in headwaters_mask.iterrows():
 
                     # Convert headwaterpoint geometries to WKB representation
-                    wkb_points = dumps(point.geometry)
+                    wkb_point = dumps(point.geometry)
 
                     # Create pygeos headwaterpoint geometries from WKB representation
-                    pointbin_geom = pygeos.io.from_wkb(wkb_points)
+                    pointbin_geom = pygeos.io.from_wkb(wkb_point)
 
                     # Distance to each stream segment
                     distances = pygeos.measurement.distance(streambin_geom, pointbin_geom)
diff --git a/src/split_flows.py b/src/split_flows.py
index 67b69f7e9..9d51065dd 100755
--- a/src/split_flows.py
+++ b/src/split_flows.py
@@ -181,27 +181,19 @@
 else:
     print ('Error: Could not add network attributes to stream segments')
 
-# Get Outlet Point Only
-#outlet = OrderedDict()
-#for i,segment in split_flows_gdf.iterrows():
-#    outlet[segment.geometry.coords[-1]] = segment[hydro_id]
-
-#hydroIDs_points = [hidp for hidp in outlet.values()]
-#split_points = [Point(*point) for point in outlet]
-
 # Get all vertices
 split_points = OrderedDict()
-for row in split_flows_gdf[['geometry',hydro_id, 'NextDownID']].iterrows():
-    lineString = row[1][0]
+for index, segment in split_flows_gdf.iterrows():
+    lineString = segment.geometry
 
     for point in zip(*lineString.coords.xy):
         if point in split_points:
-            if row[1][2] == split_points[point]:
+            if segment.NextDownID == split_points[point]:
                 pass
             else:
-                split_points[point] = row[1][1]
+                split_points[point] = segment[hydro_id]
         else:
-            split_points[point] = row[1][1]
+            split_points[point] = segment[hydro_id]
 
 hydroIDs_points = [hidp for hidp in split_points.values()]
 split_points = [Point(*point) for point in split_points]
diff --git a/src/thalweg_drop_check.py b/src/thalweg_drop_check.py
deleted file mode 100755
index 25c7098f0..000000000
--- a/src/thalweg_drop_check.py
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import sys
-import geopandas as gpd
-from shapely.geometry import Point
-import rasterio
-import pandas as pd
-import numpy as np
-import argparse
-import matplotlib.pyplot as plt
-import seaborn as sns
-from collections import deque
-from functools import reduce
-from os.path import isfile, join, dirname
-import shutil
-import warnings
-from pathlib import Path
-from collections import OrderedDict
-import time
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-"""
-    Plot Rating Curves and Compare to USGS Gages
-
-    Parameters
-    ----------
-    fim_dir : str
-        Directory containing FIM output folders.
-    output_dir : str
-        Directory containing rating curve plots and tables.
-    usgs_gages_filename : str
-        File name of USGS rating curves.
-    nwm_flow_dir : str
-        Directory containing NWM recurrence flows files.
-    number_of_jobs : str
-        Number of jobs.
-    stat_groups : str
-        string of columns to group eval metrics.
-"""
-outfolder = '/data/outputs/single_pixel_huc_ms_c/02030103'
-# outfolder = '/data/outputs/single_pixel_huc_ms_c/12090301'
-
-dem_meters_filename = os.path.join(outfolder,'dem_meters.tif')
-dem_burned_filename = os.path.join(outfolder,'dem_burned.tif')
-dem_burned_filled_filename = os.path.join(outfolder,'dem_burned_filled.tif')
-dem_lateral_thalweg_adj_filename = os.path.join(outfolder,'dem_lateral_thalweg_adj.tif')
-dem_thalwegCond_filename = os.path.join(outfolder,'dem_thalwegCond.tif')
-
-reaches_filename = os.path.join(outfolder,'NHDPlusBurnLineEvent_subset.gpkg')
-
-
-def compare_thalweg(args):
-
-    huc                              = args[0]
-    reaches_split_points_filename    = args[1]
-    reaches_filename                 = args[2]
-    dem_burned_filename         = args[3]
-    dem_meters_filename              = args[4]
-
-# reaches_split_points = gpd.read_file(reaches_split_points_filename)
-reaches = gpd.read_file(reaches_filename)
-dem_meters = rasterio.open(dem_meters_filename,'r')
-dem_burned = rasterio.open(dem_burned_filename,'r')
-dem_burned_filled = rasterio.open(dem_burned_filled_filename,'r')
-dem_lateral_thalweg_adj = rasterio.open(dem_lateral_thalweg_adj_filename,'r')
-dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
-
-### Get lists of all complete reaches using headwater attributes
-#########################################
-
-
-headwater_col = 'true_headwater'
-reaches[headwater_col] = False
-reaches.loc[reaches.NHDPlusID==10000100014087.0,headwater_col] = True
-headwaters = reaches.loc[reaches[headwater_col]==True]
-
-for index, headwater in headwaters.iterrows():
-    reaches["headwater_path"] = headwater.nws_lid
-    reaches.set_index('NHDPlusID',inplace=True,drop=False)
-
-    stream_path = get_downstream_segments(reaches,headwater_col, 'downstream')
-
-
-def get_downstream_segments(streams, headwater_col,flag_column):
-    streams[flag_column] = False
-    streams.loc[streams[headwater_col],flag_column] = True
-    Q = deque(streams.loc[streams[headwater_col],'NHDPlusID'].tolist())
-    visited = set()
-    while Q:
-        q = Q.popleft()
-        if q in visited:
-            continue
-        visited.add(q)
-        toNode,DnLevelPat = streams.loc[q,['ToNode','DnLevelPat']]
-        try:
-            downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
-        except ValueError: # 18050002 has duplicate nhd stream feature
-            if len(toNode.unique()) == 1:
-                toNode = toNode.iloc[0]
-                downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
-        # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
-        if len(set(downstream_ids))>1: # special case: remove duplicate NHDPlusIDs
-            relevant_ids = [segment for segment in downstream_ids if DnLevelPat == streams.loc[segment,'LevelPathI']]
-        else:
-            relevant_ids = downstream_ids
-        streams.loc[relevant_ids,flag_column] = True
-        for i in relevant_ids:
-            if i not in visited:
-                Q.append(i)
-    streams = streams.loc[streams[flag_column],:]
-    return(streams)
-
-#########################################
-# Collect elevation values from multiple grids along each individual reach point
-
-# Get all vertices
-for index, path in stream_path.iterrows():
-    split_points = []
-    stream_ids = []
-    dem_m_elev = []
-    dem_burned_elev = []
-    dem_burned_filled_elev = []
-    dem_lat_thal_adj_elev = []
-    dem_thal_adj_elev = []
-    index_count = []
-    count = 0
-    headwater_id =
-    for index, segment in path.iterrows():
-        lineString = segment.geometry
-
-        for point in zip(*lineString.coords.xy):
-            stream_ids = stream_ids + [segment.NHDPlusID]
-            split_points = split_points + [Point(point)]
-            count = count + 1
-            index_count = index_count + [count]
-            dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(point).coords), indexes=1))).item()]
-            dem_burned_elev = dem_burned_elev + [np.array(list(dem_burned.sample((Point(point).coords), indexes=1))).item()]
-            dem_burned_filled_elev = dem_burned_filled_elev + [np.array(list(dem_burned_filled.sample((Point(point).coords), indexes=1))).item()]
-            dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(point).coords), indexes=1))).item()]
-            dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(point).coords), indexes=1))).item()]
-
-    dem_m_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_m', 'elevation_m': dem_m_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-    # dem_burned_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_burned', 'elevation_m': dem_burned_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-    dem_burned_filled_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_burned_filled', 'elevation_m': dem_burned_filled_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-    dem_lat_thal_adj_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'dem_lat_thal_adj', 'elevation_m': dem_lat_thal_adj_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-    dem_thal_adj_pts = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'source': 'thal_adj_dem', 'elevation_m': dem_thal_adj_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-
-burnline_points = dem_m_pts.append([dem_thal_adj_pts,dem_lat_thal_adj_pts]) # dem_burned_pts, dem_burned_filled_pts,
-
-# remove nodata_pts
-burnline_points = burnline_points.loc[burnline_points.elevation_m>-9999.0]
-# burnline_points = gpd.GeoDataFrame({'NHDPlusID': stream_ids, 'ToNode': ToNodes, 'FromNode': FromNodes, 'elevation_m': dem_burned_elev, 'index_count': index_count, 'geometry': split_points}, crs=reaches.crs, geometry='geometry')
-
-#########################################
-# Identify significant drops in elevation (trace multiple grids)
-def find_elevation_drops(burnline_points):
-    drop_streams = []
-    for index, segment in burnline_points.iterrows():
-        upstream_elev = segment.elevation_m
-        try:
-            downstream_elev = burnline_points.loc[(burnline_points.index_count==(segment.index_count + 1))].elevation_m.item()
-            if (downstream_elev - upstream_elev) > 5:
-                print (f"elevation drop of {downstream_elev - upstream_elev} meters ")
-                drop_streams = drop_streams + [index]
-        except: # terminal point
-            pass
-    return drop_streams
-
-burnline_points["headwater_path"] = 'WNQN4'
-
-profile_plots_filename = '/data/outputs/single_pixel_huc_ms_c/02030103/profile_drop_plots2.png'
-
-# num_plots = len(burnline_points.headwater_path.unique())
-num_plots = len(burnline_points.source.unique())
-
-if num_plots > 3:
-    columns = num_plots // 3
-else:
-    columns = 1
-
-sns.set(style="ticks")
-# g = sns.FacetGrid(burnline_points, col="headwater_path", hue="source",sharex=True, sharey=False,col_wrap=columns)
-# g.map(sns.lineplot, "index_count", "elevation_m", palette="tab20c") # , marker="o"
-# g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
-g = sns.FacetGrid(burnline_points, col="source", hue="headwater_path",sharex=True, sharey=False,col_wrap=columns)
-g.map(sns.lineplot, "index_count", "elevation_m", palette="tab20c") # , marker="o"
-g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
-
-# Iterate thorugh each axis to get individual y-axis bounds
-for ax in g.axes.flat:
-    print (ax.lines)
-    mins = []
-    maxes = []
-    for line in ax.lines:
-        mins = mins + [min(line.get_ydata())]
-        maxes = maxes + [max(line.get_ydata())]
-    min_y = min(mins) - (max(maxes) - min(mins))/10
-    # min_y = -100
-    max_y = max(maxes) + (max(maxes) - min(mins))/10
-    ax.set_ylim(min_y,max_y)
-
-# Adjust the arrangement of the plots
-g.fig.tight_layout(w_pad=1)
-g.add_legend()
-
-plt.savefig(profile_plots_filename)
-plt.close()
-
-###############################################################################################################################################
-
-dem_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': dem_m_elev,'source': 'thalweg_adj'})
-dem_adj_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': thal_adj_elev,'source': 'dem_meters'})
-
-all_elevations = dem_thalweg_elevations.append(dem_adj_thalweg_elevations)
-
-reach_att = reaches[['HydroID', 'From_Node', 'To_Node', 'NextDownID']]
-
-thalweg_elevations = all_elevations.merge(reach_att, on="HydroID")
-
-# Find segments where elevation drops 5 m per
-# drops = thalweg_elevations.loc[thalweg_elevations.HydroID
-# all_hydro_ids = dict(thalweg_elevations[['HydroID','elevation_m']])
-thalweg_elevations.NextDownID = thalweg_elevations.NextDownID.astype('int')
-dem_adj_thalweg_elevations = thalweg_elevations.loc[thalweg_elevations.source=='thalweg_adj']
-min_index = dem_adj_thalweg_elevations.groupby(['HydroID']).pt_order.min()
-min_index = min_index.reset_index()
-min_index = min_index.rename(columns={'pt_order': 'min_index'})
-
-for index, downstream_id in dem_adj_thalweg_elevations.iterrows():
-    if index == 1:
-        break
-    if downstream_id.NextDownID != -1:
-        downstream_elevs = dem_adj_thalweg_elevations.loc[(dem_adj_thalweg_elevations.HydroID==downstream_id.NextDownID) & (dem_adj_thalweg_elevations.source=='thalweg_adj')].elevation_m
-        if (downstream_id.elevation_m - downstream_elevs[0]) > 5:
-            print (f"HydroID {HydroID} drops {(downstream_id.elevation_m - downstream_elev)} meters down from HydroID {NextDownID}")
-        upstream_elev = dem_adj_thalweg_elevations.loc[dem_adj_thalweg_elevations.NextDownID==downstream_id.NextDownID].elevation_m
-
-# drops = thalweg_elevations.
-
-select_hydroids = [10680001,10680002,10680020,10680034,10680061,10680076,10680077,10680148,10680094]
-
-select_elevations = thalweg_elevations.loc[thalweg_elevations.HydroID.isin(select_hydroids)]
-
-# Convert index to longitudinal distance
-
-# Find reference index for each segment to convert index to longitudinal distance
-min_index = select_elevations.groupby(['HydroID']).pt_order.min()
-min_index = min_index.reset_index()
-min_index = min_index.rename(columns={'pt_order': 'min_index'})
-
-# Subtract reference index from index and convert to feet
-segment_distance = pd.merge(select_elevations[['HydroID', 'pt_order','source']],min_index, on="HydroID").reset_index(drop=True)
-segment_distance['distance'] = (segment_distance.pt_order - segment_distance.min_index)* 32.8084
-segment_distance.distance = segment_distance.distance.round(1)
-# merge distances back into table
-select_elevations = select_elevations.reset_index(drop=True)
-# segment_distance_sub = segment_distance.filter(items=['HydroID', 'distance']).reset_index(drop=True)
-select_elevations = pd.concat([select_elevations.set_index('HydroID'), segment_distance[['HydroID', 'distance']].set_index('HydroID')], axis=1, join="inner")
-select_elevations = select_elevations.reset_index()
-# Convert elevation to feet
-select_elevations['elevation_ft'] = select_elevations.elevation_m * 3.28084 # convert from m to ft
-select_elevations.elevation_ft = select_elevations.elevation_ft.round(1)
-
-select_elevations = select_elevations.sort_values(['HydroID', 'distance','elevation_ft'], ascending=[1, 0, 0])
-select_elevations = select_elevations.reset_index(drop=True)
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
-    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True,type=str)
-    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True,type=str)
-    parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True,type=str)
-    parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True,type=str)
-    parser.add_argument('-catfim', '--catfim-flows-filename', help='Categorical FIM flows file',required = True,type=str)
-    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
-    parser.add_argument('-group','--stat-groups',help='column(s) to group stats',required=False,type=str)
-
-    args = vars(parser.parse_args())
-
-    fim_dir = args['fim_dir']
-    output_dir = args['output_dir']
-    usgs_gages_filename = args['usgs_gages_filename']
-    nwm_flow_dir = args['nwm_flow_dir']
-    catfim_flows_filename = args['catfim_flows_filename']
-    number_of_jobs = args['number_of_jobs']
-    stat_groups = args['stat_groups']
-
-    stat_groups = stat_groups.split()
-    procs_list = []
-
-    plots_dir = join(output_dir,'plots')
-    os.makedirs(plots_dir, exist_ok=True)
-    tables_dir = join(output_dir,'tables')
-    os.makedirs(tables_dir, exist_ok=True)
-
-    #Check age of gages csv and recommend updating if older than 30 days.
-    print(check_file_age(usgs_gages_filename))
-
-    # Open log file
-    sys.__stdout__ = sys.stdout
-    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
-    sys.stdout = log_file
-
-    huc_list  = os.listdir(fim_dir)
-    for huc in huc_list:
-
-        if huc != 'logs':
-            elev_table_filename = join(fim_dir,huc,'usgs_elev_table.csv')
-            hydrotable_filename = join(fim_dir,huc,'hydroTable.csv')
-            usgs_recurr_stats_filename = join(tables_dir,f"usgs_interpolated_elevation_stats_{huc}.csv")
-            nwm_recurr_data_filename = join(tables_dir,f"nwm_recurrence_flow_elevations_{huc}.csv")
-            rc_comparison_plot_filename = join(plots_dir,f"FIM-USGS_rating_curve_comparison_{huc}.png")
-
-            if isfile(elev_table_filename):
-                procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir, catfim_flows_filename, huc])
-
-    # Initiate multiprocessing
-    print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")
-    with Pool(processes=number_of_jobs) as pool:
-        pool.map(generate_rating_curve_metrics, procs_list)
-
-    print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
-    aggregate_metrics(output_dir,procs_list,stat_groups)
-
-    print('Delete intermediate tables')
-    shutil.rmtree(tables_dir, ignore_errors=True)
-
-    # Close log file
-    sys.stdout = sys.__stdout__
-    log_file.close()
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
new file mode 100644
index 000000000..387e3c5a4
--- /dev/null
+++ b/tools/thalweg_drop_check.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import geopandas as gpd
+from utils.shared_variables import PREP_PROJECTION
+from shapely.geometry import Point, LineString
+import rasterio
+import pandas as pd
+import numpy as np
+import argparse
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import deque
+from os.path import join
+from multiprocessing import Pool
+from utils.shared_functions import getDriver
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+"""
+    Plot Rating Curves and Compare to USGS Gages
+
+    Parameters
+    ----------
+    fim_dir : str
+        Directory containing FIM output folders.
+    output_dir : str
+        Stream layer to be evaluated.
+    stream_type : str
+        File name of USGS rating curves.
+    point_density : str
+        Elevation sampling density.
+    number_of_jobs : str
+        Number of jobs.
+"""
+
+huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename=procs_list[0]
+def compare_thalweg(args):
+
+    huc_dir                             = args[0]
+    stream_type                         = args[1]
+    point_density                       = args[2]
+    huc                                 = args[3]
+    dem_meters_filename                 = args[4]
+    dem_lateral_thalweg_adj_filename    = args[5]
+    dem_thalwegCond_filename            = args[6]
+    profile_plots_filename              = args[7]
+    profile_gpkg_filename               = args[8]
+    profile_table_filename              = args[9]
+
+    if stream_type == 'derived':
+
+        dem_derived_reaches_filename = os.path.join(huc_dir,'demDerived_reaches_split.gpkg')
+        streams = gpd.read_file(dem_derived_reaches_filename)
+        nhd_headwater_filename = os.path.join(huc_dir,'nhd_headwater_points_subset.gpkg')
+        wbd_filename = os.path.join(huc_dir,'wbd.gpkg')
+        wbd = gpd.read_file(wbd_filename)
+        headwaters_layer = gpd.read_file(nhd_headwater_filename,mask=wbd)
+        headwater_list = headwaters_layer.loc[headwaters_layer.pt_type == 'nws_lid']
+        stream_id = 'HydroID'
+
+    elif stream_type == 'burnline':
+
+        nhd_reaches_filename = os.path.join(huc_dir,'NHDPlusBurnLineEvent_subset.gpkg')
+        nhd_reaches = gpd.read_file(nhd_reaches_filename)
+        streams = nhd_reaches.copy()
+        headwaters_layer = None
+
+        # Get lists of all complete reaches using headwater attributes
+        headwater_list = streams.loc[streams.nws_lid!=''].nws_lid
+        stream_id = 'NHDPlusID'
+
+    headwater_col = 'is_headwater'
+    streams[headwater_col] = False
+    headwater_list = headwater_list.reset_index(drop=True)
+
+    if stream_type == 'derived':
+        streams['nws_lid'] = ''
+
+        if streams.NextDownID.dtype != 'int': streams.NextDownID = streams.NextDownID.astype(int)
+
+        min_dist = np.empty(len(headwater_list))
+        streams['min_dist'] = 1000
+
+        for i, point in headwater_list.iterrows():
+            streams['min_dist'] = [point.geometry.distance(line) for line in streams.geometry]
+            streams.loc[streams.min_dist==np.min(streams.min_dist),'nws_lid'] = point.site_id
+
+        headwater_list = headwater_list.site_id
+
+    streams.set_index(stream_id,inplace=True,drop=False)
+
+    # Collect headwater streams
+    single_stream_paths = []
+    for index, headwater_site in enumerate(headwater_list):
+
+        stream_path = get_downstream_segments(streams.copy(),'nws_lid', headwater_site,'downstream',stream_id,stream_type)
+
+        stream_path["headwater_path"] = headwater_site
+        stream_path = stream_path.reset_index(drop=True)
+        stream_path = stream_path.sort_values(by=['downstream_count'])
+        single_stream_paths = single_stream_paths + [stream_path.loc[stream_path.downstream==True]]
+        print(f"length of {headwater_site} path: {len(stream_path.loc[stream_path.downstream==True])}")
+
+    # Collect elevation values from multiple grids along each individual reach point
+    dem_meters = rasterio.open(dem_meters_filename,'r')
+    dem_lateral_thalweg_adj = rasterio.open(dem_lateral_thalweg_adj_filename,'r')
+    dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
+
+    thalweg_points = gpd.GeoDataFrame()
+    for path in single_stream_paths:
+
+        split_points = []
+        stream_ids = []
+        dem_m_elev = []
+        dem_burned_filled_elev = []
+        dem_lat_thal_adj_elev = []
+        dem_thal_adj_elev = []
+        headwater_path = []
+        index_count = []
+
+        for index, segment in path.iterrows():
+
+            if stream_type == 'derived':
+                linestring = segment.geometry
+
+            elif stream_type == 'burnline':
+                linestring = LineString(segment.geometry.coords[::-1])
+
+            if point_density == 'midpoints':
+
+                midpoint = linestring.interpolate(0.5,normalized=True)
+                stream_ids = stream_ids + [segment[stream_id]]
+                split_points = split_points + [midpoint]
+                index_count = index_count + [segment.downstream_count]
+                dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(midpoint).coords), indexes=1))).item()]
+                dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(midpoint).coords), indexes=1))).item()]
+                dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(midpoint).coords), indexes=1))).item()]
+                headwater_path = headwater_path + [segment.headwater_path]
+
+            elif point_density == 'all_points':
+
+                count=0
+                for point in zip(*linestring.coords.xy):
+                    stream_ids = stream_ids + [segment[stream_id]]
+                    split_points = split_points + [Point(point)]
+                    count = count + 1
+                    index_count = index_count + [segment.downstream_count*1000 + count]
+                    dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(point).coords), indexes=1))).item()]
+                    dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(point).coords), indexes=1))).item()]
+                    dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(point).coords), indexes=1))).item()]
+                    headwater_path = headwater_path + [segment.headwater_path]
+
+        # gpd.GeoDataFrame({**data, "source": "dem_m"})
+        dem_m_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'dem_m', 'elevation_m': dem_m_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
+        dem_lat_thal_adj_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'dem_lat_thal_adj', 'elevation_m': dem_lat_thal_adj_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
+        dem_thal_adj_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'thal_adj_dem', 'elevation_m': dem_thal_adj_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
+
+        for raster in [dem_m_pts,dem_lat_thal_adj_pts,dem_thal_adj_pts]:
+
+            raster = raster.sort_values(by=['index_count'])
+            raster.set_index('index_count',inplace=True,drop=True)
+            raster = raster.reset_index(drop=True)
+            raster.index.names = ['index_count']
+            raster = raster.reset_index(drop=False)
+            thalweg_points = thalweg_points.append(raster,ignore_index = True)
+
+            del raster
+
+        del dem_m_pts,dem_lat_thal_adj_pts,dem_thal_adj_pts
+
+    del dem_lateral_thalweg_adj,dem_thalwegCond,dem_meters
+
+    try:
+        # Remove nodata_pts and convert elevation to ft
+        thalweg_points = thalweg_points.loc[thalweg_points.elevation_m>-9999.0]
+        thalweg_points.elevation_m =  np.round(thalweg_points.elevation_m,3)
+        thalweg_points['elevation_ft'] =  np.round(thalweg_points.elevation_m*3.28084,3)
+
+        # Plot thalweg profile
+        plot_profile(thalweg_points, profile_plots_filename)
+
+        # Filter final thalweg ajdusted layer
+        thal_adj_points = thalweg_points.loc[thalweg_points.source=='thal_adj_dem'].copy()
+        # thal_adj_points.to_file(profile_gpkg_filename,driver=getDriver(profile_gpkg_filename))
+
+        # Identify significant rises/drops in elevation
+        thal_adj_points['elev_change'] = thal_adj_points.groupby(['headwater_path', 'source'])['elevation_m'].apply(lambda x: x - x.shift())
+        elev_changes = thal_adj_points.loc[(thal_adj_points.elev_change<=-5.0) | (thal_adj_points.elev_change>0.0)]
+
+        if not elev_changes.empty:
+            # elev_changes.to_csv(profile_table_filename,index=False)
+            elev_changes.to_file(profile_gpkg_filename,index=False,driver=getDriver(profile_gpkg_filename))
+
+
+        # Zoom in to plot only areas with steep elevation changes
+        # select_streams = elev_changes.stream_id.to_list()
+        # downstream_segments = [index + 1 for index in select_streams]
+        # upstream_segments = [index - 1 for index in select_streams]
+        # select_streams = list(set(upstream_segments + downstream_segments + select_streams))
+        # thal_adj_points_select = thal_adj_points.loc[thal_adj_points.stream_id.isin(select_streams)]
+        # plot_profile(thal_adj_points_select, profile_plots_filename_zoom)
+
+    except:
+        print(f"huc {huc} has {len(thalweg_points)} thalweg points")
+
+def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stream_id,stream_type):
+
+    streams[flag_column] = False
+    streams['downstream_count'] = -9
+    streams.loc[streams[headwater_col]==headwater_id,flag_column] = True
+    streams.loc[streams[headwater_col]==headwater_id,'downstream_count'] = 0
+    count = 0
+
+    Q = deque(streams.loc[streams[headwater_col]==headwater_id,stream_id].tolist())
+    visited = set()
+
+    while Q:
+
+        q = Q.popleft()
+
+        if q in visited:
+            continue
+
+        visited.add(q)
+        count = count + 1
+
+        if stream_type == 'burnline':
+
+            toNode,DnLevelPat = streams.loc[q,['ToNode','DnLevelPat']]
+            downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
+
+            # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
+            if len(set(downstream_ids)) > 1: # special case: remove duplicate NHDPlusIDs
+
+                relevant_ids = [segment for segment in downstream_ids if DnLevelPat == streams.loc[segment,'LevelPathI']]
+
+            else:
+                relevant_ids = downstream_ids
+
+        elif stream_type == 'derived':
+
+            toNode = streams.loc[q,['NextDownID']].item()
+            relevant_ids = streams.loc[streams[stream_id] == toNode,:].index.tolist()
+
+        streams.loc[relevant_ids,flag_column] = True
+        streams.loc[relevant_ids,'downstream_count'] = count
+
+        for i in relevant_ids:
+
+            if i not in visited:
+                Q.append(i)
+
+    streams = streams.loc[streams[flag_column],:]
+
+    return streams
+
+
+def plot_profile(elevation_table,profile_plots_filename):
+
+    num_plots = len(elevation_table.headwater_path.unique())
+    unique_rasters = elevation_table.source.unique()
+
+    if num_plots > 3:
+        columns = int(np.ceil(num_plots / 3))
+    else:
+        columns = 1
+
+    # palette = dict(zip(unique_rasters, sns.color_palette(n_colors=len(unique_rasters))))
+    # palette.update({'dem_m':'gray'})
+    sns.set(style="ticks")
+
+    if len(unique_rasters) > 1:
+        g = sns.FacetGrid(elevation_table, col="headwater_path", hue="source", hue_order=['dem_m', 'dem_lat_thal_adj', 'thal_adj_dem'], sharex=False, sharey=False,col_wrap=columns)
+    else:
+        g = sns.FacetGrid(elevation_table, col="headwater_path", hue="source", sharex=False, sharey=False,col_wrap=columns)
+
+    g.map(sns.lineplot, "index_count", "elevation_ft", palette="tab20c")
+    g.set_axis_labels(x_var="Longitudinal Profile (index)", y_var="Elevation (ft)")
+
+    # Iterate thorugh each axis to get individual y-axis bounds
+    for ax in g.axes.flat:
+        mins = []
+        maxes = []
+        for line in ax.lines:
+            mins = mins + [min(line.get_ydata())]
+            maxes = maxes + [max(line.get_ydata())]
+        min_y = min(mins) - (max(maxes) - min(mins))/10
+        max_y = max(maxes) + (max(maxes) - min(mins))/10
+        ax.set_ylim(min_y,max_y)
+
+    # if len(unique_rasters) > 1:
+    #     ax.lines[0].set_linestyle("--")
+    #     ax.lines[0].set_color('gray')
+
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0 + box.height * 0.1,box.width, box.height * 0.9])
+    # Adjust the arrangement of the plots
+    # g.fig.tight_layout(w_pad=5) #w_pad=2
+    g.add_legend()
+    # plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
+    plt.subplots_adjust(bottom=0.25)
+
+    plt.savefig(profile_plots_filename)
+    plt.close()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
+    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True,type=str)
+    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True,type=str)
+    # parser.add_argument('-rasters','--raster-list',help='list of rasters to be evaluated',required=True,type=str)
+    parser.add_argument('-stream_type','--stream-type',help='stream layer to be evaluated',required=True,type=str,choices=['derived','burnline'])
+    parser.add_argument('-point_density','--point-density',help='elevation sampling density',required=True,type=str,choices=['midpoints','all_points'])
+    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
+
+    args = vars(parser.parse_args())
+
+    fim_dir = args['fim_dir']
+    output_dir = args['output_dir']
+    # raster_list = args['raster_list']
+    stream_type = args['stream_type']
+    point_density = args['point_density']
+    number_of_jobs = args['number_of_jobs']
+
+    # dem_meters_dir = os.environ.get('dem_meters')
+
+    plots_dir = join(output_dir,'plots')
+    os.makedirs(plots_dir, exist_ok=True)
+    spatial_dir = os.path.join(output_dir,'tables')
+    os.makedirs(spatial_dir, exist_ok=True)
+
+    # Open log file
+    sys.__stdout__ = sys.stdout
+    log_file = open(join(output_dir,'thalweg_profile_comparison.log'),"w")
+    sys.stdout = log_file
+
+    procs_list = []
+    huc_list  = os.listdir(fim_dir)
+    for huc in huc_list:
+        if huc != 'logs':
+
+            huc_dir = os.path.join(fim_dir,huc)
+            dem_meters_filename = os.path.join(huc_dir,'dem_meters.tif')
+            dem_lateral_thalweg_adj_filename = os.path.join(huc_dir,'dem_lateral_thalweg_adj.tif')
+            dem_thalwegCond_filename = os.path.join(huc_dir,'dem_thalwegCond.tif')
+            profile_plots_filename = os.path.join(plots_dir,f"profile_drop_plots_{huc}_{point_density}_{stream_type}.png")
+            profile_gpkg_filename = os.path.join(huc_dir,f"thalweg_points_{huc}_{point_density}_{stream_type}.gpkg")
+            profile_table_filename = os.path.join(spatial_dir,f"thalweg_elevation_changes_{huc}_{point_density}_{stream_type}.csv")
+
+            procs_list.append([huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename])
+
+    # Initiate multiprocessing
+    print(f"Generating thalweg elevation profiles for {len(procs_list)} hucs using {number_of_jobs} jobs")
+    with Pool(processes=number_of_jobs) as pool:
+        # Get elevation values along thalweg for each headwater stream path
+        pool.map(compare_thalweg, procs_list)
+
+    # Append all elevation change spatial layers to a single gpkg
+    table_list  = os.listdir(spatial_dir)
+    agg_thalweg_elevations_gpkg_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.gpkg")
+    agg_thalweg_elevation_table_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.csv")
+    for table in table_list:
+
+        huc_gpd = gpd.read_file(os.path.join(spatial_dir,table))
+        # Write aggregate table
+        if os.path.isfile(agg_thalweg_elevations_gpkg_fileName):
+            huc_gpd.to_file(agg_thalweg_elevations_gpkg_fileName,driver=getDriver(agg_thalweg_elevations_gpkg_fileName),index=False, mode='a')
+        else:
+            huc_gpd.to_file(agg_thalweg_elevations_gpkg_fileName,driver=getDriver(agg_thalweg_elevations_gpkg_fileName),index=False)
+
+        del huc_gpd
+
+    # Create csv of elevation table
+    huc_table = pd.read_csv(agg_thalweg_elevations_gpkg_fileName)
+    huc_table.to_csv(agg_thalweg_elevation_table_fileName,index=False)
+
+    # Close log file
+    sys.stdout = sys.__stdout__
+    log_file.close()

From e782b123294d370f3010966facf0f7869c2dedca Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Thu, 13 May 2021 16:59:05 +0000
Subject: [PATCH 53/66] removing dissolved links arg

---
 src/clip_vectors_to_wbd.py  | 1 -
 tools/thalweg_drop_check.py | 9 +++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index c042aef33..6e7c2fd93 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -174,6 +174,5 @@ def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_l
     great_lakes_filename = args['great_lakes_filename']
     wbd_buffer_distance = args['wbd_buffer_distance']
     lake_buffer_distance  = args['lake_buffer_distance']
-    dissolveLinks = args['dissolve_links']
 
     subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,great_lakes_filename,wbd_buffer_distance,lake_buffer_distance)
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
index 387e3c5a4..7953566e3 100644
--- a/tools/thalweg_drop_check.py
+++ b/tools/thalweg_drop_check.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import geopandas as gpd
+sys.path.append('/foss_fim/src')
 from utils.shared_variables import PREP_PROJECTION
 from shapely.geometry import Point, LineString
 import rasterio
@@ -35,7 +36,6 @@
         Number of jobs.
 """
 
-huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename=procs_list[0]
 def compare_thalweg(args):
 
     huc_dir                             = args[0]
@@ -359,13 +359,14 @@ def plot_profile(elevation_table,profile_plots_filename):
         pool.map(compare_thalweg, procs_list)
 
     # Append all elevation change spatial layers to a single gpkg
-    table_list  = os.listdir(spatial_dir)
+    spatial_list  = os.listdir(spatial_dir)
     agg_thalweg_elevations_gpkg_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.gpkg")
     agg_thalweg_elevation_table_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.csv")
-    for table in table_list:
+    for table in spatial_list:
 
         huc_gpd = gpd.read_file(os.path.join(spatial_dir,table))
-        # Write aggregate table
+
+        # Write aggregate layer
         if os.path.isfile(agg_thalweg_elevations_gpkg_fileName):
             huc_gpd.to_file(agg_thalweg_elevations_gpkg_fileName,driver=getDriver(agg_thalweg_elevations_gpkg_fileName),index=False, mode='a')
         else:

From 66358b9276b932dcecffd2d1a9f59ab849619f16 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Fri, 14 May 2021 02:11:29 +0000
Subject: [PATCH 54/66] updating method to get burnline points

---
 src/reachID_grid_to_vector_points.py |  66 +++++++++++------
 src/run_by_unit.sh                   |   6 +-
 tools/thalweg_drop_check.py          | 105 +++++++++++++--------------
 3 files changed, 101 insertions(+), 76 deletions(-)

diff --git a/src/reachID_grid_to_vector_points.py b/src/reachID_grid_to_vector_points.py
index 790b09a2b..bcbc205aa 100755
--- a/src/reachID_grid_to_vector_points.py
+++ b/src/reachID_grid_to_vector_points.py
@@ -4,8 +4,10 @@
 import osgeo.ogr
 import osgeo.osr
 import sys
+import argparse
 from tqdm import tqdm
 import geopandas as gpd
+from utils.shared_variables import PREP_PROJECTION
 from shapely.geometry import Point
 import rasterio
 from utils.shared_functions import getDriver
@@ -15,35 +17,55 @@
 ./reachID_grid_to_vector_points.py <flows_grid_IDs raster file> <flows_points vector file> <reachID or featureID>
 
 """
+def convert_grid_cells_to_points(raster,index_option,output_points_filename=False):
 
-path = sys.argv[1]
-outputFileName = sys.argv[2]
-writeOption = sys.argv[3]
+    # Input raster
+    if isinstance(raster,str):
+        raster = rasterio.open(raster,'r')
 
-boolean = rasterio.open(path,'r')
+    elif isinstance(raster,rasterio.io.DatasetReader):
+        pass
 
-(upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = boolean.get_transform()
-indices = np.nonzero(boolean.read(1) >= 1)
+    else:
+        raise TypeError("Pass raster dataset or filepath for raster")
 
-id =[None] * len(indices[0]);points = [None]*len(indices[0])
+    (upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = raster.get_transform()
+    indices = np.nonzero(raster.read(1) >= 1)
 
-# Iterate over the Numpy points..
-i = 1
-for y_index,x_index in tqdm(zip(*indices),total=len(indices[0])):
-    x = x_index * x_size + upper_left_x + (x_size / 2) # add half the cell size
-    y = y_index * y_size + upper_left_y + (y_size / 2) # to center the point
-    points[i-1] = Point(x,y)
+    id =[None] * len(indices[0]);points = [None]*len(indices[0])
 
-    if writeOption == 'reachID':
-        reachID = a[y_index,x_index]
-        id[i-1] = reachID
+    # Iterate over the Numpy points..
+    i = 1
+    for y_index,x_index in zip(*indices):
+        x = x_index * x_size + upper_left_x + (x_size / 2) # add half the cell size
+        y = y_index * y_size + upper_left_y + (y_size / 2) # to center the point
+        points[i-1] = Point(x,y)
+        if index_option == 'reachID':
+            reachID = np.array(list(raster.sample((Point(x,y).coords), indexes=1))).item() # check this; needs to add raster cell value + index
+            id[i-1] = reachID*1000 + i #reachID + i/100
+        elif (index_option == 'featureID') |(index_option == 'pixelID'):
+            id[i-1] = i
+        i += 1
 
-    elif (writeOption == 'featureID') |( writeOption == 'pixelID'):
-        id[i-1] = i
+    pointGDF = gpd.GeoDataFrame({'id' : id, 'geometry' : points},crs=PREP_PROJECTION,geometry='geometry')
 
-    i += 1
+    if output_points_filename == False:
+        return pointGDF
+    else:
+        pointGDF.to_file(output_points_filename,driver=getDriver(output_points_filename),index=False)
 
-pointGDF = gpd.GeoDataFrame({'id' : id, 'geometry' : points},crs=boolean.proj,geometry='geometry')
-pointGDF.to_file(outputFileName,driver=getDriver(outputFileName),index=False)
+if __name__ == '__main__':
 
-print("Complete")
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='Converts a raster to points')
+    parser.add_argument('-r','--raster',help='Raster to be converted to points',required=True,type=str)
+    parser.add_argument('-i', '--index-option',help='Indexing option',required=True,type=str,choices=['reachID','featureID','pixelID'])
+    parser.add_argument('-p', '--output-points-filename',help='Output points layer filename',required=False,type=str,default=False)
+
+    args = vars(parser.parse_args())
+
+    raster = args['raster']
+    index_option = args['index_option']
+    output_points_filename = args['output_points_filename']
+
+    convert_grid_cells_to_points(raster,index_option,output_points_filename)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 53fcacec4..5e348949d 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -104,6 +104,10 @@ Tstart
 gdal_rasterize -ot Int32 -burn 1 -init 0 -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" -te $xmin $ymin $xmax $ymax -ts $ncols $nrows $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg $outputHucDataDir/flows_grid_boolean.tif
 Tcount
 
+##gdal_rasterize -ot Float32 -a NHDPlusID -init 0 -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" -te $xmin $ymin $xmax $ymax -ts $ncols $nrows $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg $outputHucDataDir/flows_grid_nhd.tif
+
+
+
 ## RASTERIZE NHD HEADWATERS (1 & 0) ##
 echo -e $startDiv"Rasterize NHD Headwaters $hucNumber"$stopDiv
 date -u
@@ -275,7 +279,7 @@ echo -e $startDiv"Vectorize Pixel Centroids $hucNumber"$stopDiv
 date -u
 Tstart
 [ ! -f $outputHucDataDir/flows_points_pixels.gpkg ] && \
-$srcDir/reachID_grid_to_vector_points.py $demDerived_streamPixels $outputHucDataDir/flows_points_pixels.gpkg featureID
+$srcDir/reachID_grid_to_vector_points.py -r $demDerived_streamPixels -i featureID -p $outputHucDataDir/flows_points_pixels.gpkg
 Tcount
 
 ## GAGE WATERSHED FOR PIXELS ##
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
index 7953566e3..3f82654d6 100644
--- a/tools/thalweg_drop_check.py
+++ b/tools/thalweg_drop_check.py
@@ -16,6 +16,8 @@
 from os.path import join
 from multiprocessing import Pool
 from utils.shared_functions import getDriver
+from rasterio import features
+from reachID_grid_to_vector_points import convert_grid_cells_to_points
 import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
@@ -36,6 +38,8 @@
         Number of jobs.
 """
 
+
+# huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename ,profile_gpkg_filename,profile_table_filename = procs_list[0]
 def compare_thalweg(args):
 
     huc_dir                             = args[0]
@@ -93,24 +97,31 @@ def compare_thalweg(args):
 
     # Collect headwater streams
     single_stream_paths = []
+    dem_meters = rasterio.open(dem_meters_filename,'r')
+    index_option = 'reachID'
     for index, headwater_site in enumerate(headwater_list):
-
         stream_path = get_downstream_segments(streams.copy(),'nws_lid', headwater_site,'downstream',stream_id,stream_type)
-
-        stream_path["headwater_path"] = headwater_site
         stream_path = stream_path.reset_index(drop=True)
         stream_path = stream_path.sort_values(by=['downstream_count'])
-        single_stream_paths = single_stream_paths + [stream_path.loc[stream_path.downstream==True]]
-        print(f"length of {headwater_site} path: {len(stream_path.loc[stream_path.downstream==True])}")
+        stream_path = stream_path.loc[stream_path.downstream==True]
+        if stream_type == 'burnline':
+            geom_value = []
+            for index, segment in stream_path.iterrows():
+                geom_value = geom_value + [(segment.geometry, segment.downstream_count)]
+            nhd_reaches_raster = features.rasterize(shapes=geom_value , out_shape=[dem_meters.height, dem_meters.width],fill=dem_meters.nodata,transform=dem_meters.transform, all_touched=True, dtype=np.float32)
+            out_dem_filename = os.path.join(huc_dir,'NHDPlusBurnLineEvent_raster.tif')
+            with rasterio.open(out_dem_filename, "w", **dem_meters.profile, BIGTIFF='YES') as dest:
+                dest.write(nhd_reaches_raster, indexes = 1)
+            stream_path = convert_grid_cells_to_points(out_dem_filename,index_option)
+        stream_path["headwater_path"] = headwater_site
+        single_stream_paths = single_stream_paths + [stream_path]
+        print(f"length of {headwater_site} path: {len(stream_path)}")
 
     # Collect elevation values from multiple grids along each individual reach point
-    dem_meters = rasterio.open(dem_meters_filename,'r')
     dem_lateral_thalweg_adj = rasterio.open(dem_lateral_thalweg_adj_filename,'r')
     dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
-
     thalweg_points = gpd.GeoDataFrame()
     for path in single_stream_paths:
-
         split_points = []
         stream_ids = []
         dem_m_elev = []
@@ -119,55 +130,49 @@ def compare_thalweg(args):
         dem_thal_adj_elev = []
         headwater_path = []
         index_count = []
-
         for index, segment in path.iterrows():
-
             if stream_type == 'derived':
                 linestring = segment.geometry
-
-            elif stream_type == 'burnline':
-                linestring = LineString(segment.geometry.coords[::-1])
-
-            if point_density == 'midpoints':
-
-                midpoint = linestring.interpolate(0.5,normalized=True)
-                stream_ids = stream_ids + [segment[stream_id]]
-                split_points = split_points + [midpoint]
-                index_count = index_count + [segment.downstream_count]
-                dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(midpoint).coords), indexes=1))).item()]
-                dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(midpoint).coords), indexes=1))).item()]
-                dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(midpoint).coords), indexes=1))).item()]
-                headwater_path = headwater_path + [segment.headwater_path]
-
-            elif point_density == 'all_points':
-
-                count=0
-                for point in zip(*linestring.coords.xy):
+                if point_density == 'midpoints':
+                    midpoint = linestring.interpolate(0.5,normalized=True)
                     stream_ids = stream_ids + [segment[stream_id]]
-                    split_points = split_points + [Point(point)]
-                    count = count + 1
-                    index_count = index_count + [segment.downstream_count*1000 + count]
-                    dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(point).coords), indexes=1))).item()]
-                    dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(point).coords), indexes=1))).item()]
-                    dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(point).coords), indexes=1))).item()]
+                    split_points = split_points + [midpoint]
+                    index_count = index_count + [segment.downstream_count]
+                    dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(midpoint).coords), indexes=1))).item()]
+                    dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(midpoint).coords), indexes=1))).item()]
+                    dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(midpoint).coords), indexes=1))).item()]
                     headwater_path = headwater_path + [segment.headwater_path]
-
+                elif point_density == 'all_points':
+                    count=0
+                    for point in zip(*linestring.coords.xy):
+                        stream_ids = stream_ids + [segment[stream_id]]
+                        split_points = split_points + [Point(point)]
+                        count = count + 1
+                        index_count = index_count + [segment.downstream_count*1000 + count]
+                        dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(point).coords), indexes=1))).item()]
+                        dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(point).coords), indexes=1))).item()]
+                        dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(point).coords), indexes=1))).item()]
+                        headwater_path = headwater_path + [segment.headwater_path]
+            elif stream_type == 'burnline':
+                stream_ids = stream_ids + [segment['id']]
+                split_points = split_points + [Point(segment.geometry)]
+                index_count = index_count + [segment['id']]
+                dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((Point(segment.geometry).coords), indexes=1))).item()]
+                dem_lat_thal_adj_elev = dem_lat_thal_adj_elev + [np.array(list(dem_lateral_thalweg_adj.sample((Point(segment.geometry).coords), indexes=1))).item()]
+                dem_thal_adj_elev = dem_thal_adj_elev + [np.array(list(dem_thalwegCond.sample((Point(segment.geometry).coords), indexes=1))).item()]
+                headwater_path = headwater_path + [segment.headwater_path]
         # gpd.GeoDataFrame({**data, "source": "dem_m"})
         dem_m_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'dem_m', 'elevation_m': dem_m_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
         dem_lat_thal_adj_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'dem_lat_thal_adj', 'elevation_m': dem_lat_thal_adj_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
         dem_thal_adj_pts = gpd.GeoDataFrame({'stream_id': stream_ids, 'source': 'thal_adj_dem', 'elevation_m': dem_thal_adj_elev, 'headwater_path': headwater_path, 'index_count': index_count, 'geometry': split_points}, crs=path.crs, geometry='geometry')
-
         for raster in [dem_m_pts,dem_lat_thal_adj_pts,dem_thal_adj_pts]:
-
             raster = raster.sort_values(by=['index_count'])
             raster.set_index('index_count',inplace=True,drop=True)
             raster = raster.reset_index(drop=True)
             raster.index.names = ['index_count']
             raster = raster.reset_index(drop=False)
             thalweg_points = thalweg_points.append(raster,ignore_index = True)
-
             del raster
-
         del dem_m_pts,dem_lat_thal_adj_pts,dem_thal_adj_pts
 
     del dem_lateral_thalweg_adj,dem_thalwegCond,dem_meters
@@ -206,7 +211,6 @@ def compare_thalweg(args):
         print(f"huc {huc} has {len(thalweg_points)} thalweg points")
 
 def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stream_id,stream_type):
-
     streams[flag_column] = False
     streams['downstream_count'] = -9
     streams.loc[streams[headwater_col]==headwater_id,flag_column] = True
@@ -217,30 +221,26 @@ def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stre
     visited = set()
 
     while Q:
-
         q = Q.popleft()
 
         if q in visited:
             continue
 
         visited.add(q)
-        count = count + 1
 
+        count = count + 1
         if stream_type == 'burnline':
-
             toNode,DnLevelPat = streams.loc[q,['ToNode','DnLevelPat']]
             downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
-
             # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
-            if len(set(downstream_ids)) > 1: # special case: remove duplicate NHDPlusIDs
 
+            if len(set(downstream_ids)) > 1: # special case: remove duplicate NHDPlusIDs
                 relevant_ids = [segment for segment in downstream_ids if DnLevelPat == streams.loc[segment,'LevelPathI']]
 
             else:
                 relevant_ids = downstream_ids
 
         elif stream_type == 'derived':
-
             toNode = streams.loc[q,['NextDownID']].item()
             relevant_ids = streams.loc[streams[stream_id] == toNode,:].index.tolist()
 
@@ -248,7 +248,6 @@ def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stre
         streams.loc[relevant_ids,'downstream_count'] = count
 
         for i in relevant_ids:
-
             if i not in visited:
                 Q.append(i)
 
@@ -329,7 +328,7 @@ def plot_profile(elevation_table,profile_plots_filename):
 
     plots_dir = join(output_dir,'plots')
     os.makedirs(plots_dir, exist_ok=True)
-    spatial_dir = os.path.join(output_dir,'tables')
+    spatial_dir = os.path.join(output_dir,'spatial_layers')
     os.makedirs(spatial_dir, exist_ok=True)
 
     # Open log file
@@ -347,7 +346,7 @@ def plot_profile(elevation_table,profile_plots_filename):
             dem_lateral_thalweg_adj_filename = os.path.join(huc_dir,'dem_lateral_thalweg_adj.tif')
             dem_thalwegCond_filename = os.path.join(huc_dir,'dem_thalwegCond.tif')
             profile_plots_filename = os.path.join(plots_dir,f"profile_drop_plots_{huc}_{point_density}_{stream_type}.png")
-            profile_gpkg_filename = os.path.join(huc_dir,f"thalweg_points_{huc}_{point_density}_{stream_type}.gpkg")
+            profile_gpkg_filename = os.path.join(spatial_dir,f"thalweg_elevation_changes_{huc}_{point_density}_{stream_type}.gpkg")
             profile_table_filename = os.path.join(spatial_dir,f"thalweg_elevation_changes_{huc}_{point_density}_{stream_type}.csv")
 
             procs_list.append([huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename])
@@ -362,9 +361,9 @@ def plot_profile(elevation_table,profile_plots_filename):
     spatial_list  = os.listdir(spatial_dir)
     agg_thalweg_elevations_gpkg_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.gpkg")
     agg_thalweg_elevation_table_fileName = os.path.join(output_dir, f"agg_thalweg_elevation_changes_{point_density}_{stream_type}.csv")
-    for table in spatial_list:
+    for layer in spatial_list:
 
-        huc_gpd = gpd.read_file(os.path.join(spatial_dir,table))
+        huc_gpd = gpd.read_file(os.path.join(spatial_dir,layer))
 
         # Write aggregate layer
         if os.path.isfile(agg_thalweg_elevations_gpkg_fileName):
@@ -375,7 +374,7 @@ def plot_profile(elevation_table,profile_plots_filename):
         del huc_gpd
 
     # Create csv of elevation table
-    huc_table = pd.read_csv(agg_thalweg_elevations_gpkg_fileName)
+    huc_table = gpd.read_file(agg_thalweg_elevations_gpkg_fileName)
     huc_table.to_csv(agg_thalweg_elevation_table_fileName,index=False)
 
     # Close log file

From 4023e148e77b6070785ac7293cd36facec735af8 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Fri, 14 May 2021 19:03:43 +0000
Subject: [PATCH 55/66] adding gpkg layers to -p

---
 src/output_cleanup.py                |  6 +++++-
 src/reachID_grid_to_vector_points.py |  2 +-
 tools/thalweg_drop_check.py          | 30 +++++++++++++++-------------
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/output_cleanup.py b/src/output_cleanup.py
index 63c551c64..529ea1a5b 100755
--- a/src/output_cleanup.py
+++ b/src/output_cleanup.py
@@ -41,7 +41,11 @@ def output_cleanup(huc_number, output_folder_path, additional_whitelist, is_prod
         'hand_ref_elev_table.csv',
         'dem_lateral_thalweg_adj.tif',
         'dem_thalwegCond.tif',
-        'dem_meters.tif'
+        'dem_meters.tif',
+        'demDerived_reaches_split.gpkg',
+        'nhd_headwater_points_subset.gpkg',
+        'wbd.gpkg',
+        'NHDPlusBurnLineEvent_subset.gpkg'
     ]
 
     # List of files that will be saved during a viz run
diff --git a/src/reachID_grid_to_vector_points.py b/src/reachID_grid_to_vector_points.py
index bcbc205aa..c77bcc732 100755
--- a/src/reachID_grid_to_vector_points.py
+++ b/src/reachID_grid_to_vector_points.py
@@ -42,7 +42,7 @@ def convert_grid_cells_to_points(raster,index_option,output_points_filename=Fals
         points[i-1] = Point(x,y)
         if index_option == 'reachID':
             reachID = np.array(list(raster.sample((Point(x,y).coords), indexes=1))).item() # check this; needs to add raster cell value + index
-            id[i-1] = reachID*1000 + i #reachID + i/100
+            id[i-1] = reachID*10000 + i #reachID + i/100
         elif (index_option == 'featureID') |(index_option == 'pixelID'):
             id[i-1] = i
         i += 1
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
index 3f82654d6..bded2c75a 100644
--- a/tools/thalweg_drop_check.py
+++ b/tools/thalweg_drop_check.py
@@ -39,7 +39,6 @@
 """
 
 
-# huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename ,profile_gpkg_filename,profile_table_filename = procs_list[0]
 def compare_thalweg(args):
 
     huc_dir                             = args[0]
@@ -52,6 +51,7 @@ def compare_thalweg(args):
     profile_plots_filename              = args[7]
     profile_gpkg_filename               = args[8]
     profile_table_filename              = args[9]
+    flows_grid_boolean_filename         = args[10]
 
     if stream_type == 'derived':
 
@@ -107,8 +107,12 @@ def compare_thalweg(args):
         if stream_type == 'burnline':
             geom_value = []
             for index, segment in stream_path.iterrows():
-                geom_value = geom_value + [(segment.geometry, segment.downstream_count)]
+                lineString = LineString(segment.geometry.coords[::-1])
+                geom_value = geom_value + [(lineString, segment.downstream_count)]
             nhd_reaches_raster = features.rasterize(shapes=geom_value , out_shape=[dem_meters.height, dem_meters.width],fill=dem_meters.nodata,transform=dem_meters.transform, all_touched=True, dtype=np.float32)
+            flow_bool = rasterio.open(flows_grid_boolean_filename)
+            flow_bool_data = flow_bool.read(1)
+            nhd_reaches_raster = np.where(flow_bool_data == int(0), -9999.0, (nhd_reaches_raster).astype(rasterio.float32))
             out_dem_filename = os.path.join(huc_dir,'NHDPlusBurnLineEvent_raster.tif')
             with rasterio.open(out_dem_filename, "w", **dem_meters.profile, BIGTIFF='YES') as dest:
                 dest.write(nhd_reaches_raster, indexes = 1)
@@ -179,7 +183,7 @@ def compare_thalweg(args):
 
     try:
         # Remove nodata_pts and convert elevation to ft
-        thalweg_points = thalweg_points.loc[thalweg_points.elevation_m>-9999.0]
+        thalweg_points = thalweg_points.loc[thalweg_points.elevation_m > 0.0]
         thalweg_points.elevation_m =  np.round(thalweg_points.elevation_m,3)
         thalweg_points['elevation_ft'] =  np.round(thalweg_points.elevation_m*3.28084,3)
 
@@ -211,6 +215,7 @@ def compare_thalweg(args):
         print(f"huc {huc} has {len(thalweg_points)} thalweg points")
 
 def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stream_id,stream_type):
+
     streams[flag_column] = False
     streams['downstream_count'] = -9
     streams.loc[streams[headwater_col]==headwater_id,flag_column] = True
@@ -230,17 +235,21 @@ def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stre
 
         count = count + 1
         if stream_type == 'burnline':
+
             toNode,DnLevelPat = streams.loc[q,['ToNode','DnLevelPat']]
             downstream_ids = streams.loc[streams['FromNode'] == toNode,:].index.tolist()
-            # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
 
+            # If multiple downstream_ids are returned select the ids that are along the main flow path (i.e. exclude segments that are diversions)
             if len(set(downstream_ids)) > 1: # special case: remove duplicate NHDPlusIDs
+
                 relevant_ids = [segment for segment in downstream_ids if DnLevelPat == streams.loc[segment,'LevelPathI']]
 
             else:
+
                 relevant_ids = downstream_ids
 
         elif stream_type == 'derived':
+
             toNode = streams.loc[q,['NextDownID']].item()
             relevant_ids = streams.loc[streams[stream_id] == toNode,:].index.tolist()
 
@@ -248,6 +257,7 @@ def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stre
         streams.loc[relevant_ids,'downstream_count'] = count
 
         for i in relevant_ids:
+
             if i not in visited:
                 Q.append(i)
 
@@ -257,27 +267,21 @@ def get_downstream_segments(streams, headwater_col,headwater_id,flag_column,stre
 
 
 def plot_profile(elevation_table,profile_plots_filename):
-
     num_plots = len(elevation_table.headwater_path.unique())
     unique_rasters = elevation_table.source.unique()
-
     if num_plots > 3:
         columns = int(np.ceil(num_plots / 3))
     else:
         columns = 1
-
     # palette = dict(zip(unique_rasters, sns.color_palette(n_colors=len(unique_rasters))))
     # palette.update({'dem_m':'gray'})
     sns.set(style="ticks")
-
     if len(unique_rasters) > 1:
         g = sns.FacetGrid(elevation_table, col="headwater_path", hue="source", hue_order=['dem_m', 'dem_lat_thal_adj', 'thal_adj_dem'], sharex=False, sharey=False,col_wrap=columns)
     else:
         g = sns.FacetGrid(elevation_table, col="headwater_path", hue="source", sharex=False, sharey=False,col_wrap=columns)
-
     g.map(sns.lineplot, "index_count", "elevation_ft", palette="tab20c")
     g.set_axis_labels(x_var="Longitudinal Profile (index)", y_var="Elevation (ft)")
-
     # Iterate thorugh each axis to get individual y-axis bounds
     for ax in g.axes.flat:
         mins = []
@@ -288,11 +292,9 @@ def plot_profile(elevation_table,profile_plots_filename):
         min_y = min(mins) - (max(maxes) - min(mins))/10
         max_y = max(maxes) + (max(maxes) - min(mins))/10
         ax.set_ylim(min_y,max_y)
-
     # if len(unique_rasters) > 1:
     #     ax.lines[0].set_linestyle("--")
     #     ax.lines[0].set_color('gray')
-
     # box = ax.get_position()
     # ax.set_position([box.x0, box.y0 + box.height * 0.1,box.width, box.height * 0.9])
     # Adjust the arrangement of the plots
@@ -300,7 +302,6 @@ def plot_profile(elevation_table,profile_plots_filename):
     g.add_legend()
     # plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
     plt.subplots_adjust(bottom=0.25)
-
     plt.savefig(profile_plots_filename)
     plt.close()
 
@@ -342,6 +343,7 @@ def plot_profile(elevation_table,profile_plots_filename):
         if huc != 'logs':
 
             huc_dir = os.path.join(fim_dir,huc)
+            flows_grid_boolean_filename = os.path.join(huc_dir,'flows_grid_boolean.tif')
             dem_meters_filename = os.path.join(huc_dir,'dem_meters.tif')
             dem_lateral_thalweg_adj_filename = os.path.join(huc_dir,'dem_lateral_thalweg_adj.tif')
             dem_thalwegCond_filename = os.path.join(huc_dir,'dem_thalwegCond.tif')
@@ -349,7 +351,7 @@ def plot_profile(elevation_table,profile_plots_filename):
             profile_gpkg_filename = os.path.join(spatial_dir,f"thalweg_elevation_changes_{huc}_{point_density}_{stream_type}.gpkg")
             profile_table_filename = os.path.join(spatial_dir,f"thalweg_elevation_changes_{huc}_{point_density}_{stream_type}.csv")
 
-            procs_list.append([huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename])
+            procs_list.append([huc_dir,stream_type,point_density,huc,dem_meters_filename,dem_lateral_thalweg_adj_filename,dem_thalwegCond_filename,profile_plots_filename,profile_gpkg_filename,profile_table_filename,flows_grid_boolean_filename])
 
     # Initiate multiprocessing
     print(f"Generating thalweg elevation profiles for {len(procs_list)} hucs using {number_of_jobs} jobs")

From 3c7bec41e2be5db710fa9cd6b51a5ce417108bd9 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 2 Jun 2021 14:31:24 +0000
Subject: [PATCH 56/66] temp change to prepro files

---
 src/aggregate_vector_inputs.py | 15 ++++++++-------
 src/utils/shared_variables.py  |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index 87b7d6a49..30817bc0b 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -184,7 +184,8 @@ def find_nwm_incoming_streams(nwm_streams_,wbd,huc_unit):
 
 def collect_stream_attributes(nhdplus_vectors_dir, huc):
 
-    print ('Starting huc: ' + str(huc))
+    print (f"Starting attribute collection for HUC {huc}",flush=True)
+
     # Collecting NHDPlus HR attributes
     burnline_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusBurnLineEvent' + str(huc) + '.gpkg')
     vaa_filename = os.path.join(nhdplus_vectors_dir,huc,'NHDPlusFlowLineVAA' + str(huc) + '.gpkg')
@@ -216,10 +217,10 @@ def collect_stream_attributes(nhdplus_vectors_dir, huc):
         nhd_streams.to_file(nhd_streams_agg_fileName,driver=getDriver(nhd_streams_agg_fileName),index=False)
         del nhd_streams
 
-        print ('finished huc: ' + str(huc))
+        print (f"finished attribute collection for HUC {huc}",flush=True)
 
     else:
-        print ('missing data for huc ' + str(huc))
+        print (f"missing data for HUC {huc}",flush=True)
 
 
 def subset_stream_networks(args, huc):
@@ -231,7 +232,7 @@ def subset_stream_networks(args, huc):
     nhdplus_vectors_dir                = args[4]
     nwm_huc4_intersections_filename    = args[5]
 
-    print("starting HUC " + str(huc),flush=True)
+    print(f"starting stream subset for HUC {huc}",flush=True)
     nwm_headwater_id = 'ID'
     ahps_headwater_id = 'nws_lid'
     headwater_pts_id = 'site_id'
@@ -297,7 +298,7 @@ def subset_stream_networks(args, huc):
 
         else:
 
-            print (f"skipping headwater adjustments for HUC: {huc}")
+            print (f"skipping headwater adjustments for HUC {huc}")
 
         del nhd_streams_fr
 
@@ -393,11 +394,11 @@ def clean_up_intermediate_files(nhdplus_vectors_dir):
             missing_subsets = missing_subsets + [huc]
 
     print (f"running subset_results on {len(missing_subsets)} HUC4s")
-    num_workers=11
+    num_workers=8
 
     with ProcessPoolExecutor(max_workers=num_workers) as executor:
         # Preprocess nhd hr and add attributes
-        collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
+        # collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
         # Subset nhd hr network
         subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in missing_subsets]
 
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index 2a6e98ada..816d736d3 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -56,7 +56,7 @@
 os.environ['nwm_headwaters_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_headwaters.gpkg')
 os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
 os.environ['nhd_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nhd_huc8_intersections.gpkg')
-os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid_new.gpkg')
+os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'updated_lid','nws_lid.gpkg')
 os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adj_new.gpkg')
 os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj_new.gpkg')
 os.environ['nwm_catchments_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments_original.gpkg')

From d9e61e2a2e8842347dc716a01e31da29132260db Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Thu, 3 Jun 2021 19:49:45 +0000
Subject: [PATCH 57/66] fixing bug in lateral thalweg adjustment to skip large
 drops

---
 src/adjust_thalweg_lateral.py  | 85 ++++++++++++++++------------------
 src/aggregate_vector_inputs.py | 16 +++++--
 src/utils/shared_variables.py  |  2 +-
 3 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/src/adjust_thalweg_lateral.py b/src/adjust_thalweg_lateral.py
index 24b0222e2..500a44acf 100755
--- a/src/adjust_thalweg_lateral.py
+++ b/src/adjust_thalweg_lateral.py
@@ -8,41 +8,42 @@
 
 
 def adjust_thalweg_laterally(elevation_raster, stream_raster, allocation_raster, cost_distance_raster, cost_distance_tolerance, dem_lateral_thalweg_adj):
-    
+
     # ------------------------------------------- Get catchment_min_dict --------------------------------------------------- #
     # The following algorithm searches for the zonal minimum elevation in each pixel catchment
     # It updates the catchment_min_dict with this zonal minimum elevation value.
     @njit
     def make_zone_min_dict(elevation_window, zone_min_dict, zone_window, cost_window, cost_tolerance, ndv):
-        for i,cm in enumerate(zone_window):
+        for i,elev_m in enumerate(zone_window):
             # If the zone really exists in the dictionary, compare elevation values.
             i = int(i)
-            cm = int(cm)
-            
+            elev_m = int(elev_m)
+
             if (cost_window[i] <= cost_tolerance):
                 if elevation_window[i] > 0:  # Don't allow bad elevation values
-                    if (cm in zone_min_dict):                     
-                                
-                        if (elevation_window[i] < zone_min_dict[cm]):
+                    if (elev_m in zone_min_dict):
+
+                        if (elevation_window[i] < zone_min_dict[elev_m]):
                             # If the elevation_window's elevation value is less than the zone_min_dict min, update the zone_min_dict min.
-                            zone_min_dict[cm] = elevation_window[i]                                                
+                            zone_min_dict[elev_m] = elevation_window[i]
                     else:
-                        zone_min_dict[cm] = elevation_window[i]
+                        zone_min_dict[elev_m] = elevation_window[i]
+
         return(zone_min_dict)
-    
+
     # Open the masked gw_catchments_pixels_masked and dem_thalwegCond_masked.
     elevation_raster_object = rasterio.open(elevation_raster)
     allocation_zone_raster_object = rasterio.open(allocation_raster)
     cost_distance_raster_object = rasterio.open(cost_distance_raster)
-    
+
     meta = elevation_raster_object.meta.copy()
     meta['tiled'], meta['compress'] = True, 'lzw'
-    
+
     # -- Create zone_min_dict -- #
     print("Create zone_min_dict")
     zone_min_dict = typed.Dict.empty(types.int32,types.float32)  # Initialize an empty dictionary to store the catchment minimums.
     # Update catchment_min_dict with pixel sheds minimum.
-    
+
     for ji, window in elevation_raster_object.block_windows(1):  # Iterate over windows, using elevation_raster_object as template.
         elevation_window = elevation_raster_object.read(1,window=window).ravel()  # Define elevation_window.
         zone_window = allocation_zone_raster_object.read(1,window=window).ravel()  # Define zone_window.
@@ -50,72 +51,69 @@ def make_zone_min_dict(elevation_window, zone_min_dict, zone_window, cost_window
 
         # Call numba-optimized function to update catchment_min_dict with pixel sheds minimum.
         zone_min_dict = make_zone_min_dict(elevation_window, zone_min_dict, zone_window, cost_window, int(cost_distance_tolerance), meta['nodata'])
-         
-    # ------------------------------------------------------------------------------------------------------------------------ # 
-    
+
+    # ------------------------------------------------------------------------------------------------------------------------ #
+
     elevation_raster_object.close()
     allocation_zone_raster_object.close()
     cost_distance_raster_object.close()
 
-    
+
     # ------------------------------------------- Assign zonal min to thalweg ------------------------------------------------ #
     @njit
     def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_window):
-                
+
         # Copy elevation values into new array that will store the minimized elevation values.
         dem_window_to_return = np.empty_like (dem_window)
         dem_window_to_return[:] = dem_window
-        
 
-        for i,cm in enumerate(zone_window):
+
+        for i,elev_m in enumerate(zone_window):
             i = int(i)
-            cm = int(cm)
+            elev_m = int(elev_m)
             thalweg_cell = thalweg_window[i]  # From flows_grid_boolean.tif (0s and 1s)
             if thalweg_cell == 1:  # Make sure thalweg cells are checked.
-                if cm in zone_min_dict:
-                    zone_min_elevation = zone_min_dict[cm]
+                if elev_m in zone_min_dict:
+                    zone_min_elevation = zone_min_dict[elev_m]
                     dem_thalweg_elevation = dem_window[i]
-                    
-                    elevation_difference = zone_min_elevation - dem_thalweg_elevation
-    
-                    if zone_min_elevation < dem_thalweg_elevation and elevation_difference <= 5:
+
+                    elevation_difference = dem_thalweg_elevation - zone_min_elevation
+
+                    if (zone_min_elevation < dem_thalweg_elevation) and (elevation_difference <= 5):
                         dem_window_to_return[i] = zone_min_elevation
 
         return(dem_window_to_return)
-        
+
     # Specify raster object metadata.
     elevation_raster_object = rasterio.open(elevation_raster)
     allocation_zone_raster_object = rasterio.open(allocation_raster)
     thalweg_object = rasterio.open(stream_raster)
 
-    
+
     dem_lateral_thalweg_adj_object = rasterio.open(dem_lateral_thalweg_adj, 'w', **meta)
-    
+
     for ji, window in elevation_raster_object.block_windows(1):  # Iterate over windows, using dem_rasterio_object as template.
         dem_window = elevation_raster_object.read(1,window=window)  # Define dem_window.
         window_shape = dem_window.shape
         dem_window = dem_window.ravel()
-        
+
         zone_window = allocation_zone_raster_object.read(1,window=window).ravel()  # Define catchments_window.
         thalweg_window = thalweg_object.read(1,window=window).ravel()  # Define thalweg_window.
-        
+
         # Call numba-optimized function to reassign thalweg cell values to catchment minimum value.
         minimized_dem_window = minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_window)
         minimized_dem_window = minimized_dem_window.reshape(window_shape).astype(np.float32)
 
 
-        dem_lateral_thalweg_adj_object.write(minimized_dem_window, window=window, indexes=1)    
-    
+        dem_lateral_thalweg_adj_object.write(minimized_dem_window, window=window, indexes=1)
+
     elevation_raster_object.close()
     allocation_zone_raster_object.close()
     cost_distance_raster_object.close()
-    
-    # Delete allocation_raster and distance_raster.
-    
-    
-    
+
+
 if __name__ == '__main__':
-    
+
     # Parse arguments.
     parser = argparse.ArgumentParser(description='Adjusts the elevation of the thalweg to the lateral zonal minimum.')
     parser.add_argument('-e','--elevation_raster',help='Raster of elevation.',required=True)
@@ -124,11 +122,8 @@ def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_w
     parser.add_argument('-d','--cost_distance_raster',help='Raster of cost distances for the allocation raster.',required=True)
     parser.add_argument('-t','--cost_distance_tolerance',help='Tolerance in meters to use when searching for zonal minimum.',required=True)
     parser.add_argument('-o','--dem_lateral_thalweg_adj',help='Output elevation raster with adjusted thalweg.',required=True)
-    
+
     # Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
-    
+
     adjust_thalweg_laterally(**args)
-    
-    
-    
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index 30817bc0b..7aeb168dd 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -255,10 +255,16 @@ def subset_stream_networks(args, huc):
         selected_wbd8 = selected_wbd8.reset_index(drop=True)
 
         # Identify FR/NWM headwaters and subset HR network
-        nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersections_filename)
+        try:
+            nhd_streams_fr = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_filename,nwm_headwaters_filename,nwm_headwater_id,nwm_huc4_intersections_filename)
+        except:
+            print (f"Error subsetting NHD HR network for HUC {huc}",flush=True)
 
         # Identify nhd mainstem streams
-        nhd_streams_all = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_fr,ahps_filename,ahps_headwater_id,nwm_huc4_intersections_filename,True)
+        try:
+            nhd_streams_all = subset_nhd_network(huc,huc_mask,selected_wbd8,nhd_streams_fr,ahps_filename,ahps_headwater_id,nwm_huc4_intersections_filename,True)
+        except:
+            print (f"Error identifing MS network for HUC {huc}",flush=True)
 
         # Identify HUC8 intersection points
         nhd_huc8_intersections = find_nwm_incoming_streams(nhd_streams_all,selected_wbd8,8)
@@ -302,6 +308,7 @@ def subset_stream_networks(args, huc):
 
         del nhd_streams_fr
 
+    print(f"finished stream subset for HUC {huc}",flush=True)
 
 def aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list):
 
@@ -394,7 +401,7 @@ def clean_up_intermediate_files(nhdplus_vectors_dir):
             missing_subsets = missing_subsets + [huc]
 
     print (f"running subset_results on {len(missing_subsets)} HUC4s")
-    num_workers=8
+    num_workers=11
 
     with ProcessPoolExecutor(max_workers=num_workers) as executor:
         # Preprocess nhd hr and add attributes
@@ -404,7 +411,8 @@ def clean_up_intermediate_files(nhdplus_vectors_dir):
 
     del wbd4,wbd8
 
-    # Aggregate fr and ms nhd netowrks for entire nwm domain
+    # Aggregate subset nhd networks for entire nwm domain
+    print ('Aggregating subset NHD networks for entire NWM domain')
     aggregate_stream_networks(nhdplus_vectors_dir,agg_nhd_headwaters_adj_fileName,agg_nhd_streams_adj_fileName,huc_list)
 
     # Remove intermediate files
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index 816d736d3..f04d8b605 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -56,7 +56,7 @@
 os.environ['nwm_headwaters_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_headwaters.gpkg')
 os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
 os.environ['nhd_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nhd_huc8_intersections.gpkg')
-os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'updated_lid','nws_lid.gpkg')
+os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid.gpkg')
 os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adj_new.gpkg')
 os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj_new.gpkg')
 os.environ['nwm_catchments_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments_original.gpkg')

From 0cc2bbccb28014e0abe7c9831d5adf868b1e7040 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Fri, 4 Jun 2021 05:25:29 +0000
Subject: [PATCH 58/66] using new nhd inputs

---
 fim_run.sh                     | 4 ++--
 src/aggregate_vector_inputs.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index cf5de36da..daae0d3e8 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -114,8 +114,8 @@ export input_WBD_gdb=$inputDataDir/wbd/WBD_National.gpkg
 export input_nwm_lakes=$inputDataDir/nwm_hydrofabric/nwm_lakes.gpkg
 export input_nwm_catchments=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
 export input_nwm_flows=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
-export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj.gpkg
-export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adj.gpkg
+export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj_new.gpkg
+export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adj_new.gpkg
 export input_GL_boundaries=$inputDataDir/landsea/gl_water_polygons.gpkg
 ## Input handling ##
 $srcDir/check_huc_inputs.py -u "$hucList"
diff --git a/src/aggregate_vector_inputs.py b/src/aggregate_vector_inputs.py
index 7aeb168dd..3675b88b8 100755
--- a/src/aggregate_vector_inputs.py
+++ b/src/aggregate_vector_inputs.py
@@ -400,12 +400,12 @@ def clean_up_intermediate_files(nhdplus_vectors_dir):
         if not os.path.isfile(streams_adj_path):
             missing_subsets = missing_subsets + [huc]
 
-    print (f"running subset_results on {len(missing_subsets)} HUC4s")
+    print (f"Subsetting stream network for {len(missing_subsets)} HUC4s")
     num_workers=11
 
     with ProcessPoolExecutor(max_workers=num_workers) as executor:
         # Preprocess nhd hr and add attributes
-        # collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
+        collect_attributes = [executor.submit(collect_stream_attributes, nhdplus_vectors_dir, str(huc)) for huc in huc_list]
         # Subset nhd hr network
         subset_results = [executor.submit(subset_stream_networks, subset_arg_list, str(huc)) for huc in missing_subsets]
 

From 7c4f01ba91c6d66792e94a0007947951305bf130 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Mon, 14 Jun 2021 14:06:17 +0000
Subject: [PATCH 59/66] setting lateral elevation adjustment threshold limit to
 3 m

---
 fim_run.sh                    | 4 ++--
 src/adjust_thalweg_lateral.py | 2 +-
 src/utils/shared_variables.py | 4 ++--
 tools/thalweg_drop_check.py   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fim_run.sh b/fim_run.sh
index daae0d3e8..cf5de36da 100755
--- a/fim_run.sh
+++ b/fim_run.sh
@@ -114,8 +114,8 @@ export input_WBD_gdb=$inputDataDir/wbd/WBD_National.gpkg
 export input_nwm_lakes=$inputDataDir/nwm_hydrofabric/nwm_lakes.gpkg
 export input_nwm_catchments=$inputDataDir/nwm_hydrofabric/nwm_catchments.gpkg
 export input_nwm_flows=$inputDataDir/nwm_hydrofabric/nwm_flows.gpkg
-export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj_new.gpkg
-export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adj_new.gpkg
+export input_nhd_flowlines=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_streams_adj.gpkg
+export input_nhd_headwaters=$inputDataDir/nhdplus_vectors_aggregate/agg_nhd_headwaters_adj.gpkg
 export input_GL_boundaries=$inputDataDir/landsea/gl_water_polygons.gpkg
 ## Input handling ##
 $srcDir/check_huc_inputs.py -u "$hucList"
diff --git a/src/adjust_thalweg_lateral.py b/src/adjust_thalweg_lateral.py
index 500a44acf..8255efec4 100755
--- a/src/adjust_thalweg_lateral.py
+++ b/src/adjust_thalweg_lateral.py
@@ -79,7 +79,7 @@ def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_w
 
                     elevation_difference = dem_thalweg_elevation - zone_min_elevation
 
-                    if (zone_min_elevation < dem_thalweg_elevation) and (elevation_difference <= 5):
+                    if (zone_min_elevation < dem_thalweg_elevation) and (elevation_difference <= 3):
                         dem_window_to_return[i] = zone_min_elevation
 
         return(dem_window_to_return)
diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py
index f04d8b605..fefad3cfa 100644
--- a/src/utils/shared_variables.py
+++ b/src/utils/shared_variables.py
@@ -57,7 +57,7 @@
 os.environ['nwm_huc4_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_huc4_intersections.gpkg')
 os.environ['nhd_huc8_intersections_filename'] = os.path.join(os.environ.get('nwm_dir'),'nhd_huc8_intersections.gpkg')
 os.environ['ahps_filename'] = os.path.join(os.environ.get('ahps_dir'),'nws_lid.gpkg')
-os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adj_new.gpkg')
-os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj_new.gpkg')
+os.environ['agg_nhd_headwaters_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_headwaters_adj.gpkg')
+os.environ['agg_nhd_streams_adj_fileName'] = os.path.join(os.environ.get('nhdplus_aggregate_dir'),'agg_nhd_streams_adj.gpkg')
 os.environ['nwm_catchments_orig_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments_original.gpkg')
 os.environ['nwm_catchments_all_filename'] = os.path.join(os.environ.get('nwm_dir'),'nwm_catchments.gpkg')
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
index bded2c75a..a864dc9c0 100644
--- a/tools/thalweg_drop_check.py
+++ b/tools/thalweg_drop_check.py
@@ -196,7 +196,7 @@ def compare_thalweg(args):
 
         # Identify significant rises/drops in elevation
         thal_adj_points['elev_change'] = thal_adj_points.groupby(['headwater_path', 'source'])['elevation_m'].apply(lambda x: x - x.shift())
-        elev_changes = thal_adj_points.loc[(thal_adj_points.elev_change<=-5.0) | (thal_adj_points.elev_change>0.0)]
+        elev_changes = thal_adj_points.loc[(thal_adj_points.elev_change<=-3.0) | (thal_adj_points.elev_change>0.0)]
 
         if not elev_changes.empty:
             # elev_changes.to_csv(profile_table_filename,index=False)

From 8ebd01fed063e45ec21b32fb6b64a1e61024694e Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Mon, 14 Jun 2021 18:00:38 +0000
Subject: [PATCH 60/66] cleaning up feature branch for pull request

---
 src/agreedem.py             | 171 +++++++++++++--------------
 src/run_by_unit.sh          |   4 -
 tools/thalweg_comparison.py | 225 ------------------------------------
 3 files changed, 79 insertions(+), 321 deletions(-)
 delete mode 100755 tools/thalweg_comparison.py

diff --git a/src/agreedem.py b/src/agreedem.py
index 95d4d6623..3d92a2d42 100755
--- a/src/agreedem.py
+++ b/src/agreedem.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 import rasterio
 import numpy as np
 import os
@@ -37,15 +36,13 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     None.
 
     '''
+    #------------------------------------------------------------------
+    # 1. From Hellweger documentation: Compute the vector grid
+    # (vectgrid). The cells in the vector grid corresponding to the
+    # lines in the vector coverage have data. All other cells have no
+    # data.
 
-    '''
-    ------------------------------------------------------------------
-    1. From Hellweger documentation: Compute the vector grid (vectgrid).
-    The cells in the vector grid corresponding to the lines in the vector
-    coverage have data. All other cells have no data.
-    '''
-
-    # Import dem layer and river layer and get dem profile
+    # Import dem layer and river layer and get dem profile.
     elev = rasterio.open(dem)
     dem_profile = elev.profile
 
@@ -69,17 +66,15 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
                 river_raw_data_window = rivers.read(1, window = window)
                 river_data_window = np.where(elev_mask_window == True, river_raw_data_window, 0)
 
-                '''
-                ---------------------------------------------------------------
-                2. From Hellweger documentation: Compute the smooth drop/raise
-                grid (smogrid). The cells in the smooth drop/raise grid
-                corresponding to the vector lines have an elevation equal to that
-                of the original DEM (oelevgrid) plus a certain distance
-                (smoothdist). All other cells have no data.
-                '''
-
-                # Assign smooth distance and calculate the smogrid
-                smooth_dist = -1 * smooth_drop # in meters
+                #---------------------------------------------------------------
+                # 2. From Hellweger documentation: Compute the smooth drop/raise
+                # grid (smogrid). The cells in the smooth drop/raise grid
+                # corresponding to the vector lines have an elevation equal to that
+                # of the original DEM (oelevgrid) plus a certain distance
+                # (smoothdist). All other cells have no data.
+
+                # Assign smooth distance and calculate the smogrid.
+                smooth_dist = -1 * smooth_drop # in meters.
                 smogrid_window = river_data_window*(elev_data_window + smooth_dist)
 
                 # Write out raster
@@ -88,26 +83,23 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     elev.close()
     rivers.close()
     raster.close()
-
-    '''
-    ------------------------------------------------------------------
-    3. From Hellweger documentation: Compute the vector distance grids
-    (vectdist and vectallo). The cells in the vector distance grid
-    (vectdist) store the distance to the closest vector cell. The
-    cells in vector allocation grid (vectallo) store the elevation of
-    the closest vector cell.
-    '''
-    # Compute allocation and proximity grid using GRASS gis r.grow.distance tool.
-    # Output distance grid in meters. Set datatype for output allocation and proximity grids to float32.
+    #------------------------------------------------------------------
+    # 3. From Hellweger documentation: Compute the vector distance grids
+    # (vectdist and vectallo). The cells in the vector distance grid
+    # (vectdist) store the distance to the closest vector cell. The
+    # cells in vector allocation grid (vectallo) store the elevation of
+    # the closest vector cell.
+
+    # Compute allocation and proximity grid using GRASS gis
+    # r.grow.distance tool. Output distance grid in meters. Set datatype
+    # for output allocation and proximity grids to float32.
     vectdist_grid, vectallo_grid = r_grow_distance(smo_output, grass_workspace, 'Float32', 'Float32')
 
-    '''
-    ------------------------------------------------------------------
-    4. From Hellweger documentation: Compute the buffer grid
-    (bufgrid2). The cells in the buffer grid outside the buffer
-    distance (buffer) store the original elevation. The cells in the
-    buffer grid inside the buffer distance have no data.
-    '''
+    #------------------------------------------------------------------
+    # 4. From Hellweger documentation: Compute the buffer grid
+    # (bufgrid2). The cells in the buffer grid outside the buffer
+    # distance (buffer) store the original elevation. The cells in the
+    # buffer grid inside the buffer distance have no data.
 
     # Open distance, allocation, elevation grids.
     vectdist = rasterio.open(vectdist_grid)
@@ -128,35 +120,35 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
                 vectallo_data_window = vectallo.read(1, window = window)
                 elev_data_window = elev.read(1, window = window)
 
-                # Define buffer distance and calculate adjustment to compute the bufgrid.
+                # Define buffer distance and calculate adjustment to compute the
+                # bufgrid.
                 # half_res adjustment equal to half distance of one cell
                 half_res = elev.res[0]/2
                 final_buffer = buffer_dist - half_res # assume all units in meters.
 
-                # Calculate bufgrid. Assign NODATA to areas where vectdist_data <= buffered value.
+                # Calculate bufgrid. Assign NODATA to areas where vectdist_data <=
+                # buffered value.
                 bufgrid_window = np.where(vectdist_data_window > final_buffer, elev_data_window, dem_profile['nodata'])
 
-                # Write out raster
+                # Write out raster.
                 raster.write(bufgrid_window.astype('float32'), indexes = 1, window = window)
 
     vectdist.close()
     vectallo.close()
     elev.close()
-
-    '''
-    ------------------------------------------------------------------
-    5. From Hellweger documentation: Compute the buffer distance grids
-    (bufdist and bufallo). The cells in the buffer distance grid
-    (bufdist) store the distance to the closest valued buffer grid
-    cell (bufgrid2). The cells in buffer allocation grid (bufallo)
-    store the elevation of the closest valued buffer cell.
-    '''
-
-    # Compute allocation and proximity grid using GRASS gis r.grow.distance.
-    # Output distance grid in meters. Set datatype for output allocation and proximity grids to float32.
+    #------------------------------------------------------------------
+    # 5. From Hellweger documentation: Compute the buffer distance grids
+    # (bufdist and bufallo). The cells in the buffer distance grid
+    # (bufdist) store the distance to the closest valued buffer grid
+    # cell (bufgrid2). The cells in buffer allocation grid (bufallo)
+    # store the elevation of the closest valued buffer cell.
+
+    # Compute allocation and proximity grid using GRASS gis
+    # r.grow.distance. Output distance grid in meters. Set datatype for
+    # output allocation and proximity grids to float32.
     bufdist_grid, bufallo_grid = r_grow_distance(buf_output, grass_workspace, 'Float32', 'Float32')
 
-    # Open distance, allocation, elevation grids
+    # Open distance, allocation, elevation grids.
     bufdist = rasterio.open(bufdist_grid)
     bufallo = rasterio.open(bufallo_grid)
     vectdist = rasterio.open(vectdist_grid)
@@ -164,7 +156,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     rivers = rasterio.open(rivers_raster)
     elev = rasterio.open(dem)
 
-    # Define profile output file
+    # Define profile output file.
     agree_output = output_raster
     agree_profile = dem_profile.copy()
     agree_profile.update(dtype = 'float32')
@@ -173,7 +165,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     with rasterio.Env():
         with rasterio.open(agree_output, 'w', **agree_profile) as raster:
             for ji, window in elev.block_windows(1):
-                # Read elevation data and mask, distance and allocation grids, and river data
+                # Read elevation data and mask, distance and allocation grids, and river data.
                 elev_data_window = elev.read(1, window = window)
                 elev_mask_window = elev.read_masks(1, window = window).astype('bool')
                 bufdist_data_window = bufdist.read(1, window = window)
@@ -184,42 +176,37 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
 
 
                 river_data_window = np.where(elev_mask_window == True, river_raw_data_window, -20.0)
-
-                '''
-                ------------------------------------------------------------------
-                6. From Hellweger documentation: Compute the smooth modified
-                elevation grid (smoelev). The cells in the smooth modified
-                elevation grid store the results of the smooth surface
-                reconditioning process. Note that for cells outside the buffer the
-                equation below assigns the original elevation.
-                '''
-
-                # Calculate smoelev
+                #------------------------------------------------------------------
+                # 6. From Hellweger documentation: Compute the smooth modified
+                # elevation grid (smoelev). The cells in the smooth modified
+                # elevation grid store the results of the smooth surface
+                # reconditioning process. Note that for cells outside the buffer the
+                # equation below assigns the original elevation.
+
+                # Calculate smoelev.
                 smoelev_window = vectallo_data_window + ((bufallo_data_window - vectallo_data_window)/(bufdist_data_window + vectdist_data_window)) * vectdist_data_window
 
-                '''
-                ------------------------------------------------------------------
-                7. From Hellweger documentation: Compute the sharp drop/raise grid
-                (shagrid). The cells in the sharp drop/raise grid corresponding to
-                the vector lines have an elevation equal to that of the smooth
-                modified elevation grid (smoelev) plus a certain distance
-                (sharpdist). All other cells have no data.
-                '''
-
-                # Define sharp drop distance and calculate the sharp drop grid where only river cells are dropped by the sharp_dist amount.
-                sharp_dist = -1 * sharp_drop # in meters
+                #------------------------------------------------------------------
+                # 7. From Hellweger documentation: Compute the sharp drop/raise grid
+                # (shagrid). The cells in the sharp drop/raise grid corresponding to
+                # the vector lines have an elevation equal to that of the smooth
+                # modified elevation grid (smoelev) plus a certain distance
+                # (sharpdist). All other cells have no data.
+
+                # Define sharp drop distance and calculate the sharp drop grid where
+                # only river cells are dropped by the sharp_dist amount.
+                sharp_dist = -1 * sharp_drop # in meters.
                 shagrid_window = (smoelev_window + sharp_dist) * river_data_window
 
-                '''
-                ------------------------------------------------------------------
-                8. From Hellweger documentation: Compute the modified elevation
-                grid (elevgrid). The cells in the modified elevation grid store
-                the results of the surface reconditioning process. Note that for
-                cells outside the buffer the the equation below assigns the
-                original elevation.
-                '''
+                #------------------------------------------------------------------
+                # 8. From Hellweger documentation: Compute the modified elevation
+                # grid (elevgrid). The cells in the modified elevation grid store
+                # the results of the surface reconditioning process. Note that for
+                # cells outside the buffer the the equation below assigns the
+                # original elevation.
 
-                # Merge sharp drop grid with smoelev grid. Then apply the same NODATA mask as original elevation grid.
+                # Merge sharp drop grid with smoelev grid. Then apply the same
+                # NODATA mask as original elevation grid.
                 elevgrid_window = np.where(river_data_window == 0, smoelev_window, shagrid_window)
                 agree_dem_window = np.where(elev_mask_window == True, elevgrid_window, dem_profile['nodata'])
 
@@ -232,8 +219,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     vectallo.close()
     rivers.close()
     elev.close()
-
-    # If the '-t' flag is called, intermediate data is removed
+    # If the '-t' flag is called, intermediate data is removed.
     if delete_intermediate_data:
         os.remove(smo_output)
         os.remove(buf_output)
@@ -245,7 +231,7 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
 
 if __name__ == '__main__':
 
-    # Parse arguments
+    #Parse arguments
     parser = argparse.ArgumentParser(description = 'Calculate AGREE DEM')
     parser.add_argument('-r', '--rivers', help = 'flows grid boolean layer', required = True)
     parser.add_argument('-d', '--dem_m',  help = 'DEM raster in meters', required = True)
@@ -257,9 +243,10 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     parser.add_argument('-sh', '---sharp', help = 'Sharp drop (m)', required = True)
     parser.add_argument('-t',  '--del',  help = 'Optional flag to delete intermediate datasets', action = 'store_true')
 
-    # Extract to dictionary and assign to variables
+    #Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
 
+    # rename variable inputs
     rivers_raster = args['rivers']
     dem = args['dem_m']
     workspace = args['workspace']
@@ -270,5 +257,5 @@ def agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buff
     sharp_drop =  float(args['sharp'])
     delete_intermediate_data = args['del']
 
-    # Run agreedem
+    #Run agreedem
     agreedem(rivers_raster, dem, output_raster, workspace, grass_workspace, buffer_dist, smooth_drop, sharp_drop, delete_intermediate_data)
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 5b4070ac3..34be2fc2d 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -105,10 +105,6 @@ Tstart
 gdal_rasterize -ot Int32 -burn 1 -init 0 -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" -te $xmin $ymin $xmax $ymax -ts $ncols $nrows $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg $outputHucDataDir/flows_grid_boolean.tif
 Tcount
 
-##gdal_rasterize -ot Float32 -a NHDPlusID -init 0 -co "COMPRESS=LZW" -co "BIGTIFF=YES" -co "TILED=YES" -te $xmin $ymin $xmax $ymax -ts $ncols $nrows $outputHucDataDir/NHDPlusBurnLineEvent_subset.gpkg $outputHucDataDir/flows_grid_nhd.tif
-
-
-
 ## RASTERIZE NHD HEADWATERS (1 & 0) ##
 echo -e $startDiv"Rasterize NHD Headwaters $hucNumber"$stopDiv
 date -u
diff --git a/tools/thalweg_comparison.py b/tools/thalweg_comparison.py
deleted file mode 100755
index 5f9f734e6..000000000
--- a/tools/thalweg_comparison.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import sys
-import geopandas as gpd
-import rasterio
-import pandas as pd
-import numpy as np
-import argparse
-import matplotlib.pyplot as plt
-import seaborn as sns
-from functools import reduce
-from multiprocessing import Pool
-from os.path import isfile, join, dirname
-import shutil
-import warnings
-from pathlib import Path
-import time
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-"""
-    Plot Rating Curves and Compare to USGS Gages
-
-    Parameters
-    ----------
-    fim_dir : str
-        Directory containing FIM output folders.
-    output_dir : str
-        Directory containing rating curve plots and tables.
-    usgs_gages_filename : str
-        File name of USGS rating curves.
-    nwm_flow_dir : str
-        Directory containing NWM recurrence flows files.
-    number_of_jobs : str
-        Number of jobs.
-    stat_groups : str
-        string of columns to group eval metrics.
-"""
-outfolder = '/data/outputs/single_pixel_huc_ms_c/02030103' # dev_v3_0_15_7_adj_huc_test
-dem_thalwegCond_filename = os.path.join(outfolder,'dem_thalwegCond.tif')
-dem_meters_filename = os.path.join(outfolder,'dem_meters.tif')
-reaches_split_points_filename = os.path.join(outfolder,'demDerived_reaches_split_points.gpkg')
-reaches_filename = os.path.join(outfolder,'demDerived_reaches_split.gpkg')
-
-
-def compare_thalweg(args):
-
-    huc                              = args[0]
-    reaches_split_points_filename    = args[1]
-    reaches_filename                 = args[2]
-    dem_thalwegCond_filename         = args[3]
-    dem_meters_filename              = args[4]
-
-reaches_split_points = gpd.read_file(reaches_split_points_filename)
-reaches = gpd.read_file(reaches_filename)
-dem_thalwegCond = rasterio.open(dem_thalwegCond_filename,'r')
-dem_meters = rasterio.open(dem_meters_filename,'r')
-
-plot_filename = '/data/outputs/single_pixel_huc_ms_c/02030103/elev_plots.png'
-
-reaches_split_points = reaches_split_points.rename(columns={'id': 'HydroID'})
-
-hydroid = []
-index_order = []
-thal_adj_elev = []
-dem_m_elev = []
-for index, point in reaches_split_points.iterrows():
-    hydroid = hydroid + [point.HydroID]
-    index_order = index_order + [index]
-    dem_m_elev = dem_m_elev + [np.array(list(dem_meters.sample((point.geometry.coords), indexes=1))).item()]
-    thal_adj_elev = thal_adj_elev + [np.array(list(dem_thalwegCond.sample((point.geometry.coords), indexes=1))).item()]
-
-dem_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': dem_m_elev,'source': 'dem_meters'})
-dem_adj_thalweg_elevations = pd.DataFrame({'HydroID': hydroid, 'pt_order': index_order, 'elevation_m': thal_adj_elev,'source': 'thalweg_adj'})
-
-all_elevations = dem_thalweg_elevations.append(dem_adj_thalweg_elevations)
-
-reach_att = reaches[['HydroID', 'From_Node', 'To_Node', 'NextDownID']]
-
-thalweg_elevations = all_elevations.merge(reach_att, on="HydroID")
-
-# Find segments where elevation drops 5 m per
-# drops = thalweg_elevations.loc[thalweg_elevations.HydroID
-# all_hydro_ids = dict(thalweg_elevations[['HydroID','elevation_m']])
-thalweg_elevations.NextDownID = thalweg_elevations.NextDownID.astype('int')
-dem_adj_thalweg_elevations = thalweg_elevations.loc[thalweg_elevations.source=='thalweg_adj']
-min_index = dem_adj_thalweg_elevations.groupby(['HydroID']).pt_order.min()
-min_index = min_index.reset_index()
-min_index = min_index.rename(columns={'pt_order': 'min_index'})
-
-for index, downstream_id in dem_adj_thalweg_elevations.iterrows():
-    if index == 1:
-        break
-    if downstream_id.NextDownID != -1:
-        downstream_elevs = dem_adj_thalweg_elevations.loc[(dem_adj_thalweg_elevations.HydroID==downstream_id.NextDownID) & (dem_adj_thalweg_elevations.source=='thalweg_adj')].elevation_m
-        if (downstream_id.elevation_m - downstream_elevs[0]) > 5:
-            print (f"HydroID {HydroID} drops {(downstream_id.elevation_m - downstream_elev)} meters down from HydroID {NextDownID}")
-        upstream_elev = dem_adj_thalweg_elevations.loc[dem_adj_thalweg_elevations.NextDownID==downstream_id.NextDownID].elevation_m
-
-# drops = thalweg_elevations.
-
-select_hydroids = [10680001,10680002,10680020,10680034,10680061,10680076,10680077,10680148,10680094]
-
-select_elevations = thalweg_elevations.loc[thalweg_elevations.HydroID.isin(select_hydroids)]
-
-# Convert index to longitudinal distance
-
-# Find reference index for each segment to convert index to longitudinal distance
-min_index = select_elevations.groupby(['HydroID']).pt_order.min()
-min_index = min_index.reset_index()
-min_index = min_index.rename(columns={'pt_order': 'min_index'})
-
-# Subtract reference index from index and convert to feet
-segment_distance = pd.merge(select_elevations[['HydroID', 'pt_order','source']],min_index, on="HydroID").reset_index(drop=True)
-segment_distance['distance'] = (segment_distance.pt_order - segment_distance.min_index)* 32.8084
-segment_distance.distance = segment_distance.distance.round(1)
-# merge distances back into table
-select_elevations = select_elevations.reset_index(drop=True)
-# segment_distance_sub = segment_distance.filter(items=['HydroID', 'distance']).reset_index(drop=True)
-select_elevations = pd.concat([select_elevations.set_index('HydroID'), segment_distance[['HydroID', 'distance']].set_index('HydroID')], axis=1, join="inner")
-select_elevations = select_elevations.reset_index()
-# Convert elevation to feet
-select_elevations['elevation_ft'] = select_elevations.elevation_m * 3.28084 # convert from m to ft
-select_elevations.elevation_ft = select_elevations.elevation_ft.round(1)
-
-select_elevations = select_elevations.sort_values(['HydroID', 'distance','elevation_ft'], ascending=[1, 0, 0])
-select_elevations = select_elevations.reset_index(drop=True)
-
-## Generate rating curve plots
-num_plots = len(select_elevations.HydroID.unique())
-
-if num_plots > 3:
-    columns = num_plots // 3
-else:
-    columns = 1
-
-sns.set(style="ticks")
-g = sns.FacetGrid(select_elevations, col="HydroID", hue="source",sharex=True, sharey=False,col_wrap=columns)
-g.map(sns.lineplot, "distance", "elevation_ft", palette="tab20c") # , marker="o"
-g.set_axis_labels(x_var="Longitudinal Distance (ft)", y_var="Elevation (ft)")
-
-# Iterate thorugh each axis to get individual y-axis bounds
-for ax in g.axes.flat:
-    print (ax.lines)
-    mins = []
-    maxes = []
-    for line in ax.lines:
-        mins = mins + [min(line.get_ydata())]
-        maxes = maxes + [max(line.get_ydata())]
-    min_y = min(mins) - (max(maxes) - min(mins))/10
-    max_y = max(maxes) + (max(maxes) - min(mins))/10
-    ax.set_ylim(min_y,max_y)
-
-# Adjust the arrangement of the plots
-g.fig.tight_layout(w_pad=1)
-g.add_legend()
-
-plt.savefig(plot_filename)
-plt.close()
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='generate rating curve plots and tables for FIM and USGS gages')
-    parser.add_argument('-fim_dir','--fim-dir', help='FIM output dir', required=True,type=str)
-    parser.add_argument('-output_dir','--output-dir', help='rating curves output folder', required=True,type=str)
-    parser.add_argument('-gages','--usgs-gages-filename',help='USGS rating curves',required=True,type=str)
-    parser.add_argument('-flows','--nwm-flow-dir',help='NWM recurrence flows dir',required=True,type=str)
-    parser.add_argument('-catfim', '--catfim-flows-filename', help='Categorical FIM flows file',required = True,type=str)
-    parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
-    parser.add_argument('-group','--stat-groups',help='column(s) to group stats',required=False,type=str)
-
-    args = vars(parser.parse_args())
-
-    fim_dir = args['fim_dir']
-    output_dir = args['output_dir']
-    usgs_gages_filename = args['usgs_gages_filename']
-    nwm_flow_dir = args['nwm_flow_dir']
-    catfim_flows_filename = args['catfim_flows_filename']
-    number_of_jobs = args['number_of_jobs']
-    stat_groups = args['stat_groups']
-
-    stat_groups = stat_groups.split()
-    procs_list = []
-
-    plots_dir = join(output_dir,'plots')
-    os.makedirs(plots_dir, exist_ok=True)
-    tables_dir = join(output_dir,'tables')
-    os.makedirs(tables_dir, exist_ok=True)
-
-    #Check age of gages csv and recommend updating if older than 30 days.
-    print(check_file_age(usgs_gages_filename))
-
-    # Open log file
-    sys.__stdout__ = sys.stdout
-    log_file = open(join(output_dir,'rating_curve_comparison.log'),"w")
-    sys.stdout = log_file
-
-    huc_list  = os.listdir(fim_dir)
-    for huc in huc_list:
-
-        if huc != 'logs':
-            elev_table_filename = join(fim_dir,huc,'usgs_elev_table.csv')
-            hydrotable_filename = join(fim_dir,huc,'hydroTable.csv')
-            usgs_recurr_stats_filename = join(tables_dir,f"usgs_interpolated_elevation_stats_{huc}.csv")
-            nwm_recurr_data_filename = join(tables_dir,f"nwm_recurrence_flow_elevations_{huc}.csv")
-            rc_comparison_plot_filename = join(plots_dir,f"FIM-USGS_rating_curve_comparison_{huc}.png")
-
-            if isfile(elev_table_filename):
-                procs_list.append([elev_table_filename, hydrotable_filename, usgs_gages_filename, usgs_recurr_stats_filename, nwm_recurr_data_filename, rc_comparison_plot_filename,nwm_flow_dir, catfim_flows_filename, huc])
-
-    # Initiate multiprocessing
-    print(f"Generating rating curve metrics for {len(procs_list)} hucs using {number_of_jobs} jobs")
-    with Pool(processes=number_of_jobs) as pool:
-        pool.map(generate_rating_curve_metrics, procs_list)
-
-    print(f"Aggregating rating curve metrics for {len(procs_list)} hucs")
-    aggregate_metrics(output_dir,procs_list,stat_groups)
-
-    print('Delete intermediate tables')
-    shutil.rmtree(tables_dir, ignore_errors=True)
-
-    # Close log file
-    sys.stdout = sys.__stdout__
-    log_file.close()

From e3be072d8225321c98b917604669a85bf28a3a4a Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Tue, 15 Jun 2021 15:19:59 -0500
Subject: [PATCH 61/66] replacing profile (accidentally removed )

---
 src/clip_vectors_to_wbd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/clip_vectors_to_wbd.py b/src/clip_vectors_to_wbd.py
index 6e7c2fd93..65fd72d20 100755
--- a/src/clip_vectors_to_wbd.py
+++ b/src/clip_vectors_to_wbd.py
@@ -7,7 +7,7 @@
 from shapely.geometry import MultiPolygon,Polygon,Point
 from utils.shared_functions import getDriver
 
-
+@profile
 def subset_vector_layers(hucCode,nwm_streams_filename,nhd_streams_filename,nwm_lakes_filename,nld_lines_filename,nwm_catchments_filename,nhd_headwaters_filename,landsea_filename,wbd_filename,wbd_buffer_filename,subset_nhd_streams_filename,subset_nld_lines_filename,subset_nwm_lakes_filename,subset_nwm_catchments_filename,subset_nhd_headwaters_filename,subset_nwm_streams_filename,subset_landsea_filename,extent,great_lakes_filename,wbd_buffer_distance,lake_buffer_distance):
 
     hucUnitLength = len(str(hucCode))

From 8b0969131fb6daaa28fd8db2e8a7b73c2a3101a1 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 16 Jun 2021 14:55:08 -0500
Subject: [PATCH 62/66] Update CHANGELOG.md

---
 CHANGELOG.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a12d1911c..f01f3e25a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,25 @@ All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 <br/><br/>
 
+## v3.0.19.1 - 2021-06-17 - [PR #417](https://github.com/NOAA-OWP/cahaba/pull/417)
+
+Feature to evaluate performance of alternative CatFIM techniques.
+
+## Additions
+- `thalweg_drop_check.py` checks the elevation along the thalweg for each stream path downstream of MS headwaters within a HUC.
+
+## Removals
+- Removing 'dissolveLinks' arg from `clip_vectors_to_wbd.py`.
+
+
+## Changes
+- Cleaned up code in `split_flows.py` to make it more readable.
+- Refactored `reduce_nhd_stream_density.py` and `adjust_headwater_streams.py` to limit MS headwater points in `agg_nhd_headwaters_adj.gpkg`.
+- Fixed a bug in `adjust_thalweg_lateral.py` lateral elevation replacement threshold; changed threshold to 3 meters.
+- Updated `aggregate_vector_inputs.py` to log intermediate processes.
+
+<br/><br/>
+
 ## v3.0.19.0 - 2021-06-10 - [PR #415](https://github.com/NOAA-OWP/cahaba/pull/415)
 
 Feature to evaluate performance of alternative CatFIM techniques.

From bdf3b70451abd91e6ec1a90d696440cce0ca66b6 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 16 Jun 2021 14:56:37 -0500
Subject: [PATCH 63/66] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f01f3e25a..2ed8a0e99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@ We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
 ## v3.0.19.1 - 2021-06-17 - [PR #417](https://github.com/NOAA-OWP/cahaba/pull/417)
 
-Feature to evaluate performance of alternative CatFIM techniques.
+Adding a thalweg profile tool to identify significant drops in thalweg elevation. Also setting lateral thalweg adjustment threshold in hydroconditioning.
 
 ## Additions
 - `thalweg_drop_check.py` checks the elevation along the thalweg for each stream path downstream of MS headwaters within a HUC.

From 5c5b0501337bf69cc06f4bd395614bfa749081d7 Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Wed, 16 Jun 2021 20:07:24 +0000
Subject: [PATCH 64/66] moving parameters to param file and reverting
 production whitelist

---
 config/params_calibrated.env  | 1 +
 config/params_template.env    | 1 +
 src/adjust_thalweg_lateral.py | 5 +++--
 src/output_cleanup.py         | 7 -------
 src/run_by_unit.sh            | 2 +-
 tools/thalweg_drop_check.py   | 3 ++-
 6 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/config/params_calibrated.env b/config/params_calibrated.env
index aa7aba1b0..89ebe9556 100644
--- a/config/params_calibrated.env
+++ b/config/params_calibrated.env
@@ -4,6 +4,7 @@
 export negative_burn_value=1000
 export agree_DEM_buffer=70
 export wbd_buffer=5000
+thalweg_lateral_elev_threshold=3.0
 
 #### geospatial parameters ####
 export max_split_distance_meters=1500
diff --git a/config/params_template.env b/config/params_template.env
index a998dc675..02b50d78b 100644
--- a/config/params_template.env
+++ b/config/params_template.env
@@ -4,6 +4,7 @@
 export negative_burn_value=1000
 export agree_DEM_buffer=70
 export wbd_buffer=5000
+thalweg_lateral_elev_threshold=3.0
 
 #### geospatial parameters ####
 export max_split_distance_meters=1500
diff --git a/src/adjust_thalweg_lateral.py b/src/adjust_thalweg_lateral.py
index 1a2c88247..601a92dd1 100755
--- a/src/adjust_thalweg_lateral.py
+++ b/src/adjust_thalweg_lateral.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 @profile
-def adjust_thalweg_laterally(elevation_raster, stream_raster, allocation_raster, cost_distance_raster, cost_distance_tolerance, dem_lateral_thalweg_adj):
+def adjust_thalweg_laterally(elevation_raster, stream_raster, allocation_raster, cost_distance_raster, cost_distance_tolerance, dem_lateral_thalweg_adj,lateral_elevation_threshold):
 
     # ------------------------------------------- Get catchment_min_dict --------------------------------------------------- #
     # The following algorithm searches for the zonal minimum elevation in each pixel catchment
@@ -79,7 +79,7 @@ def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_w
 
                     elevation_difference = dem_thalweg_elevation - zone_min_elevation
 
-                    if (zone_min_elevation < dem_thalweg_elevation) and (elevation_difference <= 3):
+                    if (zone_min_elevation < dem_thalweg_elevation) and (elevation_difference <= lateral_elevation_threshold):
                         dem_window_to_return[i] = zone_min_elevation
 
         return(dem_window_to_return)
@@ -122,6 +122,7 @@ def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_w
     parser.add_argument('-d','--cost_distance_raster',help='Raster of cost distances for the allocation raster.',required=True)
     parser.add_argument('-t','--cost_distance_tolerance',help='Tolerance in meters to use when searching for zonal minimum.',required=True)
     parser.add_argument('-o','--dem_lateral_thalweg_adj',help='Output elevation raster with adjusted thalweg.',required=True)
+    parser.add_argument('-th','--lateral_elevation_threshold',help='Maximum difference between current thalweg elevation and lowest lateral elevation in meters.',required=True)
 
     # Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
diff --git a/src/output_cleanup.py b/src/output_cleanup.py
index a5ca316c3..d73a2deee 100755
--- a/src/output_cleanup.py
+++ b/src/output_cleanup.py
@@ -40,13 +40,6 @@ def output_cleanup(huc_number, output_folder_path, additional_whitelist, is_prod
         'src_full_crosswalked.csv',
         'usgs_elev_table.csv',
         'hand_ref_elev_table.csv',
-        'dem_lateral_thalweg_adj.tif',
-        'dem_thalwegCond.tif',
-        'dem_meters.tif',
-        'demDerived_reaches_split.gpkg',
-        'nhd_headwater_points_subset.gpkg',
-        'wbd.gpkg',
-        'NHDPlusBurnLineEvent_subset.gpkg'
     ]
 
     # List of files that will be saved during a viz run
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index 34be2fc2d..e1e3fd87e 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -190,7 +190,7 @@ Tcount
 echo -e $startDiv"Performing lateral thalweg adjustment $hucNumber"$stopDiv
 date -u
 Tstart
-python3 -m memory_profiler $srcDir/adjust_thalweg_lateral.py -e $outputHucDataDir/dem_meters.tif -s $outputHucDataDir/demDerived_streamPixels.tif -a $outputHucDataDir/demDerived_streamPixels_ids_allo.tif -d $outputHucDataDir/demDerived_streamPixels_ids_dist.tif -t 50 -o $outputHucDataDir/dem_lateral_thalweg_adj.tif
+python3 -m memory_profiler $srcDir/adjust_thalweg_lateral.py -e $outputHucDataDir/dem_meters.tif -s $outputHucDataDir/demDerived_streamPixels.tif -a $outputHucDataDir/demDerived_streamPixels_ids_allo.tif -d $outputHucDataDir/demDerived_streamPixels_ids_dist.tif -t 50 -o $outputHucDataDir/dem_lateral_thalweg_adj.tif  -th $thalweg_lateral_elev_threshold
 Tcount
 
 ## MASK BURNED DEM FOR STREAMS ONLY ###
diff --git a/tools/thalweg_drop_check.py b/tools/thalweg_drop_check.py
index a864dc9c0..c56d743e5 100644
--- a/tools/thalweg_drop_check.py
+++ b/tools/thalweg_drop_check.py
@@ -196,7 +196,7 @@ def compare_thalweg(args):
 
         # Identify significant rises/drops in elevation
         thal_adj_points['elev_change'] = thal_adj_points.groupby(['headwater_path', 'source'])['elevation_m'].apply(lambda x: x - x.shift())
-        elev_changes = thal_adj_points.loc[(thal_adj_points.elev_change<=-3.0) | (thal_adj_points.elev_change>0.0)]
+        elev_changes = thal_adj_points.loc[(thal_adj_points.elev_change<=-lateral_elevation_threshold) | (thal_adj_points.elev_change>0.0)]
 
         if not elev_changes.empty:
             # elev_changes.to_csv(profile_table_filename,index=False)
@@ -314,6 +314,7 @@ def plot_profile(elevation_table,profile_plots_filename):
     # parser.add_argument('-rasters','--raster-list',help='list of rasters to be evaluated',required=True,type=str)
     parser.add_argument('-stream_type','--stream-type',help='stream layer to be evaluated',required=True,type=str,choices=['derived','burnline'])
     parser.add_argument('-point_density','--point-density',help='elevation sampling density',required=True,type=str,choices=['midpoints','all_points'])
+    parser.add_argument('-th','--elevation_threshold',help='significant elevation drop threshold in meters.',required=True)
     parser.add_argument('-j','--number-of-jobs',help='number of workers',required=False,default=1,type=int)
 
     args = vars(parser.parse_args())

From fae1fcf5baa8a83ab9039ee2931baefa1d2145fa Mon Sep 17 00:00:00 2001
From: Brian Avant <brian.avant@noaa.gov>
Date: Thu, 17 Jun 2021 19:53:11 +0000
Subject: [PATCH 65/66] fixed param export

---
 config/params_calibrated.env  | 2 +-
 config/params_template.env    | 2 +-
 src/adjust_thalweg_lateral.py | 2 +-
 src/run_by_unit.sh            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/params_calibrated.env b/config/params_calibrated.env
index 89ebe9556..3ca30650e 100644
--- a/config/params_calibrated.env
+++ b/config/params_calibrated.env
@@ -4,7 +4,7 @@
 export negative_burn_value=1000
 export agree_DEM_buffer=70
 export wbd_buffer=5000
-thalweg_lateral_elev_threshold=3.0
+export thalweg_lateral_elev_threshold=3
 
 #### geospatial parameters ####
 export max_split_distance_meters=1500
diff --git a/config/params_template.env b/config/params_template.env
index 02b50d78b..87270a669 100644
--- a/config/params_template.env
+++ b/config/params_template.env
@@ -4,7 +4,7 @@
 export negative_burn_value=1000
 export agree_DEM_buffer=70
 export wbd_buffer=5000
-thalweg_lateral_elev_threshold=3.0
+export thalweg_lateral_elev_threshold=3
 
 #### geospatial parameters ####
 export max_split_distance_meters=1500
diff --git a/src/adjust_thalweg_lateral.py b/src/adjust_thalweg_lateral.py
index 601a92dd1..47cd2e209 100755
--- a/src/adjust_thalweg_lateral.py
+++ b/src/adjust_thalweg_lateral.py
@@ -122,7 +122,7 @@ def minimize_thalweg_elevation(dem_window, zone_min_dict, zone_window, thalweg_w
     parser.add_argument('-d','--cost_distance_raster',help='Raster of cost distances for the allocation raster.',required=True)
     parser.add_argument('-t','--cost_distance_tolerance',help='Tolerance in meters to use when searching for zonal minimum.',required=True)
     parser.add_argument('-o','--dem_lateral_thalweg_adj',help='Output elevation raster with adjusted thalweg.',required=True)
-    parser.add_argument('-th','--lateral_elevation_threshold',help='Maximum difference between current thalweg elevation and lowest lateral elevation in meters.',required=True)
+    parser.add_argument('-th','--lateral_elevation_threshold',help='Maximum difference between current thalweg elevation and lowest lateral elevation in meters.',required=True,type=int)
 
     # Extract to dictionary and assign to variables.
     args = vars(parser.parse_args())
diff --git a/src/run_by_unit.sh b/src/run_by_unit.sh
index e1e3fd87e..cab360898 100755
--- a/src/run_by_unit.sh
+++ b/src/run_by_unit.sh
@@ -190,7 +190,7 @@ Tcount
 echo -e $startDiv"Performing lateral thalweg adjustment $hucNumber"$stopDiv
 date -u
 Tstart
-python3 -m memory_profiler $srcDir/adjust_thalweg_lateral.py -e $outputHucDataDir/dem_meters.tif -s $outputHucDataDir/demDerived_streamPixels.tif -a $outputHucDataDir/demDerived_streamPixels_ids_allo.tif -d $outputHucDataDir/demDerived_streamPixels_ids_dist.tif -t 50 -o $outputHucDataDir/dem_lateral_thalweg_adj.tif  -th $thalweg_lateral_elev_threshold
+python3 -m memory_profiler $srcDir/adjust_thalweg_lateral.py -e $outputHucDataDir/dem_meters.tif -s $outputHucDataDir/demDerived_streamPixels.tif -a $outputHucDataDir/demDerived_streamPixels_ids_allo.tif -d $outputHucDataDir/demDerived_streamPixels_ids_dist.tif -t 50 -o $outputHucDataDir/dem_lateral_thalweg_adj.tif -th $thalweg_lateral_elev_threshold
 Tcount
 
 ## MASK BURNED DEM FOR STREAMS ONLY ###

From 9ac4737a4b852e712412dc4e30847d0a00047781 Mon Sep 17 00:00:00 2001
From: Brad <bradford.bates@noaa.gov>
Date: Mon, 21 Jun 2021 07:44:39 -0500
Subject: [PATCH 66/66] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2ed8a0e99..808effc01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,7 @@ Adding a thalweg profile tool to identify significant drops in thalweg elevation
 - `thalweg_drop_check.py` checks the elevation along the thalweg for each stream path downstream of MS headwaters within a HUC.
 
 ## Removals
-- Removing 'dissolveLinks' arg from `clip_vectors_to_wbd.py`.
+- Removing `dissolveLinks` arg from `clip_vectors_to_wbd.py`.
 
 
 ## Changes