Merge pull request dailyerosion#114 from akrherz/dev220126

Omnibus
akrherz · Feb 1, 2022 · 52e72a5 · 52e72a5
2 parents 93399f3 + 3531c90
commit 52e72a5
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 31 deletions.
diff --git a/scripts/RT/proctor_sweep.py b/scripts/RT/proctor_sweep.py
@@ -24,7 +24,7 @@
 import subprocess
 from multiprocessing import Pool
 
-from pyiem.util import get_dbconn, logger
+from pyiem.util import get_dbconnstr, logger
 import pandas as pd
 import requests
 from tqdm import tqdm
@@ -135,7 +135,6 @@ def main(argv):
     """Go Main Go."""
     parser = usage()
     args = parser.parse_args(argv[1:])
-    pgconn = get_dbconn("idep")
     df = read_sql(
         """
         SELECT huc_12, fpath, scenario,
@@ -144,7 +143,7 @@ def main(argv):
         from flowpaths where scenario = %s
         and huc_12 in %s
     """,
-        pgconn,
+        get_dbconnstr("idep"),
         params=(args.scenario, tuple(HUC12S)),
         index_col=None,
     )

diff --git a/scripts/cligen/r_factor.py b/scripts/cligen/r_factor.py
@@ -1,7 +1,7 @@
 """R factor work."""
 
 from pyiem.dep import read_cli
-from pyiem.util import get_dbconn
+from pyiem.util import get_dbconnstr
 from pyiem.plot.use_agg import plt
 from pyiem.plot import MapPlot
 import cartopy.crs as ccrs
@@ -11,19 +11,16 @@
 from tqdm import tqdm
 from geopandas import read_postgis
 import pandas as pd
-from pandas.io.sql import read_sql
+from pandas import read_sql
 
 
 def plot():
     """Plot."""
     df2 = pd.read_csv("/tmp/data.csv", dtype={"huc12": str}).set_index("huc12")
-    pgconn = get_dbconn("idep")
     df = read_postgis(
-        """
-        SELECT huc_12, ST_Transform(simple_geom, 4326) as geom
-        from huc12  WHERE scenario = 0
-    """,
-        pgconn,
+        "SELECT huc_12, ST_Transform(simple_geom, 4326) as geom from huc12 "
+        "WHERE scenario = 0",
+        get_dbconnstr("idep"),
         geom_col="geom",
         index_col="huc_12",
     )
@@ -71,13 +68,12 @@ def plot():
 
 def dump_data():
     """Go main Go."""
-    pgconn = get_dbconn("idep")
     df = read_sql(
         """
         SELECT huc_12, max(climate_file) as cli from flowpaths where
         scenario = 0 GROUP by huc_12
     """,
-        pgconn,
+        get_dbconnstr("idep"),
         index_col="huc_12",
     )
     data = {

diff --git a/scripts/import/check_huc12_zero_flowpaths.py b/scripts/import/check_huc12_zero_flowpaths.py
@@ -1,19 +1,18 @@
 """Report which HUC12s have 0 flowpaths."""
 import sys
 
-from pyiem.util import get_dbconn
-from pandas.io.sql import read_sql
+from pyiem.util import get_dbconnstr
+from pandas import read_sql
 
 
 def main(argv):
     """Go Main Go."""
     scenario = int(argv[1])
     huc12s = [s.strip() for s in open("myhucs.txt", encoding="utf8")]
-    pgconn = get_dbconn("idep")
     df = read_sql(
         "SELECT huc_12, count(*) from flowpaths where scenario = %s "
         "GROUP by huc_12",
-        pgconn,
+        get_dbconnstr("idep"),
         params=(scenario,),
         index_col="huc_12",
     )

diff --git a/scripts/import/flowpath2prj.py b/scripts/import/flowpath2prj.py
@@ -38,8 +38,8 @@
 from math import atan2, degrees, pi
 
 from tqdm import tqdm
-from pandas.io.sql import read_sql
-from pyiem.util import get_dbconn, logger
+from pandas import read_sql
+from pyiem.util import get_dbconn, get_dbconnstr, logger
 from pyiem.dep import load_scenarios
 
 LOG = logger()
@@ -356,7 +356,7 @@ def rewrite_flowpath(cursor, scenario, flowpath_id, df):
         )
 
 
-def do_flowpath(pgconn, cursor, scenario, zone, metadata):
+def do_flowpath(cursor, scenario, zone, metadata):
     """Process a given flowpathid"""
     # slope = compute_slope(fid)
     # I need bad soilfiles so that the length can be computed
@@ -371,7 +371,7 @@ def do_flowpath(pgconn, cursor, scenario, zone, metadata):
         WHERE flowpath = %s and length < 9999
         ORDER by segid ASC
     """,
-        pgconn,
+        get_dbconnstr("idep"),
         params=(metadata["fid"],),
     )
     origsize = len(df.index)
@@ -577,7 +577,7 @@ def main(argv):
         "SELECT ST_ymax(ST_Transform(geom, 4326)) as lat, fpath, fid, huc_12, "
         "climate_file from flowpaths WHERE scenario = %s and fpath != 0 "
         "ORDER by huc_12 ASC",
-        pgconn,
+        get_dbconnstr("idep"),
         params=(get_flowpath_scenario(scenario),),
     )
     if os.path.isfile("myhucs.txt"):
@@ -596,7 +596,7 @@ def main(argv):
             zone = "IA_CENTRAL"
         elif row["lat"] >= 40.5:
             zone = "IA_SOUTH"
-        data = do_flowpath(pgconn, cursor, scenario, zone, row)
+        data = do_flowpath(cursor, scenario, zone, row)
         if data is not None:
             write_prj(data)
     cursor.close()

diff --git a/scripts/import/flowpath_importer.py b/scripts/import/flowpath_importer.py
@@ -26,6 +26,9 @@
 PREFIX = "fp"
 TRUNC_GRIDORDER_AT = 4
 GENLU_CODES = {}
+PROCESSING_COUNTS = {
+    "flowpaths_deduped": 0,
+}
 
 
 def get_flowpath(cursor, scenario, huc12, fpath):
@@ -111,6 +114,24 @@ def get_genlu_code(cursor, label):
     return GENLU_CODES[label]
 
 
+def dedupe(df, lencolname):
+    """Deduplicate by checking the FBndID."""
+    # Optmization, a 1 field value count is likely the dup we want to dump
+    fields = df["FBndID"].value_counts().sort_values(ascending=False)
+    # Find any fields with a count of 1
+    fields2 = fields[fields == 1]
+    if not fields2.empty and len(fields.index) == 2:
+        PROCESSING_COUNTS["flowpaths_deduped"] += 1
+        return df[df["FBndID"] != fields2.index[0]]
+    # Could have a perfect duplicate?
+    if fields.min() == fields.max():
+        PROCESSING_COUNTS["flowpaths_deduped"] += 1
+        return df[df["FBndID"] != fields.index[0]]
+    # high field wins
+    PROCESSING_COUNTS["flowpaths_deduped"] += 1
+    return df[df["FBndID"] == fields.index[0]]
+
+
 def process_flowpath(cursor, scenario, huc12, db_fid, df):
     """Do one flowpath please."""
     lencolname = f"{PREFIX}Len{huc12}"
@@ -119,6 +140,11 @@ def process_flowpath(cursor, scenario, huc12, db_fid, df):
     # Sort along the length column, which orders the points from top
     # to bottom
     df = df.sort_values(lencolname, ascending=True)
+    # remove duplicate points due to a bkgelder sampling issue whereby some
+    # points exist in two fields
+    if df[lencolname].duplicated().any():
+        df = dedupe(df, lencolname)
+
     # Remove any previous data for this flowpath
     cursor.execute(
         "DELETE from flowpath_points WHERE flowpath = %s", (db_fid,)
@@ -142,7 +168,10 @@ def process_flowpath(cursor, scenario, huc12, db_fid, df):
         elev_change += dy
         dx = abs(row2[lencolname] - row[lencolname])
         if dx == 0:
-            raise Exception(f"dx is zero at segid: {segid} {row} {row2}")
+            # We have a duplicate point, abort as should not be possible
+            print(f"ABORT duplicate point {segid} {row} {row2}")
+            print(df[["OBJECTID", "FBndID", lencolname]])
+            sys.exit()
         x_change += dx
         gridorder = row[gordcolname]
         if gridorder > TRUNC_GRIDORDER_AT or pd.isnull(gridorder):
@@ -215,17 +244,19 @@ def process(cursor, scenario, huc12df):
     """
     # Hack compute the huc12 by finding the fp field name
     huc12 = None
+    fpcol = None
     for col in huc12df.columns:
         if col.startswith(PREFIX):
-            huc12 = col[len(PREFIX) :]
+            fpcol = col
+            huc12 = col[len(PREFIX) :].replace("_tif", "")
             break
-    if huc12 is None:
+    if huc12 is None or len(huc12) != 12:
         raise Exception(f"Could not find huc12 from {huc12df.columns}")
 
     delete_previous(cursor, scenario, huc12)
     # the inbound dataframe has lots of data, one row per flowpath point
     # We group the dataframe by the column which uses a PREFIX and the huc8
-    for flowpath_num, df in huc12df.groupby(f"{PREFIX}{huc12}"):
+    for flowpath_num, df in huc12df.groupby(fpcol):
         # These are upstream errors I should ignore
         if flowpath_num == 0 or len(df.index) < 2:
             continue
@@ -276,6 +307,10 @@ def main(argv):
     cursor.close()
     pgconn.commit()
 
+    print("Processing accounting:")
+    for key, val in PROCESSING_COUNTS.items():
+        print(f"    {key}: {val}")
+
 
 if __name__ == "__main__":
     main(sys.argv)
diff --git a/scripts/util/yearly_report.py b/scripts/util/yearly_report.py
@@ -3,8 +3,8 @@
 import datetime
 
 import matplotlib.pyplot as plt
-from pandas.io.sql import read_sql
-from pyiem.util import get_dbconn
+from pandas import read_sql
+from pyiem.util import get_dbconnstr
 from pyiem.reference import state_names
 
 
@@ -13,7 +13,6 @@ def main(argv):
     scenario = int(argv[1])
     state = argv[2]
     print(f"This report covers the inclusive years 2008-2021 for {state}")
-    pgconn = get_dbconn("idep")
 
     df = read_sql(
         """
@@ -35,7 +34,7 @@ def main(argv):
         round((avg(detachment) * 4.463)::numeric, 2) as detachment_ta
         from agg GROUP by yr ORDER by yr
     """,
-        pgconn,
+        get_dbconnstr("idep"),
         params=(state, scenario),
         index_col="yr",
     )