From 5685be17587c2065ea38944f2d2d963733775d26 Mon Sep 17 00:00:00 2001 From: Rick Debbout Date: Mon, 30 Aug 2021 12:11:36 -0700 Subject: [PATCH] add print stmnt for LakeCat published comparison (#92) Co-authored-by: Debbout --- ListPublishedUnpublishedFiles.py | 77 +++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/ListPublishedUnpublishedFiles.py b/ListPublishedUnpublishedFiles.py index 116c15b..5fdea79 100644 --- a/ListPublishedUnpublishedFiles.py +++ b/ListPublishedUnpublishedFiles.py @@ -1,25 +1,78 @@ -import pandas as pd import os from ftplib import FTP +import pandas as pd -ftp = FTP('newftp.epa.gov') +ftp = FTP("newftp.epa.gov") ftp.login() ftp.cwd("/EPADataCommons/ORD/NHDPlusLandscapeAttributes/StreamCat/HydroRegions/") published = ftp.nlst() -local_list = os.listdir(r'O:\PRIV\CPHEA\PESD\COR\CORFiles\Geospatial_Library_Projects\StreamCat\FTP_Staging\HydroRegions\zips') +local_list = os.listdir( + "O:/PRIV/CPHEA/PESD/COR/CORFiles/" + "Geospatial_Library_Projects/StreamCat/FTP_Staging/" + "HydroRegions/zips" +) + +local_published = list( + set([x.split("_Region")[0] for x in local_list if x in published]) +) + +control_file = "ControlTable_StreamCat.csv" +control = pd.read_csv(control_file) +orig = control.copy() + +control["Published"] = control.Final_Table_Name.isin(local_published).map( + {True: "Yes", False: "No"} +) + +# not sure of the best way to deal with this? we could add a list of metrics +# that we know we don't want published to this script and check against that? +newly_published = orig.loc[orig.compare(control).index, "Final_Table_Name"] +if not newly_published.empty: + print( + "The following metrics have been recently published in StreamCat:\n\t->", + ", ".join(newly_published.tolist()), + ) + + try: + control.to_csv(control_file, index=False) + except PermissionError as e: + print(f"You may have {control_file} open in Excel?\n", e) + +# move over to LakeCat directory for published zips +ftp.cwd("/EPADataCommons/ORD/NHDPlusLandscapeAttributes/LakeCat/FinalTables/") + +# assuming that LakeCat repo is in same parent folder as StreamCat +lk_control = pd.read_csv("../LakeCat/ControlTable_LakeCat.csv") + +lake_cat_ftp = [x.split(".zip")[0] for x in ftp.nlst() if x.endswith(".zip")] +# find zips that are published in StreamCat but not in LakeCat +# remove mask metrics with `.str.contains` -- NO MASKS IN LKCAT +lkcat_unpublished = ( + control.query("Published == 'Yes'") + .loc[~control.Final_Table_Name.isin(lake_cat_ftp)] + .loc[~control.FullTableName.str.contains(r"(?:RipBuf100|MidSlp|HiSlp)")] + .FullTableName +) +print( + "The following metrics have been run in StreamCat but not in LakeCat:\n\t->", + ", ".join(lkcat_unpublished.tolist()), +) + -local_published = [x for x in local_list if x in published] -local_unpublished = [x for x in local_list if x not in published] +# LOOKING AT DIFFERENCES BETWEEN LKCAT AND STRMCAT CONTROL TABLES +# +# lk_control.columns +# control.columns +# lk_control.FullTableName.isin(control.FullTableName) +# check = lk_control.loc[~lk_control.FullTableName.isin(control.FullTableName), ["FullTableName", "LandscapeLayer","MetricName"]] +# for row in check.itertuples(): +# if not control.loc[control.LandscapeLayer == row.LandscapeLayer].empty: +# print(row) -control = pd.read_csv(r'E:\GitProjects\StreamCat\ControlTable_StreamCat.csv') -local_published_metrics = [elem.split('_Region')[0] for elem in local_published] -local_published_metrics = list(set(local_published_metrics)) +# check2 = lk_control.loc[~lk_control.LandscapeLayer.isin(control.LandscapeLayer), ["FullTableName", "LandscapeLayer","MetricName"]] -list(control) -control['Published'] = control['Final_Table_Name'].isin(local_published_metrics) -control['Published'] = control['Published'].map({True: 'Yes', False: 'No'}) -control.to_csv(r'E:\GitProjects\StreamCat\ControlTable_StreamCat.csv', index=False) +# check3 = lk_control.loc[~lk_control.MetricName.isin(control.MetricName), ["FullTableName", "LandscapeLayer","MetricName"]]