Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LakeCat published comparison #92

Merged
merged 2 commits into from
Aug 30, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 65 additions & 12 deletions ListPublishedUnpublishedFiles.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,78 @@
import pandas as pd
import os
from ftplib import FTP

import pandas as pd

ftp = FTP('newftp.epa.gov')
ftp = FTP("newftp.epa.gov")
ftp.login()
ftp.cwd("/EPADataCommons/ORD/NHDPlusLandscapeAttributes/StreamCat/HydroRegions/")


published = ftp.nlst()

local_list = os.listdir(r'O:\PRIV\CPHEA\PESD\COR\CORFiles\Geospatial_Library_Projects\StreamCat\FTP_Staging\HydroRegions\zips')
local_list = os.listdir(
"O:/PRIV/CPHEA/PESD/COR/CORFiles/"
"Geospatial_Library_Projects/StreamCat/FTP_Staging/"
"HydroRegions/zips"
)

local_published = list(
set([x.split("_Region")[0] for x in local_list if x in published])
)

control_file = "ControlTable_StreamCat.csv"
control = pd.read_csv(control_file)
orig = control.copy()

control["Published"] = control.Final_Table_Name.isin(local_published).map(
{True: "Yes", False: "No"}
)

# not sure of the best way to deal with this? we could add a list of metrics
# that we know we don't want published to this script and check against that?
newly_published = orig.loc[orig.compare(control).index, "Final_Table_Name"]
if not newly_published.empty:
print(
"The following metrics have been recently published in StreamCat:\n\t->",
", ".join(newly_published.tolist()),
)

try:
control.to_csv(control_file, index=False)
except PermissionError as e:
print(f"You may have {control_file} open in Excel?\n", e)

# move over to LakeCat directory for published zips
ftp.cwd("/EPADataCommons/ORD/NHDPlusLandscapeAttributes/LakeCat/FinalTables/")

# assuming that LakeCat repo is in same parent folder as StreamCat
lk_control = pd.read_csv("../LakeCat/ControlTable_LakeCat.csv")

lake_cat_ftp = [x.split(".zip")[0] for x in ftp.nlst() if x.endswith(".zip")]
# find zips that are published in StreamCat but not in LakeCat
# remove mask metrics with `.str.contains` -- NO MASKS IN LKCAT
lkcat_unpublished = (
control.query("Published == 'Yes'")
.loc[~control.Final_Table_Name.isin(lake_cat_ftp)]
.loc[~control.FullTableName.str.contains(r"(?:RipBuf100|MidSlp|HiSlp)")]
.FullTableName
)
print(
"The following metrics have been run in StreamCat but not in LakeCat:\n\t->",
", ".join(lkcat_unpublished.tolist()),
)


local_published = [x for x in local_list if x in published]
local_unpublished = [x for x in local_list if x not in published]
# LOOKING AT DIFFERENCES BETWEEN LKCAT AND STRMCAT CONTROL TABLES
#
# lk_control.columns
# control.columns
# lk_control.FullTableName.isin(control.FullTableName)
# check = lk_control.loc[~lk_control.FullTableName.isin(control.FullTableName), ["FullTableName", "LandscapeLayer","MetricName"]]
# for row in check.itertuples():
# if not control.loc[control.LandscapeLayer == row.LandscapeLayer].empty:
# print(row)

control = pd.read_csv(r'E:\GitProjects\StreamCat\ControlTable_StreamCat.csv')
local_published_metrics = [elem.split('_Region')[0] for elem in local_published]
local_published_metrics = list(set(local_published_metrics))
# check2 = lk_control.loc[~lk_control.LandscapeLayer.isin(control.LandscapeLayer), ["FullTableName", "LandscapeLayer","MetricName"]]

list(control)
control['Published'] = control['Final_Table_Name'].isin(local_published_metrics)
control['Published'] = control['Published'].map({True: 'Yes', False: 'No'})
control.to_csv(r'E:\GitProjects\StreamCat\ControlTable_StreamCat.csv', index=False)
# check3 = lk_control.loc[~lk_control.MetricName.isin(control.MetricName), ["FullTableName", "LandscapeLayer","MetricName"]]