Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle None values in vrpt data #83

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions HISTORY.txt
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,5 @@
22-Oct-2024 V1.725 Remove dependency on edmaps holdings file (no longer generating Map Coefficient MTZ files);
Add CLI support for performing final sanity check for ExDB loading and holdings in etl.load_ex.DbLoadingWorkflow task;
Update CI/CD testing to use python 3.10
23-Dec-2024 V1.726 Skip integers that exceed max int32 in DataTransformFactory
23-Dec-2024 V1.727 Handle "None" values in vrpt data
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update the date to 2025

2 changes: 1 addition & 1 deletion rcsb/db/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
__author__ = "John Westbrook"
__email__ = "[email protected]"
__license__ = "Apache 2.0"
__version__ = "1.725"
__version__ = "1.727"
23 changes: 17 additions & 6 deletions rcsb/db/processors/DataTransformFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# to minimize costly functon calls for simple casts.
# 24-Mar-2019 jdw adjust null value filtering
# 4-Apr-2022 bv handle embedded iterable float values in 'castIterableFloat' method
# 21-Dec-2024 bv Skip integers that exceed max int32 (2147483647)
# 23-Dec-2-24 bv Handle "None" values in vrpt data
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix/update the date to 2025

##
"""
Factory for functional elements of the transformations between input data and
Expand Down Expand Up @@ -89,6 +91,8 @@ def __init__(self, schemaDefAccessObj, filterType):
self.__transFlags["normalizeEnums"] = "normalize-enums" in filterType
self.__transFlags["translateXMLCharRefs"] = "translateXMLCharRefs" in filterType
self.__transFlags["normalizeDates"] = True
# Can be added to filterType later if needed
self.__transFlags["dropLargeIntegers"] = True
logger.debug("FLAGS settings are %r", self.__transFlags)
#
self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE)
Expand Down Expand Up @@ -217,7 +221,7 @@ def processRecord(self, tableId, row, attributeNameList, containerName=None):
if atName in dT["pureCast"]:
if nullFlag and self.__transFlags["dropEmpty"]:
continue
if (row[ii] == "?") or (row[ii] == ".") or (row[ii]) == "":
if (row[ii] == "?") or (row[ii] == ".") or (row[ii]) == "" or (row[ii]) == "None":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about simply:

Suggested change
if (row[ii] == "?") or (row[ii] == ".") or (row[ii]) == "" or (row[ii]) == "None":
if row[ii] in {"?", ".", "", "None"}:

Also, do you want to check for None (non-string version) as well? If so, see my similar suggestion made below.

if self.__transFlags["dropEmpty"]:
continue
else:
Expand All @@ -227,7 +231,14 @@ def processRecord(self, tableId, row, attributeNameList, containerName=None):
if dT["pureCast"][atName] == "string":
dD[dT["atNameD"][atName]] = row[ii]
elif dT["pureCast"][atName] == "integer":
dD[dT["atNameD"][atName]] = int(row[ii])
if abs(int(row[ii])) > 2147483647 and self.__transFlags["dropLargeIntegers"]:
# Skip large integers
logger.warning("Skipping large integer in entry %s table %s attribute %s", containerName, tableId, atName)
continue
Comment on lines +234 to +237
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a minor adjustment (swap the order of conditions to save time if the flag is False, and specify int32 in the comment):

Suggested change
if abs(int(row[ii])) > 2147483647 and self.__transFlags["dropLargeIntegers"]:
# Skip large integers
logger.warning("Skipping large integer in entry %s table %s attribute %s", containerName, tableId, atName)
continue
if self.__transFlags["dropLargeIntegers"] and abs(int(row[ii])) > 2147483647:
# Skip large integers (greater than int32)
logger.warning("Skipping large integer in entry %s table %s attribute %s", containerName, tableId, atName)
continue

# Or set large integers to maxInt32
# dD[dT["atNameD"][atName]] = 2147483647
Comment on lines +238 to +239
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete

Suggested change
# Or set large integers to maxInt32
# dD[dT["atNameD"][atName]] = 2147483647

else:
dD[dT["atNameD"][atName]] = int(row[ii])
elif dT["pureCast"][atName] == "float":
dD[dT["atNameD"][atName]] = float(row[ii])
continue
Expand Down Expand Up @@ -321,7 +332,7 @@ def castInteger(self, trfTup):
"""
if trfTup.isNull:
return trfTup
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == ""):
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == "") or (trfTup.value == "None"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again here (as well as in the other identical lines further below (i.e., line 345, 357, 368)), how about:

Suggested change
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == "") or (trfTup.value == "None"):
if trfTup.value in {"?", ".", "", "None"} or trfTup.value is None:

return TrfValue(self.__nullValueOther, trfTup.atId, trfTup.origLength, True)
return TrfValue(int(trfTup.value), trfTup.atId, trfTup.origLength, False)

Expand All @@ -331,7 +342,7 @@ def castIterableInteger(self, trfTup):
"""
if trfTup.isNull:
return trfTup
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == ""):
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == "") or (trfTup.value == "None"):
return TrfValue(self.__nullValueOther, trfTup.atId, trfTup.origLength, True)
# vL = [int(v.strip()) for v in str(trfTup.value).split(self.__tObj.getIterableSeparator(trfTup.atId))]
vL = [int(v.strip()) if v.strip() not in [".", "?"] else None for v in str(trfTup.value).split(self.__tObj.getIterableSeparator(trfTup.atId))]
Expand All @@ -343,7 +354,7 @@ def castFloat(self, trfTup):
"""
if trfTup.isNull:
return trfTup
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == ""):
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == "") or (trfTup.value == "None"):
return TrfValue(self.__nullValueOther, trfTup.atId, trfTup.origLength, True)
return TrfValue(float(trfTup.value), trfTup.atId, trfTup.origLength, False)

Expand All @@ -354,7 +365,7 @@ def castIterableFloat(self, trfTup):
# logger.info(">> atId %r value %r delimiter %r", trfTup.atId, trfTup.value, self.__tObj.getIterableSeparator(trfTup.atId))
if trfTup.isNull:
return trfTup
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == ""):
if (trfTup.value == "?") or (trfTup.value == ".") or (trfTup.value is None) or (trfTup.value == "") or (trfTup.value == "None"):
return TrfValue(self.__nullValueOther, trfTup.atId, trfTup.origLength, True)
# vL = [float(v.strip()) for v in str(trfTup.value).split(self.__tObj.getIterableSeparator(trfTup.atId))]
if not self.__tObj.isEmbeddedIterable(trfTup.atId):
Expand Down