-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Ceph Upstream Prediction Models #1
Changes from 8 commits
4c3512b
36ef936
86991e6
c69b6a9
03cff55
199f947
000bc56
ca1dd07
13f73d1
c8773b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,262 @@ | ||
#!/usr/bin/env python3 | ||
# vim: ts=4 sw=4 expandtab | ||
"""Machine learning model for disk failure prediction. | ||
|
||
This classes defined here provide the disk failure prediction module. | ||
RHDiskFailurePredictor uses the models developed at the AICoE in the | ||
Office of the CTO at Red Hat. These models were built using the open | ||
source Backblaze SMART metrics dataset. | ||
PSDiskFailurePredictor uses the models developed by ProphetStor as an | ||
example. | ||
|
||
An instance of the predictor is initialized by providing the path to trained | ||
models. Then, to predict hard drive health and deduce time to failure, the | ||
predict function is called with 6 days worth of SMART data from the hard drive. | ||
It will return a string to indicate disk failure status: "Good", "Warning", | ||
"Bad", or "Unknown". | ||
|
||
An example code is as follows: | ||
|
||
>>> model = disk_failure_predictor.RHDiskFailurePredictor() | ||
>>> status = model.initialize("./models") | ||
>>> if status: | ||
>>> model.predict(device_data) | ||
'Bad' | ||
""" | ||
import os | ||
import pickle | ||
import logging | ||
import argparse | ||
|
||
import numpy as np | ||
|
||
import json | ||
import sys | ||
from enum import Enum | ||
|
||
|
||
s_Reallocated_Sector_Count = '5' | ||
s_Reported_Uncorrectable_Errors = '187' | ||
s_Command_Timeout = '188' | ||
s_Current_Pending_Sector_Count = '197' | ||
s_Offline_Uncorrectable = '198' | ||
|
||
|
||
def get_diskfailurepredictor_path(): | ||
path = os.path.abspath(__file__) | ||
dir_path = os.path.dirname(path) | ||
return dir_path | ||
|
||
|
||
class RHDiskFailurePredictor(object): | ||
"""Disk failure prediction module developed at Red Hat | ||
|
||
This class implements a disk failure prediction module. | ||
""" | ||
|
||
# json with manufacturer names as keys | ||
# and features used for prediction as values | ||
CONFIG_FILE = "config.json" | ||
PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"} | ||
|
||
# model name prefixes to identify vendor | ||
MANUFACTURER_MODELNAME_PREFIXES = { | ||
"WD": "WDC", # for cases like "WDxxx" | ||
"WDC": "WDC", | ||
"Toshiba": "Toshiba", # for cases like "Toshiba xxx" | ||
"TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx" | ||
"toshiba": "Toshiba", # for cases like "toshiba xxx" | ||
"ST": "Seagate", # for cases like "STxxxx" | ||
"Seagate": "Seagate", # for cases like "Seagate BarraCuda ZAxxx" | ||
"ZA": "Seagate", # for cases like "ZAxxxx" | ||
"Hitachi": "Hitachi", | ||
"HGST": "HGST", | ||
} | ||
|
||
LOGGER = logging.getLogger() | ||
|
||
def __init__(self): | ||
""" | ||
This function may throw exception due to wrong file operation. | ||
""" | ||
self.model_dirpath = "" | ||
self.model_context = {} | ||
|
||
def initialize(self, model_dirpath): | ||
"""Initialize all models. Save paths of all trained model files to list | ||
|
||
Arguments: | ||
model_dirpath {str} -- path to directory of trained models | ||
|
||
Returns: | ||
str -- Error message. If all goes well, return None | ||
""" | ||
# read config file as json, if it exists | ||
config_path = os.path.join(model_dirpath, self.CONFIG_FILE) | ||
if not os.path.isfile(config_path): | ||
return "Missing config file: " + config_path | ||
else: | ||
with open(config_path) as f_conf: | ||
self.model_context = json.load(f_conf) | ||
|
||
# ensure all manufacturers whose context is defined in config file | ||
# have models and scalers saved inside model_dirpath | ||
for manufacturer in self.model_context: | ||
scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl") | ||
if not os.path.isfile(scaler_path): | ||
return "Missing scaler file: {}".format(scaler_path) | ||
model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl") | ||
if not os.path.isfile(model_path): | ||
return "Missing model file: {}".format(model_path) | ||
|
||
self.model_dirpath = model_dirpath | ||
|
||
def __preprocess(self, device_data, manufacturer): | ||
"""Scales and transforms input dataframe to feed it to prediction model | ||
|
||
Arguments: | ||
device_data {list} -- list in which each element is a dictionary with key,val | ||
as feature name,value respectively. | ||
e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...] | ||
manufacturer {str} -- manufacturer of the hard drive | ||
|
||
Returns: | ||
numpy.ndarray -- (n, d) shaped array of n days worth of data and d | ||
features, scaled | ||
""" | ||
# get the attributes that were used to train model for current manufacturer | ||
try: | ||
model_features = self.model_context[manufacturer] | ||
model_smart_attr = [i for i in model_features if 'smart' in i] | ||
except KeyError as e: | ||
RHDiskFailurePredictor.LOGGER.debug( | ||
"No context (SMART attributes on which model has been trained) found for manufacturer: {}".format( | ||
manufacturer | ||
) | ||
) | ||
return None | ||
|
||
# assuming capacity does not change across days | ||
user_capacity = device_data['capacity_bytes'] | ||
|
||
# what timestamps do we have the data from | ||
days = sorted(device_data['smart_data'].keys()) | ||
|
||
# (n,d) shaped array to hold smart attribute values for device | ||
# where n is the number of days and d is number of SMART attributes | ||
device_smart_attr_values = np.empty( | ||
shape=(len(days), len(model_smart_attr)) | ||
) | ||
for di, day in enumerate(days): | ||
for ai, attr in enumerate(model_smart_attr): | ||
if 'raw' in attr: | ||
device_smart_attr_values[di][ai] = device_data['smart_data'][day]['attr'][attr.split('_')[1]]['val_raw'] | ||
elif 'normalized' in attr: | ||
device_smart_attr_values[di][ai] = device_data['smart_data'][day]['attr'][attr.split('_')[1]]['val_norm'] | ||
elif 'user_capacity' in attr: | ||
continue | ||
else: | ||
raise ValueError('Unknown attribute found in config') | ||
|
||
# featurize n (6 to 12) days data - mean,std,coefficient of variation | ||
|
||
# rolling time window interval size in days | ||
roll_window_size = 6 | ||
|
||
# extract rolling mean and std for SMART values | ||
means = np.empty(shape=(len(days) - roll_window_size + 1, device_smart_attr_values.shape[1])) | ||
stds = np.empty(shape=(len(days) - roll_window_size + 1, device_smart_attr_values.shape[1])) | ||
|
||
for i in range(0, device_smart_attr_values.shape[0] - roll_window_size + 1): | ||
means[i, :] = device_smart_attr_values[i: i+roll_window_size, :].mean(axis=0) | ||
stds[i, :] = device_smart_attr_values[i: i+roll_window_size, :].std(axis=0, ddof=1) | ||
|
||
# calculate coefficient of variation | ||
cvs = np.divide(stds, means, out=np.zeros_like(stds), where=means!=0) | ||
|
||
# combine all extracted features | ||
if 'user_capacity' in model_features: | ||
featurized = np.hstack(( | ||
means, | ||
stds, | ||
cvs, | ||
np.repeat(user_capacity, len(days) - roll_window_size + 1).reshape(-1, 1), | ||
)) | ||
else: | ||
featurized = np.hstack((means, stds, cvs)) | ||
|
||
# scale features | ||
scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl") | ||
with open(scaler_path, 'rb') as f: | ||
scaler = pickle.load(f) | ||
featurized = scaler.transform(featurized) | ||
return featurized | ||
|
||
@staticmethod | ||
def __get_manufacturer(model_name): | ||
"""Returns the manufacturer name for a given hard drive model name | ||
|
||
Arguments: | ||
model_name {str} -- hard drive model name | ||
|
||
Returns: | ||
str -- manufacturer name | ||
""" | ||
for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items(): | ||
if model_name.startswith(prefix): | ||
return manufacturer | ||
# print error message | ||
RHDiskFailurePredictor.LOGGER.debug( | ||
"Could not infer manufacturer from model name {}".format(model_name) | ||
) | ||
|
||
def predict(self, device_data): | ||
# get manufacturer preferably as a smartctl attribute | ||
manufacturer = device_data.get("vendor", None) | ||
|
||
# if not available then try to infer using model name | ||
if manufacturer is None: | ||
RHDiskFailurePredictor.LOGGER.debug( | ||
'"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.' | ||
) | ||
if "model" not in device_data.keys(): | ||
RHDiskFailurePredictor.LOGGER.debug( | ||
'"model" field not found in smartctl output.' | ||
) | ||
else: | ||
manufacturer = RHDiskFailurePredictor.__get_manufacturer( | ||
device_data["model"] | ||
) | ||
|
||
# print error message and return Unknown | ||
if manufacturer is None: | ||
RHDiskFailurePredictor.LOGGER.debug( | ||
"Manufacturer could not be determiend. This may be because \ | ||
DiskPredictor has never encountered this manufacturer before, \ | ||
or the model name is not according to the manufacturer's \ | ||
naming conventions known to DiskPredictor" | ||
) | ||
return RHDiskFailurePredictor.PREDICTION_CLASSES[-1] | ||
else: | ||
manufacturer = manufacturer.lower() | ||
|
||
# preprocess for feeding to model | ||
preprocessed_data = self.__preprocess(device_data, manufacturer) | ||
if preprocessed_data is None: | ||
return RHDiskFailurePredictor.PREDICTION_CLASSES[-1] | ||
|
||
# get model for current manufacturer | ||
model_path = os.path.join( | ||
self.model_dirpath, manufacturer + "_predictor.pkl" | ||
) | ||
with open(model_path, 'rb') as f: | ||
model = pickle.load(f) | ||
|
||
# use prediction for latest day as the output | ||
pred_class_id = model.predict(preprocessed_data)[-1] | ||
return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id] | ||
|
||
|
||
class PreditionResult(Enum): | ||
GOOD = 0 | ||
FAIL = 1 | ||
|
@@ -26,11 +272,30 @@ def simple_prediction(device_data): | |
return PreditionResult.GOOD | ||
|
||
def main(): | ||
# args to decide which model to use for prediction | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'--predictor-model', '--pm', | ||
type=str, | ||
choices=['redhat', 'simple'], | ||
default='redhat', | ||
) | ||
args = parser.parse_args() | ||
|
||
inp_json = sys.stdin.read() | ||
device_data = json.loads(inp_json) | ||
prediction_result = simple_prediction(device_data) | ||
|
||
#print(prediction_result) | ||
if args.predictor_model == 'simple': | ||
prediction_result = simple_prediction(device_data) | ||
elif args.predictor_model == 'redhat': | ||
# init model | ||
predictor = RHDiskFailurePredictor() | ||
predictor.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), args.predictor_model)) | ||
|
||
# make prediction | ||
prediction_result = predictor.predict(device_data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'device_data' is not defined. I think you meant to add after line 305 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, looks like I accidentally deleted that after testing 😅 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, @chauhankaranraj! Yes, json.loads(). I still see several warnings and another error:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yaarith I think most of the warnings are expected future deprecation warnings. But the last UserWarning is something I haven't seen before - are the packages installed according to the requirements? As per requirements.txt, scikit-learn needs to be Could you please lmk how I can replicate the last Note: for testing, I had commented out the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
These warnings will become errors soon, and I'd rather have this code stable for a while :-)
Indeed, 0.23.2 is used. I think this issue relates to the issues Ceph users are having with diskprediction local module. Users who still have version 0.19.2 will eventually upgrade scikit-learn and the module will be broken. Also, the latest version in Ceph is 0.23.2, so we want to keep it on par with that. Is it possible to make it work with the latest scikit-learn version?
You can run: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep, that's exactly what I had in mind.
Sounds good!
Cool, shall I remove the placeholder function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can we make sure we train using the latest versions of numpy, scikit-learn, etc.? These models serve Ceph upstream and we want to move forward to the latest releases.
Yes, you can remove anything in this file which is not related to 'redhat' model (so anything which was part of the original placeholder model). The idea is that each model will be in a separate file.
Oh, I meant to uncomment the line :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Definitely. IIRC when we trained these models last year, the constraint
Pushed these changes 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks!
Sounds good. We do need to understand how we integrate them with future Ceph releases, so the diskprediction module uses them and does not break. Does model.py need the SMART reports to be consecutive? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It won't break, but the output may not be accurate in this case. The model has been trained to predict health based on exactly last 6 days of device behavior. So if the input is data from days other than the last 6 days, then we can't be sure that the result will be meaningful. |
||
else: | ||
raise ValueError(f'Got invalid input for `--predictor-model`: {args.predictor_model}') | ||
|
||
if __name__ == '__main__': | ||
sys.exit(main()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"], | ||
"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_raw", "smart_199_raw"] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,5 @@ requests | |
flask | ||
flask_restful | ||
psycopg2 | ||
numpy==1.15.1 | ||
scikit-learn==0.19.2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can also remove this ProphetStor reference