-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrmr_algorithm.py
41 lines (34 loc) · 1.67 KB
/
mrmr_algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from sklearn.feature_selection import f_regression
#check
class MRMR(object):
def __init__(self, X, y, column_names, K):
self.X = pd.DataFrame(X, columns=column_names)
self.X = self.X.infer_objects()
self.y = y
self.column_names = column_names
self.selected = []
self.not_selected = self.X.columns.to_list()
self.K = K
self.mrm_scores = pd.DataFrame(index=self.column_names)
self.F = None
self.corr = None
self.compute_F_statistic()
def compute_F_statistic(self):
self.F = pd.Series(f_regression(self.X, self.y)[0], index=self.X.columns)
self.corr = pd.DataFrame(0.00001, index=self.X.columns, columns=self.X.columns)
def computing_correlations(self):
for i in range(self.K):
# compute (absolute) correlations between the last selected feature and all the (currently) excluded features
if i > 0:
last_selected = self.selected[-1]
self.corr.loc[self.not_selected, last_selected] = self.X[self.not_selected].corrwith(self.X[last_selected]).abs().clip(.00001)
# compute FCQ score for all the (currently) excluded features (this is Formula 2)
score = self.F.loc[self.not_selected]/self.corr.loc[self.not_selected, self.selected].mean(axis=1).fillna(.00001)
#max instead mean
#Find the best feature
best = score.index[score.argmax()]
self.mrm_scores.loc[score.index[score.argmax()], 0] = score[score.argmax()]
self.selected.append(best)
self.not_selected.remove(best)
return self.selected, self.not_selected