Skip to content

Commit

Permalink
Merge pull request #25 from theochem/web_app
Browse files Browse the repository at this point in the history
Update the function to allow direct usage of molecular features
  • Loading branch information
FanwangM authored Oct 9, 2023
2 parents 661ef19 + 48d19f2 commit 288f20b
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 37 deletions.
71 changes: 43 additions & 28 deletions b3clf/b3clf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,31 @@
import numpy as np
from .descriptor_padel import compute_descriptors
from .geometry_opt import geometry_optimize
from .utils import (get_descriptors, predict_permeability,
scale_descriptors, select_descriptors)
from .utils import (
get_descriptors,
predict_permeability,
scale_descriptors,
select_descriptors,
)

__all__ = [
"b3clf",
]


def b3clf(mol_in,
sep="\s+|\t+",
clf="xgb",
sampling="classic_ADASYN",
output="B3clf_output.xlsx",
verbose=1,
random_seed=42,
time_per_mol=-1,
keep_features="no",
keep_sdf="no",
threshold="none",
):
def b3clf(
mol_in,
sep="\s+|\t+",
clf="xgb",
sampling="classic_ADASYN",
output="B3clf_output.xlsx",
verbose=1,
random_seed=42,
time_per_mol=-1,
keep_features="no",
keep_sdf="no",
threshold="none",
):
"""Use B3clf for BBB classifications with resampling strategies.
Parameters
Expand Down Expand Up @@ -110,12 +115,13 @@ def b3clf(mol_in,

geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)

_ = compute_descriptors(sdf_file=internal_sdf,
excel_out=features_out,
output_csv=None,
timeout=None,
time_per_molecule=time_per_mol,
)
_ = compute_descriptors(
sdf_file=internal_sdf,
excel_out=features_out,
output_csv=None,
timeout=None,
time_per_molecule=time_per_mol,
)

# Get computed descriptors
X_features, info_df = get_descriptors(df=features_out)
Expand All @@ -131,16 +137,25 @@ def b3clf(mol_in,
# clf = get_clf(clf_str=clf, sampling_str=sampling)

# Get classifier
result_df = predict_permeability(clf_str=clf,
sampling_str=sampling,
features_df=X_features,
info_df=info_df,
threshold=threshold)
result_df = predict_permeability(
clf_str=clf,
sampling_str=sampling,
mol_features=X_features,
info_df=info_df,
threshold=threshold,
)

# Get classifier
display_cols = ["ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label"]

result_df = result_df[[col for col in result_df.columns.to_list() if col in display_cols]]
display_cols = [
"ID",
"SMILES",
"B3clf_predicted_probability",
"B3clf_predicted_label",
]

result_df = result_df[
[col for col in result_df.columns.to_list() if col in display_cols]
]
if verbose != 0:
print(result_df)

Expand Down
23 changes: 14 additions & 9 deletions b3clf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def scale_descriptors(df):
dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
b3db_scaler = load(filename)
df.iloc[:, :] = b3db_scaler.transform(df)
df_new = b3db_scaler.transform(df)

return df
return df_new


def get_clf(clf_str, sampling_str):
Expand Down Expand Up @@ -125,26 +125,31 @@ def get_clf(clf_str, sampling_str):
return clf


def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold="none"):
def predict_permeability(
clf_str, sampling_str, mol_features, info_df, threshold="none"
):
"""Compute and store BBB predicted label and predicted probability to results dataframe."""

# load the threshold data
dirname = os.path.dirname(__file__)
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
# default threshold is 0.5
label_pool = np.zeros(features_df.shape[0], dtype=int)
label_pool = np.zeros(mol_features.shape[0], dtype=int)

# get the classifier
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)

if features_df.index.tolist() != info_df.index.tolist():
raise ValueError(
"Features_df and Info_df do not have the same index. Internal processing error"
)
if type(mol_features) == pd.DataFrame:
if mol_features.index.tolist() != info_df.index.tolist():
raise ValueError(
"Features_df and Info_df do not have the same index. Internal processing error"
)

# get predicted probabilities
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(features_df)[:, 1]
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
:, 1
]
# get predicted label from probability using the threshold
mask = np.greater_equal(
info_df["B3clf_predicted_probability"].to_numpy(),
Expand Down

0 comments on commit 288f20b

Please sign in to comment.