Skip to content

Commit

Permalink
improve initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
BeGeiger committed Oct 31, 2023
1 parent b11550d commit 364e378
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 97 deletions.
46 changes: 4 additions & 42 deletions src/salamander/nmf_framework/corrnmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@

from ..utils import match_signatures_pair, shape_checker, type_checker
from ._utils_klnmf import kl_divergence, poisson_llh, samplewise_kl_divergence
from .initialization import (
init_custom,
init_flat,
init_nndsvd,
init_random,
init_separableNMF,
)
from .initialization import initialize
from .signature_nmf import SignatureNMF

EPSILON = np.finfo(np.float32).eps
Expand Down Expand Up @@ -436,41 +430,9 @@ def _initialize(
self.n_given_signatures = 0

init_kwargs = {} if init_kwargs is None else init_kwargs.copy()

if self.init_method == "custom":
self.W, _ = init_custom(self.X, self.n_signatures, **init_kwargs)

elif self.init_method == "flat":
self.W, _ = init_flat(self.X, self.n_signatures)

elif self.init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
self.W, _ = init_nndsvd(
self.X, self.n_signatures, init=self.init_method, **init_kwargs
)

elif self.init_method == "random":
self.W, _ = init_random(self.X, self.n_signatures, **init_kwargs)

else:
self.W = init_separableNMF(self.X, self.n_signatures)

if given_signatures is not None:
self.W[:, : self.n_given_signatures] = given_signatures.copy().values
given_signatures_names = given_signatures.columns.to_numpy(dtype="<U20")
n_new_signatures = self.n_signatures - self.n_given_signatures
new_signatures_names = np.array(
[f"Sig{k+1}" for k in range(n_new_signatures)]
)
self.signature_names = np.concatenate(
[given_signatures_names, new_signatures_names]
)
else:
self.signature_names = np.array(
[f"Sig{k+1}" for k in range(self.n_signatures)], dtype="<U20"
)

self.W /= np.sum(self.W, axis=0)
self.W = self.W.clip(EPSILON)
self.W, _, self.signature_names = initialize(
self.X, self.n_signatures, self.init_method, given_signatures, **init_kwargs
)
self.sigma_sq = 1.0

if given_signature_biases is None:
Expand Down
101 changes: 93 additions & 8 deletions src/salamander/nmf_framework/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,19 @@
import numpy as np
from sklearn.decomposition import _nmf as sknmf

from ..utils import shape_checker, type_checker
from ..utils import normalize_WH, shape_checker, type_checker, value_checker

EPSILON = np.finfo(np.float32).eps
INIT_METHODS = [
"custom",
"flat",
"hierarchical_cluster",
"nndsvd",
"nndsvda",
"nndsvdar",
"random",
"separableNMF",
]


def init_custom(
Expand All @@ -16,11 +28,9 @@ def init_custom(
"""
type_checker("W_custom", W_custom, np.ndarray)
type_checker("H_custom", H_custom, np.ndarray)

n_features, n_samples = X.shape
shape_checker("W_custom", W_custom, (n_features, n_signatures))
shape_checker("H_custom", H_custom, (n_signatures, n_samples))

return W_custom, H_custom


Expand All @@ -30,10 +40,8 @@ def init_flat(X: np.ndarray, n_signatures: int):
"""
n_features, n_samples = X.shape
scaling = np.mean(np.sum(X, axis=0))

W = np.full((n_features, n_signatures), 1 / n_features)
H = np.full((n_signatures, n_samples), scaling / n_signatures)

return W, H


Expand Down Expand Up @@ -72,11 +80,10 @@ def init_random(X: np.ndarray, n_signatures: int, seed=None):
W = np.random.dirichlet(np.ones(n_features), size=n_signatures).T
scaling = np.sum(X, axis=0)
H = scaling * np.random.dirichlet(np.ones(n_signatures), size=n_samples).T

return W, H


def init_separableNMF(X: np.ndarray, n_signatures: int):
def init_separableNMF(X: np.ndarray, n_signatures: int, seed=None):
r"""
This code is following Algorithm 1 from "Fast and Robust Recursive
Algorithms for Separable Nonnegative Matrix Factorization"
Expand All @@ -95,5 +102,83 @@ def init_separableNMF(X: np.ndarray, n_signatures: int):
signature_indices[k] = kstar

W = X[:, signature_indices].astype(float)
_, H = init_random(X, n_signatures, seed=seed)
return W, H


def initialize(
X: np.ndarray,
n_signatures: int,
init_method="nndsvd",
given_signatures=None,
**kwargs,
):
"""
Initialize the signature and exposure matrices.
Parameters
----------
X : np.ndarray
count matrix
n_signatures : int
number of signatures
init_method : str
initialization method. One of 'custom', 'flat', 'hierarchical_cluster',
'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'
given_signatures : pd.Dataframe, default=None
At most 'n_signatures' many signatures can be provided to
overwrite some of the initialized signatures. This does not
change the initialized exposurse.
kwargs : dict
Any keyword arguments to be passed to the initialization method.
This includes, for example, a possible 'seed' keyword argument
for all stochastic methods.
Returns
-------
W : np.ndarray
signature matrix
H : np.ndarray
exposure matrix
signature_names : list
The signature names. By default, the signatures are named
'Sigk', where 'k' is one plus the index of the signature.
If 'given_signatures' are provided, the names are adjusted
accordingly.
"""
value_checker("init_method", init_method, INIT_METHODS)

if init_method == "custom":
W, H = init_custom(X, n_signatures, **kwargs)

elif init_method == "flat":
W, H = init_flat(X, n_signatures)

elif init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
W, H = init_nndsvd(X, n_signatures, init=init_method, **kwargs)

elif init_method == "random":
W, H = init_random(X, n_signatures, **kwargs)

else:
W, H = init_separableNMF(X, n_signatures, **kwargs)

if given_signatures is not None:
n_given_signatures = len(given_signatures.columns)
W[:, :n_given_signatures] = given_signatures.copy().values
given_signatures_names = given_signatures.columns.to_numpy(dtype=str)
n_new_signatures = n_signatures - n_given_signatures
new_signatures_names = np.array([f"Sig{k+1}" for k in range(n_new_signatures)])
signature_names = np.concatenate([given_signatures_names, new_signatures_names])
else:
signature_names = np.array([f"Sig{k+1}" for k in range(n_signatures)])

return W
W, H = normalize_WH(W, H)
W, H = W.clip(EPSILON), H.clip(EPSILON)
return W, H, signature_names
57 changes: 10 additions & 47 deletions src/salamander/nmf_framework/nmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@
import numpy as np
import pandas as pd

from ..utils import match_signatures_pair, normalize_WH
from .initialization import (
init_custom,
init_flat,
init_nndsvd,
init_random,
init_separableNMF,
)
from ..utils import match_signatures_pair
from .initialization import initialize
from .signature_nmf import SignatureNMF

EPSILON = np.finfo(np.float32).eps
Expand Down Expand Up @@ -177,6 +171,11 @@ def _initialize(self, given_signatures=None, init_kwargs=None):
Input:
------
given_signatures : pd.Dataframe, default=None
At most 'n_signatures' many signatures can be provided to
overwrite some of the initialized signatures. This does not
change the initialized exposurse.
init_kwargs: dict
Any further keywords arguments to be passed to the initialization method.
This includes, for example, a possible 'seed' keyword argument
Expand All @@ -189,45 +188,9 @@ def _initialize(self, given_signatures=None, init_kwargs=None):
self.n_given_signatures = 0

init_kwargs = {} if init_kwargs is None else init_kwargs.copy()

if self.init_method == "custom":
self.W, self.H = init_custom(self.X, self.n_signatures, **init_kwargs)

elif self.init_method == "flat":
self.W, self.H = init_flat(self.X, self.n_signatures)

elif self.init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
self.W, self.H = init_nndsvd(
self.X, self.n_signatures, init=self.init_method, **init_kwargs
)

elif self.init_method == "random":
self.W, self.H = init_random(self.X, self.n_signatures, **init_kwargs)

else:
self.W = init_separableNMF(self.X, self.n_signatures)

if given_signatures is not None:
self.W[:, : self.n_given_signatures] = given_signatures.copy().values
given_signatures_names = given_signatures.columns.to_numpy(dtype=str)
n_new_signatures = self.n_signatures - self.n_given_signatures
new_signatures_names = np.array(
[f"Sig{k+1}" for k in range(n_new_signatures)]
)
self.signature_names = np.concatenate(
[given_signatures_names, new_signatures_names]
)

else:
self.signature_names = np.array(
[f"Sig{k+1}" for k in range(self.n_signatures)]
)

if not hasattr(self, "H"):
_, self.H = init_random(self.X, self.n_signatures)

self.W, self.H = normalize_WH(self.W, self.H)
self.W, self.H = self.W.clip(EPSILON), self.H.clip(EPSILON)
self.W, self.H, self.signature_names = initialize(
self.X, self.n_signatures, self.init_method, given_signatures, **init_kwargs
)

@property
def corr_signatures(self) -> pd.DataFrame:
Expand Down

0 comments on commit 364e378

Please sign in to comment.