improve initialization

parklab · Oct 31, 2023 · 364e378 · 364e378
1 parent b11550d
commit 364e378
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 97 deletions.
diff --git a/src/salamander/nmf_framework/corrnmf.py b/src/salamander/nmf_framework/corrnmf.py
@@ -7,13 +7,7 @@
 
 from ..utils import match_signatures_pair, shape_checker, type_checker
 from ._utils_klnmf import kl_divergence, poisson_llh, samplewise_kl_divergence
-from .initialization import (
-    init_custom,
-    init_flat,
-    init_nndsvd,
-    init_random,
-    init_separableNMF,
-)
+from .initialization import initialize
 from .signature_nmf import SignatureNMF
 
 EPSILON = np.finfo(np.float32).eps
@@ -436,41 +430,9 @@ def _initialize(
             self.n_given_signatures = 0
 
         init_kwargs = {} if init_kwargs is None else init_kwargs.copy()
-
-        if self.init_method == "custom":
-            self.W, _ = init_custom(self.X, self.n_signatures, **init_kwargs)
-
-        elif self.init_method == "flat":
-            self.W, _ = init_flat(self.X, self.n_signatures)
-
-        elif self.init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
-            self.W, _ = init_nndsvd(
-                self.X, self.n_signatures, init=self.init_method, **init_kwargs
-            )
-
-        elif self.init_method == "random":
-            self.W, _ = init_random(self.X, self.n_signatures, **init_kwargs)
-
-        else:
-            self.W = init_separableNMF(self.X, self.n_signatures)
-
-        if given_signatures is not None:
-            self.W[:, : self.n_given_signatures] = given_signatures.copy().values
-            given_signatures_names = given_signatures.columns.to_numpy(dtype="<U20")
-            n_new_signatures = self.n_signatures - self.n_given_signatures
-            new_signatures_names = np.array(
-                [f"Sig{k+1}" for k in range(n_new_signatures)]
-            )
-            self.signature_names = np.concatenate(
-                [given_signatures_names, new_signatures_names]
-            )
-        else:
-            self.signature_names = np.array(
-                [f"Sig{k+1}" for k in range(self.n_signatures)], dtype="<U20"
-            )
-
-        self.W /= np.sum(self.W, axis=0)
-        self.W = self.W.clip(EPSILON)
+        self.W, _, self.signature_names = initialize(
+            self.X, self.n_signatures, self.init_method, given_signatures, **init_kwargs
+        )
         self.sigma_sq = 1.0
 
         if given_signature_biases is None:

diff --git a/src/salamander/nmf_framework/initialization.py b/src/salamander/nmf_framework/initialization.py
@@ -4,7 +4,19 @@
 import numpy as np
 from sklearn.decomposition import _nmf as sknmf
 
-from ..utils import shape_checker, type_checker
+from ..utils import normalize_WH, shape_checker, type_checker, value_checker
+
+EPSILON = np.finfo(np.float32).eps
+INIT_METHODS = [
+    "custom",
+    "flat",
+    "hierarchical_cluster",
+    "nndsvd",
+    "nndsvda",
+    "nndsvdar",
+    "random",
+    "separableNMF",
+]
 
 
 def init_custom(
@@ -16,11 +28,9 @@ def init_custom(
     """
     type_checker("W_custom", W_custom, np.ndarray)
     type_checker("H_custom", H_custom, np.ndarray)
-
     n_features, n_samples = X.shape
     shape_checker("W_custom", W_custom, (n_features, n_signatures))
     shape_checker("H_custom", H_custom, (n_signatures, n_samples))
-
     return W_custom, H_custom
 
 
@@ -30,10 +40,8 @@ def init_flat(X: np.ndarray, n_signatures: int):
     """
     n_features, n_samples = X.shape
     scaling = np.mean(np.sum(X, axis=0))
-
     W = np.full((n_features, n_signatures), 1 / n_features)
     H = np.full((n_signatures, n_samples), scaling / n_signatures)
-
     return W, H
 
 
@@ -72,11 +80,10 @@ def init_random(X: np.ndarray, n_signatures: int, seed=None):
     W = np.random.dirichlet(np.ones(n_features), size=n_signatures).T
     scaling = np.sum(X, axis=0)
     H = scaling * np.random.dirichlet(np.ones(n_signatures), size=n_samples).T
-
     return W, H
 
 
-def init_separableNMF(X: np.ndarray, n_signatures: int):
+def init_separableNMF(X: np.ndarray, n_signatures: int, seed=None):
     r"""
     This code is following Algorithm 1 from "Fast and Robust Recursive
     Algorithms for Separable Nonnegative Matrix Factorization"
@@ -95,5 +102,83 @@ def init_separableNMF(X: np.ndarray, n_signatures: int):
         signature_indices[k] = kstar
 
     W = X[:, signature_indices].astype(float)
+    _, H = init_random(X, n_signatures, seed=seed)
+    return W, H
+
+
+def initialize(
+    X: np.ndarray,
+    n_signatures: int,
+    init_method="nndsvd",
+    given_signatures=None,
+    **kwargs,
+):
+    """
+    Initialize the signature and exposure matrices.
+
+    Parameters
+    ----------
+    X : np.ndarray
+        count matrix
+
+    n_signatures : int
+        number of signatures
+
+    init_method : str
+        initialization method. One of 'custom', 'flat', 'hierarchical_cluster',
+        'nndsvd', 'nndsvda', 'nndsvdar', 'random', 'separableNMF'
+
+    given_signatures : pd.Dataframe, default=None
+        At most 'n_signatures' many signatures can be provided to
+        overwrite some of the initialized signatures. This does not
+        change the initialized exposurse.
+
+    kwargs : dict
+        Any keyword arguments to be passed to the initialization method.
+        This includes, for example, a possible 'seed' keyword argument
+        for all stochastic methods.
+
+    Returns
+    -------
+    W : np.ndarray
+        signature matrix
+
+    H : np.ndarray
+        exposure matrix
+
+    signature_names : list
+        The signature names. By default, the signatures are named
+        'Sigk', where 'k' is one plus the index of the signature.
+        If 'given_signatures' are provided, the names are adjusted
+        accordingly.
+    """
+    value_checker("init_method", init_method, INIT_METHODS)
+
+    if init_method == "custom":
+        W, H = init_custom(X, n_signatures, **kwargs)
+
+    elif init_method == "flat":
+        W, H = init_flat(X, n_signatures)
+
+    elif init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
+        W, H = init_nndsvd(X, n_signatures, init=init_method, **kwargs)
+
+    elif init_method == "random":
+        W, H = init_random(X, n_signatures, **kwargs)
+
+    else:
+        W, H = init_separableNMF(X, n_signatures, **kwargs)
+
+    if given_signatures is not None:
+        n_given_signatures = len(given_signatures.columns)
+        W[:, :n_given_signatures] = given_signatures.copy().values
+        given_signatures_names = given_signatures.columns.to_numpy(dtype=str)
+        n_new_signatures = n_signatures - n_given_signatures
+        new_signatures_names = np.array([f"Sig{k+1}" for k in range(n_new_signatures)])
+        signature_names = np.concatenate([given_signatures_names, new_signatures_names])
+    else:
+        signature_names = np.array([f"Sig{k+1}" for k in range(n_signatures)])
 
-    return W
+    W, H = normalize_WH(W, H)
+    W, H = W.clip(EPSILON), H.clip(EPSILON)
+    return W, H, signature_names
diff --git a/src/salamander/nmf_framework/nmf.py b/src/salamander/nmf_framework/nmf.py
@@ -3,14 +3,8 @@
 import numpy as np
 import pandas as pd
 
-from ..utils import match_signatures_pair, normalize_WH
-from .initialization import (
-    init_custom,
-    init_flat,
-    init_nndsvd,
-    init_random,
-    init_separableNMF,
-)
+from ..utils import match_signatures_pair
+from .initialization import initialize
 from .signature_nmf import SignatureNMF
 
 EPSILON = np.finfo(np.float32).eps
@@ -177,6 +171,11 @@ def _initialize(self, given_signatures=None, init_kwargs=None):
 
         Input:
         ------
+        given_signatures : pd.Dataframe, default=None
+            At most 'n_signatures' many signatures can be provided to
+            overwrite some of the initialized signatures. This does not
+            change the initialized exposurse.
+
         init_kwargs: dict
             Any further keywords arguments to be passed to the initialization method.
             This includes, for example, a possible 'seed' keyword argument
@@ -189,45 +188,9 @@ def _initialize(self, given_signatures=None, init_kwargs=None):
             self.n_given_signatures = 0
 
         init_kwargs = {} if init_kwargs is None else init_kwargs.copy()
-
-        if self.init_method == "custom":
-            self.W, self.H = init_custom(self.X, self.n_signatures, **init_kwargs)
-
-        elif self.init_method == "flat":
-            self.W, self.H = init_flat(self.X, self.n_signatures)
-
-        elif self.init_method in ["nndsvd", "nndsvda", "nndsvdar"]:
-            self.W, self.H = init_nndsvd(
-                self.X, self.n_signatures, init=self.init_method, **init_kwargs
-            )
-
-        elif self.init_method == "random":
-            self.W, self.H = init_random(self.X, self.n_signatures, **init_kwargs)
-
-        else:
-            self.W = init_separableNMF(self.X, self.n_signatures)
-
-        if given_signatures is not None:
-            self.W[:, : self.n_given_signatures] = given_signatures.copy().values
-            given_signatures_names = given_signatures.columns.to_numpy(dtype=str)
-            n_new_signatures = self.n_signatures - self.n_given_signatures
-            new_signatures_names = np.array(
-                [f"Sig{k+1}" for k in range(n_new_signatures)]
-            )
-            self.signature_names = np.concatenate(
-                [given_signatures_names, new_signatures_names]
-            )
-
-        else:
-            self.signature_names = np.array(
-                [f"Sig{k+1}" for k in range(self.n_signatures)]
-            )
-
-        if not hasattr(self, "H"):
-            _, self.H = init_random(self.X, self.n_signatures)
-
-        self.W, self.H = normalize_WH(self.W, self.H)
-        self.W, self.H = self.W.clip(EPSILON), self.H.clip(EPSILON)
+        self.W, self.H, self.signature_names = initialize(
+            self.X, self.n_signatures, self.init_method, given_signatures, **init_kwargs
+        )
 
     @property
     def corr_signatures(self) -> pd.DataFrame: