From 4ad960318431cdfb7a99361062847f77e0acd852 Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Mon, 25 Feb 2019 09:28:34 +0100 Subject: [PATCH] FDR: Calculate FDR using numpy --- Orange/statistics/util.py | 63 +++++++++++---------------------- Orange/tests/test_statistics.py | 29 +++++++++++++-- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index 08829eab0a0..57c38bbdc9e 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -5,7 +5,7 @@ It also patches bottleneck to contain these functions. """ import warnings -import math +from typing import Iterable import numpy as np import bottleneck as bn @@ -607,59 +607,36 @@ def std(x, axis=None, ddof=0): return np.sqrt(var(x, axis=axis, ddof=ddof)) -# to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), -# for m in [1,100000]. -# For higher values of m use an approximation, with error less or equal to -# 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772..., -# 0.5572 is an Euler-Mascheroni constant) -c = [1.0] -for m in range(2, 100000): - c.append(c[-1] + 1.0/m) - - -def FDR(p_values, dependent=False, m=None, ordered=False): +def FDR(p_values: Iterable, dependent=False, m=None, ordered=False) -> Iterable: """ `False Discovery Rate `_ correction on a list of p-values. - :param p_values: a list of p-values. + :param p_values: list or np.ndarray of p-values. :param dependent: use correction for dependent hypotheses (default False). :param m: number of hypotheses tested (default ``len(p_values)``). :param ordered: prevent sorting of p-values if they are already sorted (default False). + :return: list or np.ndarray, same as the input """ - def is_sorted(l): - return all(l[i] <= l[i + 1] for i in range(len(l) - 1)) + if p_values is None or len(p_values) == 0 or \ + (m is not None and m <= 0): + return None - if not ordered: - ordered = is_sorted(p_values) - - if not ordered: - joined = [(v, i) for i, v in enumerate(p_values)] - joined.sort() - p_values = [p[0] for p in joined] - indices = [p[1] for p in joined] - - if not m: + is_list = isinstance(p_values, list) + p_values = np.array(p_values) + if m is None: m = len(p_values) - if m <= 0 or not p_values: - return [] + if not ordered: + ordered = (np.diff(p_values) >= 0).all() + if not ordered: + indices = np.argsort(p_values) + p_values = p_values[indices] if dependent: # correct q for dependent tests - k = c[m-1] if m <= len(c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992 - m = m * k - - tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)] - fdrs = [] - cmin = tmp_fdrs[-1] - for f in reversed(tmp_fdrs): - cmin = min(f, cmin) - fdrs.append( cmin) - fdrs.reverse() + m *= sum(1 / np.arange(1, m + 1)) + fdrs = (p_values * m / np.arange(1, len(p_values) + 1))[::-1] + fdrs = np.array(np.minimum.accumulate(fdrs)[::-1]) if not ordered: - new = [None] * len(fdrs) - for v, i in zip(fdrs, indices): - new[i] = v - fdrs = new - - return fdrs + fdrs[indices] = fdrs.copy() + return fdrs if not is_list else list(fdrs) diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index 391c73273a5..59a04e9e29c 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -264,11 +264,36 @@ def test_nanstd_with_ddof(self): ) def test_FDR(self): - p_values = [0.00001, 0.0001, 0.0002, 0.0003, 0.0004] + p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) np.testing.assert_almost_equal( - np.array([0.00005, 0.00025, 0.00033, 0.00038, 0.0004]), + np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]), FDR(p_values), decimal=5) + def test_FDR_dependent(self): + p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) + np.testing.assert_almost_equal( + np.array([0.00076, 0.00091, 0.00011, 0.00086, 0.00057]), + FDR(p_values, dependent=True), decimal=5) + + def test_FDR_m(self): + p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) + np.testing.assert_almost_equal( + np.array([0.0002, 0.00024, 0.00003, 0.000225, 0.00015]), + FDR(p_values, m=3), decimal=5) + + def test_FDR_no_values(self): + self.assertIsNone(FDR(None)) + self.assertIsNone(FDR([])) + self.assertIsNone(FDR([0.0002, 0.0004], m=0)) + + def test_FDR_list(self): + p_values = [0.0002, 0.0004, 0.00001, 0.0003, 0.0001] + result = FDR(p_values) + self.assertIsInstance(result, list) + np.testing.assert_almost_equal( + np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]), + result, decimal=5) + class TestNanmean(unittest.TestCase): def setUp(self):