Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] FDR: Calculate FDR using numpy #3625

Merged
merged 1 commit into from
Feb 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 20 additions & 43 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
It also patches bottleneck to contain these functions.
"""
import warnings
import math
from typing import Iterable

import numpy as np
import bottleneck as bn
Expand Down Expand Up @@ -607,59 +607,36 @@ def std(x, axis=None, ddof=0):
return np.sqrt(var(x, axis=axis, ddof=ddof))


# to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]),
# for m in [1,100000].
# For higher values of m use an approximation, with error less or equal to
# 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772...,
# 0.5572 is an Euler-Mascheroni constant)
c = [1.0]
for m in range(2, 100000):
c.append(c[-1] + 1.0/m)


def FDR(p_values, dependent=False, m=None, ordered=False):
def FDR(p_values: Iterable, dependent=False, m=None, ordered=False) -> Iterable:
""" `False Discovery Rate <http://en.wikipedia.org/wiki/False_discovery_rate>`_
correction on a list of p-values.

:param p_values: a list of p-values.
:param p_values: list or np.ndarray of p-values.
:param dependent: use correction for dependent hypotheses (default False).
:param m: number of hypotheses tested (default ``len(p_values)``).
:param ordered: prevent sorting of p-values if they are already sorted
(default False).
:return: list or np.ndarray, same as the input
"""
def is_sorted(l):
return all(l[i] <= l[i + 1] for i in range(len(l) - 1))
if p_values is None or len(p_values) == 0 or \
(m is not None and m <= 0):
return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cover this line in tests to get approval by codecov (and to make sure we don't forget these cases at any future refactoring).


if not ordered:
ordered = is_sorted(p_values)

if not ordered:
joined = [(v, i) for i, v in enumerate(p_values)]
joined.sort()
p_values = [p[0] for p in joined]
indices = [p[1] for p in joined]

if not m:
is_list = isinstance(p_values, list)
p_values = np.array(p_values)
if m is None:
m = len(p_values)
if m <= 0 or not p_values:
return []
if not ordered:
ordered = (np.diff(p_values) >= 0).all()
if not ordered:
indices = np.argsort(p_values)
p_values = p_values[indices]

if dependent: # correct q for dependent tests
k = c[m-1] if m <= len(c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992
m = m * k

tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)]
fdrs = []
cmin = tmp_fdrs[-1]
for f in reversed(tmp_fdrs):
cmin = min(f, cmin)
fdrs.append( cmin)
fdrs.reverse()
m *= sum(1 / np.arange(1, m + 1))

fdrs = (p_values * m / np.arange(1, len(p_values) + 1))[::-1]
fdrs = np.array(np.minimum.accumulate(fdrs)[::-1])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kudos.

if not ordered:
new = [None] * len(fdrs)
for v, i in zip(fdrs, indices):
new[i] = v
fdrs = new

return fdrs
fdrs[indices] = fdrs.copy()
VesnaT marked this conversation as resolved.
Show resolved Hide resolved
return fdrs if not is_list else list(fdrs)
29 changes: 27 additions & 2 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,36 @@ def test_nanstd_with_ddof(self):
)

def test_FDR(self):
p_values = [0.00001, 0.0001, 0.0002, 0.0003, 0.0004]
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001])
np.testing.assert_almost_equal(
np.array([0.00005, 0.00025, 0.00033, 0.00038, 0.0004]),
np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]),
FDR(p_values), decimal=5)

def test_FDR_dependent(self):
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001])
np.testing.assert_almost_equal(
np.array([0.00076, 0.00091, 0.00011, 0.00086, 0.00057]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you gotten these number independently, that is, from another source or calculated manually, not from the code itself? (I'm asking because I used to be a big sinner myself.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I calculated those using the old function.

FDR(p_values, dependent=True), decimal=5)

def test_FDR_m(self):
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001])
np.testing.assert_almost_equal(
np.array([0.0002, 0.00024, 0.00003, 0.000225, 0.00015]),
FDR(p_values, m=3), decimal=5)

def test_FDR_no_values(self):
self.assertIsNone(FDR(None))
self.assertIsNone(FDR([]))
self.assertIsNone(FDR([0.0002, 0.0004], m=0))

def test_FDR_list(self):
p_values = [0.0002, 0.0004, 0.00001, 0.0003, 0.0001]
result = FDR(p_values)
self.assertIsInstance(result, list)
np.testing.assert_almost_equal(
np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]),
result, decimal=5)


class TestNanmean(unittest.TestCase):
def setUp(self):
Expand Down