-
-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENH] FDR: Calculate FDR using numpy #3625
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
It also patches bottleneck to contain these functions. | ||
""" | ||
import warnings | ||
import math | ||
from typing import Iterable | ||
|
||
import numpy as np | ||
import bottleneck as bn | ||
|
@@ -607,59 +607,36 @@ def std(x, axis=None, ddof=0): | |
return np.sqrt(var(x, axis=axis, ddof=ddof)) | ||
|
||
|
||
# to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), | ||
# for m in [1,100000]. | ||
# For higher values of m use an approximation, with error less or equal to | ||
# 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772..., | ||
# 0.5572 is an Euler-Mascheroni constant) | ||
c = [1.0] | ||
for m in range(2, 100000): | ||
c.append(c[-1] + 1.0/m) | ||
|
||
|
||
def FDR(p_values, dependent=False, m=None, ordered=False): | ||
def FDR(p_values: Iterable, dependent=False, m=None, ordered=False) -> Iterable: | ||
""" `False Discovery Rate <http://en.wikipedia.org/wiki/False_discovery_rate>`_ | ||
correction on a list of p-values. | ||
|
||
:param p_values: a list of p-values. | ||
:param p_values: list or np.ndarray of p-values. | ||
:param dependent: use correction for dependent hypotheses (default False). | ||
:param m: number of hypotheses tested (default ``len(p_values)``). | ||
:param ordered: prevent sorting of p-values if they are already sorted | ||
(default False). | ||
:return: list or np.ndarray, same as the input | ||
""" | ||
def is_sorted(l): | ||
return all(l[i] <= l[i + 1] for i in range(len(l) - 1)) | ||
if p_values is None or len(p_values) == 0 or \ | ||
(m is not None and m <= 0): | ||
return None | ||
|
||
if not ordered: | ||
ordered = is_sorted(p_values) | ||
|
||
if not ordered: | ||
joined = [(v, i) for i, v in enumerate(p_values)] | ||
joined.sort() | ||
p_values = [p[0] for p in joined] | ||
indices = [p[1] for p in joined] | ||
|
||
if not m: | ||
is_list = isinstance(p_values, list) | ||
p_values = np.array(p_values) | ||
if m is None: | ||
m = len(p_values) | ||
if m <= 0 or not p_values: | ||
return [] | ||
if not ordered: | ||
ordered = (np.diff(p_values) >= 0).all() | ||
if not ordered: | ||
indices = np.argsort(p_values) | ||
p_values = p_values[indices] | ||
|
||
if dependent: # correct q for dependent tests | ||
k = c[m-1] if m <= len(c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992 | ||
m = m * k | ||
|
||
tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)] | ||
fdrs = [] | ||
cmin = tmp_fdrs[-1] | ||
for f in reversed(tmp_fdrs): | ||
cmin = min(f, cmin) | ||
fdrs.append( cmin) | ||
fdrs.reverse() | ||
m *= sum(1 / np.arange(1, m + 1)) | ||
|
||
fdrs = (p_values * m / np.arange(1, len(p_values) + 1))[::-1] | ||
fdrs = np.array(np.minimum.accumulate(fdrs)[::-1]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kudos. |
||
if not ordered: | ||
new = [None] * len(fdrs) | ||
for v, i in zip(fdrs, indices): | ||
new[i] = v | ||
fdrs = new | ||
|
||
return fdrs | ||
fdrs[indices] = fdrs.copy() | ||
VesnaT marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return fdrs if not is_list else list(fdrs) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -264,11 +264,36 @@ def test_nanstd_with_ddof(self): | |
) | ||
|
||
def test_FDR(self): | ||
p_values = [0.00001, 0.0001, 0.0002, 0.0003, 0.0004] | ||
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) | ||
np.testing.assert_almost_equal( | ||
np.array([0.00005, 0.00025, 0.00033, 0.00038, 0.0004]), | ||
np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]), | ||
FDR(p_values), decimal=5) | ||
|
||
def test_FDR_dependent(self): | ||
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) | ||
np.testing.assert_almost_equal( | ||
np.array([0.00076, 0.00091, 0.00011, 0.00086, 0.00057]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you gotten these number independently, that is, from another source or calculated manually, not from the code itself? (I'm asking because I used to be a big sinner myself.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I calculated those using the old function. |
||
FDR(p_values, dependent=True), decimal=5) | ||
|
||
def test_FDR_m(self): | ||
p_values = np.array([0.0002, 0.0004, 0.00001, 0.0003, 0.0001]) | ||
np.testing.assert_almost_equal( | ||
np.array([0.0002, 0.00024, 0.00003, 0.000225, 0.00015]), | ||
FDR(p_values, m=3), decimal=5) | ||
|
||
def test_FDR_no_values(self): | ||
self.assertIsNone(FDR(None)) | ||
self.assertIsNone(FDR([])) | ||
self.assertIsNone(FDR([0.0002, 0.0004], m=0)) | ||
|
||
def test_FDR_list(self): | ||
p_values = [0.0002, 0.0004, 0.00001, 0.0003, 0.0001] | ||
result = FDR(p_values) | ||
self.assertIsInstance(result, list) | ||
np.testing.assert_almost_equal( | ||
np.array([0.00033, 0.0004, 0.00005, 0.00038, 0.00025]), | ||
result, decimal=5) | ||
|
||
|
||
class TestNanmean(unittest.TestCase): | ||
def setUp(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cover this line in tests to get approval by codecov (and to make sure we don't forget these cases at any future refactoring).