Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Replace local FDR with Orange's #416

Merged
merged 2 commits into from
May 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 0 additions & 58 deletions orangecontrib/text/stats.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,9 @@
import numpy as np
from scipy import stats, sparse
import math

# To speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), for m in [1,100000].
# For higher values of m use an approximation, with error less or equal to 4.99999157277e-006.
# (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772..., 0.5572 is an Euler-Mascheroni constant)
_c = [1.0]
for m in range(2, 100000):
_c.append(_c[-1] + 1.0/m)


def is_sorted(l):
return all(l[i] <= l[i+1] for i in range(len(l)-1))


def false_discovery_rate(p_values, dependent=False, m=None, ordered=False):
"""
`False Discovery Rate <http://en.wikipedia.org/wiki/False_discovery_rate>`_ correction on a list of p-values.

Args:
p_values: a list of p-values.
dependent: use correction for dependent hypotheses.
m: number of hypotheses tested (default ``len(p_values)``).
ordered: prevent sorting of p-values if they are already sorted.

Returns: A list of corrected p-values.

"""
if not ordered:
ordered = is_sorted(p_values)

if not ordered:
joined = [ (v,i) for i,v in enumerate(p_values) ]
joined.sort()
p_values = [ p[0] for p in joined ]
indices = [ p[1] for p in joined ]

if not m:
m = len(p_values)
if m <= 0 or not p_values:
return []

if dependent: # correct q for dependent tests
k = _c[m-1] if m <= len(_c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992
m = m * k

tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)]
fdrs = []
cmin = tmp_fdrs[-1]
for f in reversed(tmp_fdrs):
cmin = min(f, cmin)
fdrs.append( cmin)
fdrs.reverse()

if not ordered:
new = [ None ] * len(fdrs)
for v,i in zip(fdrs, indices):
new[i] = v
fdrs = new

return fdrs


def hypergeom_p_values(data, selected, callback=None):
"""
Calculates p_values using Hypergeometric distribution for two numpy arrays.
Expand Down
32 changes: 1 addition & 31 deletions orangecontrib/text/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import scipy.sparse as sp

from orangecontrib.text.stats import hypergeom_p_values, false_discovery_rate, is_sorted
from orangecontrib.text.stats import hypergeom_p_values, is_sorted

class StatsTests(unittest.TestCase):
x = np.array([[0, 0, 9, 0, 1],
Expand All @@ -29,36 +29,6 @@ def test_hypergeom_p_values(self):
with self.assertRaises(ValueError):
hypergeom_p_values(self.x, self.x[-2:, :-1])


def test_false_discovery_rate(self):
p_values = np.array(
[0.727, 0.281, 0.791, 0.034, 0.628, 0.743, 0.958, 0.552, 0.867, 0.606,
0.611, 0.594, 0.071, 0.517, 0.526, 0.526, 0.635, 0.932, 0.210, 0.636])
# calculated with http://www.sdmproject.com/utilities/?show=FDR
fdr_fixed = np.array(
[0.92875, 0.9085714, 0.9305882, 0.68, 0.9085714, 0.92875, 0.958, 0.9085714,
0.958, 0.9085714, 0.9085714, 0.9085714, 0.71, 0.9085714, 0.9085714, 0.9085714,
0.9085714, 0.958, 0.9085714, 0.9085714]
)
corrected = false_discovery_rate(p_values)
np.testing.assert_allclose(corrected, fdr_fixed)

corrected = false_discovery_rate(p_values, m=len(p_values))
np.testing.assert_allclose(corrected, fdr_fixed)

corrected = false_discovery_rate(sorted(p_values), ordered=True)
np.testing.assert_allclose(sorted(corrected), sorted(fdr_fixed))

np.testing.assert_equal(false_discovery_rate([]), [])
np.testing.assert_equal(false_discovery_rate(p_values, m=-1), [])

dependant = [3.3414007065721947, 3.2688034599191167, 3.3480141985890031, 2.446462966857704,
3.2688034599191167, 3.3414007065721947, 3.4466345915436469, 3.2688034599191167,
3.4466345915436469, 3.2688034599191167, 3.2688034599191167, 3.2688034599191167,
2.554395156572014, 3.2688034599191167, 3.2688034599191167, 3.2688034599191167,
3.2688034599191167, 3.4466345915436469, 3.2688034599191167, 3.2688034599191167]
np.testing.assert_equal(false_discovery_rate(p_values, dependent=True), dependant)

def test_is_sorted(self):
self.assertTrue(is_sorted(range(10)))
self.assertFalse(is_sorted(range(10)[::-1]))
5 changes: 3 additions & 2 deletions orangecontrib/text/widgets/owwordenrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from Orange.widgets import gui
from Orange.widgets.settings import Setting
from Orange.widgets.widget import OWWidget, Msg, Input
from Orange.statistics.util import FDR
from PyQt5.QtCore import QSize
from orangecontrib.text import Corpus
from orangecontrib.text.util import np_sp_sum
from orangecontrib.text.stats import false_discovery_rate, hypergeom_p_values
from orangecontrib.text.stats import hypergeom_p_values
from orangecontrib.text.vectorization import BowVectorizer


Expand Down Expand Up @@ -200,7 +201,7 @@ def apply(self):
self.p_values = hypergeom_p_values(self.data.X,
self.selected_data_transformed.X,
callback=self.progress)
self.fdr_values = false_discovery_rate(self.p_values)
self.fdr_values = FDR(self.p_values)
self.filter_and_display()
self.filter_enabled(True)
self.progressBarFinished()
Expand Down
67 changes: 67 additions & 0 deletions orangecontrib/text/widgets/tests/test_owwordenrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import unittest

from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.vectorization import BowVectorizer
from orangecontrib.text.widgets.owwordenrichment import OWWordEnrichment


class TestWordEnrichment(WidgetTest):
def setUp(self):
# type: OWWordEnrichment
self.widget = self.create_widget(OWWordEnrichment)
self.corpus = Corpus.from_file('book-excerpts')
vect = BowVectorizer()
self.corpus_vect = vect.transform(self.corpus)

def test_filter_fdr(self):
widget = self.widget
subset_corpus = self.corpus_vect[:10]
self.send_signal(widget.Inputs.data, self.corpus_vect)
self.send_signal(widget.Inputs.selected_data, subset_corpus)

# test p-value filter
widget.filter_by_p = True
widget.filter_p_value = 1e-9
widget.filter_by_fdr = False
widget.filter_fdr_value = 0.01

widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 3)
self.assertEqual({widget.sig_words.topLevelItem(i).text(0)
for i in (0, 1, 2)}, {'livesey', 'doctor', 'rum'})

# test fdr filter
widget.filter_by_p = True
widget.filter_p_value = 1e-4
widget.filter_by_fdr = True
widget.filter_fdr_value = 1e-4

widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 5)
self.assertEqual({widget.sig_words.topLevelItem(i).text(0)
for i in (0, 1, 2, 3, 4)},
{'livesey', 'doctor', 'rum', 'admiral', 'inn'})

# test if different when fdr false
widget.filter_by_p = True
widget.filter_p_value = 1e-4
widget.filter_by_fdr = False
widget.filter_fdr_value = 1e-4

widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 16)

# test no results
widget.filter_by_p = True
widget.filter_p_value = 1e-11
widget.filter_by_fdr = False
widget.filter_fdr_value = 1e-5

widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 0)


if __name__ == "__main__":
unittest.main()