Statistics - Regex count in whole document to only token

biolab · Jan 30, 2024 · c963d4c · c963d4c
1 parent e7c360d
commit c963d4c
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 10 deletions.
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
@@ -293,14 +293,11 @@ def regex(
     """
     pattern = re.compile(expression)
 
-    def number_regex(tokens: List[str]):
+    def regex_matches(text: str):
         callback()
-        return sum(bool(pattern.match(t)) for t in tokens)
+        return len(re.findall(pattern, text))
 
-    return (
-        np.c_[list(map(number_regex, corpus.tokens))],
-        [f"Regex {expression}"],
-    )
+    return np.c_[list(map(regex_matches, corpus.documents))], [f"Regex {expression}"]
 
 
 def pos_tags(

diff --git a/orangecontrib/text/widgets/tests/test_owstatistics.py b/orangecontrib/text/widgets/tests/test_owstatistics.py
@@ -214,14 +214,18 @@ def test_contains(self):
 
     def test_regex(self):
         """ Test regex statistic """
-        # words that contains digit
-        data = self._compute_features("Regex", "\w*\d\w*")
+        # words that contain digit
+        data = self._compute_features("Regex", r"\w*\d\w*")
         np.testing.assert_array_almost_equal(data.X.flatten(), [0, 0, 0, 1])
 
-        # words that contains digit
-        data = self._compute_features("Regex", "\w*is\w*")
+        # words that contain digit
+        data = self._compute_features("Regex", r"\w*is\w*")
         np.testing.assert_array_almost_equal(data.X.flatten(), [1, 2, 2, 0])
 
+        # count specific n-gram
+        data = self._compute_features("Regex", r"ipsum\ dolor")
+        np.testing.assert_array_almost_equal(data.X.flatten(), [1, 0, 0, 0])
+
         self.send_signal(self.widget.Inputs.corpus, None)
         self.assertIsNone(self.get_output(self.widget.Outputs.corpus))