biolab · PrimozGodec · Nov 26, 2020 · Jul 31, 2020 · Sep 4, 2020 · Sep 14, 2020
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,6 +4,7 @@ recursive-include orangecontrib/text/tests *.txt *.json
 recursive-include orangecontrib/text/tutorials *.ows
 recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
 recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
+recursive-include orangecontrib/text/widgets/tests/data
 include orangecontrib/text/widgets/tests/bow-test
 recursive-include scripts *.sh *.py
 

diff --git a/doc/widgets/images/SentimentAnalysis-stamped.png b/doc/widgets/images/SentimentAnalysis-stamped.png
diff --git a/doc/widgets/sentimentanalysis.md b/doc/widgets/sentimentanalysis.md
@@ -11,15 +11,15 @@ Predict sentiment from text.
 
 - Corpus: A corpus with information on the sentiment of each document.
 
-**Sentiment Analysis** predicts sentiment for each document in a corpus. It uses Liu Hu and Vader sentiment modules from [NLTK](http://www.nltk.org/api/nltk.sentiment.html). Both of them are lexicon-based. For Liu Hu, you can choose English or Slovenian version.
+**Sentiment Analysis** predicts sentiment for each document in a corpus. It uses Liu & Hu and Vader sentiment modules from [NLTK](http://www.nltk.org/api/nltk.sentiment.html) and multilingual sentiment lexicons from the [Data Science Lab](https://sites.google.com/site/datascienceslab/projects/multilingualsentiment). All of them are lexicon-based. For Liu & Hu, you can choose English or Slovenian version. Vader works only on English. Multilingual sentiment supports several languages, which are listed at the bottom of this page.
 
 ![](images/SentimentAnalysis-stamped.png)
 
-1. *Method*:
-   - *Liu Hu*: lexicon-based sentiment analysis (supports English and Slovenian)
-   - *Vader*: lexicon- and rule-based sentiment analysis
-2. Produce a report.
-3. If *Auto commit is on*, sentiment-tagged corpus is communicated automatically. Alternatively press *Commit*.
+1. *Liu Hu*: lexicon-based sentiment analysis (supports English and Slovenian). The final score is the difference between the sum of positive and sum of negative words, normalized by the length of the document and multiplied by a 100. The final score reflects the percentage of sentiment difference in the document.
+2. *Vader*: lexicon- and rule-based sentiment analysis
+3. *Multilingual sentiment*: lexicon-based sentiment analysis for several languages
+4. *Custom dictionary*: add you own positive and negative sentiment dictionaries. Accepted source type is .txt file with each word in its own line. The final score is computed in the same way as Liu Hu.
+5. If *Auto commit is on*, sentiment-tagged corpus is communicated automatically. Alternatively press *Commit*.
 
 Example
 -------
@@ -53,6 +53,54 @@ References
 
 Hutto, C.J. and E. E. Gilbert (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
 
-Hu, Minqing and Bing Liu (2004). Mining opinion features in customer reviews. In Proceedings of AAAI Conference on Artificial Intelligence, vol. 4, pp. 755–760. [Available online.](http://www.aaai.org/Papers/AAAI/2004/AAAI04-119.pdf) 
+Hu, Minqing and Bing Liu (2004). Mining opinion features in customer reviews. In Proceedings of AAAI Conference on Artificial Intelligence, vol. 4, pp. 755–760. [Available online.](http://www.aaai.org/Papers/AAAI/2004/AAAI04-119.pdf)
 
 Kadunc, Klemen and Marko Robnik-Šikonja (2016). Analiza mnenj s pomočjo strojnega učenja in slovenskega leksikona sentimenta. Conference on Language Technologies & Digital Humanities, Ljubljana (in Slovene). [Available online.](http://www.sdjt.si/wp/wp-content/uploads/2016/09/JTDH-2016_Kadunc-et-al_Analiza-mnenj-s-pomocjo-strojnega-ucenja.pdf)
+
+## Multilingual Sentiment Languages
+
+- Afrikaans
+- Arabic
+- Azerbaijani
+- Belarusian
+- Bosnian
+- Bulgarian
+- Catalan
+- Chinese
+- Chinese Characters
+- Croatian
+- Czech
+- Danish
+- Dutch
+- English
+- Estonian
+- Farsi
+- Finnish
+- French
+- Gaelic
+- German
+- Greek
+- Hebrew
+- Hindi
+- Hungarian
+- Indonesian
+- Italian
+- Japanese
+- Korean
+- Latin
+- Latvian
+- Lithuanian
+- Macedonian
+- Norwegian
+- Norwegian Nynorsk
+- Polish
+- Portuguese
+- Romanian
+- Russian
+- Serbian
+- Slovak
+- Slovene
+- Spanish
+- Swedish
+- Turkish
+- Ukrainian
diff --git a/orangecontrib/text/sentiment/__init__.py b/orangecontrib/text/sentiment/__init__.py
@@ -19,6 +19,11 @@
     SloSentiment
 
 
+def read_file(file):
+    with open(file, 'r') as f:
+        return f.read().split('\n')
+
+
 class LiuHuSentiment:
     sentiments = ('sentiment',)
     name = 'Liu Hu'
@@ -182,6 +187,36 @@ def online(self):
             return False
 
 
+class CustomDictionaries:
+    sentiments = ['sentiment']
+    name = 'Custom Dictionaries'
+
+    @wait_nltk_data
+    def __init__(self, pos, neg):
+        self.positive = set(read_file(pos)) if pos else None
+        self.negative = set(read_file(neg)) if neg else None
+
+    def transform(self, corpus):
+        scores = []
+        corpus = WordPunctTokenizer()(corpus)
+
+        for doc in corpus.tokens:
+            pos_words = sum(word in self.positive for word in doc) if \
+                self.positive else 0
+            neg_words = sum(word in self.negative for word in doc) if \
+                self.negative else 0
+            scores.append([100*(pos_words - neg_words)/max(len(doc), 1)])
+        X = np.array(scores).reshape((-1, len(self.sentiments)))
+
+        # set  compute values
+        shared_cv = SharedTransform(self)
+        cv = [VectorizationComputeValue(shared_cv, col)
+              for col in self.sentiments]
+
+        corpus = corpus.extend_attributes(X, self.sentiments, compute_values=cv)
+        return corpus
+
+
 if __name__ == "__main__":
     c = Corpus.from_file('deerwester')
     liu = LiuHuSentiment('Slovenian')

diff --git a/orangecontrib/text/widgets/owsentimentanalysis.py b/orangecontrib/text/widgets/owsentimentanalysis.py
@@ -1,3 +1,5 @@
+from typing import List
+
 from AnyQt.QtCore import Qt
 from AnyQt.QtWidgets import QApplication, QGridLayout, QLabel
 
@@ -6,7 +8,9 @@
 from Orange.widgets.widget import OWWidget, Msg
 from orangecontrib.text import Corpus
 from orangecontrib.text.sentiment import VaderSentiment, LiuHuSentiment, \
-    MultiSentiment, SentimentDictionaries
+    MultiSentiment, SentimentDictionaries, CustomDictionaries
+from orangecontrib.text.widgets.owpreprocess import FileLoader, _to_abspath
+from orangewidget.utils.filedialogs import RecentPath
 
 
 class OWSentimentAnalysis(OWWidget):
@@ -32,20 +36,26 @@ class Outputs:
     METHODS = [
         LiuHuSentiment,
         VaderSentiment,
-        MultiSentiment
+        MultiSentiment,
+        CustomDictionaries
     ]
     LANG = ['English', 'Slovenian']
     MULTI_LANG = MultiSentiment.LANGS.keys()
+    DEFAULT_NONE = None
 
     class Warning(OWWidget.Warning):
         senti_offline = Msg('No internet connection! Sentiment now only works '
                             'with local models.')
         senti_offline_no_lang = Msg('No internet connection and no local '
                                     'language resources are available.')
+        one_dict_only = Msg(f'Only one dictionary loaded.')
+        no_dicts_loaded = Msg('No dictionaries loaded.')
 
     def __init__(self):
         super().__init__()
         self.corpus = None
+        self.pos_file = None
+        self.neg_file = None
 
         self.form = QGridLayout()
         self.method_box = box = gui.radioButtonsInBox(
@@ -65,6 +75,18 @@ def __init__(self):
                                       sendSelectedValue=True,
                                       contentsLength=10, items=[''],
                                       callback=self._method_changed)
+        self.custom_list = gui.appendRadioButton(box, "Custom dictionary",
+                                                 addToLayout=False)
+
+        self.__posfile_loader = FileLoader()
+        self.__posfile_loader.set_file_list()
+        self.__posfile_loader.activated.connect(self.__pos_loader_activated)
+        self.__posfile_loader.file_loaded.connect(self.__pos_loader_activated)
+
+        self.__negfile_loader = FileLoader()
+        self.__negfile_loader.set_file_list()
+        self.__negfile_loader.activated.connect(self.__neg_loader_activated)
+        self.__negfile_loader.file_loaded.connect(self.__neg_loader_activated)
 
         self.form.addWidget(self.liu_hu, 0, 0, Qt.AlignLeft)
         self.form.addWidget(QLabel("Language:"), 0, 1, Qt.AlignRight)
@@ -73,6 +95,17 @@ def __init__(self):
         self.form.addWidget(self.multi_sent, 2, 0, Qt.AlignLeft)
         self.form.addWidget(QLabel("Language:"), 2, 1, Qt.AlignRight)
         self.form.addWidget(self.multi_box, 2, 2, Qt.AlignRight)
+        self.form.addWidget(self.custom_list, 3, 0, Qt.AlignLeft)
+        self.filegrid = QGridLayout()
+        self.form.addLayout(self.filegrid, 4, 0, 1, 3)
+        self.filegrid.addWidget(QLabel("Positive:"), 0, 0, Qt.AlignRight)
+        self.filegrid.addWidget(self.__posfile_loader.file_combo, 0, 1)
+        self.filegrid.addWidget(self.__posfile_loader.browse_btn, 0, 2)
+        self.filegrid.addWidget(self.__posfile_loader.load_btn, 0, 3)
+        self.filegrid.addWidget(QLabel("Negative:"), 1, 0, Qt.AlignRight)
+        self.filegrid.addWidget(self.__negfile_loader.file_combo, 1, 1)
+        self.filegrid.addWidget(self.__negfile_loader.browse_btn, 1, 2)
+        self.filegrid.addWidget(self.__negfile_loader.load_btn, 1, 3)
 
         self.senti_dict = SentimentDictionaries()
         self.update_multi_box()
@@ -83,6 +116,28 @@ def __init__(self):
                              'Autocommit is on')
         ac.layout().insertSpacing(1, 8)
 
+    def __pos_loader_activated(self):
+        cf = self.__posfile_loader.get_current_file()
+        self.pos_file = cf.abspath if cf else None
+        self._method_changed()
+
+    def __neg_loader_activated(self):
+        cf = self.__negfile_loader.get_current_file()
+        self.neg_file = cf.abspath if cf else None
+        self._method_changed()
+
+    def __set_pos_path(self, path: RecentPath, paths: List[RecentPath] = []):
+        self._posfile_loader.recent_paths = paths
+        self.__posfile_loader.set_file_list()
+        self.__posfile_loader.set_current_file(_to_abspath(path))
+        self.pos_file = self.__posfile_loader.get_current_file()
+
+    def __set_lx_path(self, path: RecentPath, paths: List[RecentPath] = []):
+        self.__negfile_loader.recent_paths = paths
+        self.__negfile_loader.set_file_list()
+        self.__negfile_loader.set_current_file(_to_abspath(path))
+        self.neg_file = self.__negfile_loader.get_current_file()
+
     def update_multi_box(self):
         if self.senti_dict.supported_languages:
             self.multi_box.clear()
@@ -116,6 +171,8 @@ def _method_changed(self):
     def commit(self):
         if self.corpus is not None:
             self.Warning.senti_offline.clear()
+            self.Warning.one_dict_only.clear()
+            self.Warning.no_dicts_loaded.clear()
             method = self.METHODS[self.method_idx]
             if self.method_idx == 0:
                 out = method(language=self.liu_language).transform(self.corpus)
@@ -126,6 +183,13 @@ def commit(self):
                     return
                 else:
                     out = method(language=self.multi_language).transform(self.corpus)
+            elif self.method_idx == 3:
+                out = method(self.pos_file, self.neg_file).transform(self.corpus)
+                if (self.pos_file and not self.neg_file) or \
+                    (self.neg_file and not self.pos_file):
+                    self.Warning.one_dict_only()
+                if not self.pos_file and not self.neg_file:
+                    self.Warning.no_dicts_loaded()
             else:
                 out = method().transform(self.corpus)
             self.Outputs.corpus.send(out)

diff --git a/orangecontrib/text/widgets/tests/bow-test b/orangecontrib/text/widgets/tests/bow-test
diff --git a/...s/data/corrupted/sample_pdf_corrupted.pdf → ...uments/corrupted/sample_pdf_corrupted.pdf b/...s/data/corrupted/sample_pdf_corrupted.pdf → ...uments/corrupted/sample_pdf_corrupted.pdf
diff --git a/.../widgets/tests/data/good/sample_docx.docx → ...ests/data/documents/good/sample_docx.docx b/.../widgets/tests/data/good/sample_docx.docx → ...ests/data/documents/good/sample_docx.docx
diff --git a/...xt/widgets/tests/data/good/sample_odt.odt → .../tests/data/documents/good/sample_odt.odt b/...xt/widgets/tests/data/good/sample_odt.odt → .../tests/data/documents/good/sample_odt.odt
diff --git a/...xt/widgets/tests/data/good/sample_pdf.pdf → .../tests/data/documents/good/sample_pdf.pdf b/...xt/widgets/tests/data/good/sample_pdf.pdf → .../tests/data/documents/good/sample_pdf.pdf
diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_txt_ž.txt b/orangecontrib/text/widgets/tests/data/documents/good/sample_txt_ž.txt
@@ -0,0 +1 @@
+This is a test txt_ž file
diff --git a/orangecontrib/text/widgets/tests/data/sentiment/neg.txt b/orangecontrib/text/widgets/tests/data/sentiment/neg.txt
@@ -0,0 +1,5 @@
+bad
+ugly
+sad
+random
+quasi
diff --git a/orangecontrib/text/widgets/tests/data/sentiment/pos.txt b/orangecontrib/text/widgets/tests/data/sentiment/pos.txt
@@ -0,0 +1,5 @@
+good
+nice
+great
+human
+user
diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py
@@ -8,17 +8,17 @@
 class TestOWImportDocuments(WidgetTest):
     def setUp(self) -> None:
         self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
-        path = os.path.join(os.path.dirname(__file__), "data")
+        path = os.path.join(os.path.dirname(__file__), "data/documents")
         self.widget.setCurrentPath(path)
         self.widget.reload()
         self.wait_until_finished()
 
     def test_current_path(self):
-        path = os.path.join(os.path.dirname(__file__), "data")
+        path = os.path.join(os.path.dirname(__file__), "data/documents")
         self.assertEqual(path, self.widget.currentPath)
 
     def test_no_skipped(self):
-        path = os.path.join(os.path.dirname(__file__), "data", "good")
+        path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
         self.widget.setCurrentPath(path)
         self.widget.reload()
         self.wait_until_finished()

diff --git a/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py b/orangecontrib/text/widgets/tests/test_owsentimentanalysis.py
@@ -1,3 +1,6 @@
+import os
+import numpy as np
+
 from unittest import mock, skip
 from unittest.mock import patch
 
@@ -39,6 +42,18 @@ def test_output(self):
         out_corpus = self.get_output(self.widget.Outputs.corpus)
         self.assertEqual(len(out_corpus.domain), len(self.corpus.domain) + 1)
 
+        # test custom files
+        self.widget.pos_file = os.path.join(os.path.dirname(__file__),
+                                            "data/sentiment/pos.txt")
+        self.widget.neg_file = os.path.join(os.path.dirname(__file__),
+                                            "data/sentiment/neg.txt")
+        self.widget.custom_list.click()
+        out_corpus = self.get_output(self.widget.Outputs.corpus)
+        self.assertEqual(len(out_corpus.domain), len(self.corpus.domain) + 1)
+        res = np.array([[0], [10], [16.66666667], [12.5], [11.11111111],
+                        [-14.28571429], [0], [-10], [0]])
+        np.testing.assert_array_almost_equal(out_corpus.X, res, decimal=8)
+
     def test_language_changed(self):
         """Test if output changes on language change"""
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
@@ -63,3 +78,23 @@ def test_sentiment_offline(self):
             self.send_signal(widget.Inputs.corpus, self.corpus)
             widget.multi_sent.click()
             self.assertTrue(widget.Warning.senti_offline.is_shown())
+
+    def test_no_file_warnings(self):
+        widget = self.create_widget(OWSentimentAnalysis)
+        self.send_signal(widget.Inputs.corpus, self.corpus)
+        self.assertFalse(widget.Warning.no_dicts_loaded.is_shown())
+        widget.custom_list.click()
+        self.assertTrue(widget.Warning.no_dicts_loaded.is_shown())
+        widget.pos_file = os.path.join(os.path.dirname(__file__),
+                                       "data/sentiment/pos.txt")
+        widget.commit()
+        self.assertTrue(widget.Warning.one_dict_only.is_shown())
+        self.assertFalse(widget.Warning.no_dicts_loaded.is_shown())
+        widget.neg_file = os.path.join(os.path.dirname(__file__),
+                                       "data/sentiment/neg.txt")
+        widget.commit()
+        self.assertFalse(widget.Warning.one_dict_only.is_shown())
+        self.assertFalse(widget.Warning.no_dicts_loaded.is_shown())
+        widget.vader.click()
+        self.assertFalse(widget.Warning.one_dict_only.is_shown())
+        self.assertFalse(widget.Warning.no_dicts_loaded.is_shown())
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    bad
+    ugly
+    sad
+    random
+    quasi