Skip to content

Commit

Permalink
Import Documents: Add conllu reader
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdapretnar committed Jun 22, 2021
1 parent b9b9612 commit ffacabb
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 7 deletions.
Empty file added orangecontrib/text/conllu.py
Empty file.
97 changes: 90 additions & 7 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import re
import yaml
from urllib.parse import quote, unquote

from conllu import parse, parse_incr
from requests.exceptions import ConnectionError

from collections import namedtuple
Expand Down Expand Up @@ -36,7 +38,7 @@

from orangecontrib.text.corpus import Corpus

DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")
DefaultFormats = ("docx", "odt", "txt", "pdf", "xml", "conllu")

TextData = namedtuple(
"Text",
Expand Down Expand Up @@ -211,6 +213,72 @@ def make_text_data(self):
text_data.category, text_data.content)


class ConlluReader(Reader):
TextData = namedtuple(
"Text",
["name", "path", "ext", "category", "doc_id", "content"]
)

ext = [".conllu"]

def __init__(self, path):
super().__init__(path)
self.tokens = None

@staticmethod
def parse_ner(tokens):
entities = []
temp_ner = []
for token in tokens:
# "0" means the token is not named entity
if token["misc"]["NER"] != "O":
# lemma?
temp_ner.append(token["form"])
else:
entities.append(" ".join(temp_ner))
temp_ner = []
if temp_ner:
entities.append(" ".join(temp_ner))
return entities

def read_file(self):
content = []
file = open(self.path, "r")
utterance_id = None
utterance = []
tokens = []
temp_tokens = []
for sentence in parse_incr(file):
if "newdoc id" in sentence.metadata.keys():
if utterance_id is not None:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
utterance = temp_tokens = []
utterance_id = sentence.metadata["newdoc id"]
utterance.append(sentence.metadata["text"])
temp_tokens.extend([token["lemma"] for token in sentence])
if temp_tokens or utterance:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
file.close()
self.tokens = tokens
self.content = pd.DataFrame(content, columns=["newdoc id", "text"])

def make_text_data(self):
text_objects = []
name = pathlib.Path(self.path).stem
directory = pathlib.PurePath(self.path).parent
category = directory.parts[-1] or "None"
for _, row in self.content.iterrows():
if self.replace_white_space:
row["text"] = re.sub(r'\s+', ' ', row["text"])
text_objects.append(self.TextData(name, self.path, self.ext,
category,
row["newdoc id"],
row["text"]))
return text_objects


class ImportDocuments:
META_DATA_FILE_KEY = "Text file"

Expand All @@ -229,6 +297,8 @@ def __init__(self, startdir: str,
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None
self.is_conllu = False
self.tokens = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
Expand All @@ -245,6 +315,7 @@ def _read_text_data(self):
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []
tokens = []

if n_paths == 0:
raise NoDocumentsException()
Expand All @@ -261,14 +332,22 @@ def _read_text_data(self):
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
if type(reader) == ConlluReader:
self.is_conllu = True
for t in text:
text_data.append(t)
tokens.extend(reader.tokens)
else:
self.is_conllu = False
text_data.append(text)
batch.append(text_data)
else:
errors.append(error)

if self.cancelled:
return

self.tokens = tokens
return text_data, errors

def _read_meta_data(self):
Expand All @@ -295,15 +374,15 @@ def _read_meta_data(self):

def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
names = ["name", "path", "content"] if not self.is_conllu else [
"name", "path", "utterance", "content"]
data = []
category_data = []
text_categories = list(set(t.category for t in self._text_data))
values = list(set(text_categories))
category_var = DiscreteVariable.make("category", values=values)
for textdata in self._text_data:
data.append(
[
datum = [
# some characters are written as decomposed (č is char c
# and separate char for caron), with NFC normalization we
# normalize them to be written as precomposed (č is one
Expand All @@ -313,7 +392,9 @@ def _create_corpus(self) -> Corpus:
normalize('NFC', textdata.path),
normalize('NFC', textdata.content)
]
)
if self.is_conllu:
datum.insert(2, normalize('NFC', textdata.doc_id))
data.append(datum)
category_data.append(category_var.to_val(textdata.category))
if len(text_categories) > 1:
category_data = np.array(category_data)
Expand All @@ -329,7 +410,9 @@ def _create_corpus(self) -> Corpus:
corpus = Corpus(domain,
Y=category_data,
metas=data,
text_features=[domain.metas[2]])
text_features=[domain.metas[-1]])
if self.is_conllu and self.tokens:
corpus.store_tokens(self.tokens)

return corpus

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ biopython # Enables Pubmed widget.
ufal.udpipe >=1.2.0.3
orange-widget-base >=4.12.0
yake
conllu

0 comments on commit ffacabb

Please sign in to comment.