Skip to content

Commit

Permalink
Merge pull request #675 from ajdapretnar/conllu
Browse files Browse the repository at this point in the history
Import Documents: Add conllu reader
  • Loading branch information
VesnaT authored Jul 23, 2021
2 parents eddfa45 + 68b4408 commit b093ab5
Show file tree
Hide file tree
Showing 12 changed files with 454 additions and 49 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ recursive-include orangecontrib/text/tests *.txt *.json
recursive-include orangecontrib/text/tutorials *.ows
recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu
include orangecontrib/text/widgets/tests/bow-test
recursive-include scripts *.sh *.py

Expand Down
Binary file removed doc/widgets/images/Import-Documents-stamped.png
Binary file not shown.
Binary file added doc/widgets/images/ImportDocuments-Conllu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/widgets/images/ImportDocuments.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 14 additions & 3 deletions doc/widgets/importdocuments.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,29 @@ Import text documents from folders.
**Outputs**

- Corpus: A collection of documents from the local machine.
- Skipped Documents: A list of documents that couldn't be imported.

**Import Documents** widget retrieves text files from folders and creates a corpus. The widget reads .txt, .docx, .odt, .pdf and .xml files. If a folder contains subfolders, they will be used as class labels.
**Import Documents** widget retrieves text files from folders and creates a corpus. The widget reads .txt, .docx, .odt, .pdf, .xml, and .conllu files. If a folder contains subfolders, they will be used as class labels.

![](images/Import-Documents-stamped.png)
![](images/ImportDocuments.png)

1. Folder being loaded.
2. Load folder from a local machine.
3. Reload the data.
4. Number of documents retrieved.
4. Options for importing .conllu files.
5. Number of documents retrieved.

If the widget cannot read the file for some reason, the file will be skipped. Files that were successfully retrieved will still be on the output.

Conllu files
------------

![](images/ImportDocuments-Conllu.png)

Since Text version 1.5.0, Orange supports reading [.conllu files](https://universaldependencies.org/format.html). Each file will be considered as a separate document in the corpus. If utterance IDs exist, utterances will become documents (each row in the corpus will be a single utterance).

Lemmas and POS tags from *Conllu import options* will be added as tokens and the corpus will be considered preprocessed. Named entities will be added as a comma-separated string (if they exist in the file).

Example
-------

Expand Down
189 changes: 157 additions & 32 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
import yaml
from urllib.parse import quote, unquote

from conllu import parse_incr
from requests.exceptions import ConnectionError

from collections import namedtuple
Expand Down Expand Up @@ -39,7 +41,7 @@

from orangecontrib.text.corpus import Corpus

DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")
DefaultFormats = ("docx", "odt", "txt", "pdf", "xml", "conllu")

TextData = namedtuple(
"Text",
Expand Down Expand Up @@ -88,7 +90,8 @@ def read(self, ):
return textdata, error

def read_file(self):
raise NotImplementedError("No reader for {}".format(pathlib.Path(self.path).suffix))
raise NotImplementedError(
"No reader for {}".format(pathlib.Path(self.path).suffix))

def make_text_data(self):
name = pathlib.Path(self.path).stem
Expand Down Expand Up @@ -153,7 +156,8 @@ def read_file(self):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text).replace('\x00', '')

Expand Down Expand Up @@ -186,6 +190,13 @@ def read_file(self):
self.content[k] = ""


class TsvMetaReader(Reader):
ext = [".tsv"]

def read_file(self):
self.content = pd.read_csv(self.path, delimiter="\t")


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

Expand Down Expand Up @@ -217,8 +228,95 @@ def make_text_data(self):
text_data.category, text_data.content)


class ConlluReader(Reader):
TextData = namedtuple(
"Text",
["name", "path", "ext", "category", "doc_id", "content"]
)

ext = [".conllu"]

def __init__(self, path):
super().__init__(path)
self.tokens = None
self.pos = None
self.ner = None

@staticmethod
def parse_ner(tokens):
entities = []
temp_ner = []
for token in tokens:
if token["misc"] is None or "NER" not in token["misc"]:
continue
# "0" means the token is not named entity
if token["misc"]["NER"] != "O":
# lemma?
temp_ner.append(token["lemma"])
elif temp_ner:
entities.append(" ".join(temp_ner))
temp_ner = []
if temp_ner:
entities.append(" ".join(temp_ner))
return entities

def read_file(self):
content = []
file = open(self.path, "r", encoding="utf-8")
utterance_id = ""
utterance = []
tokens = []
pos = []
ner = []
temp_tokens = []
temp_pos = []
temp_ner = []
for sentence in parse_incr(file):
if "newdoc id" in sentence.metadata.keys():
if utterance_id:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
pos.append(temp_pos)
ner.append(temp_ner)
utterance = []
temp_tokens = []
temp_pos = []
temp_ner = []
utterance_id = sentence.metadata["newdoc id"]
utterance.append(sentence.metadata["text"])
temp_tokens.extend([token["lemma"] for token in sentence])
temp_pos.extend([token["upos"] for token in sentence])
temp_ner.extend(self.parse_ner(sentence))
if temp_tokens or utterance:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
pos.append(temp_pos)
ner.append(temp_ner)
file.close()
self.tokens = tokens
self.pos = pos
self.ner = np.array([", ".join(tokens) for tokens in ner], dtype=object)
self.content = pd.DataFrame(content, columns=["newdoc id", "text"])

def make_text_data(self):
text_objects = []
name = pathlib.Path(self.path).stem
directory = pathlib.PurePath(self.path).parent
category = directory.parts[-1] or "None"
for _, row in self.content.iterrows():
if self.replace_white_space:
row["text"] = re.sub(r'\s+', ' ', row["text"])
text_objects.append(self.TextData(name, self.path, self.ext,
category,
row["newdoc id"],
row["text"]))
return text_objects


class ImportDocuments:
META_DATA_FILE_KEY = "Text file"
# this is what we will merge meta data on, change to user-set variable
CONLLU_META_DATA = "ID"

def __init__(self, startdir: str,
is_url: bool = False,
Expand All @@ -235,13 +333,19 @@ def __init__(self, startdir: str,
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
self.is_conllu = False
self.tokens = None
self.pos = None
self.ner = None

def run(self) -> Tuple[Corpus, List, List, List, List, bool]:
self._text_data, errors_text, tokens, pos, ner, conllu \
= self._read_text_data()
self._meta_data, errors_meta = self._read_meta_data()
self.is_conllu = conllu
corpus = self._create_corpus()
corpus = self._add_metadata(corpus)
return corpus, errors_text + errors_meta
return corpus, errors_text + errors_meta, tokens, pos, ner, conllu

def _read_text_data(self):
text_data = []
Expand All @@ -251,6 +355,10 @@ def _read_text_data(self):
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []
tokens = []
pos = []
ner = []
conllu = False

if n_paths == 0:
raise NoDocumentsException()
Expand All @@ -267,19 +375,28 @@ def _read_text_data(self):
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
if type(reader) == ConlluReader:
conllu = True
for t in text:
text_data.append(t)
tokens.extend(reader.tokens)
pos.extend(reader.pos)
ner.extend(reader.ner)
else:
conllu = False
text_data.append(text)
batch.append(text_data)
else:
errors.append(error)

if self.cancelled:
return

return text_data, errors
return text_data, errors, tokens, pos, ner, conllu

def _read_meta_data(self):
scan = self.scan_url if self._is_url else self.scan
patterns = ["*.csv", "*.yaml", "*.yml"]
patterns = ["*.csv", "*.yaml", "*.yml", "*.tsv"]
paths = scan(self.startdir, include_patterns=patterns)
meta_dfs, errors = [], []
for path in paths:
Expand All @@ -301,25 +418,27 @@ def _read_meta_data(self):

def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
names = ["name", "path", "content"] if not self.is_conllu else [
"name", "path", "utterance", "content"]
data = []
category_data = []
text_categories = list(set(t.category for t in self._text_data))
values = list(set(text_categories))
category_var = DiscreteVariable.make("category", values=values)
for textdata in self._text_data:
data.append(
[
# some characters are written as decomposed (č is char c
# and separate char for caron), with NFC normalization we
# normalize them to be written as precomposed (č is one
# unicode char - 0x10D)
# https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
normalize('NFC', textdata.name),
normalize('NFC', textdata.path),
normalize('NFC', textdata.content)
]
)
datum = [
# some characters are written as decomposed (č is char c
# and separate char for caron), with NFC normalization we
# normalize them to be written as precomposed (č is one
# unicode char - 0x10D)
# https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
normalize('NFC', textdata.name),
normalize('NFC', textdata.path),
normalize('NFC', textdata.content)
]
if self.is_conllu:
datum.insert(2, normalize('NFC', textdata.doc_id))
data.append(datum)
category_data.append(category_var.to_val(textdata.category))
if len(text_categories) > 1:
category_data = np.array(category_data)
Expand All @@ -335,19 +454,24 @@ def _create_corpus(self) -> Corpus:
corpus = Corpus(domain,
Y=category_data,
metas=data,
text_features=[domain.metas[2]])

text_features=[domain.metas[-1]])
return corpus

def _add_metadata(self, corpus: Corpus) -> Corpus:
if "path" not in corpus.domain or self._meta_data is None \
or self.META_DATA_FILE_KEY not in self._meta_data.columns:
or (self.META_DATA_FILE_KEY not in self._meta_data.columns
and self.CONLLU_META_DATA not in self._meta_data.columns):
return corpus

df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]
if self.is_conllu:
df = self._meta_data.set_index(self.CONLLU_META_DATA)
path_column = corpus.get_column_view("utterance")[0]
else:
df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]

if len(df.index.drop_duplicates()) != len(df.index):
df = df[~df.index.duplicated(keep='first')]
filtered = df.reindex(path_column)
Expand Down Expand Up @@ -396,8 +520,9 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):

filenames = [fname for fname in filenames
if matches_any(fname, include_patterns)
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in filenames]
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in
filenames]
return paths

@staticmethod
Expand Down
Loading

0 comments on commit b093ab5

Please sign in to comment.