From 443089c301ea4b3340d23aba41315b491bc5c219 Mon Sep 17 00:00:00 2001 From: Panem Sandeep Date: Wed, 27 Jun 2018 15:10:12 +0530 Subject: [PATCH 1/5] Changes done to fetch probabilities of CRF assigned tags from LSTM last layer. This helps in applying thresholds at the end to tweak precision of tagging. --- src/brat_to_conll.py | 37 ++++++++++++++++++++++++++++++++++++- src/neuroner.py | 8 +++++--- src/train.py | 30 +++++++++++++++++++++++++++--- src/utils_plots.py | 2 +- 4 files changed, 69 insertions(+), 8 deletions(-) diff --git a/src/brat_to_conll.py b/src/brat_to_conll.py index cb2b62f6..404ade7a 100755 --- a/src/brat_to_conll.py +++ b/src/brat_to_conll.py @@ -66,10 +66,40 @@ def get_sentences_and_tokens_from_stanford(text, core_nlp): sentences.append(tokens) return sentences -def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False): +def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False, scores = False, indices = False): # load text with codecs.open(text_filepath, 'r', 'UTF-8') as f: text =f.read() + + index_score_map = {} + + if scores: + num_scores = len(scores) + num_indices = len(indices) + assert(num_scores == num_indices), "scores are not in sync with text!" + + cum_scores = [-1]*num_scores + cum_scores[0] = scores[0] + for i in range(1, num_scores): + cum_scores[i] = cum_scores[i-1] + scores[i] + + index_score_map = {} + i = 0 + while i < num_scores: + st = indices[i][0] + j = i + while j < num_scores: + ed = indices[j][1] + try: + tag_score = float(cum_scores[j] - cum_scores[i] + scores[i]) / (j-i+1) # tag score, considering avg. + except: + import pdb; pdb.set_trace() + key = "%s-%s" % (st, ed) + index_score_map[key] = tag_score + j += 1 + i += 1 + + if verbose: print("\ntext:\n{0}\n".format(text)) # parse annotation file @@ -86,6 +116,11 @@ def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False): entity['start'] = int(anno[2]) entity['end'] = int(anno[3]) entity['text'] = ' '.join(anno[4:]) + key = "%s-%s" % (anno[2], anno[3]) + entity_score = index_score_map.get(key, "no score from model") + if entity_score != "no score from model": + entity_score = float("{0:.3f}".format(entity_score)) + entity['score'] = entity_score if verbose: print("entity: {0}".format(entity)) # Check compatibility between brat text and anootation diff --git a/src/neuroner.py b/src/neuroner.py index cb25314a..146f53a0 100644 --- a/src/neuroner.py +++ b/src/neuroner.py @@ -1,5 +1,7 @@ import matplotlib +import gc matplotlib.use('Agg') +from distutils import util import train import dataset as ds import tensorflow as tf @@ -469,13 +471,13 @@ def predict(self, text): # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) - _, _, output_filepaths[dataset_type] = prediction_output + _, _, output_filepaths[dataset_type], _scores, _indices = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(utils.get_basename_without_extension(dataset_brat_deploy_filepath))) - text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True) + text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True, scores=_scores, indices=_indices) assert(text == text2) return entities @@ -488,4 +490,4 @@ def close(self): def __del__(self): self.sess.close() - +gc.collect() diff --git a/src/train.py b/src/train.py index 3b4ba7bc..386fccf6 100644 --- a/src/train.py +++ b/src/train.py @@ -7,6 +7,8 @@ import utils_tf import codecs import utils_nlp +from sklearn.preprocessing import minmax_scale +from sklearn.preprocessing import normalize #from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file def train_step(sess, dataset, sequence_number, model, parameters): @@ -34,11 +36,17 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine else: print('Evaluate model on the {0} set'.format(dataset_type)) all_predictions = [] + all_scores = [] + all_indices = [] all_y_true = [] output_filepath = os.path.join(stats_graph_folder, '{1:03d}_{0}.txt'.format(dataset_type,epoch_number)) output_file = codecs.open(output_filepath, 'w', 'UTF-8') original_conll_file = codecs.open(dataset_filepaths[dataset_type], 'r', 'UTF-8') - + #print("PrintingDataset") + #print(dataset.index_to_label) + minmax_unary_scores = [] + normalize_unary_scores = [] + #assert(len(dataset.token_indices[dataset_type]) == 1),"lines more than 1!" for i in range(len(dataset.token_indices[dataset_type])): feed_dict = { model.input_token_indices: dataset.token_indices[dataset_type][i], @@ -48,19 +56,32 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine model.dropout_keep_prob: 1. } unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict) + nn_predictions = predictions.tolist() + #print(unary_scores) + #print(predictions) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained) predictions = predictions[1:-1] else: predictions = predictions.tolist() - assert(len(predictions) == len(dataset.tokens[dataset_type][i])) + #assert(len(predictions) == len(dataset.tokens[dataset_type][i])) output_string = '' prediction_labels = [dataset.index_to_label[prediction] for prediction in predictions] + #minmax_unary_scores = minmax_scale(np.exp(unary_scores[1:-1]), axis=1) + normalize_unary_scores = normalize(np.exp(unary_scores[1:-1]), norm='l1', axis=1) + #normalize_unary_scores = normalize(minmax_unary_scores, norm='l1', axis=1) + #prediction_scores = np.amax(normalize_unary_scores, axis=1) + #my_predictions = np.argmax(normalize_unary_scores, axis=1) + prediction_scores = [] + for ind, pred in enumerate(predictions): + prediction_scores.append(normalize_unary_scores[ind][pred]) + gold_labels = dataset.labels[dataset_type][i] if parameters['tagging_format'] == 'bioes': prediction_labels = utils_nlp.bioes_to_bio(prediction_labels) gold_labels = utils_nlp.bioes_to_bio(gold_labels) + indices = [] for prediction, token, gold_label in zip(prediction_labels, dataset.tokens[dataset_type][i], gold_labels): while True: line = original_conll_file.readline() @@ -68,6 +89,7 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine if '-DOCSTART-' in split_line[0] or len(split_line) == 0 or len(split_line[0]) == 0: continue else: + indices.append((int(split_line[2]), int(split_line[3]))) token_original = split_line[0] if parameters['tagging_format'] == 'bioes': split_line.pop() @@ -79,6 +101,8 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine output_file.write(output_string+'\n') all_predictions.extend(predictions) + all_scores.extend(prediction_scores) + all_indices.extend(indices) all_y_true.extend(dataset.label_indices[dataset_type][i]) output_file.close() @@ -97,7 +121,7 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine new_y_pred, new_y_true, new_label_indices, new_label_names, _, _ = remap_labels(all_predictions, all_y_true, dataset, parameters['main_evaluation_mode']) print(sklearn.metrics.classification_report(new_y_true, new_y_pred, digits=4, labels=new_label_indices, target_names=new_label_names)) - return all_predictions, all_y_true, output_filepath + return all_predictions, all_y_true, output_filepath, all_scores, all_indices def predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths): diff --git a/src/utils_plots.py b/src/utils_plots.py index ff9af5e6..7aedbdd4 100644 --- a/src/utils_plots.py +++ b/src/utils_plots.py @@ -33,7 +33,7 @@ def show_values(pc, fmt="%.2f", **kw): By HYRY ''' pc.update_scalarmappable() - ax = pc.get_axes() + ax = pc.axes for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()): x, y = p.vertices[:-2, :].mean(0) if np.all(color[:3] > 0.5): From 2c2ef1baae6c7fb1aa9d9c4f80406e5c6b82f0f9 Mon Sep 17 00:00:00 2001 From: Panem Sandeep Date: Wed, 27 Jun 2018 15:17:00 +0530 Subject: [PATCH 2/5] Removed debugging statements --- src/brat_to_conll.py | 1 - src/train.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/src/brat_to_conll.py b/src/brat_to_conll.py index 404ade7a..46afbc67 100755 --- a/src/brat_to_conll.py +++ b/src/brat_to_conll.py @@ -15,7 +15,6 @@ def get_start_and_end_offset_of_token_from_spacy(token): def get_sentences_and_tokens_from_spacy(text, spacy_nlp): document = spacy_nlp(text) - # sentences sentences = [] for span in document.sents: sentence = [document[i] for i in range(span.start, span.end)] diff --git a/src/train.py b/src/train.py index 386fccf6..16ccf78b 100644 --- a/src/train.py +++ b/src/train.py @@ -42,8 +42,6 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine output_filepath = os.path.join(stats_graph_folder, '{1:03d}_{0}.txt'.format(dataset_type,epoch_number)) output_file = codecs.open(output_filepath, 'w', 'UTF-8') original_conll_file = codecs.open(dataset_filepaths[dataset_type], 'r', 'UTF-8') - #print("PrintingDataset") - #print(dataset.index_to_label) minmax_unary_scores = [] normalize_unary_scores = [] #assert(len(dataset.token_indices[dataset_type]) == 1),"lines more than 1!" @@ -57,8 +55,6 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine } unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict) nn_predictions = predictions.tolist() - #print(unary_scores) - #print(predictions) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained) predictions = predictions[1:-1] From 911fa2746f71d5e3789dc78c22cab5e8201e892b Mon Sep 17 00:00:00 2001 From: Panem Sandeep Date: Wed, 27 Jun 2018 16:05:33 +0530 Subject: [PATCH 3/5] Changes done to fetch scores of CRF assigned tags from LSTM last layer. This helps in applying thresholds at the end to tweak precision of tagging. --- src/neuroner.py | 2 +- src/train.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/neuroner.py b/src/neuroner.py index 146f53a0..097e5ae7 100644 --- a/src/neuroner.py +++ b/src/neuroner.py @@ -470,7 +470,7 @@ def predict(self, text): # Predict labels and output brat output_filepaths = {} - prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) + prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths,prediction_flag=True) _, _, output_filepaths[dataset_type], _scores, _indices = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) diff --git a/src/train.py b/src/train.py index 16ccf78b..0afd1618 100644 --- a/src/train.py +++ b/src/train.py @@ -30,7 +30,7 @@ def train_step(sess, dataset, sequence_number, model, parameters): feed_dict) return transition_params_trained -def prediction_step(sess, dataset, dataset_type, model, transition_params_trained, stats_graph_folder, epoch_number, parameters, dataset_filepaths): +def prediction_step(sess, dataset, dataset_type, model, transition_params_trained, stats_graph_folder, epoch_number, parameters, dataset_filepaths,prediction_flag=False): if dataset_type == 'deploy': print('Predict labels for the {0} set'.format(dataset_type)) else: @@ -117,7 +117,10 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine new_y_pred, new_y_true, new_label_indices, new_label_names, _, _ = remap_labels(all_predictions, all_y_true, dataset, parameters['main_evaluation_mode']) print(sklearn.metrics.classification_report(new_y_true, new_y_pred, digits=4, labels=new_label_indices, target_names=new_label_names)) - return all_predictions, all_y_true, output_filepath, all_scores, all_indices + if prediction_flag: + return all_predictions, all_y_true, output_filepath, all_scores, all_indices + else: + return all_predictions, all_y_true, output_filepath def predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths): From 155c48466f38555aa7fab6ee2f4dab910c2a0801 Mon Sep 17 00:00:00 2001 From: Panem Sandeep Date: Wed, 27 Jun 2018 16:05:33 +0530 Subject: [PATCH 4/5] Changes done to fetch scores of CRF assigned tags from LSTM last layer. This helps in applying thresholds at the end to tweak precision of tagging. --- src/neuroner.py | 2 +- src/train.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/neuroner.py b/src/neuroner.py index 146f53a0..097e5ae7 100644 --- a/src/neuroner.py +++ b/src/neuroner.py @@ -470,7 +470,7 @@ def predict(self, text): # Predict labels and output brat output_filepaths = {} - prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) + prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths,prediction_flag=True) _, _, output_filepaths[dataset_type], _scores, _indices = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) diff --git a/src/train.py b/src/train.py index 16ccf78b..0afd1618 100644 --- a/src/train.py +++ b/src/train.py @@ -30,7 +30,7 @@ def train_step(sess, dataset, sequence_number, model, parameters): feed_dict) return transition_params_trained -def prediction_step(sess, dataset, dataset_type, model, transition_params_trained, stats_graph_folder, epoch_number, parameters, dataset_filepaths): +def prediction_step(sess, dataset, dataset_type, model, transition_params_trained, stats_graph_folder, epoch_number, parameters, dataset_filepaths,prediction_flag=False): if dataset_type == 'deploy': print('Predict labels for the {0} set'.format(dataset_type)) else: @@ -117,7 +117,10 @@ def prediction_step(sess, dataset, dataset_type, model, transition_params_traine new_y_pred, new_y_true, new_label_indices, new_label_names, _, _ = remap_labels(all_predictions, all_y_true, dataset, parameters['main_evaluation_mode']) print(sklearn.metrics.classification_report(new_y_true, new_y_pred, digits=4, labels=new_label_indices, target_names=new_label_names)) - return all_predictions, all_y_true, output_filepath, all_scores, all_indices + if prediction_flag: + return all_predictions, all_y_true, output_filepath, all_scores, all_indices + else: + return all_predictions, all_y_true, output_filepath def predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths): From ee2b0ac4e87cc1f29dd51ca3cd61bc52b3df0bb7 Mon Sep 17 00:00:00 2001 From: Panem Sandeep Date: Wed, 27 Jun 2018 20:27:17 +0530 Subject: [PATCH 5/5] Pushing with Correct username --- src/neuroner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/neuroner.py b/src/neuroner.py index 097e5ae7..f3667788 100644 --- a/src/neuroner.py +++ b/src/neuroner.py @@ -442,7 +442,6 @@ def fit(self): def predict(self, text): self.prediction_count += 1 - if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters)