Skip to content

Commit

Permalink
Throw an exception if --force_exact_pieces specified on a non-composa…
Browse files Browse the repository at this point in the history
…ble MWT dataset
  • Loading branch information
AngledLuffa committed Nov 28, 2024
1 parent 1a36efb commit 2a1c1aa
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions stanza/models/mwt_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ def train(args):
logger.warning("Training data available, but dev data has no MWTs. Only training a dict based MWT")
args['dict_only'] = True

if args['force_exact_pieces'] and not mwts_composed_of_words(train_doc):
raise ValueError("Cannot train model with --force_exact_pieces, as the MWT in this dataset are not entirely composed of their subwords")

if args['force_exact_pieces'] is None and mwts_composed_of_words(train_doc):
# the force_exact_pieces mechanism trains a separate version of the MWT expander in the Trainer
# (the training loop here does not need to change)
Expand All @@ -147,6 +150,8 @@ def train(args):
# this behavior can be turned off at training time with --no_force_exact_pieces
logger.info("Train MWTs entirely composed of their subwords. Training the MWT to match that paradigm as closely as possible")
args['force_exact_pieces'] = True

if args['force_exact_pieces']:
logger.info("Reconverting to BinaryDataLoader")
train_batch = BinaryDataLoader(train_doc, args['batch_size'], args, evaluation=False)
vocab = train_batch.vocab
Expand Down

0 comments on commit 2a1c1aa

Please sign in to comment.