Skip to content

Commit

Permalink
fix: Clean the text before to taggify it
Browse files Browse the repository at this point in the history
  • Loading branch information
Romuald Rousseau committed Dec 15, 2024
1 parent 38aadfb commit f5025d7
Showing 1 changed file with 9 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public SimpleTagClassifier(final Model model, final TagClassifier.TagStyle tagSt
this.model = model;
this.tagStyle = tagStyle;

this.lexicon =(model != null && model.getData().get("lexicon").isPresent())
this.lexicon = (model != null && model.getData().get("lexicon").isPresent())
? model.getData().getList("lexicon")
: StringUtils.getSymbols().stream().toList();
this.tagTokenizer = new ShingleTokenizer(this.getLexicon(), 1);
Expand Down Expand Up @@ -71,20 +71,23 @@ public TagClassifier setLexicon(final List<String> lexicon) {

@Override
public String ensureTagStyle(final String text) {
final var cleanText = (this.model == null)
? text
: this.model.getFilterList().stream().reduce(text, (a, x) -> a.replaceAll("(?i)" + x, " "));
if (this.tagStyle == TagClassifier.TagStyle.SNAKE) {
this.tagTokenizer.disableLemmatization();
return StringUtils.toSnake(text, this.tagTokenizer);
return StringUtils.toSnake(cleanText, this.tagTokenizer);
}
if (this.tagStyle == TagClassifier.TagStyle.CAMEL) {
this.tagTokenizer.disableLemmatization();
return StringUtils.toCamel(text, this.tagTokenizer);
return StringUtils.toCamel(cleanText, this.tagTokenizer);
}
if (text.indexOf(" ") > 0 || text.indexOf("_") > 0) {
if (cleanText.indexOf(" ") > 0 || cleanText.indexOf("_") > 0) {
this.tagTokenizer.enableLemmatization();
return StringUtils.toSnake(text, this.tagTokenizer);
return StringUtils.toSnake(cleanText, this.tagTokenizer);
} else {
this.tagTokenizer.enableLemmatization();
return StringUtils.toCamel(text, this.tagTokenizer);
return StringUtils.toCamel(cleanText, this.tagTokenizer);
}
}

Expand Down

0 comments on commit f5025d7

Please sign in to comment.