-
Notifications
You must be signed in to change notification settings - Fork 2
/
tokenizer.mli
executable file
·50 lines (40 loc) · 1.65 KB
/
tokenizer.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
(** Module used to tokenize the words in a document, as well
as tokenize the sentences within the doc. Contains functionality
to tokenize sentences and tokenize words.*)
module Tokenizer : sig
(** Tokenizer type, containing tokenized
sentences and tokenized words in 2 list
attributes *)
type t = {
sentences : string list;
words : string list;
}
(** [sent_tokenize text] tokenizes the string text into sentences,
returns list of sentences with process:
1. seperate sentences into string list by '.'
2. trimming all from start with spaces, \n, \t
*)
val sent_tokenize: (string) -> string list
(** [make_lower word] make all letters of the word lowercase,
if applicable. *)
val make_lower : (string) -> string
(** [split_on_chars chars s] splits string
s into list of strings separated
by any one of the chars in specified char list*)
val split_on_chars : (char list) -> (string) -> string list
(** [word_tokenize text] tokenizes the text and returns a string list in
two steps:
1. making all the words lowercase
2. splitting the words into a string list by punctuation and spaces
*)
val word_tokenize : (string) -> string list
(** [tokenize text] is the tokenized form of [text] in a document,
containing two attributes: a string list of sentence tokens and a
string list of word tokens. *)
val tokenize : (string) -> t
(** [get_sentences] returns a string list of tokenized sentences
in a document [tok]. *)
val get_sentences : (t) -> string list
(** [get_words tok] returns a tokenized word list of [tok]. *)
val get_words : (t) -> string list
end