-
Notifications
You must be signed in to change notification settings - Fork 2
/
tokenizer.ml
executable file
·76 lines (67 loc) · 2.68 KB
/
tokenizer.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
(** Module used to tokenize the words in a document, as well
as tokenize the sentences within the doc. Contains functionality
to tokenize sentences and tokenize words. *)
module Tokenizer = struct
(** Tokenizer type, which contains tokenized
sentences and tokenized words in 2 separate list
attributes. *)
type t = {
sentences : string list;
words : string list;
}
(** [sent_tokenize text] tokenizes the string text into sentences,
returns list of sentences with process:
1. seperate sentences into string list by '.'
2. trimming all from start with spaces, \n, \t
*)
let sent_tokenize (text:string) : string list =
let sentences = String.split_on_char '.' text in
let strip_sents = List.map (fun s -> String.trim s) sentences in
List.filter (fun s -> s <> "") strip_sents
(** [make_lower word] make all letters of the word lowercase,
if applicable. *)
let make_lower (word:string) : string =
String.lowercase_ascii word
(** [split_on_chars chars s] splits string
s into list of strings separated
by any one of the chars in specified char list*)
let rec split_on_chars (chars : char list)
(s : string) : string list =
match chars with
| h::t ->
let split_strings = String.split_on_char h s in
let split_strings2 = List.map (fun x ->
split_on_chars t x) split_strings in
List.concat split_strings2
| [] -> [s]
(** [word_tokenize text] tokenizes the text and returns a string list in
two steps:
1. making all the words lowercase
2. splitting the words into a string list by punctuation and spaces
*)
let word_tokenize (text:string) : string list =
let puncts = [' '; '\n'; '\t'; ';'; ','; '.'; ':'; '?'; '$';
'-'; '!'; '#'; '%'; '^'; '&'; '*';
'('; ')'; '+'; '='; '/'; '\\';
'\"' ] in
let words = split_on_chars puncts text in
let filter_words = List.filter (fun w -> w <> "") words in
let strip_words = List.map (fun w -> String.trim w) filter_words in
let all_lower = List.map (fun w -> make_lower w) strip_words in
all_lower
(** [tokenize text] is the tokenized form of [text] in a document,
containing two attributes: a string list of sentence tokens and a
string list of word tokens. *)
let tokenize (text:string) : t =
{
sentences = sent_tokenize text;
words = word_tokenize text;
}
(** [get_sentences] returns a string list of tokenized sentences
in a document [tok]. *)
let get_sentences (tok:t) : string list =
tok.sentences
(** [get_words tok] returns a tokenized word list of [tok]. *)
let get_words (tok:t) : string list =
tok.words
end