forked from quickwit-oss/tantivy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whitespace_tokenizer.rs
90 lines (79 loc) · 2.57 KB
/
whitespace_tokenizer.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
use std::str::CharIndices;
use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces.
#[derive(Clone, Default)]
pub struct WhitespaceTokenizer {
token: Token,
}
pub struct WhitespaceTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: &'a mut Token,
}
impl Tokenizer for WhitespaceTokenizer {
type TokenStream<'a> = WhitespaceTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> {
self.token.reset();
WhitespaceTokenStream {
text,
chars: text.char_indices(),
token: &mut self.token,
}
}
}
impl<'a> WhitespaceTokenStream<'a> {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
.filter(|(_, c)| c.is_ascii_whitespace())
.map(|(offset, _)| offset)
.next()
.unwrap_or(self.text.len())
}
}
impl<'a> TokenStream for WhitespaceTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() {
if !c.is_ascii_whitespace() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
}
}
false
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[cfg(test)]
mod tests {
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{TextAnalyzer, Token, WhitespaceTokenizer};
#[test]
fn test_whitespace_tokenizer() {
let tokens = token_stream_helper("Hello, happy tax payer!");
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut a = TextAnalyzer::from(WhitespaceTokenizer::default());
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}