-
Notifications
You must be signed in to change notification settings - Fork 10
/
Tokenizer.php
119 lines (101 loc) · 3.2 KB
/
Tokenizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
<?php
namespace QueryTranslator\Languages\Galach;
use QueryTranslator\Tokenizing;
use QueryTranslator\Values\TokenSequence;
/**
* Galach implementation of the Tokenizing interface.
*/
final class Tokenizer implements Tokenizing
{
/**
* Represents the whitespace in the input string.
*/
const TOKEN_WHITESPACE = 1;
/**
* Combines two adjoining elements with logical AND.
*/
const TOKEN_LOGICAL_AND = 2;
/**
* Combines two adjoining elements with logical OR.
*/
const TOKEN_LOGICAL_OR = 4;
/**
* Applies logical NOT to the next (right-side) element.
*/
const TOKEN_LOGICAL_NOT = 8;
/**
* Applies logical NOT to the next (right-side) element.
*
* This is an alternative to the TOKEN_LOGICAL_NOT, with the difference that
* parser will expect it's placed next (left) to the element it applies to,
* without the whitespace in between.
*/
const TOKEN_LOGICAL_NOT_2 = 16;
/**
* Mandatory operator applies to the next (right-side) element and means
* that the element must be present. There must be no whitespace between it
* and the element it applies to.
*/
const TOKEN_MANDATORY = 32;
/**
* Prohibited operator applies to the next (right-side) element and means
* that the element must not be present. There must be no whitespace between
* it and the element it applies to.
*/
const TOKEN_PROHIBITED = 64;
/**
* Left side delimiter of a group.
*
* Group is used to group elements in order to form a sub-query.
*
* @see \QueryTranslator\Languages\Galach\Values\Token\GroupBegin
*/
const TOKEN_GROUP_BEGIN = 128;
/**
* Right side delimiter of a group.
*
* Group is used to group elements in order to form a sub-query.
*/
const TOKEN_GROUP_END = 256;
/**
* Term token type represents a category of term type tokens.
*
* This type is intended to be used as an extension point through subtyping.
*
* @see \QueryTranslator\Languages\Galach\Values\Token\Phrase
* @see \QueryTranslator\Languages\Galach\Values\Token\Tag
* @see \QueryTranslator\Languages\Galach\Values\Token\User
* @see \QueryTranslator\Languages\Galach\Values\Token\Word
*/
const TOKEN_TERM = 512;
/**
* Bailout token.
*
* If token could not be recognized, next character is extracted into a
* token of this type. Ignored by parser.
*/
const TOKEN_BAILOUT = 1024;
/**
* @var \QueryTranslator\Languages\Galach\TokenExtractor
*/
private $tokenExtractor;
/**
* @param \QueryTranslator\Languages\Galach\TokenExtractor $tokenExtractor
*/
public function __construct(TokenExtractor $tokenExtractor)
{
$this->tokenExtractor = $tokenExtractor;
}
public function tokenize($string)
{
$length = mb_strlen($string);
$position = 0;
$tokens = [];
while ($position < $length) {
$token = $this->tokenExtractor->extract($string, $position);
$position += mb_strlen($token->lexeme);
$tokens[] = $token;
}
return new TokenSequence($tokens, $string);
}
}