Skip to content

Commit

Permalink
Added inclusive and exclusive range tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
thePanz committed Apr 5, 2018
1 parent 6af5e8b commit 25e49ce
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 6 deletions.
7 changes: 4 additions & 3 deletions lib/Languages/Galach/TokenExtractor/Full.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ final class Full extends TokenExtractor
'/(?<lexeme>(?:(?<marker>(?<!\\\\)\#)(?<tag>[a-zA-Z0-9_][a-zA-Z0-9_\-.]*)))(?:[\s"()+!]|$)/Au' => Tokenizer::TOKEN_TERM,
'/(?<lexeme>(?:(?<marker>(?<!\\\\)@)(?<user>[a-zA-Z0-9_][a-zA-Z0-9_\-.]*)))(?:[\s"()+!]|$)/Au' => Tokenizer::TOKEN_TERM,
'/(?<lexeme>(?:(?<domain>[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?<quote>(?<!\\\\)["])(?<phrase>.*?)(?:(?<!\\\\)(?P=quote)))/Aus' => Tokenizer::TOKEN_TERM,
'/(?<lexeme>(?:(?<domain>[a-zA-Z_][a-zA-Z0-9_\-.]*):)?\[(?<rangeFrom>[a-zA-Z0-9]+) TO (?<rangeTo>[a-zA-Z0-9]+)\])/Aus' => Tokenizer::TOKEN_TERM,
'/(?<lexeme>(?:(?<domain>[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?<rangeStartSymbol>[\[\{])(?<rangeFrom>[a-zA-Z0-9]+) TO (?<rangeTo>[a-zA-Z0-9]+)[\]\}])/Aus' => Tokenizer::TOKEN_TERM,
'/(?<lexeme>(?:(?<domain>[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?<word>(?:\\\\\\\\|\\\\ |\\\\\(|\\\\\)|\\\\"|[^"()\s])+?))(?:(?<!\\\\)["]|\(|\)|$|\s)/Au' => Tokenizer::TOKEN_TERM,
];

Expand All @@ -50,12 +50,13 @@ protected function createTermToken($position, array $data)
$lexeme = $data['lexeme'];

switch (true) {
case isset($data['rangeFrom']) && isset($data['rangeTo']):
case isset($data['rangeStartSymbol']):
return new Range(
$lexeme,
$position,
$data['domain'],
$data['rangeFrom'], $data['rangeTo']
$data['rangeFrom'], $data['rangeTo'],
Range::getTypeByStart($data['rangeStartSymbol'])
);
case isset($data['word']):
return new Word(
Expand Down
38 changes: 36 additions & 2 deletions lib/Languages/Galach/Values/Token/Range.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
*/
final class Range extends Token
{
const TYPE_INCLUSIVE = 'inclusive';
const TYPE_EXCLUSIVE = 'exclusive';

/**
* Holds domain string.
*
Expand All @@ -29,19 +32,50 @@ final class Range extends Token
*/
public $rangeTo;

/**
* @var string
*/
public $type;

/**
* @param string $lexeme
* @param int $position
* @param int $position
* @param string $domain
* @param string $rangeFrom
* @param string $rangeTo
* @param string $type
*/
public function __construct($lexeme, $position, $domain, $rangeFrom, $rangeTo)
public function __construct($lexeme, $position, $domain, $rangeFrom, $rangeTo, $type)
{
if (!in_array($type, [self::TYPE_EXCLUSIVE, self::TYPE_INCLUSIVE])) {
throw new \InvalidArgumentException(sprintf('Invalid range type: %s', $type));
}

parent::__construct(Tokenizer::TOKEN_TERM, $lexeme, $position);

$this->domain = $domain;
$this->rangeFrom = $rangeFrom;
$this->rangeTo = $rangeTo;
$this->type = $type;
}

/**
* Returns the range type, given the starting symbol.
*
* @param string $startSymbol the start symbol, either '[' or '{'
*
* @return string
*/
public static function getTypeByStart($startSymbol)
{
if ('[' === $startSymbol) {
return self::TYPE_INCLUSIVE;
}

if ('{' === $startSymbol) {
return self::TYPE_EXCLUSIVE;
}

throw new \InvalidArgumentException(sprintf('Invalid range start symbol: %s', $startSymbol));
}
}
8 changes: 7 additions & 1 deletion tests/Galach/Tokenizer/FullTokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,13 @@ public function providerForTestTokenize()
[
'[a TO b]',
[
new RangeToken('[a TO b]', 0, '', 'a', 'b'),
new RangeToken('[a TO b]', 0, '', 'a', 'b', 'inclusive'),
],
],
[
'{a TO b}',
[
new RangeToken('{a TO b}', 0, '', 'a', 'b', 'exclusive'),
],
],
[
Expand Down
7 changes: 7 additions & 0 deletions tests/Galach/Tokenizer/TextTokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ public static function setUpBeforeClass()
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5),
new WordToken('b]', 6, '', 'b]'),
],
'{a TO b}' => [
new WordToken('{a', 0, '', '{a'),
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 2),
new WordToken('TO', 3, '', 'TO'),
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5),
new WordToken('b}', 6, '', 'b}'),
],
'domain:domain:' => [
new WordToken('domain:domain:', 0, '', 'domain:domain:'),
],
Expand Down

0 comments on commit 25e49ce

Please sign in to comment.