Skip to content

Commit af647a6

Browse files
committed
bug fixes and more optimizations
1 parent 2bbbb8e commit af647a6

File tree

4 files changed

+6
-6
lines changed

4 files changed

+6
-6
lines changed

src/Analysis/Keywords/Rake.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ public function getKeywordScores()
113113
$tokens = explode(" ", $phrase);
114114
foreach($tokens as $keyword)
115115
{
116-
$degrees[$keyword] += substr_count($phrase, " ")+1;
116+
$degrees[$keyword] += count($tokens);
117117
}
118118
}
119119

src/Filters/WhitespaceFilter.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class WhitespaceFilter implements ITokenTransformation
1212
{
1313
public function transform($word)
1414
{
15-
return preg_replace("/(^\s+)|(\s+$)/us", "", preg_replace( '/\s+/', ' ', $word ));
15+
return preg_replace("/\s[[:space:]]+/", " ", str_replace(["\r", "\n"], ' ', $word));
1616
}
1717

1818
}

src/Tokenizers/PennTreeBankTokenizer.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Based on http://www.cis.upenn.edu/~treebank/tokenizer.sed
1010
* @author yooper
1111
*/
12-
class PennTreeBankTokenizer extends WhitespaceTokenizer
12+
class PennTreeBankTokenizer
1313
{
1414
/**
1515
*
@@ -24,11 +24,11 @@ public function __construct()
2424

2525
/**
2626
* Calls internal functions to handle data processing
27-
* @param type $string
27+
* @param string $str
2828
*/
2929
public function tokenize($str)
3030
{
31-
return parent::tokenize($this->execute($str));
31+
return preg_split('/[\pZ\pC]+/u', $this->execute($str), null, PREG_SPLIT_NO_EMPTY);
3232
}
3333
/**
3434
* Handles the data processing

src/Tokenizers/WhitespaceTokenizer.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class WhitespaceTokenizer extends TokenizerAbstract
99
{
1010
public function tokenize($string)
1111
{
12-
return preg_split('/[\pZ\pC]+/u', $string, null, PREG_SPLIT_NO_EMPTY);
12+
return mb_split('\s+', $string);
1313
}
1414
}
1515

0 commit comments

Comments
 (0)