Skip to content

Commit e3ad0f0

Browse files
committed
optimized algorithms and filters
1 parent 0792dff commit e3ad0f0

File tree

6 files changed

+70
-81
lines changed

6 files changed

+70
-81
lines changed

src/Analysis/Keywords/Rake.php

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ class Rake
2222
* @var ContentDocument;
2323
*/
2424
protected $document = null;
25+
26+
/**
27+
*
28+
* @var array
29+
*/
30+
protected $tokens = [];
2531

2632
/**
2733
*
@@ -34,6 +40,18 @@ public function __construct(TokensDocument $document, $nGramSize = NGramFactory:
3440
$this->nGramSize = $nGramSize;
3541
}
3642

43+
/**
44+
*
45+
* @return array
46+
*/
47+
public function getTokens()
48+
{
49+
if(empty($this->tokens)) {
50+
$this->tokens = array_values(array_filter($this->getTokensDocument()->getDocumentData()));
51+
}
52+
return $this->tokens;
53+
}
54+
3755
/**
3856
*
3957
* @return TokensDocument
@@ -51,19 +69,16 @@ public function getPhrases()
5169
{
5270
$phrases = [];
5371

54-
// filter empty tokens
55-
$tokens = array_values(array_filter($this->getTokensDocument()->getDocumentData()));
56-
5772
for($index = $this->nGramSize; $index >= 2; $index--)
5873
{
59-
$phrases = array_merge($phrases, NGramFactory::create($tokens, $index));
74+
$phrases = array_merge($phrases, NGramFactory::create($this->getTokens(), $index));
6075
}
6176

6277
// you cannot use a phrase if it is a substring of a longer phrase
6378
// we must exclude all of the substring phrases
6479
$add = [];
6580
$remove = [];
66-
foreach($phrases as &$phrase)
81+
foreach($phrases as $phrase)
6782
{
6883
if(isset($remove[$phrase])) {
6984
continue;
@@ -74,7 +89,7 @@ public function getPhrases()
7489
//remove the prefix
7590
$remove[substr($phrase, strpos($phrase," ")+1)] = true;
7691
}
77-
}
92+
}
7893
return array_keys($add);
7994
}
8095

@@ -86,38 +101,30 @@ public function getKeywordScores()
86101
{
87102
$phrases = $this->getPhrases();
88103
// we must filter the null values before computing the frequencies
89-
$tokens = array_values( array_filter( $this->getTokensDocument()->getDocumentData() ));
90-
$freqDist = (new FreqDist($tokens))->getKeyValuesByFrequency();
91-
unset($tokens);
104+
$freqDist = (new FreqDist($this->getTokens()))->getKeyValuesByFrequency();
92105

93106
$keywords = array_keys($freqDist);
94107
// track the total degrees for a token
95108
$degrees = array_fill_keys($keywords, 0);
96-
109+
97110
// tally the results
98111
foreach($phrases as $phrase)
99112
{
100113
foreach($keywords as $keyword)
101114
{
102115
if(strpos($phrase, $keyword) !== false) {
103116
$degrees[$keyword] += substr_count($phrase, " ")+1;
104-
}
105-
117+
}
106118
}
107-
}
108-
$tally = [];
109-
foreach($freqDist as $keyword => $freqValue)
110-
{
111-
$tally[$keyword] = $degrees[$keyword] / $freqValue;
112-
}
113-
114-
$phraseScores = array_fill_keys($phrases, 0);
119+
}
120+
121+
$phraseScores = array_fill_keys($phrases, 0);
115122
foreach($phrases as $phrase)
116123
{
117124
$tokens = explode(" ", $phrase);
118125
foreach($tokens as $token)
119126
{
120-
$phraseScores[$phrase] += $tally[$token];
127+
$phraseScores[$phrase] += ($degrees[$token] / $freqDist[$token]);
121128
}
122129
}
123130

@@ -129,6 +136,7 @@ public function __destruct()
129136
{
130137
unset($this->document);
131138
unset($this->nGramSize);
139+
unset($this->tokens);
132140
}
133141

134142
}

src/Filters/CharFilter.php

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,47 +5,14 @@
55
use TextAnalysis\Interfaces\ITokenTransformation;
66

77
/**
8-
* Removes a single character, but not a number. Allows for whitelisted and blacklisted
9-
* items to be passed in
8+
* Removes a single character, but not a number.
109
* @author yooper
1110
*/
1211
class CharFilter implements ITokenTransformation
13-
{
14-
/**
15-
*
16-
* @var array
17-
*/
18-
protected $whiteList = [];
19-
20-
/**
21-
*
22-
* @var array
23-
*/
24-
protected $blackList = [];
25-
26-
/**
27-
* Set a white list or black list
28-
* @param array $whiteList
29-
* @param array $blackList
30-
*/
31-
public function __construct(array $whiteList = [], array $blackList = [])
32-
{
33-
$this->whiteList = $whiteList;
34-
$this->blackList = $blackList;
35-
}
36-
12+
{
3713
public function transform($word)
3814
{
39-
if(strlen($word) === 1) {
40-
if(in_array($word, $this->whiteList)) {
41-
return $word;
42-
} elseif(in_array($word, $this->blackList)) {
43-
return null;
44-
} elseif(!is_numeric($word)) {
45-
return null;
46-
}
47-
}
48-
return $word;
15+
return preg_replace("/ \D /", " ", $word);
4916
}
5017

5118
}

src/Filters/QuotesFilter.php

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,31 @@ class QuotesFilter implements ITokenTransformation
1717
protected $search = null;
1818

1919

20+
/**
21+
*
22+
* @var string
23+
*/
24+
protected $regex = null;
25+
26+
2027
/**
2128
* Specify what chars or strings needs to be search for and replace with a empty space
2229
* @param array|null $search
2330
*/
24-
public function __construct(array $search = ["'",'"','`','','',''])
31+
public function __construct(array $search = ["\'",'\"','`','','',''])
32+
{
33+
$this->search = $search;
34+
$this->regex = "/([".implode("", $this->search)."])/";
35+
36+
}
37+
38+
/**
39+
*
40+
* @return string
41+
*/
42+
public function getRegex()
2543
{
26-
$this->search = $search;
44+
return $this->regex;
2745
}
2846

2947
/**
@@ -33,7 +51,13 @@ public function __construct(array $search = ["'",'"','`','“','”','’'])
3351
*/
3452
public function transform($word)
3553
{
36-
return str_replace($this->search, "", $word);
54+
return preg_replace($this->getRegex(), '', $word);
55+
}
56+
57+
public function __destruct()
58+
{
59+
unset($this->regex);
60+
unset($this->search);
3761
}
3862
}
3963

src/Filters/WhitespaceFilter.php

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,7 @@ class WhitespaceFilter implements ITokenTransformation
1212
{
1313
public function transform($word)
1414
{
15-
$word = preg_replace("/(^\s+)|(\s+$)/us", "", preg_replace( '/\s+/', ' ', $word ));
16-
if(empty($word)) {
17-
return null;
18-
}
19-
return $word;
15+
return preg_replace("/(^\s+)|(\s+$)/us", "", preg_replace( '/\s+/', ' ', $word ));
2016
}
2117

2218
}

src/NGrams/NGramFactory.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ static public function create(array $tokens, $nGramSize = self::BIGRAM, $separat
2727
{
2828
$separatorLength = strlen($separator);
2929
$length = count($tokens) - $nGramSize + 1;
30+
if($length < 1) {
31+
return [];
32+
}
3033
$ngrams = array_fill(0, $length, ''); // initialize the array
3134

3235
for($index = 0; $index < $length; $index++)

tests/TextAnalysis/Filters/CharFilterTest.php

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,17 @@ class CharFilterTest extends \PHPUnit_Framework_TestCase
1414
public function testCharFilterDefaults()
1515
{
1616
$transformer = new CharFilter();
17-
$this->assertEquals(null, $transformer->transform('A'));
18-
$this->assertEquals(null, $transformer->transform('!'));
19-
$this->assertEquals('9', $transformer->transform('9'));
20-
}
21-
22-
public function testCharFilterWhiteList()
23-
{
24-
$transformer = new CharFilter(['A','!']);
17+
18+
$this->assertEquals(' ', $transformer->transform(' A '));
19+
$this->assertEquals(' ', $transformer->transform(' ! '));
20+
$this->assertEquals(' 9 ', $transformer->transform(' 9 '));
21+
2522
$this->assertEquals('A', $transformer->transform('A'));
2623
$this->assertEquals('!', $transformer->transform('!'));
2724
$this->assertEquals('9', $transformer->transform('9'));
28-
}
25+
}
2926

30-
public function testCharFilterBlackList()
31-
{
32-
$transformer = new CharFilter([],['9']);
33-
$this->assertEquals(null, $transformer->transform('A'));
34-
$this->assertEquals(null, $transformer->transform('!'));
35-
$this->assertEquals(null, $transformer->transform('9'));
36-
}
27+
3728

3829

3930
}

0 commit comments

Comments
 (0)