@@ -22,6 +22,12 @@ class Rake
22
22
* @var ContentDocument;
23
23
*/
24
24
protected $ document = null ;
25
+
26
+ /**
27
+ *
28
+ * @var array
29
+ */
30
+ protected $ tokens = [];
25
31
26
32
/**
27
33
*
@@ -34,6 +40,18 @@ public function __construct(TokensDocument $document, $nGramSize = NGramFactory:
34
40
$ this ->nGramSize = $ nGramSize ;
35
41
}
36
42
43
+ /**
44
+ *
45
+ * @return array
46
+ */
47
+ public function getTokens ()
48
+ {
49
+ if (empty ($ this ->tokens )) {
50
+ $ this ->tokens = array_values (array_filter ($ this ->getTokensDocument ()->getDocumentData ()));
51
+ }
52
+ return $ this ->tokens ;
53
+ }
54
+
37
55
/**
38
56
*
39
57
* @return TokensDocument
@@ -51,19 +69,16 @@ public function getPhrases()
51
69
{
52
70
$ phrases = [];
53
71
54
- // filter empty tokens
55
- $ tokens = array_values (array_filter ($ this ->getTokensDocument ()->getDocumentData ()));
56
-
57
72
for ($ index = $ this ->nGramSize ; $ index >= 2 ; $ index --)
58
73
{
59
- $ phrases = array_merge ($ phrases , NGramFactory::create ($ tokens , $ index ));
74
+ $ phrases = array_merge ($ phrases , NGramFactory::create ($ this -> getTokens () , $ index ));
60
75
}
61
76
62
77
// you cannot use a phrase if it is a substring of a longer phrase
63
78
// we must exclude all of the substring phrases
64
79
$ add = [];
65
80
$ remove = [];
66
- foreach ($ phrases as & $ phrase )
81
+ foreach ($ phrases as $ phrase )
67
82
{
68
83
if (isset ($ remove [$ phrase ])) {
69
84
continue ;
@@ -74,7 +89,7 @@ public function getPhrases()
74
89
//remove the prefix
75
90
$ remove [substr ($ phrase , strpos ($ phrase ," " )+1 )] = true ;
76
91
}
77
- }
92
+ }
78
93
return array_keys ($ add );
79
94
}
80
95
@@ -86,38 +101,30 @@ public function getKeywordScores()
86
101
{
87
102
$ phrases = $ this ->getPhrases ();
88
103
// we must filter the null values before computing the frequencies
89
- $ tokens = array_values ( array_filter ( $ this ->getTokensDocument ()->getDocumentData () ));
90
- $ freqDist = (new FreqDist ($ tokens ))->getKeyValuesByFrequency ();
91
- unset($ tokens );
104
+ $ freqDist = (new FreqDist ($ this ->getTokens ()))->getKeyValuesByFrequency ();
92
105
93
106
$ keywords = array_keys ($ freqDist );
94
107
// track the total degrees for a token
95
108
$ degrees = array_fill_keys ($ keywords , 0 );
96
-
109
+
97
110
// tally the results
98
111
foreach ($ phrases as $ phrase )
99
112
{
100
113
foreach ($ keywords as $ keyword )
101
114
{
102
115
if (strpos ($ phrase , $ keyword ) !== false ) {
103
116
$ degrees [$ keyword ] += substr_count ($ phrase , " " )+1 ;
104
- }
105
-
117
+ }
106
118
}
107
- }
108
- $ tally = [];
109
- foreach ($ freqDist as $ keyword => $ freqValue )
110
- {
111
- $ tally [$ keyword ] = $ degrees [$ keyword ] / $ freqValue ;
112
- }
113
-
114
- $ phraseScores = array_fill_keys ($ phrases , 0 );
119
+ }
120
+
121
+ $ phraseScores = array_fill_keys ($ phrases , 0 );
115
122
foreach ($ phrases as $ phrase )
116
123
{
117
124
$ tokens = explode (" " , $ phrase );
118
125
foreach ($ tokens as $ token )
119
126
{
120
- $ phraseScores [$ phrase ] += $ tally [$ token ];
127
+ $ phraseScores [$ phrase ] += ( $ degrees [$ token ] / $ freqDist [ $ token ]) ;
121
128
}
122
129
}
123
130
@@ -129,6 +136,7 @@ public function __destruct()
129
136
{
130
137
unset($ this ->document );
131
138
unset($ this ->nGramSize );
139
+ unset($ this ->tokens );
132
140
}
133
141
134
142
}
0 commit comments