RAKEJava/RakeModel.java at master · shirsho-12/RAKEJava · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
package src.rake;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;

/*
Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
as described in:
Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
Automatic keyword extraction from individual documents.
In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

NOTE: The original code (from https://github.com/aneesha/RAKE)
has been extended by a_medelyan (zelandiya)
with a set of heuristics to decide whether a phrase is an acceptable candidate
as well as the ability to set frequency and phrase length parameters
important when dealing with longer documents

NOTE 2: The code published by a_medelyan (https://github.com/zelandiya/RAKE-tutorial)
has been additionally extended by Marco Pegoraro to implement the adjoined candidate
feature described in section 1.2.3 of the original paper. Note that this creates the
need to modify the metric for the candidate score, because the adjoined candidates
have a very high score (because of the nature of the original score metric)

NOTE 3: This repository converts Python code from aneesha (https://https://github.com/zelandiya/RAKE-tutorial) to
Java and removes the need for external txt files for a self-contained program.
 */
public class RakeModel {
    ArrayList<String> stopWords = new StopWords().getStopWords();
    // An adjoined string refers to a phrase i.e. 1 or more words
    public int minCharLength = 2, maxWordsLength = 5;
    // Minimum number of words in the adjoined string
    public int minWords = 1, maxWords = 1;
    // Minimum phrase and keyword frequencies required to be counted
    public int minPhraseFreqAdj = 2, minKeywordFrequency = 1;
    public HashMap<String, Double> candidateScores;  // Allows score access for printing
    private boolean isNumber(String s)
    {
        return s.matches("[+-]?[0-9]*\\.?[0-9]+");
    }

    private ArrayList<String> separateWords(String text, int minWordSize)
    {
//        Pattern textSplitter = Pattern.compile("[a-zA-Z0-9_\\+\\-/]");
        ArrayList<String> words = new ArrayList<>();
        for (String word: text.split(" "))        // ReGeX pattern discarded
        {
            String currWord = word.strip();
            // Numbers stay in phrase but are not counted as they invalidate phrase scores
            if (currWord.length() > minWordSize && currWord != "" && !isNumber(currWord))
                words.add(currWord);
        }
        return words;
    }
    public ArrayList<String> separateSentences(String text) throws UnsupportedEncodingException {
        // Sentence pattern obtained from Python code
        byte[] pb = "[\\[\\]\n.!?,;:\t\\-\\\"\\(\\)\\\'\u2019\u2013]".getBytes("UTF-8");
        Pattern sp = Pattern.compile(new String(pb, "UTF-8")); // May need further editing
        // Convert String array to arrayList
        ArrayList<String> sentenceList = new ArrayList<>(Arrays.asList(sp.split(text)));
        return sentenceList;
    }
    private Pattern stopWordRegexBuilder(){
        String stopWordPattern = "";
        for (String word: stopWords)
            stopWordPattern += "\\b" + word + "\\b|";     // \\b is the word boundary for the regex pattern
        stopWordPattern = stopWordPattern.substring(0, stopWordPattern.length() - 1);
        return Pattern.compile(stopWordPattern, Pattern.CASE_INSENSITIVE);
    }
    private ArrayList<String> adjoinCandidateExtractor(ArrayList<String> sentences)
    {
        WordDict adjoinedCandidates = new WordDict();
        for (String sentence: sentences){

            adjoinedCandidates.addDict(adjoinedSentenceExtractor(sentence));
        }
        return adjoinedCandidateFilter(adjoinedCandidates, minPhraseFreqAdj);
    }
    // Filter to remove words below threshold frequency
    private ArrayList<String> adjoinedCandidateFilter(WordDict candidates,
                                                            int threshold){
        ArrayList<String> filteredCandidates = new ArrayList<>();
        for (String candidate: candidates){
            if (candidates.getFrequency(candidate) >=  threshold) {
                filteredCandidates.add(candidate);
            }
        }
        return filteredCandidates;
    }
    private WordDict adjoinedSentenceExtractor(String sentence)
    {
        WordDict wordDict = new WordDict();
        String[] sLCase = sentence.toLowerCase().strip().split(" ");
        for (int numKeywords = minWords; numKeywords <= maxWords; numKeywords++)
            for (int j = 0; j < (sLCase.length - numKeywords); j++){
                if (! stopWords.contains(sLCase[j])){
                    String candidate = sLCase[j];
                    int k = 1;
                    int keywordCounter = 1;           // Measures length of candidate sequence
                    boolean containsStopWord = false;
                    while (keywordCounter < numKeywords && (j+k) < sLCase.length){
                        // Add the next word to candidate sequence
                        candidate += " " + sLCase[j+k];
                        if (stopWords.contains(sLCase[j + k]))
                            containsStopWord = true;
                        else
                            keywordCounter += 1;
                        k++;      // Go to next word
                    }
                    //Candidate added to list iff (1) it contains a stop word, (2) the last word is not a stop word and
                    // adjoined candidate phrase is exactly equal to the number of keywords allowed (i) - prevents duplicates
                    if (containsStopWord &&
                            !stopWords.contains(candidate.substring(candidate.lastIndexOf(" ") + 1)) &&
                            keywordCounter == numKeywords) {
                        wordDict.add(candidate);
                    }
                }
            }
        return wordDict;
    }

    private ArrayList<String> generateCandidateKeywords(ArrayList<String> sentences){
        ArrayList<String> phraseList = new ArrayList<>();
        WordDict phraseCounter = new WordDict();
        for (String sentence: sentences){
            // Previous problem: only using | in regex for str.split instead of \\|
            sentence = sentence.strip().replaceAll(stopWordRegexBuilder().pattern(), "|");
            ArrayList<String> phrases = new ArrayList<>(Arrays.asList(sentence.split("\\|")));
            for (String phrase: phrases) {
                if (phraseCheck(phrase)) phraseCounter.add(phrase);
            }
        }
        // Add phrase to array only if it passes a minimum frequency
        for (String phrase: phraseCounter){
            if (phraseCounter.getFrequency(phrase) >= minKeywordFrequency)
                phraseList.add(phrase);
        }
        ArrayList<String> adjCandidates = adjoinCandidateExtractor(sentences);
        phraseList.addAll(adjCandidates);
        return phraseList;
    }
    private boolean phraseCheck(String phrase){
        if (phrase.length() < minCharLength) return  false;   // Minimum character length check
        if (phrase == "") return  false;
        if (phrase.split(" ").length > maxWordsLength) return false; // Maximum phrase length check

        int alphabets = 0, digits = 0;
        for (int i = 0; i < phrase.length(); i++){
            if (Character.isDigit(phrase.charAt(i))) digits++;
            else if (Character.isAlphabetic(phrase.charAt(i))) alphabets++;
        }
        if (alphabets == 0 || digits > alphabets) return false;     // phrase must have more alphabets than characters
        return true;
    }

    private HashMap<String, Double> calculateWordScore(ArrayList<String> phraseArray){
        WordDict wordFrequencies = new WordDict();
        WordDict wordDegree = new WordDict();
        for (String phrase: phraseArray){
            ArrayList<String> words = separateWords(phrase, 0);
            int wordsLength = words.size();
            int listDegree = wordsLength - 1;
            for (String word: words){
                wordFrequencies.add(word, 1);
                wordDegree.add(word, listDegree);
            }
        }
        for (String word: wordFrequencies)
            wordDegree.add(word, wordFrequencies.getFrequency(word));

        HashMap<String, Double> wordScores = new HashMap<>();
        for (String word: wordFrequencies){
            Double value = wordDegree.getFrequency(word) / (1.0 * wordFrequencies.getFrequency(word));
            wordScores.put(word, value);
        }
        return wordScores;
    }
    private HashMap<String, Double> getCandidateScores(ArrayList<String> phraseArray,
                                                      HashMap<String, Double> wordScores){
        HashMap<String, Double> candidateScores = new HashMap();
        for (String phrase: phraseArray){
            double cScore = 0.0;
            ArrayList<String> wordArray = separateWords(phrase, 0);
            for (String word: wordArray)
                cScore += wordScores.get(word);
            candidateScores.put(phrase.strip(), cScore);
        }
        return candidateScores;
    }
    public ArrayList<String> run(String text) throws UnsupportedEncodingException {
        ArrayList<String> sentences = separateSentences(text);
        ArrayList<String> phraseArray = generateCandidateKeywords(sentences);
        HashMap<String, Double> wordScores = calculateWordScore(phraseArray);

        candidateScores = getCandidateScores(phraseArray, wordScores);
        return Sorting.Sort(candidateScores);
    }
}