UCRTextClassifier/RegexClassifier.py at master · UC-Riverside-DatabaseLab/UCRTextClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import re
from enum import Enum
from AbstractTextClassifier import AbstractTextClassifier


class ScoringMethod(Enum):
    """Enumeration of regular expression scoring methods. Information gain has
    not yet been implemented.
    """
    accuracy = 1
    information_gain = 2


class RegexRule(object):
    """Container for a regular expression, its associated class, etc.

    Constructor arguments:
    regex - The regular expression
    phrase - The phrase the regular expression was based on
    matched - A list of instance objects (defined in TextDataSetFileParser.py)
        whose text is matched by the regular expression
    distribution - The class distribution among matched instances
    class_value - The class that matched instances will be classified as
    num_correct - The number of classified instances in matched of class
        class_value
    score - Accuracy or information gain, according to the scoring method
    """
    def __init__(self, regex, phrase, matched, distribution, class_value,
                 num_correct, score):
        self.regex = regex
        pattern = regex.pattern.replace("(^|^.* )", "").replace("($| .*$)", "")
        self.partial_regex = re.compile(pattern)
        self.phrase = phrase
        self.matched = matched
        self.distribution = distribution
        self.class_value = class_value
        self.num_correct = num_correct
        self.score = score

    def matches(self, text):
        """Determine whether the regular expression matches the given text.

        Arguments:
        text - The text to compare to the regular expression

        Returns:
        True if the regular expression matched the given text, False otherwise.
        """
        return self.regex.match(text)


class RegexClassifier(AbstractTextClassifier):
    """Classifier that generates regular expressions based on training data.

    Constructor arguments:
    scoring_method (default accuracy) - The scoring method to use while
        expanding regular expressions
    score_threshold (default 1) - The minimum score of maximally-expanded
        regular expressions to accept for classification
    jump_length (default 2) - The maximum number of consecutive nearby words to
        skip when expanding a regular expression
    root_words (default 1) - The number of initial words to expand regular
        expressions from
    min_root_word_frequency (default 0.25) - The minimum frequency of words (as
        a percentage of the sum of the dataset's weights) to consider as root
        words
    """
    def __init__(self, scoring_method=ScoringMethod.accuracy,
                 score_threshold=1, jump_length=2, root_words=1,
                 min_root_word_frequency=0.25):
        self.regex_prefix = "(^|^.* )"
        self.regex_suffix = "($| .*$)"
        self.gap = " (\\S+ )"
        self.regex_tokens = [".", "^", "$", "(", ")", "[", "]", "{", "}", "?",
                             "+", "|", "*"]
        self.jump_length = max(0, jump_length)
        self.score_threshold = score_threshold
        self.root_words = max(1, root_words)
        self.min_root_word_frequency = min(1, max(0, min_root_word_frequency))
        self.scoring_method = scoring_method
        self.regex_rules = []

    def train(self, data):
        self.regex_rules = []
        prefixes = set()
        suffixes = set()
        current_regex_rules = []
        new_regex_rules = []
        improved = False

        for word in self.__find_top_words(data):
            current_regex_rules.append(self.__create_regex_rule(word, data))

            while len(current_regex_rules) > 0:
                new_regex_rules.clear()

                for regex_rule in current_regex_rules:
                    if regex_rule is None or regex_rule.phrase is None:
                        continue

                    improved = False
                    candidates = []

                    for i in range(0, self.jump_length + 1):
                        self.__find_prefixes_and_suffixes(regex_rule,
                                                          regex_rule.matched,
                                                          prefixes, suffixes,
                                                          i)
                        self.__expand_regex(regex_rule.phrase, candidates,
                                            prefixes, regex_rule.matched, True,
                                            i)
                        self.__expand_regex(regex_rule.phrase, candidates,
                                            suffixes, regex_rule.matched,
                                            False, i)

                    for new_regex_rule in candidates:
                        new_score = new_regex_rule.score
                        score = regex_rule.score

                        if (new_score > score or new_score == score and
                                new_regex_rule.num_correct >
                                regex_rule.num_correct):
                            new_regex_rules.append(new_regex_rule)

                            improved = True

                    if (not improved and regex_rule.score >=
                            self.score_threshold):
                        self.regex_rules.append(regex_rule)

                current_regex_rules = new_regex_rules.copy()

        for regex_rule in self.regex_rules:
            regex_rule.matched = None

        self.__create_default_regex_rule(data)

    def classify(self, instance):
        for regex_rule in self.regex_rules:
            if regex_rule.matches(instance.text):
                return regex_rule.distribution

        return {}

    def __create_default_regex_rule(self, data):
        distribution = {}
        max_class = None
        max_value = 0
        total = 0

        for instance in data:
            matched = False

            for regex_rule in self.regex_rules:
                if regex_rule.matches(instance.text):
                    matched = True

            if not matched:
                if instance.class_value in distribution:
                    distribution[instance.class_value] += instance.weight
                else:
                    distribution[instance.class_value] = instance.weight

                total += instance.weight

                if distribution[instance.class_value] > max_value:
                    max_value = distribution[instance.class_value]
                    max_class = instance.class_value

        for c_value, count in distribution.items():
            distribution[c_value] = count / total

        self.regex_rules.append(RegexRule(re.compile(".*"), ".*", None,
                                          distribution, max_class, max_value,
                                          distribution[max_class]))

    def __create_regex_rule(self, phrase, data):
        distribution = {}
        total = 0
        num_correct = 0
        class_value = None
        regex = re.compile(self.regex_prefix + self.__format_regex(phrase) +
                           self.regex_suffix)
        matched = []

        for instance in data:
            if regex.match(instance.text):
                total += instance.weight

                if instance.class_value in distribution:
                    distribution[instance.class_value] += instance.weight
                else:
                    distribution[instance.class_value] = instance.weight

                if distribution[instance.class_value] > num_correct:
                    num_correct = distribution[instance.class_value]
                    class_value = instance.class_value

                matched.append(instance)

        if total > 0:
            for c_value, count in distribution.items():
                distribution[c_value] = count / total

            return RegexRule(regex, phrase, matched, distribution, class_value,
                             num_correct, distribution[class_value])

        return None

    def __expand_regex(self, phrase, candidates, affixes, data, use_prefixes,
                       gap_size):
        formatted_gap = " "
        regex_rule = None

        if gap_size > 0:
            formatted_gap = self.gap + "{" + str(gap_size) + "}"

        for affix in affixes:
            if use_prefixes:
                formatted_phrase = affix + formatted_gap + phrase
            else:
                formatted_phrase = phrase + formatted_gap + affix

            regex_rule = self.__create_regex_rule(formatted_phrase, data)

            if regex_rule is not None:
                candidates.append(regex_rule)

        return candidates

    def __find_prefixes_and_suffixes(self, regex_rule, data, prefixes,
                                     suffixes, gap_size):
        prefixes.clear()
        suffixes.clear()

        matched_pattern = "MATCHED_PATTERN"
        matched_len = len(matched_pattern)

        for instance in data:
            text = regex_rule.partial_regex.sub(matched_pattern, instance.text)

            while matched_pattern in text:
                matched_pattern_index = text.index(matched_pattern)
                partial_text = text[0:matched_pattern_index].strip().split(" ")
                partial_text_length = len(partial_text)

                if gap_size < partial_text_length:
                    prefix_index = partial_text_length - 1 - gap_size
                    prefixes.add(partial_text[prefix_index])

                text = text[text.index(matched_pattern) + matched_len:].strip()
                partial_text = text.split(" ")

                if gap_size < len(partial_text):
                    suffixes.add(partial_text[gap_size])

    def __find_top_words(self, data):
        top_words = []
        words = {}
        word_accuracy = {}
        sum_of_weights = 0
        acc = "accuracy"
        cnt = "count"

        for instance in data:
            sum_of_weights += instance.weight

            for word in instance.text.split(" "):
                if word == "":
                    continue

                if word in words:
                    if instance.class_value in words[word]:
                        words[word][instance.class_value] += instance.weight
                    else:
                        words[word][instance.class_value] = instance.weight
                else:
                    words[word] = {instance.class_value: instance.weight}

        for word, distribution in words.items():
            max_count = 0
            total = 0

            for class_value, count in distribution.items():
                total += count

                if count > max_count:
                    max_count = count

            if total > sum_of_weights * self.min_root_word_frequency:
                word_accuracy[word] = {acc: max_count / total, cnt: max_count}

        for i in range(0, self.root_words):
            max_word = None
            max_accuracy = 0
            max_count = 0

            for word, stats in word_accuracy.items():
                if (stats[acc] > max_accuracy or stats[acc] == max_accuracy and
                        stats[cnt] > max_count):
                    max_word = word
                    max_accuracy = stats[acc]
                    max_count = stats[cnt]

            if max_word is not None:
                top_words.append(max_word)
                del word_accuracy[max_word]

        return top_words

    def __format_regex(self, phrase):
        if self.gap in phrase:
            gap_index = phrase.index(self.gap)
            gap_string_length = len(self.gap)
            start = gap_index + gap_string_length + 1
            gap_length = phrase[start:gap_index + gap_string_length + 2]
            prefix = self.__format_regex(phrase[0:gap_index])
            start = gap_index + gap_string_length + 3
            suffix = self.__format_regex(phrase[start:])

            return prefix + self.gap + "{" + gap_length + "}" + suffix

        for token in self.regex_tokens:
            phrase = phrase.replace(token, "\\" + token).replace("\\\\", "\\")

        return phrase