FactChecker/StatementClassifier.py at main · benjaminwilen/FactChecker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from sklearn.linear_model import LogisticRegression
from statsmodels.miscmodels.ordinal_model import OrderedModel

from DeepNetwork import *

class StatementClassifier:
    """ Classifies statement embeddings into corresponding truth classes"""

    def __init__(self, modelType, data, embeddings_dim):
        """
        Initializes classifier

        Arguments:
            - modelType (str): either "LR", "OR", or "DN"
            - data (dict): All training, dev, and testing data
            - embeddings_dim (int): length of statements embeddings

        """
        self.modelType = modelType
        self.data = data

        self.model = None
        if modelType == "LR":
            self.model = LogisticRegression()
        elif modelType == "OR":
            self.model = OrderedModel(data["y_train"],
                        data["X_train"],
                        distr='logit')
        elif modelType == "DN":
            self.model = DeepNetwork(6, embeddings_dim, 128, 64, 0.01, 0.1)

    def train(self):
        """
        Trains the classifier on the training data from self.data
        """

        if self.modelType == "LR":
            self.model.fit(self.data["X_train"], self.data["y_train"])
        elif self.modelType == "OR":
            self.model = self.model.fit(method='bfgs')
        elif self.modelType == "DN":
            LEARNING_RATE = 1e-1
            loss_fn = nn.CrossEntropyLoss()
            optimizer = torch.optim.SGD(self.model.parameters(), lr=LEARNING_RATE) #stochastic gradient descent

            loss_history, train_accuracy, dev_accuracy = self.model.train_model(torch.Tensor(self.data["X_train"]),
                                                               torch.LongTensor(self.data["y_train"]),
                                                               torch.Tensor(self.data["X_dev"]),
                                                               torch.LongTensor(self.data["y_dev"]),
                                                                loss_fn, optimizer)


    def predict(self, X) -> list:
        """
        Predicts the class for embeddings

        Arguments:
            - X (np.ndarray): All statement dense embeddings to predict a label for

        Returns: (List[int]): Predicted class for each embedding

        """
        if self.modelType == "LR":
            return self.model.predict(X)
        elif self.modelType == "OR":
            predicted_probs = self.model.model.predict(self.model.params, exog=X)
            return [np.argmax(row) for row in predicted_probs]
        elif self.modelType == "DN":
            predictions, _ = self.model.predict(torch.Tensor(X))
            return predictions