LearningFromData/overfitting.py at master · buptjz/LearningFromData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
__author__ = 'wangjz'

"""
Learning From Data
HW 6
    In the following problems use the data provided in the files
                      http://work.caltech.edu/data/in.dta
                     http://work.caltech.edu/data/out.dta
    as a training and test set respectively. Each line of the files corresponds to a two- dimensional
    input x = (x1,x2), so that X = R2, followed by the corresponding label from Y = {−1, 1}.
    We are going to apply Linear Regression with a non-linear transformation for classification.
    The nonlinear transformation is given by
"""

import numpy as np


class LinearRegression:
    def __init__(self, training_x, training_y, test_x=None, test_y=None, hasTest=False, lam=0.001):
        """
        带weight decay的线性回归
        training_x =[[---x1--],
                     [---x2--],
                     [.......],
                     [---xN--]]
        training_y = [y1,y2.....,yN]
        """
        self.__N = training_x.shape[0]
        self.__X = training_x
        self.__Y = training_y
        if hasTest:
            self.__N_test = test_x.shape[0]
            self.__X_test = test_x
            self.__Y_test = test_y
        self.w = 0
        self.lam = lam

    def learn_w(self):
        X = self.__X
        # Xt = X.transpose()
        #pseudo inverse
        # X_pinv = np.linalg.pinv(X)
        # self.w = np.dot(X_pinv, self.__Y)
        '''wreg = (ZTZ + λI)−1 ZTy'''
        Xt = X.transpose()
        tmp = np.dot(Xt, X) + self.lam * np.identity(8)
        self.w = np.dot(np.dot(np.mat(tmp).I, Xt), self.__Y)

    def evaluate_error_in(self):
        guess = np.dot(self.__X, self.w)
        guess[guess >= 0] = 1
        guess[guess < 0] = -1
        guess = np.int16(guess)
        correct = sum(guess == self.__Y)
        return 1.0 - correct * 1.0 / self.__N

    def evaluate_error_out(self):
        guess = np.dot(self.__X_test, self.w)
        guess[guess >= 0] = 1
        guess[guess < 0] = -1
        guess = np.int16(guess)
        correct = sum(guess == self.__Y_test)
        return 1.0 - correct * 1.0 / self.__N_test


def read_data(file_path):
    f = open(file_path, 'r')
    lines = f.readlines()
    f.close()

    train_num = len(lines)
    dimension = 8
    X = np.zeros((train_num, dimension))
    Y = np.zeros((train_num, 1))
    for i, l in enumerate(lines):
        items = l[:-1].split()
        X[i, 0] = 1.0                        # 1.0
        X[i, 1] = float(items[0])            # x1
        X[i, 2] = float(items[1])            # x2
        X[i, 3] = X[i, 1] * X[i, 1]          # x1 * x1
        X[i, 4] = X[i, 2] * X[i, 2]          # x2 * x2
        X[i, 5] = X[i, 1] * X[i, 2]          # x1 * x2
        X[i, 6] = abs(X[i, 1] - X[i, 2])     # |x1 - x2|
        X[i, 7] = abs(X[i, 1] + X[i, 2])     # |x1 + x2|
        Y[i, 0] = float(items[2])

    return X, Y


def overfit_exp():
    """
    Run Linear Regression on the training set after performing the non-linear trans- formation.
    What values are closest (in Euclidean distance) to the in-sample and out-of-sample classification errors,
    respectively?
    得到了正确的结果分别是 0.02857143 和 0.084
    """

    trainX, trainY = read_data("in.dta.txt")
    testX, testY = read_data("out.dta.txt")
    lr = LinearRegression(trainX, trainY, testX, testY, True, lam=0.1)
    lr.learn_w()
    print lr.evaluate_error_in()
    print lr.evaluate_error_out()


if __name__ == "__main__":
    overfit_exp()