LearningFromData/linear_regression.py at master · monkey0105/LearningFromData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-
__author__ = 'wangjz'

"""
Learning From Data
HW 1
In these problems, we will explore how Linear Regression for classification works.
As with the Perceptron Learning Algorithm in Homework # 1, you will create your own target function f
and data set D. Take d = 2 so you can visualize the problem, and assume X = [−1, 1] × [−1, 1] with uniform probability
of picking each x ∈ X . In each run, choose a random line in the plane as your target function f (do this by taking
two random, uniformly distributed points in [−1, 1] × [−1, 1] and taking the line passing through them), where one
side of the line maps to +1 and the other maps to −1. Choose the inputs xn of the data set as
random points (uniformly in X ), and evaluate the target function on each xn to get the corresponding output yn.
"""

import numpy as np
import random
from perceptron import Perceptron


def avg(a_list):
    return 1.0 * sum(a_list) / len(a_list)


class LinearRegression:
    def __init__(self, training_x, training_y, test_x=None, test_y=None):
        """
        training_x =[[---x1--],
                     [---x2--],
                     [.......],
                     [---xN--]]

        training_y = [y1,y2.....,yN]
        """
        self.__N = training_x.shape[0]
        self.__X = training_x
        self.__Y = training_y
        if test_x != None:
            self.__N_test = test_x.shape[0]
            self.__X_test = test_x
            self.__Y_test = test_y
        self.w = 0

    def learn_w(self):
        X = self.__X
        # Xt = X.transpose()
        #pseudo inverse
        X_pinv = np.linalg.pinv(X)
        self.w = np.dot(X_pinv, self.__Y)

    def evaluate_error_in(self):
        guess = np.dot(self.__X, self.w)
        guess[guess >= 0] = 1
        guess[guess < 0] = -1
        guess = np.int16(guess)
        correct = sum(guess == self.__Y)
        return 1.0 - correct * 1.0 / self.__N

    def evaluate_error_out(self):
        guess = np.dot(self.__X_test, self.w)
        guess[guess >= 0] = 1
        guess[guess < 0] = -1
        guess = np.int16(guess)
        correct = sum(guess == self.__Y_test)
        return 1.0 - correct * 1.0 / self.__N_test


def generate_nonlinear_data(n):
    X = np.random.rand(n, 3) * 2 - 1
    X[:, 0] = 1

    #f(x1,x2)=sign(x21 +x22 −0.6)
    Y = np.zeros((X.shape[0], 1))
    for row in range(X.shape[0]):
        Y[row, 0] = X[row, 1] * X[row, 1] + X[row, 2] * X[row, 2] - 0.6
    Y[Y >= 0] = 1
    Y[Y < 0] = -1
    Y = np.int16(Y)
    return X, Y

def generate_date(n):
    #generate target function (represented by vector w)
    p1 = [random.uniform(-1, 1), random.uniform(-1, 1)]
    p2 = [random.uniform(-1, 1), random.uniform(-1, 1)]
    w = [1, 0, 0]
    w[1] = (p2[0] - p1[0]) / (p1[1] - p2[1])
    w[2] = - (p1[0] * w[0] + p1[1] * w[1])
    w = np.array([[w[0]], [w[1]], [w[2]]])

    #generate n random points between [-1,1]X[-1,1]
    X = np.random.rand(n, 3) * 2 - 1
    X[:, 0] = 1

    Y = np.dot(X, w)
    Y[Y >= 0] = 1
    Y[Y < 0] = -1
    Y = np.int16(Y)
    return w, X, Y


def generate_train_and_test(train_num, test_num):
    total = train_num + test_num
    w, X, Y = generate_date(total)
    X_train = X[0:train_num, :]
    X_test = X[train_num:total, :]
    Y_train = Y[0:train_num, :]
    Y_test = Y[train_num:total, :]
    return w, X_train, Y_train, X_test, Y_test


def main2():
    """
    generate 1000 fresh points and use them to estimate the out-of-sample error
    Eout of g that you got in Problem 5 (number of misclassified out-of-sample points / total number of
    out-of-sample points).
    Again, run the experiment 1000 times and take the average. Which value is closest to the average Eout?
    """
    EXP_TIMEs = 1000
    train_num = 100
    test_num = 1000
    E_in_list = []
    E_out_list = []
    for e in range(EXP_TIMEs):
        w, X_train, Y_train, X_test, Y_test = generate_train_and_test(train_num, test_num)
        lr = LinearRegression(X_train, Y_train, X_test, Y_test)
        lr.learn_w()
        E_in_list.append(lr.evaluate_error_in())
        E_out_list.append(lr.evaluate_error_out())

    print(avg(E_in_list))
    print(avg(E_out_list))


def main1():
    """
    Now, take N = 10. After finding the weights using Linear Regression,
    use them as a vector of initial weights for the Perceptron Learning Algorithm. Run PLA until it converges
    to a final vector of weights that completely separates all the in-sample points. Among the choices below,
    what is the closest value to the average number of iterations (over 1000 runs) that PLA takes to converge?
    (When implementing PLA,
    have the algorithm choose a point randomly from the set of misclassified points at each iteration)
    """
    EXP_TIMEs = 1000
    train_num = 10
    iterations_list = []
    for e in range(EXP_TIMEs):
        w, X_train, Y_train = generate_date(train_num)
        lr = LinearRegression(X_train, Y_train)
        lr.learn_w()
        learned_w = lr.w
        #def __init__(self, training_X, training_Y, init_w=[]):
        per = Perceptron(X_train, Y_train, learned_w)
        per.gd_algorithm()
        iterations_list.append(per.num_iterations)
    print(avg(iterations_list))


def nonlinear_exp():
    """
    In these problems, we again apply Linear Regression for classification.
    Consider the target function:
    f(x1,x2)=sign(x21 +x2 −0.6)
    Generate a training set of N = 1000 points on X = [−1, 1] × [−1, 1] with uniform
    probability of picking each x ∈ X . Generate simulated noise by flipping the sign of the
    output in a random 10% subset of the generated training set.
    """
    EXP_TIMEs = 1000
    train_num = 1000
    test_num = 1000
    e_out_list = []
    for e in range(EXP_TIMEs):
        print(e)
        X, Y = generate_nonlinear_data(train_num + test_num)
        X_trans = np.zeros((X.shape[0], 6))
        for row in range(Y.shape[0]):
            X_trans[row, 0:3] = X[row, :]
            X_trans[row, 3] = X[row, 1] * X[row, 2]
            X_trans[row, 4] = X[row, 1] * X[row, 1]
            X_trans[row, 5] = X[row, 2] * X[row, 2]
            if random.randint(1, 10) == 1:
                # 10% to be wrong on original data
                Y[row, 0] = -Y[row, 0]
        X_train = X_trans[:train_num, :]
        X_test = X_trans[train_num:, :]
        Y_train = Y[:train_num, :]
        Y_test = Y[train_num:, :]

        lr = LinearRegression(X_train, Y_train, X_test, Y_test)
        lr.learn_w()
        e_out_list.append(lr.evaluate_error_out())
    print(avg(e_out_list))


if __name__ == "__main__":
    nonlinear_exp()