-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathsparse_autoencoder.py
More file actions
135 lines (93 loc) · 4.75 KB
/
sparse_autoencoder.py
File metadata and controls
135 lines (93 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Based on CS294A/CS294W Programming Assignment Starter Code
from numpy import *
from compute_numerical_gradient import computeNumericalGradient
from image_patches import getPatches
def sigmoid(x):
return 1 / (1 + exp(-x))
def feedForward(thetaParam, hiddenSize, visibleSize, data):
"""Compute the activation of the hidden layer for the Sparse Autoencoder.
Keyword arguments:
thetaParam -- trained weights from the autoencoder
visibleSize -- the number of input units (probably 64)
hiddenSize -- the number of hidden units (probably 25)
data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example.
"""
W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]
return sigmoid(data.dot(W1.T) + b1)
def cost(thetaParam, visibleSize, hiddenSize, lambdaParam, sparsityParam, betaParam, data, corruptionLevel=0.0):
""" Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
Keyword arguments:
thetaParam -- a vector of parameters (W1, W2, b1, b2)
visibleSize -- the number of input units (probably 64)
hiddenSize -- the number of hidden units (probably 25)
lambdaParam -- weight decay parameter
sparsityParam -- the desired average activation for the hidden units
betaParam -- weight of sparsity penalty term
data -- a matrix containing the training data. So, data[i,:] is the i-th training example.
corruptionLevel -- how much of the input will get corrupted (denoising autoencoder)
"""
W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
W2 = thetaParam[hiddenSize*visibleSize:2*hiddenSize*visibleSize].reshape(visibleSize, hiddenSize)
b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]
b2 = thetaParam[2*hiddenSize*visibleSize+hiddenSize:]
m = data.shape[0]
inputData = data
# Corrupt input data (so that denoising autoencoder can fix it)
if corruptionLevel > 0.0:
corruptionMatrix = random.binomial(1,1-corruptionLevel, size=inputData.shape)
inputData = inputData * corruptionMatrix
# Forward propagation
a2 = sigmoid(inputData.dot(W1.T) + b1)
a3 = sigmoid(a2.dot(W2.T) + b2)
# Back propagation
mean_a2 = mean(a2,0)
sparsity_delta = (-sparsityParam / mean_a2) + (1-sparsityParam)/(1-mean_a2)
delta3 = -(data - a3) * (a3 * (1-a3))
delta2 = (delta3.dot(W2) + betaParam*sparsity_delta) * (a2 * (1-a2))
W1grad = (delta2.T.dot(inputData))/m + lambdaParam * W1
b1grad = sum(delta2, 0)/m
W2grad = (delta3.T.dot(a2))/m + lambdaParam * W2
b2grad = sum(delta3, 0)/m
cost = sum((a3 - data)**2)/2
weight_decay = sum(W1**2) + sum(W2**2)
sparsity_penalty = sparsityParam*log(sparsityParam/mean_a2) + \
(1-sparsityParam)*log((1-sparsityParam) / (1-mean_a2))
cost = cost/m + (lambdaParam/2) * weight_decay + betaParam * sum(sparsity_penalty)
grad = concatenate([W1grad.ravel(), W2grad.ravel(), b1grad.ravel(), b2grad.ravel()])
return (cost, grad)
def initializeParameters(hiddenSize, visibleSize):
# Initialize parameters randomly based on layer sizes.
# we'll choose weights uniformly from the interval [-r, r]
r = sqrt(6) / sqrt(hiddenSize+visibleSize+1)
W1 = random.rand(hiddenSize, visibleSize) * 2 * r - r;
W2 = random.rand(visibleSize, hiddenSize) * 2 * r - r;
b1 = zeros((hiddenSize, 1));
b2 = zeros((visibleSize, 1));
# Convert weights and bias gradients to the vector form.
# This step will "unroll" (flatten and concatenate together) all
# your parameters into a vector, which can then be used with minFunc.
theta = concatenate([W1.ravel(), W2.ravel(), b1.ravel(), b2.ravel()])
return theta
if __name__ == "__main__":
""" Check correctness of implemenation of sparse_autoencoder cost function
using gradient check
"""
patchSize=8
visibleSize = patchSize*patchSize # number of input units
hiddenSize = 25 # number of hidden units
sparsityParam = 0.01 # desired average activation of the hidden units.
lambdaParam = 0.0001 # weight decay parameter
betaParam = 3 # weight of sparsity penalty term
patches = getPatches(numPatches=10, patchSize=patchSize)
# Obtain random parameters theta
thetaParam = initializeParameters(hiddenSize, visibleSize)
def sparseAutoencoderCostCallback(x):
return cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam,
betaParam, patches)
(cost_value, grad) = sparseAutoencoderCostCallback(thetaParam)
numgrad = computeNumericalGradient(sparseAutoencoderCostCallback, thetaParam);
diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad)
print('%s' % diff)
print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n')