DeepLearning/sparse_autoencoder.py at master · dusano/DeepLearning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Based on CS294A/CS294W Programming Assignment Starter Code
from numpy  import *

from compute_numerical_gradient import computeNumericalGradient
from image_patches import getPatches


def sigmoid(x):
	return 1 / (1 + exp(-x))


def feedForward(thetaParam, hiddenSize, visibleSize, data):
	"""Compute the activation of the hidden layer for the Sparse Autoencoder.

	Keyword arguments:
	thetaParam -- trained weights from the autoencoder
	visibleSize -- the number of input units (probably 64)
	hiddenSize -- the number of hidden units (probably 25)
	data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example.

	"""

	W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
	b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]

	return sigmoid(data.dot(W1.T) + b1)


def cost(thetaParam, visibleSize, hiddenSize, lambdaParam, sparsityParam, betaParam, data, corruptionLevel=0.0):
	""" Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
	and the corresponding gradients W1grad, W2grad, b1grad, b2grad.

	Keyword arguments:
	thetaParam -- a vector of parameters (W1, W2, b1, b2)
	visibleSize -- the number of input units (probably 64)
	hiddenSize -- the number of hidden units (probably 25)
	lambdaParam -- weight decay parameter
	sparsityParam -- the desired average activation for the hidden units
	betaParam -- weight of sparsity penalty term
	data -- a matrix containing the training data. So, data[i,:] is the i-th training example.
	corruptionLevel -- how much of the input will get corrupted (denoising autoencoder)

	"""

	W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
	W2 = thetaParam[hiddenSize*visibleSize:2*hiddenSize*visibleSize].reshape(visibleSize, hiddenSize)
	b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]
	b2 = thetaParam[2*hiddenSize*visibleSize+hiddenSize:]

	m = data.shape[0]

	inputData = data
	# Corrupt input data (so that denoising autoencoder can fix it)
	if corruptionLevel > 0.0:
		corruptionMatrix = random.binomial(1,1-corruptionLevel, size=inputData.shape)
		inputData = inputData * corruptionMatrix

	# Forward propagation
	a2 = sigmoid(inputData.dot(W1.T) + b1)
	a3 = sigmoid(a2.dot(W2.T) + b2)

	# Back propagation
	mean_a2 = mean(a2,0)

	sparsity_delta = (-sparsityParam / mean_a2) + (1-sparsityParam)/(1-mean_a2)

	delta3 = -(data - a3) * (a3 * (1-a3))
	delta2 = (delta3.dot(W2) + betaParam*sparsity_delta) * (a2 * (1-a2))

	W1grad = (delta2.T.dot(inputData))/m + lambdaParam * W1
	b1grad = sum(delta2, 0)/m
	W2grad = (delta3.T.dot(a2))/m + lambdaParam * W2
	b2grad = sum(delta3, 0)/m

	cost = sum((a3 - data)**2)/2

	weight_decay = sum(W1**2) + sum(W2**2)

	sparsity_penalty = sparsityParam*log(sparsityParam/mean_a2) + \
						(1-sparsityParam)*log((1-sparsityParam) / (1-mean_a2))

	cost = cost/m + (lambdaParam/2) * weight_decay + betaParam * sum(sparsity_penalty)

	grad = concatenate([W1grad.ravel(), W2grad.ravel(), b1grad.ravel(), b2grad.ravel()])

	return (cost, grad)


def initializeParameters(hiddenSize, visibleSize):
	# Initialize parameters randomly based on layer sizes.

	# we'll choose weights uniformly from the interval [-r, r]
	r  = sqrt(6) / sqrt(hiddenSize+visibleSize+1)

	W1 = random.rand(hiddenSize, visibleSize) * 2 * r - r;
	W2 = random.rand(visibleSize, hiddenSize) * 2 * r - r;

	b1 = zeros((hiddenSize, 1));
	b2 = zeros((visibleSize, 1));

	# Convert weights and bias gradients to the vector form.
	# This step will "unroll" (flatten and concatenate together) all
	# your parameters into a vector, which can then be used with minFunc.
	theta = concatenate([W1.ravel(), W2.ravel(), b1.ravel(), b2.ravel()])

	return theta


if __name__ == "__main__":
	""" Check correctness of implemenation of sparse_autoencoder cost function
	using gradient check
	"""
	patchSize=8
	visibleSize = patchSize*patchSize		# number of input units
	hiddenSize = 25							# number of hidden units
	sparsityParam = 0.01					# desired average activation of the hidden units.
	lambdaParam = 0.0001					# weight decay parameter
	betaParam = 3							# weight of sparsity penalty term

	patches = getPatches(numPatches=10, patchSize=patchSize)

	# Obtain random parameters theta
	thetaParam = initializeParameters(hiddenSize, visibleSize)

	def sparseAutoencoderCostCallback(x):
		return cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam,
					betaParam, patches)

	(cost_value, grad) = sparseAutoencoderCostCallback(thetaParam)

	numgrad = computeNumericalGradient(sparseAutoencoderCostCallback, thetaParam);
	diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad)

	print('%s' % diff)
	print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n')