From e20cfb1fc730adf10fd6c2685521f87aca2ed379 Mon Sep 17 00:00:00 2001
From: ItayXD <lavieitay@gmail.com>
Date: Wed, 24 Aug 2022 09:54:12 +0300
Subject: [PATCH] Add support for apple silicon and switched to Aesara

---
 network3.py | 87 +++++++++++++++++++++++++++--------------------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/network3.py b/network3.py
index ff8afa8..87b3ce3 100644
--- a/network3.py
+++ b/network3.py
@@ -1,7 +1,7 @@
 """network3.py
 ~~~~~~~~~~~~~~
 
-A Theano-based program for training and running simple neural
+A Aesara-based program for training and running simple neural
 networks.
 
 Supports several layer types (fully connected, convolutional, max
@@ -12,14 +12,14 @@
 network2.py.  However, unlike network.py and network2.py it can also
 be run on a GPU, which makes it faster still.
 
-Because the code is based on Theano, the code is different in many
+Because the code is based on Aesara, the code is different in many
 ways from network.py and network2.py.  However, where possible I have
 tried to maintain consistency with the earlier programs.  In
 particular, the API is similar to network2.py.  Note that I have
 focused on making the code simple, easily readable, and easily
 modifiable.  It is not optimized, and omits many desirable features.
 
-This program incorporates ideas from the Theano documentation on
+This program incorporates ideas from the Aesara documentation on
 convolutional neural nets (notably,
 http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
 implementation of dropout (https://github.com/mdenil/dropout ), and
@@ -34,18 +34,18 @@
 
 # Third-party libraries
 import numpy as np
-import theano
-import theano.tensor as T
-from theano.tensor.nnet import conv
-from theano.tensor.nnet import softmax
-from theano.tensor import shared_randomstreams
-from theano.tensor.signal.pool import pool_2d
+import aesara
+import aesara.tensor as T
+from aesara.tensor.nnet import conv2d
+from aesara.tensor.nnet import softmax
+from aesara.tensor.random.utils import RandomStream
+from aesara.tensor.signal.pool import pool_2d
 
 # Activation functions for neurons
 def linear(z): return z
-def ReLU(z): return T.maximum(0.0, z)
-from theano.tensor.nnet import sigmoid
-from theano.tensor import tanh
+from aesara.tensor.nnet import relu as ReLU
+from aesara.tensor import sigmoid
+from aesara.tensor import tanh
 
 
 #### Constants
@@ -53,27 +53,30 @@ def ReLU(z): return T.maximum(0.0, z)
 if GPU:
     print("Trying to run under a GPU.  If this is not desired, then modify "+\
         "network3.py\nto set the GPU flag to False.")
-    try: theano.config.device = 'gpu'
+    try: aesara.config.device = 'gpu'
     except: pass # it's already set
-    theano.config.floatX = 'float32'
+    aesara.config.floatX = 'float32'
 else:
     print("Running with a CPU.  If this is not desired, then the modify "+\
         "network3.py to set\nthe GPU flag to True.")
 
+aesara.config.gcc__cxxflags = "-Wno-c++11-narrowing" #Required for apple silicon
+
+
 #### Load the MNIST data
 def load_data_shared(filename="mnist.pkl.gz"):
     f = gzip.open(filename, 'rb')
     training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
     f.close()
     def shared(data):
-        """Place the data into shared variables.  This allows Theano to copy
+        """Place the data into shared variables.  This allows Aesara to copy
         the data to the GPU, if one is available.
 
         """
-        shared_x = theano.shared(
-            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
-        shared_y = theano.shared(
-            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
+        shared_x = aesara.shared(
+            np.asarray(data[0], dtype=aesara.config.floatX), borrow=True)
+        shared_y = aesara.shared(
+            np.asarray(data[1], dtype=aesara.config.floatX), borrow=True)
         return shared_x, T.cast(shared_y, "int32")
     return [shared(training_data), shared(validation_data), shared(test_data)]
 
@@ -123,7 +126,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
         # define functions to train a mini-batch, and to compute the
         # accuracy in validation and test mini-batches.
         i = T.lscalar() # mini-batch index
-        train_mb = theano.function(
+        train_mb = aesara.function(
             [i], cost, updates=updates,
             givens={
                 self.x:
@@ -131,7 +134,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        validate_mb_accuracy = theano.function(
+        validate_mb_accuracy = aesara.function(
             [i], self.layers[-1].accuracy(self.y),
             givens={
                 self.x:
@@ -139,7 +142,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        test_mb_accuracy = theano.function(
+        test_mb_accuracy = aesara.function(
             [i], self.layers[-1].accuracy(self.y),
             givens={
                 self.x:
@@ -147,7 +150,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        self.test_mb_predictions = theano.function(
+        self.test_mb_predictions = aesara.function(
             [i], self.layers[-1].y_out,
             givens={
                 self.x:
@@ -210,23 +213,23 @@ def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
         self.activation_fn=activation_fn
         # initialize weights and biases
         n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
-        self.w = theano.shared(
+        self.w = aesara.shared(
             np.asarray(
                 np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
-                dtype=theano.config.floatX),
+                dtype=aesara.config.floatX),
             borrow=True)
-        self.b = theano.shared(
+        self.b = aesara.shared(
             np.asarray(
                 np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
-                dtype=theano.config.floatX),
+                dtype=aesara.config.floatX),
             borrow=True)
         self.params = [self.w, self.b]
 
     def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
         self.inpt = inpt.reshape(self.image_shape)
-        conv_out = conv.conv2d(
+        conv_out = conv2d(
             input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
-            image_shape=self.image_shape)
+            input_shape=self.image_shape)
         pooled_out = pool_2d(
             input=conv_out, ws=self.poolsize, ignore_border=True)
         self.output = self.activation_fn(
@@ -241,15 +244,15 @@ def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
         self.activation_fn = activation_fn
         self.p_dropout = p_dropout
         # Initialize weights and biases
-        self.w = theano.shared(
+        self.w = aesara.shared(
             np.asarray(
                 np.random.normal(
                     loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
-                dtype=theano.config.floatX),
+                dtype=aesara.config.floatX),
             name='w', borrow=True)
-        self.b = theano.shared(
+        self.b = aesara.shared(
             np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
-                       dtype=theano.config.floatX),
+                       dtype=aesara.config.floatX),
             name='b', borrow=True)
         self.params = [self.w, self.b]
 
@@ -274,21 +277,21 @@ def __init__(self, n_in, n_out, p_dropout=0.0):
         self.n_out = n_out
         self.p_dropout = p_dropout
         # Initialize weights and biases
-        self.w = theano.shared(
-            np.zeros((n_in, n_out), dtype=theano.config.floatX),
+        self.w = aesara.shared(
+            np.zeros((n_in, n_out), dtype=aesara.config.floatX),
             name='w', borrow=True)
-        self.b = theano.shared(
-            np.zeros((n_out,), dtype=theano.config.floatX),
+        self.b = aesara.shared(
+            np.zeros((n_out,), dtype=aesara.config.floatX),
             name='b', borrow=True)
         self.params = [self.w, self.b]
 
     def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
         self.inpt = inpt.reshape((mini_batch_size, self.n_in))
-        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
+        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b, axis=-1)
         self.y_out = T.argmax(self.output, axis=1)
         self.inpt_dropout = dropout_layer(
             inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
-        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
+        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b, axis=-1)
 
     def cost(self, net):
         "Return the log-likelihood cost."
@@ -305,7 +308,7 @@ def size(data):
     return data[0].get_value(borrow=True).shape[0]
 
 def dropout_layer(layer, p_dropout):
-    srng = shared_randomstreams.RandomStreams(
+    srng = RandomStream(
         np.random.RandomState(0).randint(999999))
-    mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)
-    return layer*T.cast(mask, theano.config.floatX)
+    mask = srng.binomial(1, 1-p_dropout, size=layer.shape)
+    return layer*T.cast(mask, aesara.config.floatX)