feature-store · narang-amit · Oct 27, 2021 · Oct 27, 2021 · Oct 27, 2021 · Oct 29, 2021
diff --git a/recsys/als-half.ipynb b/recsys/als-half.ipynb
diff --git a/recsys/als-jg-edits.ipynb b/recsys/als-jg-edits.ipynb
diff --git a/recsys/als-stream-2.ipynb b/recsys/als-stream-2.ipynb
@@ -0,0 +1,255 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8e0e6a4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from collections import defaultdict\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5b033873",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratings_path = \"/Users/amitnarang/Downloads/ml-latest-small/ratings.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "a00a310f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "        userId  movieId  rating   timestamp\n",
+      "0            1        1     4.0   964982703\n",
+      "1            1        3     4.0   964981247\n",
+      "2            1        6     4.0   964982224\n",
+      "3            1       47     5.0   964983815\n",
+      "4            1       50     5.0   964982931\n",
+      "...        ...      ...     ...         ...\n",
+      "100831     610   166534     4.0  1493848402\n",
+      "100832     610   168248     5.0  1493850091\n",
+      "100833     610   168250     5.0  1494273047\n",
+      "100834     610   168252     5.0  1493846352\n",
+      "100835     610   170875     3.0  1493846415\n",
+      "\n",
+      "[100836 rows x 4 columns]\n",
+      "     userId  movieId  rating\n",
+      "0         1        1     4.0\n",
+      "1         1       50     5.0\n",
+      "2         1      151     5.0\n",
+      "3         1      223     3.0\n",
+      "4         1      296     3.0\n",
+      "..      ...      ...     ...\n",
+      "180     601   112556     4.0\n",
+      "181     601   122916     3.5\n",
+      "182     601   152081     4.5\n",
+      "183     601   170705     5.0\n",
+      "184     601   177765     4.5\n",
+      "\n",
+      "[185 rows x 3 columns]\n",
+      "     userId  movieId  rating\n",
+      "0         1        3     4.0\n",
+      "1         1        6     4.0\n",
+      "2         1       47     5.0\n",
+      "3         1       70     3.0\n",
+      "4         1      101     5.0\n",
+      "..      ...      ...     ...\n",
+      "545     601   168326     4.0\n",
+      "546     601   170697     4.0\n",
+      "547     601   172591     4.5\n",
+      "548     601   174055     4.0\n",
+      "549     601   176371     4.0\n",
+      "\n",
+      "[550 rows x 3 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(ratings_path, sep = ',')\n",
+    "\n",
+    "user_vector_matrix = dict()\n",
+    "movie_vector_matrix = dict()\n",
+    "\n",
+    "columns = ['userId', 'movieId', 'rating']\n",
+    "test_data = []\n",
+    "train_data = []\n",
+    "\n",
+    "for row in df.itertuples():\n",
+    "    if (row.userId % 100 == 1):\n",
+    "        if row.Index % 4 == 0:\n",
+    "            test_data.append([row.userId, row.movieId, row.rating])\n",
+    "        else:\n",
+    "            train_data.append([row.userId, row.movieId, row.rating])\n",
+    "\n",
+    "test_df = pd.DataFrame(data=test_data, columns=columns)\n",
+    "train_df = pd.DataFrame(data=train_data, columns=columns)\n",
+    "max_train_movie = max(train_df['movieId'])\n",
+    "print(df)\n",
+    "print(test_df)\n",
+    "print(train_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "a6a8fb84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ALSStreamingModel:\n",
+    "    def __init__(self, l, num_features, alpha):\n",
+    "        self.l = l\n",
+    "        self.num_features = num_features\n",
+    "        self.alpha = alpha\n",
+    "        self.user_features = dict()\n",
+    "        self.movie_features = np.random.randint(100, size=(max_train_movie, num_features))\n",
+    "        print(self.movie_features.shape)\n",
+    "        self.ratings = dict()\n",
+    "        \n",
+    "    def fit(self, train):\n",
+    "        for row in train.itertuples():\n",
+    "            #print(\"Update\", row.Index)\n",
+    "            #start = time.time()\n",
+    "            self.update_user_vector(row)\n",
+    "            #print(\"Took\", time.time()-start)\n",
+    "        return self \n",
+    "\n",
+    "    def _als_step(self, ratings, solve_vecs, fixed_vecs):\n",
+    "        \"\"\"\n",
+    "        when updating the user matrix,\n",
+    "        the item matrix is the fixed vector and vice versa\n",
+    "        \n",
+    "        ratings: 1xnum_movies\n",
+    "        solve_vecs: 1xnum_features\n",
+    "        fixed_vecs: 1xnum_features\n",
+    "        RF * (F^-1F + lI)^-1\n",
+    "        num_features x num_features\n",
+    "        \n",
+    "        num_users x num_movies * num_movies x num_features\n",
+    "        num_users x num_features \n",
+    "        \n",
+    "        (610, 193609) (610, 200) (193609, 200)\n",
+    "        ratings user movies\n",
+    "        (1, 40) (1, 40) (1, 193609)\n",
+    "        b has to be 1x40\n",
+    "        ratings is 1xY fixedVecs is Yx40\n",
+    "        user movies ratings\n",
+    "        \"\"\"\n",
+    "        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_features) * self.l\n",
+    "        #print(A.shape)\n",
+    "        b = ratings.dot(fixed_vecs)\n",
+    "        A_inv = np.linalg.inv(A)\n",
+    "        solve_vecs = b.dot(A_inv)\n",
+    "        return solve_vecs\n",
+    "    \n",
+    "    def update_user_vector(self, row):\n",
+    "        rating = row.rating\n",
+    "        userId = row.userId\n",
+    "        movieId = row.movieId\n",
+    "\n",
+    "        if userId in self.user_features:\n",
+    "            user_vector = self.user_features[userId]\n",
+    "            rating_vector = self.ratings[userId]\n",
+    "        else:\n",
+    "            user_vector = np.random.randint(100, size=(1, self.num_features))\n",
+    "            rating_vector = np.zeros((1, max_train_movie))\n",
+    "\n",
+    "        movie_vector = self.movie_features\n",
+    "        rating_vector[0, movieId-1] = rating\n",
+    "        self.ratings[userId] = rating_vector\n",
+    "        #print(user_vector.shape, movie_vector.shape, rating_vector.shape)\n",
+    "        new_user_vector = self._als_step(rating_vector, user_vector, movie_vector)\n",
+    "        self.user_features[userId] = new_user_vector\n",
+    "    \n",
+    "    def predict_set(self, data):\n",
+    "        \n",
+    "        correct_results = []\n",
+    "        predicted_results = []\n",
+    "        for row in data.itertuples():\n",
+    "            prediction = self.predict_rating(row.userId, row.movieId)\n",
+    "            predicted_results.append(prediction)\n",
+    "            correct_results.append(row.rating)\n",
+    "        \n",
+    "        return self.compute_mse(correct_results, predicted_results)\n",
+    "    \n",
+    "    def predict_rating(self, userId, movieId):\n",
+    "        \"\"\"predict ratings for every user and item\"\"\"\n",
+    "        if userId not in self.user_features or movieId not in self.movie_features:\n",
+    "            return 0\n",
+    "        user_vector = self.user_features[userId]\n",
+    "        movie_vector = self.movie_features[movieId-1]\n",
+    "        prediction = user_vector.dot(movie_vector.T)\n",
+    "        if np.isnan(prediction) or prediction > 5:\n",
+    "            return 5\n",
+    "        if prediction < 0:\n",
+    "            return 0\n",
+    "        return prediction\n",
+    "\n",
+    "    def compute_mse(self, y_true, y_pred):\n",
+    "        \"\"\"ignore zero terms prior to comparing the mse\"\"\"\n",
+    "        mse = mean_squared_error(np.asarray(y_true), np.asarray(y_pred))\n",
+    "        return mse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "285ebde1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(176371, 40)\n",
+      "16.894003342096422\n",
+      "16.5193922235925\n"
+     ]
+    }
+   ],
+   "source": [
+    "als = ALSStreamingModel(.01, 100, .1)\n",
+    "als.fit(train_df)\n",
+    "print(als.predict_set(test_df))\n",
+    "print(als.predict_set(train_df))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.4 64-bit ('base': conda)",
+   "language": "python",
+   "name": "python37464bitbaseconda9114583a17cf498dbdf9713d49f5bef8"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}