Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
657 changes: 657 additions & 0 deletions recsys/als-half.ipynb

Large diffs are not rendered by default.

438 changes: 438 additions & 0 deletions recsys/als-jg-edits.ipynb

Large diffs are not rendered by default.

255 changes: 255 additions & 0 deletions recsys/als-stream-2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"id": "8e0e6a4f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from collections import defaultdict\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import mean_squared_error\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5b033873",
"metadata": {},
"outputs": [],
"source": [
"ratings_path = \"/Users/amitnarang/Downloads/ml-latest-small/ratings.csv\""
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "a00a310f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" userId movieId rating timestamp\n",
"0 1 1 4.0 964982703\n",
"1 1 3 4.0 964981247\n",
"2 1 6 4.0 964982224\n",
"3 1 47 5.0 964983815\n",
"4 1 50 5.0 964982931\n",
"... ... ... ... ...\n",
"100831 610 166534 4.0 1493848402\n",
"100832 610 168248 5.0 1493850091\n",
"100833 610 168250 5.0 1494273047\n",
"100834 610 168252 5.0 1493846352\n",
"100835 610 170875 3.0 1493846415\n",
"\n",
"[100836 rows x 4 columns]\n",
" userId movieId rating\n",
"0 1 1 4.0\n",
"1 1 50 5.0\n",
"2 1 151 5.0\n",
"3 1 223 3.0\n",
"4 1 296 3.0\n",
".. ... ... ...\n",
"180 601 112556 4.0\n",
"181 601 122916 3.5\n",
"182 601 152081 4.5\n",
"183 601 170705 5.0\n",
"184 601 177765 4.5\n",
"\n",
"[185 rows x 3 columns]\n",
" userId movieId rating\n",
"0 1 3 4.0\n",
"1 1 6 4.0\n",
"2 1 47 5.0\n",
"3 1 70 3.0\n",
"4 1 101 5.0\n",
".. ... ... ...\n",
"545 601 168326 4.0\n",
"546 601 170697 4.0\n",
"547 601 172591 4.5\n",
"548 601 174055 4.0\n",
"549 601 176371 4.0\n",
"\n",
"[550 rows x 3 columns]\n"
]
}
],
"source": [
"df = pd.read_csv(ratings_path, sep = ',')\n",
"\n",
"user_vector_matrix = dict()\n",
"movie_vector_matrix = dict()\n",
"\n",
"columns = ['userId', 'movieId', 'rating']\n",
"test_data = []\n",
"train_data = []\n",
"\n",
"for row in df.itertuples():\n",
" if (row.userId % 100 == 1):\n",
" if row.Index % 4 == 0:\n",
" test_data.append([row.userId, row.movieId, row.rating])\n",
" else:\n",
" train_data.append([row.userId, row.movieId, row.rating])\n",
"\n",
"test_df = pd.DataFrame(data=test_data, columns=columns)\n",
"train_df = pd.DataFrame(data=train_data, columns=columns)\n",
"max_train_movie = max(train_df['movieId'])\n",
"print(df)\n",
"print(test_df)\n",
"print(train_df)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a6a8fb84",
"metadata": {},
"outputs": [],
"source": [
"class ALSStreamingModel:\n",
" def __init__(self, l, num_features, alpha):\n",
" self.l = l\n",
" self.num_features = num_features\n",
" self.alpha = alpha\n",
" self.user_features = dict()\n",
" self.movie_features = np.random.randint(100, size=(max_train_movie, num_features))\n",
" print(self.movie_features.shape)\n",
" self.ratings = dict()\n",
" \n",
" def fit(self, train):\n",
" for row in train.itertuples():\n",
" #print(\"Update\", row.Index)\n",
" #start = time.time()\n",
" self.update_user_vector(row)\n",
" #print(\"Took\", time.time()-start)\n",
" return self \n",
"\n",
" def _als_step(self, ratings, solve_vecs, fixed_vecs):\n",
" \"\"\"\n",
" when updating the user matrix,\n",
" the item matrix is the fixed vector and vice versa\n",
" \n",
" ratings: 1xnum_movies\n",
" solve_vecs: 1xnum_features\n",
" fixed_vecs: 1xnum_features\n",
" RF * (F^-1F + lI)^-1\n",
" num_features x num_features\n",
" \n",
" num_users x num_movies * num_movies x num_features\n",
" num_users x num_features \n",
" \n",
" (610, 193609) (610, 200) (193609, 200)\n",
" ratings user movies\n",
" (1, 40) (1, 40) (1, 193609)\n",
" b has to be 1x40\n",
" ratings is 1xY fixedVecs is Yx40\n",
" user movies ratings\n",
" \"\"\"\n",
" A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_features) * self.l\n",
" #print(A.shape)\n",
" b = ratings.dot(fixed_vecs)\n",
" A_inv = np.linalg.inv(A)\n",
" solve_vecs = b.dot(A_inv)\n",
" return solve_vecs\n",
" \n",
" def update_user_vector(self, row):\n",
" rating = row.rating\n",
" userId = row.userId\n",
" movieId = row.movieId\n",
"\n",
" if userId in self.user_features:\n",
" user_vector = self.user_features[userId]\n",
" rating_vector = self.ratings[userId]\n",
" else:\n",
" user_vector = np.random.randint(100, size=(1, self.num_features))\n",
" rating_vector = np.zeros((1, max_train_movie))\n",
"\n",
" movie_vector = self.movie_features\n",
" rating_vector[0, movieId-1] = rating\n",
" self.ratings[userId] = rating_vector\n",
" #print(user_vector.shape, movie_vector.shape, rating_vector.shape)\n",
" new_user_vector = self._als_step(rating_vector, user_vector, movie_vector)\n",
" self.user_features[userId] = new_user_vector\n",
" \n",
" def predict_set(self, data):\n",
" \n",
" correct_results = []\n",
" predicted_results = []\n",
" for row in data.itertuples():\n",
" prediction = self.predict_rating(row.userId, row.movieId)\n",
" predicted_results.append(prediction)\n",
" correct_results.append(row.rating)\n",
" \n",
" return self.compute_mse(correct_results, predicted_results)\n",
" \n",
" def predict_rating(self, userId, movieId):\n",
" \"\"\"predict ratings for every user and item\"\"\"\n",
" if userId not in self.user_features or movieId not in self.movie_features:\n",
" return 0\n",
" user_vector = self.user_features[userId]\n",
" movie_vector = self.movie_features[movieId-1]\n",
" prediction = user_vector.dot(movie_vector.T)\n",
" if np.isnan(prediction) or prediction > 5:\n",
" return 5\n",
" if prediction < 0:\n",
" return 0\n",
" return prediction\n",
"\n",
" def compute_mse(self, y_true, y_pred):\n",
" \"\"\"ignore zero terms prior to comparing the mse\"\"\"\n",
" mse = mean_squared_error(np.asarray(y_true), np.asarray(y_pred))\n",
" return mse"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "285ebde1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(176371, 40)\n",
"16.894003342096422\n",
"16.5193922235925\n"
]
}
],
"source": [
"als = ALSStreamingModel(.01, 100, .1)\n",
"als.fit(train_df)\n",
"print(als.predict_set(test_df))\n",
"print(als.predict_set(train_df))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.4 64-bit ('base': conda)",
"language": "python",
"name": "python37464bitbaseconda9114583a17cf498dbdf9713d49f5bef8"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading