Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.8
USER root

RUN apt-get update
RUN apt-get -y install locales && \
localedef -f UTF-8 -i ja_JP ja_JP.UTF-8
RUN apt-get install -y vim

ENV LANG ja_JP.UTF-8
ENV LANGUAGE ja_JP:ja
ENV LC_ALL ja_JP.UTF-8
ENV TZ JST-9

RUN pip install --upgrade pip
RUN pip install --upgrade setuptools
RUN pip install poetry poetry-dynamic-versioning
10 changes: 10 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"name": "PyTorchCML",
"dockerComposeFile": "docker-compose.yml",
"extensions": [
"ms-python.python",
],
"service": "python",
"workspaceFolder": "/work",
"shutdownAction": "stopCompose"
}
10 changes: 10 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: "3"

services:
python:
build: .
volumes:
- ../:/work
command: sleep infinity

# chmod 777 ./work/build && .work/build.sh &&
47 changes: 45 additions & 2 deletions PyTorchCML/models/BaseEmbeddingModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import torch
from torch import nn

import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

from ..adaptors import BaseAdaptor


Expand Down Expand Up @@ -44,15 +48,17 @@ def __init__(
)

else:
self.user_embedding = nn.Embedding.from_pretrained(user_embedding_init)
self.user_embedding = nn.Embedding.from_pretrained(
user_embedding_init)
self.user_embedding.weight.requires_grad = True

if item_embedding_init is None:
self.item_embedding = nn.Embedding(
n_item, n_dim, sparse=False, max_norm=max_norm
)
else:
self.item_embedding = nn.Embedding.from_pretrained(item_embedding_init)
self.item_embedding = nn.Embedding.from_pretrained(
item_embedding_init)
self.item_embedding.weight.requires_grad = True

def forward(
Expand Down Expand Up @@ -112,3 +118,40 @@ def get_item_weight(self, users: torch.Tensor) -> torch.Tensor:
torch.Tensor: Tensor of weight size (n, n_item)
"""
raise NotImplementedError

def get_topk_items(self, users: torch.Tensor, k: int, num_batch: int = 100, n_jobs: int = -1):
"""Method of getting top k items for for each user.
Args:
users (torch.Tensor): 1d tensor of user_id size (n).
k : number of top items.
num_batch : number of users for a batch.
n_job : number of using process.

Returns:
pd.DataFrame: dataframe of topk items for each user which columns are ["user", "item", "score"]
"""

batches = torch.split(users, num_batch)
inputs = tqdm(batches)
items = torch.LongTensor(torch.arange(self.n_item))

def predict_user(i, batch_users, k):
users_expand = batch_users.expand(self.n_item, -1).T.reshape(-1, 1)
items_expand = items.expand(len(batch_users), -1).reshape(-1, 1)
pairs_tensor = torch.cat([users_expand, items_expand], axis=1)
pairs_array = pairs_tensor.cpu().detach().numpy()
pairs_df = pd.DataFrame(pairs_array, columns=['user', 'item'])
score_tensor = self.predict(pairs_tensor)
pairs_df['score'] = score_tensor.cpu().detach().numpy()
pairs_df = pairs_df.sort_values(
by=["user", "score"], ascending=[True, False])
topk_pairs = pairs_df.groupby("user").head(k)
return i, topk_pairs

scored = Parallel(n_jobs=n_jobs)(
delayed(predict_user)(i, batch_users=batch_users, k=k)
for i, batch_users in enumerate(inputs)
)
scored = sorted(scored, key=lambda x: x[0])
scored = [s[1] for s in scored]
return pd.concat(scored, axis=0)
15 changes: 10 additions & 5 deletions PyTorchCML/samplers/BaseSampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,17 @@ def __init__(
neutral_cpu = neutral.cpu()
not_negative = torch.cat([train_set_cpu, neutral_cpu])
self.not_negative_flag = csr_matrix(
(np.ones(not_negative.shape[0]), (not_negative[:, 0], not_negative[:, 1])),
(np.ones(not_negative.shape[0]),
(not_negative[:, 0], not_negative[:, 1])),
[n_user, n_item],
)
self.not_negative_flag.sum_duplicates()
self.not_negative_flag.data[:] = 1

# device
if device is None:
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.device = torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu")
else:
self.device = device

Expand All @@ -86,7 +88,8 @@ def __init__(
pos_weight_pair = pos_weight[train_set[:, 0].cpu()]

else:
raise NotImplementedError
raise ValueError(
"The length of pos_weight does not match any of n_user, n_item, or n_positive_pair.")

else: # uniform
pos_weight_pair = torch.ones(train_set.shape[0])
Expand All @@ -108,7 +111,8 @@ def __init__(
self.neg_item_weight = torch.Tensor(neg_weight).to(self.device)

else:
raise NotImplementedError
raise ValueError(
"The length of neg_weight does not match any of n_user or n_item.")

def get_pos_batch(self) -> torch.Tensor:
"""Method for positive sampling.
Expand Down Expand Up @@ -152,6 +156,7 @@ def get_neg_batch(self, users: torch.Tensor) -> torch.Tensor:

else:
neg_sampler = Categorical(probs=self.neg_item_weight)
neg_samples = neg_sampler.sample([self.batch_size, self.n_neg_samples])
neg_samples = neg_sampler.sample(
[self.batch_size, self.n_neg_samples])

return neg_samples
20 changes: 13 additions & 7 deletions PyTorchCML/trainers/BaseTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm.auto import tqdm

from ..evaluators import BaseEvaluator
from ..losses import BaseLoss
Expand Down Expand Up @@ -66,13 +66,17 @@ def fit(
for b in pbar:
# batch sampling
batch = self.sampler.get_pos_batch()
users = batch[:, self.column_names["user_id"]].reshape(-1, 1)
pos_items = batch[:, self.column_names["item_id"]].reshape(-1, 1)
users = batch[:, self.column_names["user_id"]
].reshape(-1, 1)
pos_items = batch[:,
self.column_names["item_id"]].reshape(-1, 1)

if self.sampler.two_stage:
neg_candidates = self.sampler.get_and_set_candidates()
dist = self.model.spreadout_distance(pos_items, neg_candidates)
self.sampler.set_candidates_weight(dist, self.model.n_dim)
dist = self.model.spreadout_distance(
pos_items, neg_candidates)
self.sampler.set_candidates_weight(
dist, self.model.n_dim)

neg_items = self.sampler.get_neg_batch(users.reshape(-1))

Expand All @@ -83,7 +87,8 @@ def fit(
embeddings_dict = self.model(users, pos_items, neg_items)

# compute loss
loss = self.criterion(embeddings_dict, batch, self.column_names)
loss = self.criterion(
embeddings_dict, batch, self.column_names)

# adding loss for domain adaptation
if self.model.user_adaptor is not None:
Expand Down Expand Up @@ -117,4 +122,5 @@ def fit(
valid_scores_sub = valid_evaluator.score(self.model)
valid_scores_sub["epoch"] = ep + 1
valid_scores_sub["loss"] = accum_loss / n_batch
self.valid_scores = pd.concat([self.valid_scores, valid_scores_sub])
self.valid_scores = pd.concat(
[self.valid_scores, valid_scores_sub])
3 changes: 3 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
poetry config virtualenvs.in-project true
poetry install
poetry build
Loading