From 92fe68471bc2b04ee2420f8c52f88ee2f1d420b3 Mon Sep 17 00:00:00 2001 From: Starry-Hu Date: Fri, 16 Jun 2023 15:18:48 +0800 Subject: [PATCH 1/2] add `bash preprocess.sh` script line for `-t user` partition --- fedlab_benchmarks/leaf/README_zh_cn.md | 1 + 1 file changed, 1 insertion(+) diff --git a/fedlab_benchmarks/leaf/README_zh_cn.md b/fedlab_benchmarks/leaf/README_zh_cn.md index 395bec48..8301be17 100644 --- a/fedlab_benchmarks/leaf/README_zh_cn.md +++ b/fedlab_benchmarks/leaf/README_zh_cn.md @@ -80,6 +80,7 @@ bash preprocess.sh -s niid --sf 0.05 -k 0 -t sample cd fedlab_benchmarks/datasets/data/shakespeare bash preprocess.sh -s niid --sf 0.2 -k 0 -t sample # bash preprocess.sh -s niid --sf 1.0 -k 0 -t sample # get 660 users (with default --tf 0.9) +# bash preprocess.sh -s niid --sf 1.0 -k 0 -t user # get 1129 users (with default --tf 0.9) # bash preprocess.sh -s iid --iu 1.0 --sf 1.0 -k 0 -t sample # get all 1129 users cd fedlab_benchmarks/datasets/data/sent140 From bb7b9a15aeaf0adf4ca9b50955bdb7ad5769ed56 Mon Sep 17 00:00:00 2001 From: Starry-Hu Date: Fri, 16 Jun 2023 15:21:42 +0800 Subject: [PATCH 2/2] Fixed bug with huge extra saving cost of attribute `Vocab` in Sent140Dataset --- .../leaf/dataset/sent140_dataset.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fedlab_benchmarks/leaf/dataset/sent140_dataset.py b/fedlab_benchmarks/leaf/dataset/sent140_dataset.py index e1548d70..abcdd73b 100644 --- a/fedlab_benchmarks/leaf/dataset/sent140_dataset.py +++ b/fedlab_benchmarks/leaf/dataset/sent140_dataset.py @@ -44,9 +44,7 @@ def __init__(self, client_id: int, client_str: str, data: list, targets: list, self.data_token = [] self.data_tokens_tensor = [] self.targets_tensor = [] - self.vocab = None self.tokenizer = tokenizer if tokenizer else Tokenizer() - self.fix_len = None self._process_data_target() if is_to_tokens: @@ -76,16 +74,14 @@ def encode(self, vocab: 'Vocab', fix_len: int): if len(self.data_tokens_tensor) > 0: self.data_tokens_tensor.clear() self.targets_tensor.clear() - self.vocab = vocab - self.fix_len = fix_len - pad_idx = self.vocab.get_index('') + pad_idx = vocab.get_index('') assert self.data_token is not None for tokens in self.data_token: - self.data_tokens_tensor.append(self.__encode_tokens(tokens, pad_idx)) + self.data_tokens_tensor.append(self.__encode_tokens(tokens, vocab, pad_idx, fix_len)) for target in self.targets: self.targets_tensor.append(torch.tensor(target)) - def __encode_tokens(self, tokens, pad_idx) -> torch.Tensor: + def __encode_tokens(self, tokens, vocab, pad_idx, fix_len) -> torch.Tensor: """encode `fix_len` length for token_data to get indices list in `self.vocab` if one sentence length is shorter than fix_len, it will use pad word for padding to fix_len if one sentence length is longer than fix_len, it will cut the first max_words words @@ -96,9 +92,9 @@ def __encode_tokens(self, tokens, pad_idx) -> torch.Tensor: Returns: integer list of indices with `fix_len` length for tokens input """ - x = [pad_idx for _ in range(self.fix_len)] - for idx, word in enumerate(tokens[:self.fix_len]): - x[idx] = self.vocab.get_index(word) + x = [pad_idx for _ in range(fix_len)] + for idx, word in enumerate(tokens[:fix_len]): + x[idx] = vocab.get_index(word) return torch.tensor(x) def __len__(self):