-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdownsample.py
More file actions
33 lines (23 loc) · 855 Bytes
/
downsample.py
File metadata and controls
33 lines (23 loc) · 855 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Usage:
$ spark-submit downsample.py k random_seed
'''
import sys
from pyspark.sql import SparkSession
def downsample(spark, k, rand_seed, filename):
# Load entie data set
full_interactions = spark.read.parquet('full_interactions.parquet')
full_users = full_interactions.select('user_id').distinct()
# Sample k % users
k_percent_users = full_users.sample(False, k / 100, rand_seed)
k_percent_users_interactions = k_percent_users.join(full_interactions, on = 'user_id', how = 'left')
k_percent_users_interactions.write.mode('overwrite').parquet(f'{k}_percent/{k}_percent_subsamples.parquet')
if __name__ == '__main__':
spark = SparkSession.builder \
.appName('downsample') \
.getOrCreate()
k = int(sys.argv[1])
rand_seed = int(sys.argv[2])
downsample(spark, k, random_seed)