diff --git a/README.md b/README.md index 8d74be0..0ca4b04 100644 --- a/README.md +++ b/README.md @@ -18,31 +18,31 @@ eg: eg: `python slackscrape.py -c C193MSB9J -will write channel messages to `general.json` in `output/channels/general/messages/` +will write channel messages to `general.json` in `output/channels//messages/` ## get channels messages -`python get\_channels\_messages.py -u [optional update existing] -a [optional include archived]` +`python get_channels_messages.py -u [optional update existing] -a [optional include archived]` -eg: `python get\_channels\_messages.py -u +eg: `python get_channels_messages.py -u Will get all channels messages and update with any new messages it finds and write to `output/channels//messages/.json` ## get channels info -`python get\_channels\_info.py -u [optional update existing]` +`python get_channels_info.py -u [optional update existing]` -eg: `python get\_channels\_info.py -u +eg: `python get_channels_info.py -u Will get all channels metadata and write to `output/channels//info/.json` ## get users -`python get\_users.py -u [optional update existing]` +`python get_users.py -u [optional update existing]` -eg: `python get\_users.py -u +eg: `python get_users.py -u Will get all users and write to `output/users/members/.json` diff --git a/get_channels_info.py b/get_channels_info.py index da1a72b..e771ba2 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -1,24 +1,12 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json -from slackscrape import scrape_slack +from json_utils import * from slackclient import SlackClient import operator import argparse -import os config = load_json('./env.json') -def ensure_dir(directory): - if not os.path.exists(directory): - os.makedirs(directory) - - return directory - -if __name__ == '__main__': - ap = argparse.ArgumentParser() - ap.add_argument('-u', '--update', help = 'update channels', action="store_true") - args = vars(ap.parse_args()) - +def all_channels_info(args): channel_args = { 'exclude_archived': 0, } @@ -27,15 +15,22 @@ def ensure_dir(directory): response = sc.api_call('channels.list', **channel_args) channels = response['channels'] + return channels + +def store_channel_info(args): + sc = SlackClient(config['token']) + channels = all_channels_info(args) + for idx, channel in enumerate(channels): - chan_name = channel['name'] + chan_name = channel['name'].encode('utf-8') print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) info_path = ensure_dir('./output/channels/{}/info'.format(chan_name)) + output_path = '{}/{}.json'.format(info_path, chan_name) try: - old_json = load_json('{}/{}.json'.format(info_path, chan_name)) + old_json = load_json(output_path) if not args['update']: print('Already have channel {}, skipping ...'.format(chan_name)) continue @@ -48,7 +43,17 @@ def ensure_dir(directory): channel_info = sc.api_call('channels.info', **slack_args) try: - dump_json('{}/{}.json'.format(info_path, chan_name), channel_info) + dump_json(output_path, channel_info) except Exception as e: print('ERROR DUMPING {}'.format(chan_name)) print(e) + + return channels + +if __name__ == '__main__': + ap = argparse.ArgumentParser() + ap.add_argument('-u', '--update', help = 'update channels', action="store_true") + args = vars(ap.parse_args()) + + store_channel_info(args) + diff --git a/get_channels_messages.py b/get_channels_messages.py index aebd9f1..f8d20bb 100755 --- a/get_channels_messages.py +++ b/get_channels_messages.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json +from json_utils import * from slackscrape import scrape_slack from slackclient import SlackClient import argparse @@ -8,12 +8,6 @@ config = load_json('./env.json') -def ensure_dir(directory): - if not os.path.exists(directory): - os.makedirs(directory) - - return directory - if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-a', '--archived', help = 'include archived channels', action="store_true") @@ -31,14 +25,14 @@ def ensure_dir(directory): sorted_channels = sorted(channels, key=lambda x: x['num_members'], reverse=True) for idx, channel in enumerate(sorted_channels): - chan_name = channel['name'] + chan_name = channel['name'].encode('utf-8') print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) - msg_path = ensure_dir('./output/channels/{}/messages'.format(chan_name)) - output = './output/channels/{}/messages/{}.json'.format(chan_name, chan_name) + msg_path = ensure_dir('{}/messages'.format(chan_path)) + dump_path = '{}/{}.json'.format(msg_path, chan_name) try: - old_json = load_json(output) + old_json = load_json(dump_path) if not args['update']: print('Aready have messages, skipping...') continue @@ -55,4 +49,4 @@ def ensure_dir(directory): if len(new_messages): all_messages = new_messages + old_json - dump_json(output, all_messages) + dump_json(dump_path, all_messages) diff --git a/json_utils.py b/json_utils.py index 01bad59..e4f4570 100644 --- a/json_utils.py +++ b/json_utils.py @@ -3,6 +3,7 @@ from operator import itemgetter from datetime import datetime import time +import os def load_json(path): with io.open(path, encoding='utf-8') as f: @@ -11,3 +12,9 @@ def load_json(path): def dump_json(path, data): with open(path, mode='w') as f: json.dump(data, f, indent = 2) + +def ensure_dir(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + return directory diff --git a/slackscrape.py b/slackscrape.py index f320647..45fc5c8 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -1,12 +1,14 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json +from json_utils import * from slackclient import SlackClient +from get_channels_info import * import argparse def get_messages(sc, slack_args, messages, filter_func): history = sc.api_call("channels.history", **slack_args) - last_ts = history['messages'][-1]['ts'] if history['has_more'] else False - filtered = list(filter(filter_func, history['messages'])) + last_ts = history['messages'][-1]['ts'] if ('has_more' in history and history['has_more']) else False + hist_messages = history['messages'] if ('messages' in history) else [] + filtered = list(filter(filter_func, hist_messages)) all_messages = messages + filtered print('Fetched {} messages. {} Total now.'.format(len(filtered), len(all_messages))) @@ -26,6 +28,12 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): print('Done fetching messages. Found {} in total.'.format(len(results['messages']))) return results['messages'] +def find_channel_by(key, val, return_key='name'): + channels = all_channels_info('') + for chan in channels: + if chan[key] == val: + return chan[return_key] + if __name__ == '__main__': config = load_json('./env.json') @@ -34,16 +42,22 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): ap.add_argument('-o', '--output', help = 'file to save out') args = vars(ap.parse_args()) channel = args['channel'] - output = args['output'] + + channel_name = find_channel_by('id', channel) + print channel_name + output = args['output'] or channel_name + + chan_path = ensure_dir('./output/channels/{}/messages/'.format(channel_name)) + dump_path = '{}/{}.json'.format(chan_path, output) try: - old_json = load_json(output) + old_json = load_json(dump_path) except Exception as e: old_json = [] print('No existing messages, starting from scratch...') slack_args = { - 'channel': config['channel_id'], + 'channel': channel, 'oldest': old_json[0]['ts'] if len(old_json) else '', } @@ -51,4 +65,4 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): if len(new_messages): all_messages = new_messages + old_json - dump_json(output, all_messages) + dump_json(dump_path, all_messages)