From a494dd9d5c3f69fef1028b07346bf7c297ff1fd6 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 18:27:56 -0400 Subject: [PATCH 01/12] correctly call channel --- slackscrape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slackscrape.py b/slackscrape.py index f320647..13d5b81 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -36,6 +36,7 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): channel = args['channel'] output = args['output'] + try: old_json = load_json(output) except Exception as e: @@ -43,7 +44,7 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): print('No existing messages, starting from scratch...') slack_args = { - 'channel': config['channel_id'], + 'channel': channel, 'oldest': old_json[0]['ts'] if len(old_json) else '', } From 1e1a898a342d444ecac439153503b0b4c54fe9ef Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 18:41:37 -0400 Subject: [PATCH 02/12] enable output arg to work for channel dumps --- json_utils.py | 7 +++++++ slackscrape.py | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/json_utils.py b/json_utils.py index 01bad59..e4f4570 100644 --- a/json_utils.py +++ b/json_utils.py @@ -3,6 +3,7 @@ from operator import itemgetter from datetime import datetime import time +import os def load_json(path): with io.open(path, encoding='utf-8') as f: @@ -11,3 +12,9 @@ def load_json(path): def dump_json(path, data): with open(path, mode='w') as f: json.dump(data, f, indent = 2) + +def ensure_dir(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + return directory diff --git a/slackscrape.py b/slackscrape.py index 13d5b81..6e2786b 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json +from json_utils import * from slackclient import SlackClient import argparse @@ -34,11 +34,13 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): ap.add_argument('-o', '--output', help = 'file to save out') args = vars(ap.parse_args()) channel = args['channel'] - output = args['output'] + output = args['output'] or 'general' + channel_path = ensure_dir('./output/channels/{}/messages/'.format(channel)) + dump_path = '{}/{}.json'.format(channel_path, output) try: - old_json = load_json(output) + old_json = load_json(dump_path) except Exception as e: old_json = [] print('No existing messages, starting from scratch...') @@ -52,4 +54,4 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): if len(new_messages): all_messages = new_messages + old_json - dump_json(output, all_messages) + dump_json(dump_path, all_messages) From eda6e0754cc1536da83a2f544c67ed047c28dcc4 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 18:47:44 -0400 Subject: [PATCH 03/12] update info script to use common utils --- get_channels_info.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/get_channels_info.py b/get_channels_info.py index da1a72b..0513eea 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -1,19 +1,12 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json +from json_utils import * from slackscrape import scrape_slack from slackclient import SlackClient import operator import argparse -import os config = load_json('./env.json') -def ensure_dir(directory): - if not os.path.exists(directory): - os.makedirs(directory) - - return directory - if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-u', '--update', help = 'update channels', action="store_true") From 0392bbe9c9ffc757c981c0f6928e2277a4edc730 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 19:03:37 -0400 Subject: [PATCH 04/12] correctly handle channels with UTF-8 names --- get_channels_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/get_channels_info.py b/get_channels_info.py index 0513eea..98dc866 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -21,7 +21,7 @@ channels = response['channels'] for idx, channel in enumerate(channels): - chan_name = channel['name'] + chan_name = channel['name'].encode('utf-8') print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) From 82294285d9e5b80b25157e54ae3921306f6762a8 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 19:50:29 -0400 Subject: [PATCH 05/12] make get_channel_info an importable CLI --- get_channels_info.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/get_channels_info.py b/get_channels_info.py index 98dc866..689fa52 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -7,11 +7,7 @@ config = load_json('./env.json') -if __name__ == '__main__': - ap = argparse.ArgumentParser() - ap.add_argument('-u', '--update', help = 'update channels', action="store_true") - args = vars(ap.parse_args()) - +def channel_info(args): channel_args = { 'exclude_archived': 0, } @@ -45,3 +41,13 @@ except Exception as e: print('ERROR DUMPING {}'.format(chan_name)) print(e) + + return channels + +if __name__ == '__main__': + ap = argparse.ArgumentParser() + ap.add_argument('-u', '--update', help = 'update channels', action="store_true") + args = vars(ap.parse_args()) + + channel_info(args) + From 9c2c5dac9a09ee00ea5f0c59b163816ce7b8c62f Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 20:22:44 -0400 Subject: [PATCH 06/12] channel getter correctly doe lookup of chan name --- slackscrape.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/slackscrape.py b/slackscrape.py index 6e2786b..3b733f4 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from json_utils import * from slackclient import SlackClient +from get_channels_info import * import argparse def get_messages(sc, slack_args, messages, filter_func): @@ -26,6 +27,12 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): print('Done fetching messages. Found {} in total.'.format(len(results['messages']))) return results['messages'] +def find_channel_by(key, val): + channels = channel_info('') + for chan in channels: + if chan['id'] == channel: + return chan['name'] + if __name__ == '__main__': config = load_json('./env.json') @@ -36,7 +43,10 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): channel = args['channel'] output = args['output'] or 'general' - channel_path = ensure_dir('./output/channels/{}/messages/'.format(channel)) + channel_name = find_channel_by('id', channel) + print channel_name + + channel_path = ensure_dir('./output/channels/{}/messages/'.format(channel_name)) dump_path = '{}/{}.json'.format(channel_path, output) try: From 85226a371afc9a196153e85c1822562fcc54a97c Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 20:38:06 -0400 Subject: [PATCH 07/12] import of channel info module does not invoke storage --- get_channels_info.py | 11 ++++++++--- slackscrape.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/get_channels_info.py b/get_channels_info.py index 689fa52..d4d39e0 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -1,13 +1,12 @@ #!/usr/bin/env python from json_utils import * -from slackscrape import scrape_slack from slackclient import SlackClient import operator import argparse config = load_json('./env.json') -def channel_info(args): +def all_channels_info(args): channel_args = { 'exclude_archived': 0, } @@ -16,6 +15,12 @@ def channel_info(args): response = sc.api_call('channels.list', **channel_args) channels = response['channels'] + return channels + +def store_channel_info(args): + sc = SlackClient(config['token']) + channels = all_channels_info(args) + for idx, channel in enumerate(channels): chan_name = channel['name'].encode('utf-8') print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) @@ -49,5 +54,5 @@ def channel_info(args): ap.add_argument('-u', '--update', help = 'update channels', action="store_true") args = vars(ap.parse_args()) - channel_info(args) + store_channel_info(args) diff --git a/slackscrape.py b/slackscrape.py index 3b733f4..1c578e6 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -28,7 +28,7 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): return results['messages'] def find_channel_by(key, val): - channels = channel_info('') + channels = all_channels_info('') for chan in channels: if chan['id'] == channel: return chan['name'] From 98decad3e3b7a0b8d9e851603a491156678e3b5f Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 20:39:29 -0400 Subject: [PATCH 08/12] correct info in README --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8d74be0..0ca4b04 100644 --- a/README.md +++ b/README.md @@ -18,31 +18,31 @@ eg: eg: `python slackscrape.py -c C193MSB9J -will write channel messages to `general.json` in `output/channels/general/messages/` +will write channel messages to `general.json` in `output/channels//messages/` ## get channels messages -`python get\_channels\_messages.py -u [optional update existing] -a [optional include archived]` +`python get_channels_messages.py -u [optional update existing] -a [optional include archived]` -eg: `python get\_channels\_messages.py -u +eg: `python get_channels_messages.py -u Will get all channels messages and update with any new messages it finds and write to `output/channels//messages/.json` ## get channels info -`python get\_channels\_info.py -u [optional update existing]` +`python get_channels_info.py -u [optional update existing]` -eg: `python get\_channels\_info.py -u +eg: `python get_channels_info.py -u Will get all channels metadata and write to `output/channels//info/.json` ## get users -`python get\_users.py -u [optional update existing]` +`python get_users.py -u [optional update existing]` -eg: `python get\_users.py -u +eg: `python get_users.py -u Will get all users and write to `output/users/members/.json` From d835bc4bd4812cced965e082089aca9758a4ee56 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 20:58:37 -0400 Subject: [PATCH 09/12] find_channel_by correctly uses input. Custom return key --- slackscrape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slackscrape.py b/slackscrape.py index 1c578e6..562e3e4 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -27,11 +27,11 @@ def scrape_slack(token, slack_args, filter_func = lambda x: x): print('Done fetching messages. Found {} in total.'.format(len(results['messages']))) return results['messages'] -def find_channel_by(key, val): +def find_channel_by(key, val, return_key='name'): channels = all_channels_info('') for chan in channels: - if chan['id'] == channel: - return chan['name'] + if chan[key] == val: + return chan[return_key] if __name__ == '__main__': config = load_json('./env.json') From d1010402f06365077855822f796c1b5d31867db0 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 21:00:18 -0400 Subject: [PATCH 10/12] get_channels_messages uses common lib --- get_channels_messages.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/get_channels_messages.py b/get_channels_messages.py index aebd9f1..f5e8331 100755 --- a/get_channels_messages.py +++ b/get_channels_messages.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from json_utils import load_json, dump_json +from json_utils import * from slackscrape import scrape_slack from slackclient import SlackClient import argparse @@ -8,12 +8,6 @@ config = load_json('./env.json') -def ensure_dir(directory): - if not os.path.exists(directory): - os.makedirs(directory) - - return directory - if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-a', '--archived', help = 'include archived channels', action="store_true") From b708b3cecd583100fa0c55a991363af8665424f7 Mon Sep 17 00:00:00 2001 From: ZJ Date: Fri, 23 Aug 2019 21:12:33 -0400 Subject: [PATCH 11/12] pre-check certain key lookups --- slackscrape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slackscrape.py b/slackscrape.py index 562e3e4..0d962ab 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -6,8 +6,8 @@ def get_messages(sc, slack_args, messages, filter_func): history = sc.api_call("channels.history", **slack_args) - last_ts = history['messages'][-1]['ts'] if history['has_more'] else False - filtered = list(filter(filter_func, history['messages'])) + last_ts = history['messages'][-1]['ts'] if ('has_more' in history and history['has_more']) else False + filtered = list(filter(filter_func, ('messages' in history and history['messages']))) all_messages = messages + filtered print('Fetched {} messages. {} Total now.'.format(len(filtered), len(all_messages))) From 25f11e7a3b3115ff3cad815e88cebd853d65f1b9 Mon Sep 17 00:00:00 2001 From: ZJ Date: Sat, 24 Aug 2019 00:10:11 -0400 Subject: [PATCH 12/12] fix "in" assertions. normalize names --- get_channels_info.py | 5 +++-- get_channels_messages.py | 10 +++++----- slackscrape.py | 9 +++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/get_channels_info.py b/get_channels_info.py index d4d39e0..e771ba2 100755 --- a/get_channels_info.py +++ b/get_channels_info.py @@ -27,9 +27,10 @@ def store_channel_info(args): chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) info_path = ensure_dir('./output/channels/{}/info'.format(chan_name)) + output_path = '{}/{}.json'.format(info_path, chan_name) try: - old_json = load_json('{}/{}.json'.format(info_path, chan_name)) + old_json = load_json(output_path) if not args['update']: print('Already have channel {}, skipping ...'.format(chan_name)) continue @@ -42,7 +43,7 @@ def store_channel_info(args): channel_info = sc.api_call('channels.info', **slack_args) try: - dump_json('{}/{}.json'.format(info_path, chan_name), channel_info) + dump_json(output_path, channel_info) except Exception as e: print('ERROR DUMPING {}'.format(chan_name)) print(e) diff --git a/get_channels_messages.py b/get_channels_messages.py index f5e8331..f8d20bb 100755 --- a/get_channels_messages.py +++ b/get_channels_messages.py @@ -25,14 +25,14 @@ sorted_channels = sorted(channels, key=lambda x: x['num_members'], reverse=True) for idx, channel in enumerate(sorted_channels): - chan_name = channel['name'] + chan_name = channel['name'].encode('utf-8') print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) - msg_path = ensure_dir('./output/channels/{}/messages'.format(chan_name)) - output = './output/channels/{}/messages/{}.json'.format(chan_name, chan_name) + msg_path = ensure_dir('{}/messages'.format(chan_path)) + dump_path = '{}/{}.json'.format(msg_path, chan_name) try: - old_json = load_json(output) + old_json = load_json(dump_path) if not args['update']: print('Aready have messages, skipping...') continue @@ -49,4 +49,4 @@ if len(new_messages): all_messages = new_messages + old_json - dump_json(output, all_messages) + dump_json(dump_path, all_messages) diff --git a/slackscrape.py b/slackscrape.py index 0d962ab..45fc5c8 100755 --- a/slackscrape.py +++ b/slackscrape.py @@ -7,7 +7,8 @@ def get_messages(sc, slack_args, messages, filter_func): history = sc.api_call("channels.history", **slack_args) last_ts = history['messages'][-1]['ts'] if ('has_more' in history and history['has_more']) else False - filtered = list(filter(filter_func, ('messages' in history and history['messages']))) + hist_messages = history['messages'] if ('messages' in history) else [] + filtered = list(filter(filter_func, hist_messages)) all_messages = messages + filtered print('Fetched {} messages. {} Total now.'.format(len(filtered), len(all_messages))) @@ -41,13 +42,13 @@ def find_channel_by(key, val, return_key='name'): ap.add_argument('-o', '--output', help = 'file to save out') args = vars(ap.parse_args()) channel = args['channel'] - output = args['output'] or 'general' channel_name = find_channel_by('id', channel) print channel_name + output = args['output'] or channel_name - channel_path = ensure_dir('./output/channels/{}/messages/'.format(channel_name)) - dump_path = '{}/{}.json'.format(channel_path, output) + chan_path = ensure_dir('./output/channels/{}/messages/'.format(channel_name)) + dump_path = '{}/{}.json'.format(chan_path, output) try: old_json = load_json(dump_path)