From 2f10ca335c071cc802dc858bc4b16dcfc9dd1f0b Mon Sep 17 00:00:00 2001 From: Elodie Date: Fri, 5 Jun 2020 17:42:17 +0200 Subject: [PATCH 1/6] plugin oncrawl v1.0.0 --- oncrawl/code-env/python/desc.json | 7 + oncrawl/code-env/python/spec/requirements.txt | 2 + .../custom-recipes/data_queries/recipe.json | 30 + oncrawl/custom-recipes/data_queries/recipe.py | 127 ++++ oncrawl/js/data_queries_controller.js | 234 ++++++ oncrawl/plugin.json | 15 + oncrawl/python-lib/oncrawl/__init__.py | 55 ++ oncrawl/python-lib/oncrawl/oncrawlDataAPI.py | 202 ++++++ oncrawl/resource/behave.js | 673 ++++++++++++++++++ oncrawl/resource/data_queries.html | 142 ++++ oncrawl/resource/data_queries.py | 156 ++++ oncrawl/resource/functions.js | 161 +++++ oncrawl/resource/styles.css | 109 +++ 13 files changed, 1913 insertions(+) create mode 100644 oncrawl/code-env/python/desc.json create mode 100644 oncrawl/code-env/python/spec/requirements.txt create mode 100644 oncrawl/custom-recipes/data_queries/recipe.json create mode 100644 oncrawl/custom-recipes/data_queries/recipe.py create mode 100644 oncrawl/js/data_queries_controller.js create mode 100644 oncrawl/plugin.json create mode 100644 oncrawl/python-lib/oncrawl/__init__.py create mode 100644 oncrawl/python-lib/oncrawl/oncrawlDataAPI.py create mode 100644 oncrawl/resource/behave.js create mode 100644 oncrawl/resource/data_queries.html create mode 100644 oncrawl/resource/data_queries.py create mode 100644 oncrawl/resource/functions.js create mode 100644 oncrawl/resource/styles.css diff --git a/oncrawl/code-env/python/desc.json b/oncrawl/code-env/python/desc.json new file mode 100644 index 00000000..a233995d --- /dev/null +++ b/oncrawl/code-env/python/desc.json @@ -0,0 +1,7 @@ +{ + "pythonInterpreter": "PYTHON36", + "acceptedPythonInterpreters": ["PYTHON36"], + "forceConda": false, + "installCorePackages": true, + "installJupyterSupport": false +} \ No newline at end of file diff --git a/oncrawl/code-env/python/spec/requirements.txt b/oncrawl/code-env/python/spec/requirements.txt new file mode 100644 index 00000000..978bb47d --- /dev/null +++ b/oncrawl/code-env/python/spec/requirements.txt @@ -0,0 +1,2 @@ +prison +pendulum \ No newline at end of file diff --git a/oncrawl/custom-recipes/data_queries/recipe.json b/oncrawl/custom-recipes/data_queries/recipe.json new file mode 100644 index 00000000..4b47f320 --- /dev/null +++ b/oncrawl/custom-recipes/data_queries/recipe.json @@ -0,0 +1,30 @@ +{ + "meta": { + "label": "OnCrawl data queries", + "description": "Export URLs or aggregations from crawls or log monitoring events", + "icon": "icon-globe", + "iconColor": "sky" + }, + + "kind": "PYTHON", + + "inputRoles": [], + + "outputRoles": [ + { + "name": "output", + "label": "output", + "description": "output", + "arity": "UNARY", + "required": true, + "acceptsDataset": true + } + ], + + 'paramsModule' : 'oncrawl-data_queries.module', + 'paramsPythonSetup' : 'data_queries.py', + 'paramsTemplate' : 'data_queries.html', + + "params": [] + +} diff --git a/oncrawl/custom-recipes/data_queries/recipe.py b/oncrawl/custom-recipes/data_queries/recipe.py new file mode 100644 index 00000000..fe941a3f --- /dev/null +++ b/oncrawl/custom-recipes/data_queries/recipe.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +import dataiku +from dataiku.customrecipe import * +import requests +import json +from oncrawl import oncrawlDataAPI as ocd + +output_names = get_output_names_for_role('output') +output_datasets = [dataiku.Dataset(name) for name in output_names] +output = output_datasets[0] + +#------------------------------config & vars +config = get_recipe_config() + +headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'Authorization' : 'Bearer {}'.format(config['api_key']) +} + +#--according index, ids are related to projects or crawls - each id represent a crawl or a project +if config['index'] != 'logs': + ids = config['list_configs_crawls'][config['crawl_config']] + if config['crawls_id'] != 'all': + ids = [config['crawls_id']] +else: + ids = config['list_projects_id_name'].keys() + if config['projects_id'] != 'all': + ids = [config['projects_id'].split(',')[0]] + +#------------------------------schema +#fields not returned by oncrawl API +metadata = { + 'project_id': 'string', + 'project_name': 'string', + 'crawl_id': 'string', + 'config_name': 'string', + 'crawl_start_timestamp': 'bigint' + } + +metadata_fields = ocd.build_schema_from_metadata(config, metadata) + +schema = { + 'dataset_schema': metadata_fields['schema'], + 'dataset_schema_field_list': metadata_fields['list'] +} +fields_to_request_by_ids = {} + +for i, id in enumerate(ids): + + progress = '#{} {} {}/{}'.format(id, 'crawls' if config['index'] != 'logs' else 'projects', (i+1), len(ids)) + + #when aggregate data, all items have same schema + if config['data_action'] == 'aggs': + + if i == 0: + f = ocd.build_schema_from_config(config=config) + schema['dataset_schema'] = schema['dataset_schema'] + f + print('############################\r\n############################\r\nBuil dataset schema with: ', progress) + else: + break + else: + + print('############################\r\n############################\r\nBuil dataset schema with: ', progress) + #when export data, for many reasons all items could not have same schema + #return new fields to add to dataset schema and all fields to request for this item + f = ocd.build_schema_from_oncrawl(config=config, id=id, headers=headers, schema=schema) + + if 'item_schema' not in f.keys() or len(f['item_schema']) == 0: + continue + + schema['dataset_schema'] = schema['dataset_schema'] + f['dataset_schema'] + schema['dataset_schema_field_list'] = schema['dataset_schema_field_list'] + f['dataset_schema_field_list'] + + fields_to_request_by_ids[id] = f['item_schema'] + +output.write_schema(schema['dataset_schema']) + +#------------------------------data & writer +total_count = 0 +with output.get_writer() as writer: + + for i, id in enumerate(ids): + + #this case happend when project has no log feature or unexpected ES issue + if config['data_action'] == 'export' and id not in fields_to_request_by_ids.keys(): + continue + + progress = '#{} {} {}/{}'.format(id, 'crawls' if config['index'] != 'logs' else 'projects', (i+1), len(ids)) + print('############################\r\n############################\r\nGet data for: ', progress) + + metadata_value = ocd.fill_metadata(config, id) + if config['data_action'] == 'export': + data = ocd.export(config_oql=config['oql'], fields=fields_to_request_by_ids[id], config_index=config['index'], id=id, headers=headers) + else: + data = ocd.aggs(config_oql=config['oql'], config_index=config['index'], id=id, headers=headers) + + count_result = 0 + try: + for json_line in data: + + row = metadata_value + [] + + if config['data_action'] == 'export': + #oncrawl export api send values not in the same order as schema... + for field in schema['dataset_schema_field_list']: + if field not in list(metadata.keys()): + if field in list(json_line.keys()): + if field in ['title', 'meta_description', 'h1'] and json_line[field] is not None: + row.append(json_line[field].encode(encoding = 'utf8', errors = 'replace')) + else: + row.append(json_line[field]) + else: + row.append(None) + else: + row = row + list(json_line.values()) + + writer.write_row_array(row) + count_result += 1 + print(progress, 'row: ',count_result) + print(progress, ': total row recorded: ', count_result, '\r\n############################\r\n############################') + + except Exception as e: + raise Exception('{}'.format(e)) + + total_count += 1 + diff --git a/oncrawl/js/data_queries_controller.js b/oncrawl/js/data_queries_controller.js new file mode 100644 index 00000000..c4a261cb --- /dev/null +++ b/oncrawl/js/data_queries_controller.js @@ -0,0 +1,234 @@ +var app = angular.module('oncrawl-data_queries.module', []); + +app.controller('oncrawl_data_queries', function($scope) { + + //Behave.js is a lightweight library for adding IDE style behaviors to plain text areas, making it much more enjoyable to write code in. + var editor = new Behave({ + textarea: document.getElementById('oql') + }); + + //init default vars + $scope.api_error = null + $scope.oql_error = null + + if(!$scope.config.date_kind) + { + $scope.config.date_kind = 'relative'; + } + $scope.toggle_date = false; + if($scope.config.date_kind == 'absolute') + $scope.toggle_date = true; + + if(!$scope.config.date_filter_time_cursor) + { + $scope.config.date_filter_time_cursor = 'current'; + $scope.config.date_filter_unit = 'month'; + $scope.config.date_filter_include_today = true; + $scope.config.date_filter_type = true; + + } + if(!$scope.config.date_filter_num_unit) + { + $scope.config.date_filter_num_unit = 1; + } + if(!$scope.config.data_action) + { + $scope.config.data_action = 'aggs'; + } + $scope.toggle_action = false; + if($scope.config.data_action == 'export') + $scope.toggle_action = true; + + if(!$scope.config.index) + { + $scope.config.index = 'pages'; + } + + $scope.selectDefaultCrawls = function() + { + $scope.config.crawls_id = selectDefaultCrawls($scope); + } + $scope.$watchGroup(['date_start_yyyy_mm_dd', 'date_end_yyyy_mm_dd'], updateDatesRange); + function updateDatesRange(o, n) + { + if (!$scope.date_start_yyyy_mm_dd && !$scope.date_end_yyyy_mm_dd) + return + + $scope.config.oql = build_oql($scope); + if($scope.config.index != 'logs') + { + $scope.get_crawls(); + } + } + $scope.build_date_range = function() + { + $scope.config.date_kind ='relative' + if($scope.toggle_date) + { + $scope.config.date_kind = 'absolute' + } + + $scope.date_start_yyyy_mm_dd = ""; + $scope.date_end_yyyy_mm_dd = ""; + + + if($scope.config.date_kind == 'absolute') + { + if(!$scope.config.override_date_start_yyyy_mm_dd || !$scope.config.override_date_end_yyyy_mm_dd) + { + return; + } + } + if($scope.config.date_kind == 'relative') + { + if(!$scope.config.date_filter_num_unit) + { + return; + } + } + + $scope.callPythonDo({'method': 'build_date_range' + }).then(function(response) { + try + { + $scope.date_start_yyyy_mm_dd = response.start + $scope.date_end_yyyy_mm_dd = response.end + } + catch(e) { + $scope.api_error = response.error + } + + }, function(response) { + $scope.api_error = "Unexpected error occurred" + }); + } + $scope.build_date_range(); + + + $scope.build_oql = function(reset=false) + { + $scope.config.data_action ='aggs' + if($scope.toggle_action) + { + $scope.config.data_action = 'export' + } + + $scope.config.oql = build_oql($scope, reset) + } + $scope.check_oql = function() + { + $scope.oql_error = null; + $scope.config.oql = document.getElementById('oql').value + try + { + if($scope.config.oql) + { + JSON.parse($scope.config.oql) + } + + //build oql if empty and add default required missing fields + $scope.config.oql = prettyPrint(build_oql($scope)) + document.getElementById('oql').value = $scope.config.oql + } + catch(e) + { + $scope.oql_error = e; + } + } + + + if($scope.config.list_projects_id_name) + { + $scope.num_projects = Object.keys($scope.config.list_projects_id_name).length; + } + + if($scope.config.list_configs_crawls) + { + $scope.num_configs = Object.keys($scope.config.list_configs_crawls).length; + } + + + $scope.get_projects = function() + { + + $scope.callPythonDo({'method': 'get_projects', + 'offset': $scope.config.projects_filter_offset || 0, + 'limit': $scope.config.projects_filter_limit || null, + 'sort': $scope.config.projects_filter_sort || 'name:asc' + }).then(function(response) { + try + { + + $scope.api_error = null + + $scope.config.list_projects_id_name = response.projects; + + $scope.num_projects = Object.keys($scope.config.list_projects_id_name).length; + + if(Object.keys(response.projects).length > 1 && !$scope.config.projects_id) + { + $scope.config.projects_id = 'all'; + } + + $scope.get_crawls(); + + } + catch(e) + { + $scope.api_error = response.error + } + },function(response) { + $scope.api_error = "Unexpected error occurred" + }); + } + + + $scope.get_crawls = function() + { + + if(!$scope.config.projects_id) + { + return; + } + + if(!$scope.date_start_yyyy_mm_dd || !$scope.date_end_yyyy_mm_dd) + { + return; + } + + if($scope.config.index == 'logs') + { + return; + } + + $scope.callPythonDo({'method': 'get_crawls', + 'projects_id': $scope.config.projects_id, + 'date_start_yyyy_mm_dd' : $scope.date_start_yyyy_mm_dd, + 'date_end_yyyy_mm_dd' : $scope.date_end_yyyy_mm_dd, + 'index': $scope.config.index + }).then(function(response) { + try + { + + $scope.config.list_configs_crawls = response.configs; + $scope.config.list_crawls_project = response.crawls; + $scope.num_configs = Object.keys($scope.config.list_configs_crawls).length; + if(!$scope.config.crawl_config) + { + $scope.config.crawl_config = Object.keys(response.configs)[0]; + } + + $scope.selectDefaultCrawls(); + + } + catch(e) { + $scope.api_error = response.error + } + + }, function(response) { + $scope.api_error = "Unexpected error occurred "+response + }); + + } + +}); \ No newline at end of file diff --git a/oncrawl/plugin.json b/oncrawl/plugin.json new file mode 100644 index 00000000..700ded51 --- /dev/null +++ b/oncrawl/plugin.json @@ -0,0 +1,15 @@ +{ + "id": "oncrawl", + "version": "1.0.0", + + "meta": { + + "label": "Oncrawl", + "description": "Export URLs or aggregations from crawls or log monitoring events", + "author": "Cogniteev", + "icon": "icon-globe", + "tags": ["SEO", "Logs", "Crawler"], + "url": "https://www.oncrawl.com", + "licenseInfo": "Apache 2" + } +} diff --git a/oncrawl/python-lib/oncrawl/__init__.py b/oncrawl/python-lib/oncrawl/__init__.py new file mode 100644 index 00000000..10f472e8 --- /dev/null +++ b/oncrawl/python-lib/oncrawl/__init__.py @@ -0,0 +1,55 @@ +#from datetime import datetime, timedelta +from calendar import monthrange +import pendulum +import json + +def build_date_range(config): + + # work with date string to support manual date override + # do not forget that range requested is [[ => always add 1 day !! + date_start_yyyy_mm_dd = "" + date_end_yyyy_mm_dd = "" + + if config['date_kind'] == 'relative': + + # use user timezone + datetime_reference = pendulum.now() + #user_tz = datetime_reference.timezone.name + + datetime_reference_first_day_month = datetime_reference.start_of('month') + datetime_reference_last_day_month = datetime_reference.end_of('month') + + datetime_reference_first_day_week = datetime_reference.start_of('week') + datetime_reference_last_day_week = datetime_reference.end_of('week') + + if config['date_filter_time_cursor'] == 'current': + if config['date_filter_unit'] == 'month': + date_start_yyyy_mm_dd = datetime_reference_first_day_month.strftime('%Y-%m-%d') + date_end_yyyy_mm_dd = datetime_reference_last_day_month.add(days=1).strftime('%Y-%m-%d') + + if config['date_filter_unit'] == 'day': + date_start_yyyy_mm_dd = datetime_reference.strftime('%Y-%m-%d') + date_end_yyyy_mm_dd = datetime_reference.add(days=1).strftime('%Y-%m-%d') + + if config['date_filter_time_cursor'] == 'previous': + if config['date_filter_unit'] == 'month': + date_start_yyyy_mm_dd = datetime_reference_first_day_month.subtract(months=config['date_filter_num_unit']).strftime('%Y-%m-%d') + date_end_yyyy_mm_dd = datetime_reference_first_day_month.strftime('%Y-%m-%d') + + if config['date_filter_include_today']: + date_end_yyyy_mm_dd = datetime_reference_last_day_month.add(days=1).strftime('%Y-%m-%d') + + if config['date_filter_unit'] == 'day': + date_start_yyyy_mm_dd = datetime_reference.subtract(days=config['date_filter_num_unit']).strftime('%Y-%m-%d') + date_end_yyyy_mm_dd = datetime_reference.strftime('%Y-%m-%d') + + if config['date_filter_include_today']: + date_end_yyyy_mm_dd = datetime_reference.add(days=1).strftime('%Y-%m-%d') + + else: + + date_start_yyyy_mm_dd = config['override_date_start_yyyy_mm_dd'] + date_end_yyyy_mm_dd = config['override_date_end_yyyy_mm_dd'] + + return {'start': date_start_yyyy_mm_dd, 'end': date_end_yyyy_mm_dd} + diff --git a/oncrawl/python-lib/oncrawl/oncrawlDataAPI.py b/oncrawl/python-lib/oncrawl/oncrawlDataAPI.py new file mode 100644 index 00000000..8c3971c6 --- /dev/null +++ b/oncrawl/python-lib/oncrawl/oncrawlDataAPI.py @@ -0,0 +1,202 @@ +import requests +import json + +endpoint_by_index = { + 'pages': {'end_point' : 'crawl/__id__/pages'}, + 'links': {'end_point' : 'crawl/__id__/links'}, + 'logs': {'end_point' : 'project/__id__/log_monitoring/events'} +} + +def map_dss_storage_type(field_type): + + mapping = { + 'bool':'boolean', + 'float':'double', + 'ratio':'double', + 'object':'object' + } + + dss_type = mapping.get(field_type, 'string') + + return dss_type + +def fill_metadata(config, id): + + p_id = id + if config['index'] != 'logs': + p_id = config['list_crawls_project'][id]['project_id'] + + v = [p_id, config['list_projects_id_name'][p_id]] + if config['index'] != 'logs': + v = v + [id, config['crawl_config'], config['list_crawls_project'][id]['created_at']] + + return v + + +def build_schema_from_metadata(config, metadata): + + fields = list(metadata.keys()) + if config['index'] == 'logs': + fields = list(metadata.keys())[:2] + + f = { + 'list': fields, + 'schema': [{'name': field, 'type': metadata[field]} for field in fields] + } + + return f + +def build_schema_from_oncrawl(config, id, headers, schema): + + f = {'dataset_schema': [], 'dataset_schema_field_list': [], 'item_schema': []} + + fields = get_fields(config_index=config['index'], id=id, headers=headers) + + try: + for field in fields['fields']: + + if field['can_display']: + + #item field list + f['item_schema'].append(field['name']) + + #look for new fields to add to dataset schema + if field['name'] not in schema['dataset_schema_field_list']: + field_type = map_dss_storage_type(field['type']) + + f['dataset_schema'].append({ + "name": field['name'], + "type": field_type, + }) + + f['dataset_schema_field_list'].append(field['name']) + + except Exception as e: + print('############################\r\nProject {} has no logs monitoring feature\n\r############################'.format(id)) + + return f + +def build_schema_from_config(config): + + f = [] + oql = json.loads(config['oql']) + + for i, agg in enumerate(oql['aggs']): + + field_type = 'bigint' + if 'fields' in agg.keys(): + field_type = 'object' + + f.append({ + "name":agg['name'], + "type": field_type, + }) + + return f + +def get_fields(id, config_index, headers): + + endpoint = endpoint_by_index[config_index]['end_point'].replace('__id__', id) + + try: + + #get fields = dataset cols + get_fields = requests.request('GET', 'https://app.oncrawl.com/api/v2/data/{}/fields'.format(endpoint), headers=headers) + get_fields.raise_for_status() + + fields = get_fields.json() + + return fields + + except requests.exceptions.HTTPError as e: + if config_index != 'logs' and get_fields.status_code != 403: + raise Exception('{}-{}'.format(str(e), get_fields.text)) + else: + return + + except Exception as e: + raise Exception(e) + + +def export(config_oql, fields, config_index, id, headers): + + endpoint = endpoint_by_index[config_index]['end_point'].replace('__id__', id) + + #oql = oncrawl query language - interface to query our ES + oql = json.loads(config_oql)['oql'] + body = { + + 'oql' : oql, + 'fields' : fields, + 'file_type':'json' + } + + #get urls = dataset rows + try: + export = requests.request('POST', 'https://app.oncrawl.com/api/v2/data/{}?export=true'.format(endpoint), json=body, headers=headers, stream=True) + export.raise_for_status() + + for line in export.iter_lines(): + json_line = json.loads(line) + + yield json_line + + except requests.exceptions.HTTPError as e: + if config_index != 'logs' and export.status_code != 403: + raise Exception('{}-{}'.format(str(e), export.text)) + + except Exception as e: + raise Exception(e) + + +def aggs(config_oql, config_index, id, headers): + + endpoint = endpoint_by_index[config_index]['end_point'].replace('__id__', id) + + oql = json.loads(config_oql)['aggs'] + + body = { + + 'aggs' : oql + } + + try: + + get_data = requests.request('POST', 'https://app.oncrawl.com/api/v2/data/{}/aggs?fmt=row_objects'.format(endpoint), json=body, headers=headers) + get_data.raise_for_status() + + data = get_data.json() + + agg_value = {} + for j, agg in enumerate(data['aggs']): + cols = agg['cols'] + col_name = cols[-1] + agg_name = oql[j].get('name') if oql[j].get('name') else col_name + + if agg_name and agg_name in agg_value: + agg_name = '{}_{}'.format(agg_name,j) + + if len(agg['rows']) == 1: + agg_value[agg_name] = agg['rows'][0][col_name] + else: + agg_value[agg_name] = agg['rows'] + + json_line = agg_value + + yield json_line + + except requests.exceptions.HTTPError as e: + if config_index != 'logs' and get_data.status_code != 403: + raise Exception('{}-{}'.format(str(e), get_data.text)) + + except Exception as e: + error = e + + if data.get('aggs')[0].get('error'): + error = data.get('aggs')[0].get('error') + + if config_index != 'logs': + raise Exception(error) + + + \ No newline at end of file diff --git a/oncrawl/resource/behave.js b/oncrawl/resource/behave.js new file mode 100644 index 00000000..2d735bf6 --- /dev/null +++ b/oncrawl/resource/behave.js @@ -0,0 +1,673 @@ +/* + * Behave.js + * + * Copyright 2013, Jacob Kelley - http://jakiestfu.com/ + * Released under the MIT Licence + * http://opensource.org/licenses/MIT + * + * Github: http://github.com/jakiestfu/Behave.js/ + * Version: 1.5 + */ + + +(function(undefined){ + + 'use strict'; + + var BehaveHooks = BehaveHooks || (function(){ + var hooks = {}; + + return { + add: function(hookName, fn){ + if(typeof hookName == "object"){ + var i; + for(i=0; i>> 0; + if (typeof func != "function"){ + throw new TypeError(); + } + var res = [], + thisp = arguments[1]; + for (var i = 0; i < len; i++) { + if (i in t) { + var val = t[i]; + if (func.call(thisp, val, i, t)) { + res.push(val); + } + } + } + return res; + }; + } + + var defaults = { + textarea: null, + replaceTab: true, + softTabs: true, + tabSize: 4, + autoOpen: true, + overwrite: true, + autoStrip: true, + autoIndent: true, + fence: false + }, + tab, + newLine, + charSettings = { + + keyMap: [ + { open: "\"", close: "\"", canBreak: false }, + { open: "'", close: "'", canBreak: false }, + { open: "(", close: ")", canBreak: false }, + { open: "[", close: "]", canBreak: true }, + { open: "{", close: "}", canBreak: true } + ] + + }, + utils = { + + _callHook: function(hookName, passData){ + var hooks = BehaveHooks.get(hookName); + passData = typeof passData=="boolean" && passData === false ? false : true; + + if(hooks){ + if(passData){ + var theEditor = defaults.textarea, + textVal = theEditor.value, + caretPos = utils.cursor.get(), + i; + + for(i=0; i -1) { + start = end = len; + } else { + start = -textInputRange.moveStart("character", -len); + start += normalizedValue.slice(0, start).split(newLine).length - 1; + + if (textInputRange.compareEndPoints("EndToEnd", endRange) > -1) { + end = len; + } else { + end = -textInputRange.moveEnd("character", -len); + end += normalizedValue.slice(0, end).split(newLine).length - 1; + } + } + } + } + + return start==end ? false : { + start: start, + end: end + }; + } + }, + editor: { + getLines: function(textVal){ + return (textVal).split("\n").length; + }, + get: function(){ + return defaults.textarea.value.replace(/\r/g,''); + }, + set: function(data){ + defaults.textarea.value = data; + } + }, + fenceRange: function(){ + if(typeof defaults.fence == "string"){ + + var data = utils.editor.get(), + pos = utils.cursor.get(), + hacked = 0, + matchedFence = data.indexOf(defaults.fence), + matchCase = 0; + + while(matchedFence>=0){ + matchCase++; + if( pos < (matchedFence+hacked) ){ + break; + } + + hacked += matchedFence+defaults.fence.length; + data = data.substring(matchedFence+defaults.fence.length); + matchedFence = data.indexOf(defaults.fence); + + } + + if( (hacked) < pos && ( (matchedFence+hacked) > pos ) && matchCase%2===0){ + return true; + } + return false; + } else { + return true; + } + }, + isEven: function(_this,i){ + return i%2; + }, + levelsDeep: function(){ + var pos = utils.cursor.get(), + val = utils.editor.get(); + + var left = val.substring(0, pos), + levels = 0, + i, j; + + for(i=0; i=0 ? finalLevels : 0; + }, + deepExtend: function(destination, source) { + for (var property in source) { + if (source[property] && source[property].constructor && + source[property].constructor === Object) { + destination[property] = destination[property] || {}; + utils.deepExtend(destination[property], source[property]); + } else { + destination[property] = source[property]; + } + } + return destination; + }, + addEvent: function addEvent(element, eventName, func) { + if (element.addEventListener){ + element.addEventListener(eventName,func,false); + } else if (element.attachEvent) { + element.attachEvent("on"+eventName, func); + } + }, + removeEvent: function addEvent(element, eventName, func){ + if (element.addEventListener){ + element.removeEventListener(eventName,func,false); + } else if (element.attachEvent) { + element.detachEvent("on"+eventName, func); + } + }, + + preventDefaultEvent: function(e){ + if(e.preventDefault){ + e.preventDefault(); + } else { + e.returnValue = false; + } + } + }, + intercept = { + tabKey: function (e) { + + if(!utils.fenceRange()){ return; } + + if (e.keyCode == 9) { + utils.preventDefaultEvent(e); + + var toReturn = true; + utils._callHook('tab:before'); + + var selection = utils.cursor.selection(), + pos = utils.cursor.get(), + val = utils.editor.get(); + + if(selection){ + + var tempStart = selection.start; + while(tempStart--){ + if(val.charAt(tempStart)=="\n"){ + selection.start = tempStart + 1; + break; + } + } + + var toIndent = val.substring(selection.start, selection.end), + lines = toIndent.split("\n"), + i; + + if(e.shiftKey){ + for(i = 0; i 1 && value.some(function(v) { return !Number.isInteger(v); })) { + return value; + } + } + for (var index in value) { + if (typeof(value[index]) === 'object') { + return value; // Don't support arrays that contain objects (little too tricky of a shot) + } else if (typeof(value[index]) === 'string') { // Keep the double quotes + value[index] = '###"###' + value[index] + '###"###'; + } + } + return '###[###' + value.join('###,### ') + '###]###'; +}; + +function prettyStringify(json, extraFormatting = null, spacing = 4) { + const results = JSON.stringify(json, function(key, value) { let result = formatJsonVtStandard(key, value); if (extraFormatting && extraFormatting !== null) { result = extraFormatting(key, result); } return result; }, spacing); + const results1 = results.split('\"###[###').join('['); // string.replace is sidelined: does not replace all occurrences + const results2 = results1.split('###]###\"').join(']'); // string.replace is sidelined: does not replace all occurrences + const results3 = results2.split('###\\"###').join('\"'); // string.replace is sidelined: does not replace all occurrences + const results4 = results3.split('###,###').join(','); // string.replace is sidelined: does not replace all occurrences + return results4; +} \ No newline at end of file diff --git a/oncrawl/resource/data_queries.html b/oncrawl/resource/data_queries.html new file mode 100644 index 00000000..7643fc2a --- /dev/null +++ b/oncrawl/resource/data_queries.html @@ -0,0 +1,142 @@ + + + + + + +
+
+ {{api_error}} +
+ +
+ Authentication +
+ +
+ + + +
+
+
+ +
+ Data + +
+ +
+ + {{num_projects}} + +
+
+
+ +
+ + +
+
+
+ +
+ + +
+
+ +
+ +
+ + + + + + +
+
+
+ +
+ + + +
+
+ +
+ +
+ + {{num_configs}} + +
+
+
+ +
+ + {{config.list_configs_crawls[config.crawl_config].length}} + +
+
+
+ +
+ Output + +
+ +
+ + +
+
+
+ +
+ +
+ {{oql_error}} +
+ + ! How to write OQL + Clear the field to reset +
+
+ +
+
diff --git a/oncrawl/resource/data_queries.py b/oncrawl/resource/data_queries.py new file mode 100644 index 00000000..e480e422 --- /dev/null +++ b/oncrawl/resource/data_queries.py @@ -0,0 +1,156 @@ +import requests +import prison +import pendulum +import oncrawl as oc +import json + +def do(payload, config): + + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + } + + if 'method' not in payload: + return {} + + #handle date range + if payload['method'] == 'build_date_range': + + dates = oc.build_date_range(config=config) + + date_start_yyyy_mm_dd = dates['start'] + date_end_yyyy_mm_dd = dates['end'] + + return {'start': date_start_yyyy_mm_dd, 'end': date_end_yyyy_mm_dd} + + #get projects + if payload["method"] == "get_projects": + + headers['Authorization'] = 'Bearer {}'.format(config['api_key']) + + filters = () + offset = 0 + limit = 1000 + sort = 'name:asc' + + if 'offset' in payload: + offset = payload['offset'] + + if 'limit' in payload and payload['limit'] is not None: + limit = payload['limit'] + + if 'sort' in payload: + sort = payload['sort'].replace(" ", "") + + response = { + 'projects':{}, + } + + while offset is not None: + + try: + + get_projects = requests.get('https://app.oncrawl.com/api/v2/projects?filters={}&offset={}&limit={}&sort={}'.format(filters, offset, limit, sort), headers = headers) + get_projects.raise_for_status() + projects = get_projects.json() + + offset = projects['meta']['offset'] + projects['meta']['limit'] + + for project in projects['projects']: + response['projects'][project['id']] = project['name'] + + assert offset <= projects['meta']['total'] + + except AssertionError: + offset = None + + except requests.exceptions.HTTPError: + offset = None + response = {'error' : 'error : {}'.format(get_projects)} + + except Exception as e: + offset = None + response = {'error' : get_projects} + + return response + + #get crawls + if payload["method"] == "get_crawls": + + headers['Authorization'] = 'Bearer {}'.format(config['api_key']) + + offset = 0 + limit = 1000 + + try: + assert payload['projects_id'] == 'all' + projects_id = list(config['list_projects_id_name'].keys()) + + except AssertionError as error: + projects_id = [config['projects_id'].split(',')[0]] + + + # work with date as string to support manual date override + # do not forget that range requested is [[ => always add 1 day !! + dates = oc.build_date_range(config) + date_start_yyyy_mm_dd = dates['start'] + date_end_yyyy_mm_dd = dates['end'] + + user_tz = pendulum.now().timezone.name + + crawl_start_timestamp = pendulum.parse(date_start_yyyy_mm_dd, tz=user_tz).timestamp() * 1000 + crawl_end_timestamp = pendulum.parse(date_end_yyyy_mm_dd, tz=user_tz).timestamp() * 1000 + + filters = { + "and" : [ + {"field": ["status", "equals", "done"]}, + {"field": ["created_at", "gte", crawl_start_timestamp]}, + {"field": ["created_at", "lt", crawl_end_timestamp]}, + {"field": ["project_id", "one_of", projects_id]} + ] + } + + response = { + 'configs':{}, + 'crawls':{}, + } + + try: + + while offset is not None: + + get_crawls = requests.get('https://app.oncrawl.com/api/v2/crawls?filters={}&offset={}&limit={}&sort=created_at:desc'.format(prison.dumps(filters), offset, limit), headers = headers) + get_crawls.raise_for_status() + + crawls = get_crawls.json() + offset = crawls['meta']['offset'] + crawls['meta']['limit'] + + for crawl in crawls['crawls']: + + if ( config['index'] == 'pages' and crawl['status'] == 'done') or (config['index'] == 'links' and crawl['link_status'] == 'live'): + + if crawl['crawl_config']['name'] not in response['configs']: + response['configs'][crawl['crawl_config']['name']] = [] + + response['configs'][crawl['crawl_config']['name']].append(crawl['id']) + response['crawls'][crawl['id']] = { + "project_id": crawl['project_id'], + "created_at": crawl['created_at'], + "ended_at": crawl['ended_at'], + } + + assert offset <= crawls['meta']['total'] + + except AssertionError: + offset = None + + except requests.exceptions.HTTPError as e: + offset = None + response = {'error' : 'merguez error : {} {}'.format(str(e), get_crawls.text)} + + except Exception as e: + response = {'error' : get_crawls} + + return response + \ No newline at end of file diff --git a/oncrawl/resource/functions.js b/oncrawl/resource/functions.js new file mode 100644 index 00000000..f4f863d2 --- /dev/null +++ b/oncrawl/resource/functions.js @@ -0,0 +1,161 @@ +function prettyPrint(oql) { + + try + { + var pretty_oql = JSON.parse(oql); + } + catch(e) + { + var pretty_oql = oql + } + + pretty_oql = JSON.stringify(pretty_oql, undefined, 4); + + return pretty_oql; +} + +function build_oql(scope, reset) +{ + let oql_templates = { + 'aggs' : { + 'pages' : '{"aggs":[{"oql":{"field":["fetched","equals","true"]},"name":"agg_name"}]}', + 'links' : '{"aggs":[{"oql":{"field":["target_fetched","equals", "true"]},"name":"agg_name"}]}', + 'logs': '{"aggs":[{"oql":{"and":[{"field":["event_is_bot_hit","equals","true"]}]},"name":"agg_name"}]}' + }, + 'export' : { + 'pages' : '{"oql":{"field":["fetched","equals", "true"]}}', + 'links' : '{"oql":{"field":["target_fetched","equals", "true"]}}', + 'logs' : '{"oql":{"field":["event_is_bot_hit","equals", "true"]}}' + } + } + + let oql = scope.config.oql; + if(!oql || reset) + { + oql = oql_templates[scope.config.data_action][scope.config.index] + } + + let oql_parsed = JSON.parse(oql); + + //add agg name if missing or rebuild aggs if missing oql node + if(scope.config.data_action == 'aggs') + { + + if(Object.keys(oql_parsed).indexOf('aggs') < 0 || oql_parsed.aggs.length == 0) + { + oql = oql_templates[scope.config.data_action][scope.config.index]; + oql_parsed = JSON.parse(oql); + } + else + { + //alert(JSON.stringify(oql_parsed.aggs)) + let agg_name = [] + for(let i=0; i add if null + if(Object.keys(oql)[0] != 'and') + { + if(oql.field[0] != 'event_day' && oql.field[0] != 'event_datetime') + { + oql_content.push(oql) + } + } + else + { + //update oql to update date_start_yyyy_mm_dd and date_end_yyyy_mm_dd + for (let i=0; i Date: Tue, 30 Jun 2020 15:59:37 +0200 Subject: [PATCH 2/6] add a way to get last crawl, add a link to contact us to get an api key, fix PR reviewer feedbacks --- .../python-lib/oncrawl/oncrawlProjectAPI.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 oncrawl/python-lib/oncrawl/oncrawlProjectAPI.py diff --git a/oncrawl/python-lib/oncrawl/oncrawlProjectAPI.py b/oncrawl/python-lib/oncrawl/oncrawlProjectAPI.py new file mode 100644 index 00000000..0f4f3f9a --- /dev/null +++ b/oncrawl/python-lib/oncrawl/oncrawlProjectAPI.py @@ -0,0 +1,112 @@ +import requests +import prison + +headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', +} + +def build_human_error(response): + + r = 'Please apologize, something bad happened' + + if response.status_code == 401: + r = 'Your API key seems to be invalid. Please check it and contact us if the error persists.' + + return r + +def get_projects(api_key): + + headers['Authorization'] = 'Bearer {}'.format(api_key) + + offset = 0 + limit = 1000 + sort = 'name:asc' + + response = [] + + while offset is not None: + + try: + + r = requests.get('https://app.oncrawl.com/api/v2/projects?&offset={}&limit={}&sort={}'.format(offset, limit, sort), headers = headers) + r.raise_for_status() + items = r.json() + + offset = items['meta']['offset'] + items['meta']['limit'] + + for item in items['projects']: + response.append({'id': item['id'], 'name':item['name']}) + + assert offset <= items['meta']['total'] + + except AssertionError: + offset = None + + except requests.exceptions.HTTPError: + offset = None + response = {'error': build_human_error(r)} + + except Exception as e: + offset = None + response = {'error' : e} + + return response + +def get_live_crawls(config, projects_id, timestamp_range, limit=None): + + headers['Authorization'] = 'Bearer {}'.format(config['api_key']) + + offset = 0 + + if limit is None: + limit = 1000 + + filters = { + "and" : [ + {"field": ["status", "equals", "done"]}, + {"field": ["created_at", "gte", timestamp_range['start']]}, + {"field": ["created_at", "lt", timestamp_range['end']]}, + {"field": ["project_id", "one_of", projects_id]} + ] + } + + try: + + response = [] + while offset is not None: + + r = requests.get('https://app.oncrawl.com/api/v2/crawls?filters={}&offset={}&limit={}&sort=created_at:desc'.format(prison.dumps(filters), offset, limit), headers = headers) + r.raise_for_status() + + items = r.json() + offset = items['meta']['offset'] + items['meta']['limit'] + + for item in items['crawls']: + + if ( config['index'] == 'pages' and item['status'] == 'done') or (config['index'] == 'links' and item['link_status'] == 'live'): + + response.append( + { + 'id': item['id'], + 'config_name': item['crawl_config']['name'], + 'project_id': item['project_id'], + 'created_at': item['created_at'], + 'ended_at': item['ended_at'] + } + ) + + assert offset <= items['meta']['total'] + + except AssertionError: + offset = None + + except requests.exceptions.HTTPError as e: + offset = None + response = {'error': build_human_error(r)} + + except Exception as e: + response = {'error' : e} + + return response + From c5e05136c61e5e16c3f45be1d91376e3a0fddbdf Mon Sep 17 00:00:00 2001 From: Elodie Date: Tue, 30 Jun 2020 16:01:52 +0200 Subject: [PATCH 3/6] add a way to get last crawl, add a link to contact us to get an api key, fix PR reviewer feedbacks --- .../custom-recipes/data_queries/recipe.json | 8 +- oncrawl/custom-recipes/data_queries/recipe.py | 80 +++++++++-- oncrawl/js/data_queries_controller.js | 7 +- oncrawl/python-lib/oncrawl/__init__.py | 14 ++ oncrawl/resource/data_queries.html | 5 +- oncrawl/resource/data_queries.py | 131 +++++------------- oncrawl/resource/functions.js | 5 +- 7 files changed, 134 insertions(+), 116 deletions(-) diff --git a/oncrawl/custom-recipes/data_queries/recipe.json b/oncrawl/custom-recipes/data_queries/recipe.json index 4b47f320..66c05036 100644 --- a/oncrawl/custom-recipes/data_queries/recipe.json +++ b/oncrawl/custom-recipes/data_queries/recipe.json @@ -1,7 +1,7 @@ { "meta": { "label": "OnCrawl data queries", - "description": "Export URLs or aggregations from crawls or log monitoring events", + "description": "

Export URLs or aggregations from crawls or log monitoring events.

Contact us to get your API key

", "icon": "icon-globe", "iconColor": "sky" }, @@ -21,9 +21,9 @@ } ], - 'paramsModule' : 'oncrawl-data_queries.module', - 'paramsPythonSetup' : 'data_queries.py', - 'paramsTemplate' : 'data_queries.html', + "paramsModule" : "oncrawl-data_queries.module", + "paramsPythonSetup" : "data_queries.py", + "paramsTemplate" : "data_queries.html", "params": [] diff --git a/oncrawl/custom-recipes/data_queries/recipe.py b/oncrawl/custom-recipes/data_queries/recipe.py index fe941a3f..30ec2a51 100644 --- a/oncrawl/custom-recipes/data_queries/recipe.py +++ b/oncrawl/custom-recipes/data_queries/recipe.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- import dataiku -from dataiku.customrecipe import * -import requests -import json +from dataiku.customrecipe import get_output_names_for_role, get_recipe_config +import oncrawl as oc from oncrawl import oncrawlDataAPI as ocd +from oncrawl import oncrawlProjectAPI as ocp output_names = get_output_names_for_role('output') output_datasets = [dataiku.Dataset(name) for name in output_names] @@ -12,22 +12,80 @@ #------------------------------config & vars config = get_recipe_config() +#config checker to raise better error +e = None +if 'api_key' not in config.keys(): + e = 'Please add your API key' + +if 'list_projects_id_name' not in config.keys() or len(config['list_projects_id_name'].keys()) == 0: + e = 'Your Oncrawl account seems to have no projects available. Please check with your Oncrawl account.' + +if 'list_configs_crawls' not in config.keys() or len(config['list_configs_crawls'].keys()) == 0 or 'list_crawls_project' not in config.keys() or len(config['list_crawls_project'].keys()) == 0: + e = 'Your Oncrawl account seems to have no crawls available. Please check the choosen project and date range with your Oncrawl account.' + +if e is not None: + raise Exception(e) + headers = { 'Content-Type': 'application/json', 'Accept': 'application/json', 'Authorization' : 'Bearer {}'.format(config['api_key']) } -#--according index, ids are related to projects or crawls - each id represent a crawl or a project -if config['index'] != 'logs': - ids = config['list_configs_crawls'][config['crawl_config']] - if config['crawls_id'] != 'all': +#list project ids +p_ids = [] + +#if getting all projects : rebuild an up to date ids list +if config['projects_id'] == 'all': + try: + p_ids_uptodate = ocp.get_projects(config['api_key']) + + for p in p_ids_uptodate: + config['list_projects_id_name'][p['id']] = p['name'] + p_ids.append(p['id']) + + except Exception as e: + raise Exception(p_ids_uptodate) +else: + p_ids = [config['projects_id'].split(',')[0]] + +#--list ids to get data : according config['index'], ids are related to projects or crawls - each id represents a crawl when index = pages or links and a project when index = logs +if config['index'] == 'logs': + + ids = p_ids + +else: + + if config['crawls_id'] not in ['all', 'last']: ids = [config['crawls_id']] -else: - ids = config['list_projects_id_name'].keys() - if config['projects_id'] != 'all': - ids = [config['projects_id'].split(',')[0]] + + else: + + #if getting all or last crawls : rebuild an up to date ids list + try: + dates = oc.build_date_range(config) + date_start_yyyy_mm_dd = dates['start'] + date_end_yyyy_mm_dd = dates['end'] + + crawl_start_timestamp = oc.datestring_to_miltimestamp_with_tz(dates['start']) + crawl_end_timestamp = oc.datestring_to_miltimestamp_with_tz(dates['end']) + + limit = None + + c_ids_uptodate = ocp.get_live_crawls(projects_id=p_ids, config=config, timestamp_range={'start': crawl_start_timestamp, 'end': crawl_end_timestamp}, limit=limit) + + ids = [] + count_crawls_by_projects = [] + for c in c_ids_uptodate: + if c['config_name'] == config['crawl_config']: + if (config['crawls_id'] == 'last' and c['project_id'] not in count_crawls_by_projects) or config['crawls_id'] != 'last': + count_crawls_by_projects.append(c['project_id']) + ids.append(c['id']) + except Exception as e: + raise + + #------------------------------schema #fields not returned by oncrawl API metadata = { diff --git a/oncrawl/js/data_queries_controller.js b/oncrawl/js/data_queries_controller.js index c4a261cb..c2cd9903 100644 --- a/oncrawl/js/data_queries_controller.js +++ b/oncrawl/js/data_queries_controller.js @@ -211,13 +211,16 @@ app.controller('oncrawl_data_queries', function($scope) { { $scope.config.list_configs_crawls = response.configs; + //list_crawls_project to allow recipe to build crawls metadata (project_id, start_date...) $scope.config.list_crawls_project = response.crawls; $scope.num_configs = Object.keys($scope.config.list_configs_crawls).length; + + //by default take first crawl config... if(!$scope.config.crawl_config) { $scope.config.crawl_config = Object.keys(response.configs)[0]; } - + //... and return 'all' or a crawl id if there is only 1 crawl fr the choosen config $scope.selectDefaultCrawls(); } @@ -226,7 +229,7 @@ app.controller('oncrawl_data_queries', function($scope) { } }, function(response) { - $scope.api_error = "Unexpected error occurred "+response + $scope.api_error = "Unexpected error occurred " }); } diff --git a/oncrawl/python-lib/oncrawl/__init__.py b/oncrawl/python-lib/oncrawl/__init__.py index 10f472e8..008b7227 100644 --- a/oncrawl/python-lib/oncrawl/__init__.py +++ b/oncrawl/python-lib/oncrawl/__init__.py @@ -53,3 +53,17 @@ def build_date_range(config): return {'start': date_start_yyyy_mm_dd, 'end': date_end_yyyy_mm_dd} + +def datestring_to_miltimestamp_with_tz(date): + + user_tz = pendulum.now().timezone.name + + d = pendulum.parse(date, tz=user_tz).timestamp() * 1000 + + return d + + + + + + diff --git a/oncrawl/resource/data_queries.html b/oncrawl/resource/data_queries.html index 7643fc2a..4509da20 100644 --- a/oncrawl/resource/data_queries.html +++ b/oncrawl/resource/data_queries.html @@ -26,9 +26,9 @@
-
+
{{num_projects}} @@ -102,6 +102,7 @@
{{config.list_configs_crawls[config.crawl_config].length}} diff --git a/oncrawl/resource/data_queries.py b/oncrawl/resource/data_queries.py index e480e422..00d10afd 100644 --- a/oncrawl/resource/data_queries.py +++ b/oncrawl/resource/data_queries.py @@ -1,8 +1,5 @@ -import requests -import prison -import pendulum import oncrawl as oc -import json +from oncrawl import oncrawlProjectAPI as ocp def do(payload, config): @@ -14,6 +11,7 @@ def do(payload, config): if 'method' not in payload: return {} + #handle date range if payload['method'] == 'build_date_range': @@ -24,93 +22,50 @@ def do(payload, config): return {'start': date_start_yyyy_mm_dd, 'end': date_end_yyyy_mm_dd} + #get projects if payload["method"] == "get_projects": - headers['Authorization'] = 'Bearer {}'.format(config['api_key']) - - filters = () - offset = 0 - limit = 1000 - sort = 'name:asc' - - if 'offset' in payload: - offset = payload['offset'] - - if 'limit' in payload and payload['limit'] is not None: - limit = payload['limit'] - - if 'sort' in payload: - sort = payload['sort'].replace(" ", "") - response = { 'projects':{}, } - - while offset is not None: - - try: - - get_projects = requests.get('https://app.oncrawl.com/api/v2/projects?filters={}&offset={}&limit={}&sort={}'.format(filters, offset, limit, sort), headers = headers) - get_projects.raise_for_status() - projects = get_projects.json() - - offset = projects['meta']['offset'] + projects['meta']['limit'] - - for project in projects['projects']: - response['projects'][project['id']] = project['name'] - - assert offset <= projects['meta']['total'] - - except AssertionError: - offset = None - - except requests.exceptions.HTTPError: - offset = None - response = {'error' : 'error : {}'.format(get_projects)} - - except Exception as e: - offset = None - response = {'error' : get_projects} + try: + + projects = ocp.get_projects(config['api_key']) + + for p in projects: + response['projects'][p['id']] = p['name'] + + except Exception as e: + response = {'error' : projects['error']} + return response + #get crawls if payload["method"] == "get_crawls": - - headers['Authorization'] = 'Bearer {}'.format(config['api_key']) - - offset = 0 - limit = 1000 + #project list try: assert payload['projects_id'] == 'all' projects_id = list(config['list_projects_id_name'].keys()) except AssertionError as error: projects_id = [config['projects_id'].split(',')[0]] - - + + + #dates ranges: need timestamp # work with date as string to support manual date override - # do not forget that range requested is [[ => always add 1 day !! dates = oc.build_date_range(config) date_start_yyyy_mm_dd = dates['start'] date_end_yyyy_mm_dd = dates['end'] - user_tz = pendulum.now().timezone.name - - crawl_start_timestamp = pendulum.parse(date_start_yyyy_mm_dd, tz=user_tz).timestamp() * 1000 - crawl_end_timestamp = pendulum.parse(date_end_yyyy_mm_dd, tz=user_tz).timestamp() * 1000 + crawl_start_timestamp = oc.datestring_to_miltimestamp_with_tz(dates['start']) + crawl_end_timestamp = oc.datestring_to_miltimestamp_with_tz(dates['end']) + + crawls = ocp.get_live_crawls(projects_id=projects_id, config=config, timestamp_range={'start': crawl_start_timestamp, 'end': crawl_end_timestamp}) - filters = { - "and" : [ - {"field": ["status", "equals", "done"]}, - {"field": ["created_at", "gte", crawl_start_timestamp]}, - {"field": ["created_at", "lt", crawl_end_timestamp]}, - {"field": ["project_id", "one_of", projects_id]} - ] - } - response = { 'configs':{}, 'crawls':{}, @@ -118,39 +73,23 @@ def do(payload, config): try: - while offset is not None: + for c in crawls: - get_crawls = requests.get('https://app.oncrawl.com/api/v2/crawls?filters={}&offset={}&limit={}&sort=created_at:desc'.format(prison.dumps(filters), offset, limit), headers = headers) - get_crawls.raise_for_status() - - crawls = get_crawls.json() - offset = crawls['meta']['offset'] + crawls['meta']['limit'] - - for crawl in crawls['crawls']: - - if ( config['index'] == 'pages' and crawl['status'] == 'done') or (config['index'] == 'links' and crawl['link_status'] == 'live'): + if c['config_name'] not in response['configs']: + response['configs'][c['config_name']] = [] - if crawl['crawl_config']['name'] not in response['configs']: - response['configs'][crawl['crawl_config']['name']] = [] + response['configs'][c['config_name']].append(c['id']) + response['crawls'][c['id']] = { + "project_id": c['project_id'], + "created_at": c['created_at'], + "ended_at": c['ended_at'], + } - response['configs'][crawl['crawl_config']['name']].append(crawl['id']) - response['crawls'][crawl['id']] = { - "project_id": crawl['project_id'], - "created_at": crawl['created_at'], - "ended_at": crawl['ended_at'], - } - - assert offset <= crawls['meta']['total'] - - except AssertionError: - offset = None - - except requests.exceptions.HTTPError as e: - offset = None - response = {'error' : 'merguez error : {} {}'.format(str(e), get_crawls.text)} - except Exception as e: - response = {'error' : get_crawls} + response = {'error' : crawls['error']} + return response + + \ No newline at end of file diff --git a/oncrawl/resource/functions.js b/oncrawl/resource/functions.js index f4f863d2..3e7d0f53 100644 --- a/oncrawl/resource/functions.js +++ b/oncrawl/resource/functions.js @@ -143,13 +143,16 @@ function add_logsdate_field(oql, scope) return oql } +// return all except if there is only one crawl ==> return its ids function selectDefaultCrawls(scope) { let list = null if(scope.config.list_configs_crawls[scope.config.crawl_config]) { list = 'all' - if(scope.config.list_configs_crawls[scope.config.crawl_config].length == 1) + if(scope.config.crawls_id) + list = scope.config.crawls_id + if(scope.config.list_configs_crawls[scope.config.crawl_config].length == 1 && list != 'last') { list = scope.config.list_configs_crawls[scope.config.crawl_config][0]; } From fb350d2bd5bffeef8857f1b88e960287af45b597 Mon Sep 17 00:00:00 2001 From: chiktika Date: Thu, 2 Jul 2020 14:34:52 +0200 Subject: [PATCH 4/6] Update requirements.txt add package version --- oncrawl/code-env/python/spec/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oncrawl/code-env/python/spec/requirements.txt b/oncrawl/code-env/python/spec/requirements.txt index 978bb47d..c1905881 100644 --- a/oncrawl/code-env/python/spec/requirements.txt +++ b/oncrawl/code-env/python/spec/requirements.txt @@ -1,2 +1,2 @@ -prison -pendulum \ No newline at end of file +prison==0.1.3 +pendulum==2.1.0 From 7677eea6dcfe571e3263fe1f46a048fd806ef319 Mon Sep 17 00:00:00 2001 From: chiktika Date: Wed, 8 Jul 2020 09:35:34 +0200 Subject: [PATCH 5/6] Update plugin url --- oncrawl/plugin.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oncrawl/plugin.json b/oncrawl/plugin.json index 700ded51..74fdb605 100644 --- a/oncrawl/plugin.json +++ b/oncrawl/plugin.json @@ -9,7 +9,7 @@ "author": "Cogniteev", "icon": "icon-globe", "tags": ["SEO", "Logs", "Crawler"], - "url": "https://www.oncrawl.com", + "url": "https://www.dataiku.com/product/plugins/oncrawl/", "licenseInfo": "Apache 2" } } From f190a0c0c5debd56dc6e4032d7494e0124ead667 Mon Sep 17 00:00:00 2001 From: Chiktika Date: Wed, 8 Jul 2020 18:46:51 +0200 Subject: [PATCH 6/6] update tags --- oncrawl/plugin.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oncrawl/plugin.json b/oncrawl/plugin.json index 74fdb605..0dbb323b 100644 --- a/oncrawl/plugin.json +++ b/oncrawl/plugin.json @@ -8,7 +8,7 @@ "description": "Export URLs or aggregations from crawls or log monitoring events", "author": "Cogniteev", "icon": "icon-globe", - "tags": ["SEO", "Logs", "Crawler"], + "tags": ["API", "Cloud", "Logs"], "url": "https://www.dataiku.com/product/plugins/oncrawl/", "licenseInfo": "Apache 2" }