From bf95681aa1974a60e069075fa60973af5e482dad Mon Sep 17 00:00:00 2001 From: Jelte Date: Sun, 21 Apr 2013 22:42:58 +0300 Subject: [PATCH 1/2] Update scraper.js Only parse body when there is response. Don't do anything before you've checked if there is an error --- lib/scraper.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scraper.js b/lib/scraper.js index 6ca44ab..628db87 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -47,12 +47,12 @@ module.exports = function scrape(requestOptions, callback, fetchOptions) { } request(requestOptions, function (err, response, body) { - body = body.replace(/<(\/?)script/g, '<$1nobreakage'); setTimeout(runNextFetch, timeSpacing); if (err) { callback(err, null, null); } if (response && response.statusCode == 200) { + body = body.replace(/<(\/?)script/g, '<$1nobreakage'); var window = jsdom.jsdom().createWindow(); jsdom.jQueryify(window, __dirname+'/../deps/jquery-1.6.1.min.js', function(win, $) { $('head').append($(body).find('head').html()); From 3757d988aa898cc33b53ef52df6957035b66fd5c Mon Sep 17 00:00:00 2001 From: j3lte Date: Fri, 16 Aug 2013 12:03:57 +0200 Subject: [PATCH 2/2] JSLinted, fixing some code --- lib/scraper.js | 59 ++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/lib/scraper.js b/lib/scraper.js index 628db87..57fa1dc 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,8 +1,13 @@ -var request = require('request'); -var jsdom = require('jsdom'); +/*jslint node:true sub:true */ +/*global require */ + + +var request = require('request'), + jsdom = require('jsdom'); + var requestDefaults = { - 'uri': null - , 'headers': { + 'uri' : null, + 'headers' : { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' } }; @@ -10,20 +15,21 @@ var fetchDefaults = { 'reqPerSec': 0 }; module.exports = function scrape(requestOptions, callback, fetchOptions) { + 'use strict'; if (!fetchOptions) { fetchOptions = {}; } if (!callback) { - callback = function(){}; + callback = function () {}; } - Object.keys(fetchDefaults).forEach(function(key) { + Object.keys(fetchDefaults).forEach(function (key) { if (fetchOptions[key] === undefined) { - fetchOptions[key] = fetchDefaults[key] + fetchOptions[key] = fetchDefaults[key]; } }); - var fetches = []; - var queue = []; + var fetches = [], + queue = []; if (!Array.isArray(requestOptions)) { fetches.push(requestOptions); @@ -31,15 +37,15 @@ module.exports = function scrape(requestOptions, callback, fetchOptions) { fetches = requestOptions; } - fetches.forEach(function(requestOptions, index) { - queue.push(function() { - Object.keys(requestDefaults).forEach(function(key) { + fetches.forEach(function (requestOptions, index) { + queue.push(function () { + Object.keys(requestDefaults).forEach(function (key) { requestOptions[key] = requestOptions[key] || requestDefaults[key]; }); if (typeof requestOptions === 'string') { requestOptions = { 'uri': requestOptions - } + }; } if (!requestOptions['uri']) { @@ -51,29 +57,22 @@ module.exports = function scrape(requestOptions, callback, fetchOptions) { if (err) { callback(err, null, null); } - if (response && response.statusCode == 200) { + if (response && response.statusCode === 200) { body = body.replace(/<(\/?)script/g, '<$1nobreakage'); var window = jsdom.jsdom().createWindow(); - jsdom.jQueryify(window, __dirname+'/../deps/jquery-1.6.1.min.js', function(win, $) { + jsdom.jQueryify(window, __dirname + '/../deps/jquery-1.6.1.min.js', function (win, $) { $('head').append($(body).find('head').html()); $('body').append($(body).find('body').html()); callback(null, $); }); } else { - callback(new Error('Request to '+requestOptions['uri']+' ended with status code: '+(typeof response !== 'undefined' ? response.statusCode : 'unknown')), null, null); + callback(new Error('Request to ' + requestOptions['uri'] + ' ended with status code: ' + (typeof response !== 'undefined' ? response.statusCode : 'unknown')), null, null); } }); - }) + }); }); - - var concurrentConnections = !fetchOptions['reqPerSec'] ? queue.length : (Math.floor(fetchOptions['reqPerSec']) || 1); - var timeSpacing = !fetchOptions['reqPerSec'] ? 0 : 1000/fetchOptions['reqPerSec']; - - for (var i=0; i < concurrentConnections; i++) { - runNextFetch(i); - }; - - function runNextFetch(i) { + + function runNextFetch(i) { if (!i) { i = 0; } @@ -82,4 +81,12 @@ module.exports = function scrape(requestOptions, callback, fetchOptions) { queue.shift(); } } + + var concurrentConnections = !fetchOptions['reqPerSec'] ? queue.length : (Math.floor(fetchOptions['reqPerSec']) || 1); + var timeSpacing = !fetchOptions['reqPerSec'] ? 0 : 1000 / fetchOptions['reqPerSec']; + + for (var i=0; i < concurrentConnections; i++) { + runNextFetch(i); + }; + };