From a8b77687b980c41dbf3c2ed5a583a8d05b73aac1 Mon Sep 17 00:00:00 2001 From: pawelrychlik Date: Thu, 6 Feb 2014 23:35:40 +0100 Subject: [PATCH 1/2] updated examples to current twitter ui --- examples/advanced.js | 4 ++-- examples/parallel.js | 10 +++++----- examples/simple.js | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/advanced.js b/examples/advanced.js index 96b7b53..1f3c092 100644 --- a/examples/advanced.js +++ b/examples/advanced.js @@ -1,14 +1,14 @@ var scraper = require('scraper'); scraper({ - 'uri': 'http://search.twitter.com/search?q=nodejs' + 'uri': 'https://twitter.com/search?q=nodejs' , 'headers': { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' }} , function(err, $) { if (err) {throw err;} - $('.msg').each(function() { + $('p.js-tweet-text.tweet-text').each(function() { console.log($(this).text().trim()+'\n'); }); }); \ No newline at end of file diff --git a/examples/parallel.js b/examples/parallel.js index 4e3e9f3..80342e2 100644 --- a/examples/parallel.js +++ b/examples/parallel.js @@ -1,20 +1,20 @@ var scraper = require('scraper'); scraper([ - 'http://search.twitter.com/search?q=javascript' - , 'http://search.twitter.com/search?q=css' + 'https://twitter.com/search?q=javascript' + , 'https://twitter.com/search?q=css' , { - 'uri': 'http://search.twitter.com/search?q=nodejs' + 'uri': 'https://twitter.com/search?q=nodejs' , 'headers': { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' } } - , 'http://search.twitter.com/search?q=html5' + , 'https://twitter.com/search?q=html5' ] , function(err, $) { if (err) {throw err;} - $('.msg').each(function() { + $('p.js-tweet-text.tweet-text').each(function() { console.log($(this).text().trim()+'\n'); }); }, { diff --git a/examples/simple.js b/examples/simple.js index 5083a64..61afaff 100644 --- a/examples/simple.js +++ b/examples/simple.js @@ -1,9 +1,9 @@ var scraper = require('scraper'); -scraper('http://search.twitter.com/search?q=javascript', function(err, $) { +scraper('https://twitter.com/search?q=javascript', function(err, $) { if (err) {throw err;} - $('.msg').each(function() { + $('p.js-tweet-text.tweet-text').each(function() { console.log($(this).text().trim()+'\n'); }); }); \ No newline at end of file From 2060bdaf88e62d715d16f4a6e5c97c73565b28d7 Mon Sep 17 00:00:00 2001 From: pawelrychlik Date: Thu, 6 Feb 2014 23:48:06 +0100 Subject: [PATCH 2/2] updated readme to align with twitter ui; introduced Running Examples section --- README.md | 59 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 7f0daaa..70c3f4f 100644 --- a/README.md +++ b/README.md @@ -10,68 +10,71 @@ Via [npm](http://github.com/isaacs/npm): ## Examples +### Running examples + + $ node ./examples/simple.js + ### Simple First argument is an url as a string, second is a callback which exposes a jQuery object with your scraped site as "body" and third is an object from the request containing info about the url. var scraper = require('scraper'); - scraper('http://search.twitter.com/search?q=javascript', function(err, jQuery) { - if (err) {throw err} + scraper('https://twitter.com/search?q=javascript', function(err, jQuery) { + if (err) {throw err;} - jQuery('.msg').each(function() { + jQuery('p.js-tweet-text.tweet-text').each(function() { console.log(jQuery(this).text().trim()+'\n'); }); }); + ### "Advanced" First argument is an object containing settings for the "request" instance used internally, second is a callback which exposes a jQuery object with your scraped site as "body" and third is an object from the request containing info about the url. var scraper = require('scraper'); scraper( - { - 'uri': 'http://search.twitter.com/search?q=nodejs' - , 'headers': { - 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' - } - } - , function(err, $) { - if (err) {throw err} + { + 'uri': 'https://twitter.com/search?q=nodejs', + 'headers': { + 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' + } + }, + function(err, $) { + if (err) {throw err;} - $('.msg').each(function() { + $('p.js-tweet-text.tweet-text').each(function() { console.log($(this).text().trim()+'\n'); }); } ); + ### Parallel First argument is an array containing either strings or objects, second is a callback which exposes a jQuery object with your scraped site as "body" and third is an object from the request containing info about the url. **You can also add rate limiting to the fetcher by adding an options object as the third argument containing 'reqPerSec': float.** var scraper = require('scraper'); - scraper( - [ - 'http://search.twitter.com/search?q=javascript' - , 'http://search.twitter.com/search?q=css' - , { - 'uri': 'http://search.twitter.com/search?q=nodejs' - , 'headers': { + scraper([ + 'https://twitter.com/search?q=javascript', + 'https://twitter.com/search?q=css', + { + 'uri': 'https://twitter.com/search?q=nodejs', + 'headers': { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' } - } - , 'http://search.twitter.com/search?q=html5' - ] - , function(err, $) { + }, + 'https://twitter.com/search?q=html5' + ], + function(err, $) { if (err) {throw err;} - $('.msg').each(function() { + $('p.js-tweet-text.tweet-text').each(function() { console.log($(this).text().trim()+'\n'); }); - } - , { + }, + { 'reqPerSec': 0.2 // Wait 5sec between each external request } ); - - ## Arguments ### First (required)