Promise Based Scraper in Node.js
- Published 28th Nov 2015
Last edited 27th Nov 2016

Promises/A+ in JavaScript
I love automation. And I love scraping. I ended up writing a quick Node.js script to Scrape Magento’s Certification Directory to put together a list of Australian & Melbournian Magento Developers / Specialists that are certified.
It definitely was a bunch of fun because I tried a new technique based on promises. The promise library I used is a Promises/A+ compliant Bluebird library.
Without further ado, here is the code:
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
var Promise = require('bluebird'); var MongoDB = Promise.promisifyAll(require("mongodb")); var MongoClient = Promise.promisifyAll(MongoDB.MongoClient); var cheerio = require('cheerio'); var http = require('http'); var urls = [ 'http://www.magentocommerce.com/certification/directory/index/?q=&country_id=AU®ion_id=®ion=vic&certificate_type=', 'http://www.magentocommerce.com/certification/directory/index/?q=&country_id=AU®ion_id=®ion=victoria&certificate_type=' ]; var i = 0; var pause = 10 * 1000; var fetch = function (url) { console.log('Processing', url); return new Promise(function (resolve, reject) { http.get(url, function (res) { var body = ""; res.setEncoding('utf8'); res.on('data', function (chunk) { body += chunk; }); res.on('end', function () { resolve(body); }) }); }); }; var process = function (url) { var devs = []; fetch(url) .then(function (body) { $ = cheerio.load(body); $('.results > .row').eq(0).remove(); // remove header row return $('.results > .row'); }) .then(function (rows) { rows.each(function () { var dev = { name: $(this).find('div').eq(0).text().trim(), company: $(this).find('div').eq(1).text().trim(), location: $(this).find('div').eq(2).text().trim().replace(',Australia', ', Australia'), certificates: $(this).find('div').eq(3).html().replace('\n', '').trim(), href: $(this).attr('onclick').match(/'(.*?)'/)[1] }; devs.push(dev); }); }) .then(function () { return MongoClient.connectAsync('mongodb://localhost:27017/magento'); }) .then(function (db) { this.db = db; return Promise.all(devs.map(function (dev) { return db.collection('melbourne').updateAsync({'name': dev.name}, {$set: dev}, {upsert: true}); })) }) .then(function (result) { console.log('Upserted', result.length, 'records'); if ($('.main-section .pager a.next').length > 0) { setTimeout(function () { process($('.main-section .pager a.next').attr('href')); }, pause); } else if (++i < urls.length) { setTimeout(function () { process(urls[i]); }, pause); } else { console.log('No more to process, exiting.'); } this.db.close(); }) .catch(function (err) { throw err; }); } process(urls[i]); |
So the code basically goes through each address in the array urls, with each page that it visits returning a promise object. A chain of .then() goes through each row of developer information, creates a JSON object per developer and stores it in a MongoDB.
If a ‘next page’ button is found, it will go to the next page before visiting the next URL in the urls array.
In my opinion, the coolest parts of the code is:
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
var MongoDB = Promise.promisifyAll(require("mongodb")); var MongoClient = Promise.promisifyAll(MongoDB.MongoClient); // -- SNIP -- .then(function () { return MongoClient.connectAsync('mongodb://localhost:27017/magento'); }) .then(function (db) { this.db = db; return Promise.all(devs.map(function (dev) { return db.collection('melbourne').updateAsync({'name': dev.name}, {$set: dev}, {upsert: true}); })) }) // -- SNIP -- |
…which pretty much turns a callback based MongoDB native Node.js driver to a promise based one. It’ll do a bunch of inserts (upsert) as promises and only when those promises resolve, goes to the next .then()
You can see the result of this code here or view it as a GitHub Gist.
Comments
Ben P
December 1, 2015 at 7:43 pmFrancis, this looks very clear. Thanks for posting it.
I’m wondering what your take would be on the problem of the link-following request failing or timing out? No need for code, of course, but would you mind sharing how you would probably modify the scraper to handle this?
Thanks again!
Francis Kim
December 1, 2015 at 8:19 pmHi Ben,
I haven’t made it deliberately fail to test but
.catch()should catch any errors and you could also place a.reject()when the http call errors.Vince
May 3, 2017 at 3:21 pmdoesnt pull anything into the DB
Write a comment