Crawling links on a page, then visiting and inspecting each link with node and zombie.js

Question

Crawling links on a page, then visiting and inspecting each link with node and zombie.js

I'm trying to build a simple utility in Node with zombie.js to visit a page, find and open all links on the page, and ensure that each child page returns a 200 successfully.

Here's an example of this code (written in CoffeeScript), crawling the home page of stackoverflow.com

Browser = require('zombie')

browserOpts =
  runScripts: false
  site: 'http://www.stackoverflow.com'

home = new Browser browserOpts

home.visit '/', (e, browser) ->
  questions = browser.queryAll '#question-mini-list .summary h3 a'
  for q in questions
    qUrl = q.getAttribute 'href'
    page = new Browser browserOpts
    page.visit qUrl, (e, browser, statusCode, errors) ->
      console.log "Arrived at page #{browser.window.location} and found " + browser.html().length + " bytes"
      console.log statusCode
      browser.dump()
      return
  return

If you attempt to run this code, you'll notice the first handful of links are loaded correctly, and the number of bytes in the page is displayed.

However, after the first batch of successful page loads (the size of which seems random), all subsequent page loads seem to execute the callback to visit prematurely. The document is empty (it's just <html><head></head><body></body></html>) and the statusCode argument to the callback is undefined.

I can't explain or figure out why this is happening. Any tips would be greatly appreciated.

node.js
zombie.js

Answer 1

excuse my js to a coffeescript question

var async = require('async');
var Browser = require('zombie');

var browserOpts = {
  runScripts: false,
  site: 'http://www.stackoverflow.com'
};

var home = new Browser(browserOpts);

home.visit('/', function(e, browser) {
  var questions = browser.queryAll('#question-mini-list .summary h3 a');
  async.eachLimit(questions, 3, function (question, cb) {
    var qUrl = question.getAttribute('href');
    var page = new Browser(browserOpts);
    page.visit(qUrl, function(e, browser, statusCode, errors) {
      console.log(("Arrived at page " + browser.window.location + " and found ") + browser.html().length + " bytes");
      console.log(statusCode);
      browser.dump();
      cb(e);
    });
  }, function (err) {
    console.error('OOPS', err);
  });
});

try it out here: http://runnable.com/UWh05t96qlJ8AAAC

You're making too many requests at once and stackoverflow is cutting you off. It's cuttoff is 4 as far as I can tell.

If you really need data from stackoverflow use the api: https://api.stackexchange.com/docs