I'm trying to build a simple utility in Node with zombie.js to visit a page, find and open all links on the page, and ensure that each child page returns a 200 successfully.
Here's an example of this code (written in CoffeeScript), crawling the home page of stackoverflow.com
Browser = require('zombie')
browserOpts =
runScripts: false
site: 'http://www.stackoverflow.com'
home = new Browser browserOpts
home.visit '/', (e, browser) ->
questions = browser.queryAll '#question-mini-list .summary h3 a'
for q in questions
qUrl = q.getAttribute 'href'
page = new Browser browserOpts
page.visit qUrl, (e, browser, statusCode, errors) ->
console.log "Arrived at page #{browser.window.location} and found " + browser.html().length + " bytes"
console.log statusCode
browser.dump()
return
return
If you attempt to run this code, you'll notice the first handful of links are loaded correctly, and the number of bytes in the page is displayed.
However, after the first batch of successful page loads (the size of which seems random), all subsequent page loads seem to execute the callback to visit prematurely. The document is empty (it's just <html><head></head><body></body></html>) and the statusCode argument to the callback is undefined.
I can't explain or figure out why this is happening. Any tips would be greatly appreciated.
excuse my js to a coffeescript question
var async = require('async');
var Browser = require('zombie');
var browserOpts = {
runScripts: false,
site: 'http://www.stackoverflow.com'
};
var home = new Browser(browserOpts);
home.visit('/', function(e, browser) {
var questions = browser.queryAll('#question-mini-list .summary h3 a');
async.eachLimit(questions, 3, function (question, cb) {
var qUrl = question.getAttribute('href');
var page = new Browser(browserOpts);
page.visit(qUrl, function(e, browser, statusCode, errors) {
console.log(("Arrived at page " + browser.window.location + " and found ") + browser.html().length + " bytes");
console.log(statusCode);
browser.dump();
cb(e);
});
}, function (err) {
console.error('OOPS', err);
});
});
try it out here: http://runnable.com/UWh05t96qlJ8AAAC
You're making too many requests at once and stackoverflow is cutting you off. It's cuttoff is 4 as far as I can tell.
If you really need data from stackoverflow use the api: https://api.stackexchange.com/docs