I'm new to Node.js. I'm using zombie.js to scrape a web page title from a few websites. Below is my code:
var Browser = require("zombie");
var util = require("util");
halt = require('delayed');
title = [];
url = [ 'http://www.apple.com', 'http://www.microsoft.com', 'http://www.dell.com' ];
function getTitles(url){
//console.log('Start scraping title');
var length = url.length;
console.log('Total Site to Scrape: '+length);
label = 1;
for(var i=0;i<length;i++){
browser = new Browser()
browser.runScripts = false
browser.setMaxListeners(0);
browser.visit(url[i], function(e, browser, status, errors) {
browser.wait(function(){
title[i] = browser.text('html > head > title');
console.log(label+': '+title[i]);
browser.close();
label++;
});
});
};
}
getTitles(url);
halt.delay(function () {
console.log('Array Length: '+title.length)
console.log('Array Content: '+title)
}, 10)
Below is the output of the code:
Total Site to Scrape: 3
1: Apple
2: Dell Official Site - The Power To Do More | Dell
3: Microsoft Home Page | Devices and Services
Array Length: 4
Array Content: ,,,Microsoft Home Page | Devices and Services
The part that I don't understand:
I'm not familiar with zombie but I am pretty sure this is a closure problem. i
isn't what you think it is when you think it is. See this answer: Javascript closure inside loops - simple practical example for some information on closures. Basically what's happening is your loop continues even though your requests are async, they're not done so when they come back, you've looped through all 3 URLS, now you've got three elements... well then it inserts all 3 values into the 4th element. Last one wins, so Microsoft is all you see.