I have build a web-crawler using node.js
, but one of the main features of node.js
, is Non-Blocking
code, which is great, but in my case it is breaking my program. Here is how my program used to work:
request
for the website and finds all links to
all companies that passed on the page;for
loop that modifies that companies URLs array;request
that is in the for loop that goes throw the
array of companies URLs to find all products URLs in each company
page;for
loop that modifies that products URLs array;for
loop that goes
throw the array of products and gets the price of each product and
store it into dictionary(object) where the key is the name of the
product and the value is the price;As you can see each of my steps depends on a step before it. So I need to do something so my program runs in the order I listed. I have tried to use callback
's but it did not end pretty well. Because here is a simple example of a callback
:
function some_function(arg1, arg2, callback) {
var my_number = (arg1 - arg2) * arg2;
callback(my_number);
}
some_function(20, 15, function(num) {
console.log("callback called! " + num);
});
But I do not imagine how to make callback that would have 6 functions. Maybe there is a way, but not with my knowledge of callback
's. Here is a demo version of my program without callback
's:
var request = require('request');
var cheerio = require('cheerio');
var companiesUrls = [];
var url = '';
var companiesUrls2 = [];
var carsUrls = [];
var carsOwnerReview = {};
var carReviewUrl = [];
var site = '...'
var companiesPath = '/companies'
///step 1\\\
request(site+companiesPath, function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
$('a', '#group-content').each(function(){
var url = $(this).attr('href');
companiesUrls.push(url);
});
};
});
///step 2\\\
for(var i=0;i<companiesUrls.length;i+=2){
companiesUrls2.push(companiesUrls[i]);
};
///step 3\\\
for(var i=0;i<companiesUrls2.length;i++){
request(site+companiesUrls2[i], function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
$('h3.edition-title').children().children().each(function(){
var url = $(this).attr('href');
carsUrls.push(url);
});
};
});
};
///step 4\\\
for(var i=0;i<carsUrls.length;i++){
carReviewUrl.push(carsUrls[carsUrls.length-1].slice(0,-7)+'/owner-reviews');
};
///step 5\\\
for(var i=0;i<carReviewUrl.length;i++){
request(site+carReviewUrl[i], function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
var model = $('#page-title').text();
$('span.total-votes').children().each(function(){
var reviewNum = $(this).text();
carsOwnerReview[model] = reviewNum;
});
};
});
}
///step 6\\\
var keysSorted = Object.keys(carsOwnerReview).sort(function(a,b){return carsOwnerReview[a]-carsOwnerReview[b]});
var keysSortedReversed = keysSorted.reverse();
So my question is: What should I do to run my code with node.js
in my order?
If you want to program seriously with JavaScript/node.js you have to deeply understand that JS is async and everything happens in parallel except your code.
This means in your case that your callbacks are not called unless all of your code has been terminated. So calling an async function within a for-loop should always turn all your warning lamps RED!
Here is your code correctly designed:
var request = require('request');
var cheerio = require('cheerio');
var companiesUrls = [];
var url = '';
var companiesUrls2 = [];
var carsUrls = [];
var carsOwnerReview = {};
var carReviewUrl = [];
var site = '...'
var companiesPath = '/companies'
///step 1\\\
request(site+companiesPath, function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
$('a', '#group-content').each(function(){
var url = $(this).attr('href');
companiesUrls.push(url);
});
};
///step 2\\\
for(var i=0;i<companiesUrls.length;i+=2){
companiesUrls2.push(companiesUrls[i]);
};
///step 3\\\
function processCompaniesUrls2( i, callback_pcu2 ) {
if( i<companiesUrls2.length ) {
request(site+companiesUrls2[i], function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
$('h3.edition-title').children().children().each(function(){
var url = $(this).attr('href');
carsUrls.push(url);
});
};
processCompaniesUrls2( i+1, callback_pcu2 );
});
} else {
callback_pcu2();
}
}
processCompaniesUrls2( 0, function() {
///step 4\\\
for(var i=0;i<carsUrls.length;i++){
carReviewUrl.push(carsUrls[carsUrls.length-1].slice(0,-7)+'/owner-reviews');
};
///step 5\\\
function processCarReviewUrl( i, callback_pcru ) {
if( i<carReviewUrl.length ) {
request(site+carReviewUrl[i], function(err, resp, body){
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
var model = $('#page-title').text();
$('span.total-votes').children().each(function(){
var reviewNum = $(this).text();
carsOwnerReview[model] = reviewNum;
});
};
processCarReviewUrl( i+1, callback_pcru );
});
} else {
callback_pcru();
}
}
processCarReviewUrl( 0, function() {
///step 6\\\
var keysSorted = Object.keys(carsOwnerReview).sort(function(a,b){return carsOwnerReview[a]-carsOwnerReview[b]});
var keysSortedReversed = keysSorted.reverse();
});
});
});
Because things can even get more complicated I strongly recommend the study of this article. It is really worth reading.