I am making anywhere between 1 to 10 web requests using jsdom (web-scraping library for Node.js). It goes something like this:
app.get('/results', function(req, res) {
jsdom.env(
"http://website1.com",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// scrape website #1
}
);
jsdom.env(
"http://website2.com",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// scrape website #2
}
);
jsdom.env(
"http://website3.com",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// scrape website #3
}
);
}
res.render('results', { items: items });
}
How do I run res.render() ONLY after all jsdom requests have been completed and after I have gathered all the information that I need? In a synchronous world this obviously would not be a problem, but since javascript is asynchronous, res.render() will be run before any of jsdom callbacks are finished.
The "naive" solution you could employ for a small number of scrapes is to nest everything (start each scrape in the callback of the last scrape, the last callback contains the render method.)
scrape
cb: scrape
cb: scrape
cb: render all results
That becomes tedious and illegible, of course. (And everything runs in series, not parallel, which won't be very fast.)
The better solution would be to write a function to count the number of returned results and calls render when all have returned. Here is one implementation:
function parallel_cb(total, finalCallback) {
var done = 0;
var results = [];
return function(result) {
done += 1;
results.push(result);
if (total == done) finalCallback(results);
}
}
To use it in your example:
app.get('/results', function(req, res) {
var myCallback = parallel_cb(
sitesToScrape.count, // or 3 in this case
function(items) {res.render('results', { items: items })});
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// do some scraping
myCallback(result_from_scrape);
}
);
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// more scraping
myCallback(result_from_scrape);
}
);
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// even more scraping
myCallback(result_from_scrape);
}
);
});
Instead of writing your own, you should really learn to use an existing parallel / async library as suggested by @almypal in the comment to your question.
With async you could do something much neater as described in the docs: https://github.com/caolan/async#parallel
Or if all your scrapes actually look for the same elements in the resulting pages, you could even do a parallel map over an array of URLs to scrape: https://github.com/caolan/async#maparr-iterator-callback
Each of your scrapes can use the callback function provided by async's parallel method, to return the results of its scrape. The final [optional] callback will contain your call to render with all the items.
This is your code, directly translated to the async library:
var async = require("async");
app.get('/results', function(req, res) {
async.parallel( // the first argument is an array of functions
[
// this cb (callback) is what you use to let the async
// function know that you're done, and give it your result
function (cb) {
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// do some scraping
// async's callback expects an error for the first
// param and the result as the second param
cb(null, result_from_scrape); //No error
}
);
},
function (cb) {
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// more scraping
cb(null, result_from_scrape);
}
);
},
function (cb) {
jsdom.env(
"http://nodejs.org/dist/",
["http://code.jquery.com/jquery.js"],
function (errors, window) {
// even more scraping
cb(null, result_from_scrape);
}
);
}
],
// This is the "optional callback". We need it to render.
function (err, results) {
// If any of the parallel calls returned an error instead
// of null, it's now in the err variable.
if (err) res.render('error_template', {error: err});
else res.render('results', { items: results });
});
});