I'm fetching a lot of data asynchronously using "request" in Node.js.
First of all, the request callback does not contain the original request post parameters (which is horrible) so I had to implement this myself by adding an x-header to the request (because headers from the original request are available within the callback response).
However, the request headers often mismatch the response result. I.e. I request two URL's in quick succession and the callback will mix them up. It will claim that the request headers for one request belong to the result of the other request.
var getActiveGames = function() {
console.log(activeGamePlayer);
if (activeGamePlayer >= Object.keys(players).length-1) {
activeGamePlayer = 0;
}
var player = Object.keys(players)[activeGamePlayer];
var ign = players[player].ign;
if (ign) {
request.post({
headers: {
'content-type' : 'application/x-www-form-urlencoded',
'x-summoner' : player
},
url: URL,
body: 'userName=' + ign + '&force=true'
}, function(error, response, body){
if (!error) {
if (response.statusCode == 200) {
jsdom.env(body, ['http://code.jquery.com/jquery.js'], function(err, window) {
if (window.jQuery) {
activeGamePlayer += 1;
var $ = window.jQuery;
var isPlaying = $('div:first').hasClass('SpectatorBig');
//var playerID = response.client._httpMessage._headers['x-summoner']; //doesn't work due to request and response headers mismatch
var playername = isPlaying ? $('tr.mine .summonerName').html() : $('div.nBoxContent b').html();
if (playername) {
var playerID;
for (var p in players) {
if (players[p].ign) {
if (players[p].ign == playername) {
playerID = p;
}
}
}
if (isPlaying) {
var champion = $('tr.mine div.__spc32').removeClass('__spc32 img').attr('class');
champion = champion.replace('__spc32-', '');
var gameURL = $('div.Spectate a').attr('href');
var timestamp;
if ($('._countdown').length) {
timestamp = $('._countdown').attr('data-timestamp');
} else {
timestamp =$('._timeago').attr('data-datetime');
}
players[playerID].activeGame = {
'timestamp' : timestamp,
'champion' : champion,
'finished' : false,
};
} else {
if (players[playerID].hasOwnProperty('activeGame')) {
players[playerID].activeGame.finished = true;
// make sure no game is in players json object
}
}
} else {
console.log ("Error: Faulty data from op.gg");
faultyIGN = Object.keys(players).length;
}
} else {
console.log("Error: No jQuery object in jsdom body.");
}
});
} else {
console.log("GetActiveGame: Status Code not 200");
}
} else {
console.log("GetActiveGame: " + error);
}
setTimeout(getActiveGames, 100);
});
} else {
//no ign
activeGamePlayer += 1;
setTimeout(getActiveGames, 100);
}
}
The issue is that the response object in this instance contains incorrect information. The request and response information it contains do not match up, they are from separate requests and responses.
http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs
The trick is to wrap the callback in a self-executing function which is passed the information you wish to save. This information then becomes available to the callback functions closure.
for (pool in pools) {
var url = 'http://www.thprd.org/schedules/schedule.cfm?cs_id=' + pools[pool];
request(url, ( function(pool) {
return function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
console.log(pool);
// TODO: scraping goes here!
}
} )(pool));
}