Using Node.js, when one requests a HTTP response, in optimal circumstances, the request comes back with a HTTP response.
However, sometimes the request breaks because the site, for example, has a 404 code, or the site does not exist at all. When requesting a batch of URLs, if there is a 404 code on, say, the 200th URL out of 1000 URLs requested, the entire script breaks. Here is my code:
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
(function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
db.scrape.save(obj);
}).on('error',function(e){
console.log("Error Site: " + hostNames[i]);
});
})(i);
};
Is there a way, that for example, if the site does not exist, I simply skip to the next URL, instead of having the script break?
EDIT: Fixed. Thanks user DavidKemp
Use a try/catch block to catch any errors that might occur, and then continue on from there.
For example:
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
//moved the function out so we do not have to keep redefining it:
var get_url = function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
console.log(JSON.stringify(obj, null, 4));
})
};
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
try {
get_url(i);
}
catch(err){
//do something with err
}
};
You need to bind an error handler to your request. I also cleaned up the code a bit.
hostNames.forEach(function(hostname), {
var req = http.get({host: hostName}, function(res) {
var obj = {
url: hostName,
statusCode: res.statusCode,
headers: res.headers
};
console.log(JSON.stringify(obj, null, 4));
});
req.on('error', function(err){
console.log('Failed to fetch', hostName);
});
});
You can use uncaughtException event. this let script run even after exception. link
process.on('uncaughtException', function(err) {
console.log('Caught exception: ' + err);
});
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
(function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
db.scrape.save(obj);
}).on('error',function(e){
console.log("Error Site: " + hostNames[i]);
});
})(i);
};
Added a callback for when there's an error. Logs the site that returns an error on console. This error is usually triggered by a 404 or request time is too long.
The full docs are at http://nodejs.org/api/http.html#http_http_get_options_callback at the time of writing. loganfsmyth's answer provides a useful example.