I am running a script in Node.js that uses the request package to help me make HTTP requests. The script makes anywhere from 100 to 1000 requests in order to download PDFs and put them in a certain folder on my local machine. When I only make 100 requests, the program works fine and all the PDFs show up in my folder, but when I make many more requests the program begins throwing this error (ENOTFOUND) and the files that throw this error are blank in the folder. When I load the links that threw the error manually, they work fine (so I know it is not an issue with the host) and I can download the files like that, but I would like to automate it.
Here is the bit of the function that gives the error. I have begun trying to push all links that give errors to an array, but have had no luck fixing the issue yet.
var year = process.argv[3] % 2000;
var url = yearURL[year];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html, {xmlMode: true});
$('link', 'item').each(function(){
var link = $(this).text();
PDFscrape(link);
});
}
});
function PDFscrape(link){
request(link, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var num = $('#ctl00_ContentPlaceHolder1_lblFile2', '#ctl00_ContentPlaceHolder1_pageDetails').text();
console.log('Document ' + num + ' has been loaded.');
var i = 1;
$('a', '#ctl00_ContentPlaceHolder1_lblAttachments2').each(function(){
var pdf = 'https://phila.legistar.com/' + $(this).attr('href');
AddPDF(pdf, num, i);
i++;
});
}
})
}
function AddPDF(link, file, i){
var name = file;
var fileName;
if (year !== 100 && year !== -2){
fileName = fs.createWriteStream(__dirname + '/pdfs/20' + year + '/' + file + '_' + i + '.pdf');
} else if (year === 100){
fileName = fs.createWriteStream(__dirname + '/pdfs/recent100/' + file + '_' + i + '.pdf');
} else if (year === -2){
fileName = fs.createWriteStream(__dirname + '/pdfs/recent10/' + file + '_' + i + '.pdf');
}
request({
uri: link,
headers: {
'Host': 'phila.legistar.com',
'User-Agent': 'request'
}
}, function(err) {
if (err){
var errLink = {url: link, file: name, num: i}
var count = errors.push(errLink);
console.log('--------- Error: ' + count + ' ---------');
}
}).pipe(fileName);
}
The error that is being thrown is:
stream.js:94
throw er; // Unhandled stream error in pipe.
^
Error: getaddrinfo ENOTFOUND
at errnoException (dns.js:37:11)
at Object.onanswer [as oncomplete] (dns.js:124:16)
Note: I am handling the error, but the documents that give this error are piped through as blank when they should not be. I just can't figure out why the files are giving the error.
A few missing semicolons, somewhat inelegant use of != instead of !==, bad double-meaning use of file, but nothing to explain what you're seeing. I am guessing that year is somehow bad. Posting my version of your code here with some comments.
(I am posting here and not in a comment b/c can't post long code in comments):
function AddPDF(link, filename, i) {
var file;
if (year !== 100 && year !== -2) {
// probably want "!==" rather than "=="
file = fs.createWriteStream(__dirname + '/' + file + '_' + i + '.pdf');
} else {
// WHAT HAPPENS HERE?
// what is "file", that is later piped to?
// this may be your issue.
// Where is this "year" coming from? Can you show relevant code?
}
request({
uri: link,
headers: {
'Host': 'phila.legistar.com',
'User-Agent': 'request'
}
}, function(err) {
var errLink, count;
if (err) {
errLink = {url: link, file: filename, num: i};
count = errors.push(errLink);
console.log('--------- Error: ' + count + ' ---------');
}
}).pipe(file);
}
Finally, you still haven't provided the actual trace.
Good thing you posted the error... It seems like what you have is a DNS error. Hostname lookup is failing sometimes.
I suggest you print all of the URL's, without actually downloading them. If they all look OK, then it's a temporary DNS issue. If you have a local/close DNS server, check its logs. Otherwise, try changing the DNS to something robust, like 8.8.8.8 (Google's DNS), just for the test.
If that fails as well, for different URLs every time, then it's something in your local machine. One way around it is to try/catch this and re-try the offending URL.