I've written a scraper using the scraper
module and the queue
function of async
module.
I read the list of URLs to scrap from a json file and write the informations in another JSON file.
This is my script:
var fs = require("fs");
var scrap = require("scrap"),
async = require("async");
var errors = [];
// Queue a list of URLs
var queue = JSON.parse(fs.readFileSync("products.json", "utf8"));
var len = queue.products.length;
var q = async.queue(function (url, done) {
scrap(url, function(err, $) {
var product = {};
product.name = $("#page-body h2").first().text().trim();
product.myarr = [];
product.picture = $(".content img").first().attr("src");
try {
if (product.picture.indexOf("someword") > 1) {
delete product.picture;
}
}
catch (e) {
console.error(e);
}
$(".content [style^=\"color: #\"] [style=\"font-weight: bold\"], .content [style=\"font-weight: bold\"] [style^=\"color: #\"]").each(function() {
product.myarr.push($(this).text().trim().toLowerCase());
});
if (product.myarr.length) {
fs.appendFile("products-parsed.json", JSON.stringify(product) + ",\n", function (err) {
console.log(queue.products.indexOf(url), len, err);
if (err) { errors.push(queue.products.indexOf(url)); }
done();
});
}
});
}, 20);
q.drain = function() {
console.log(errors);
};
q.push(queue.products);
When I run it, after about 3.000 pages, it stops (quit) and it does not give any error, I have to start from the latest worked page using:
q.push(queue.products.slice(lastWorkedPage, queue.products.length - 1));
How can I fix this problem?
Not sure why, by the way seems like the problem was caused by this row:
console.log(queue.products.indexOf(url), len, err);
Commenting it has solved the problem, feel free to give a more accurate answer which explains the solution and I'll set it as accepted.