I've made a scraping program using phantomjs for a website I frequently need to get information from and the problem was I would get these "Â" characters throughout the scraped data. Now if I write this data to a file, and in a separate node script I use iconv to encode to ascii and then use a regexp pattern to get rid of the resulting "?"'s, that works fine. But when I try to incorporate that same exact function inside my scraping program, I get this error:
ReferenceError: Can't find variable: Buffer
C:/javascript/Phantom/node_modules/iconv-lite/encodings/internal.js:4
C:/javascript/Phantom/node_modules/iconv-lite/encodings/internal.js:82
ReferenceError: Can't find variable: Buffer
C:/javascript/Phantom/node_modules/iconv-lite/encodings/sbcs-codec.js:20
C:/javascript/Phantom/node_modules/iconv-lite/lib/index.js:98 in getCodec
C:/javascript/Phantom/node_modules/iconv-lite/lib/index.js:16 in encode
siteScraper.js:72 in replaceDataAndWrite
siteScraper.js:58
This is the scraping program (with replaced url's and credentials for privacy):
var page = require('webpage').create();
var fs = require('fs');
var iconv = require('iconv-lite');
console.log('before page.open'),
url = "xxxxxxxxxxxxxxxxxxxxxxxxx",
url2 = "xxxxxxxxxxxxxxxxxxxxxxxx";
var credentials = {username: 'xxxxxxxxx', password: 'xxxxxxxx'}
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
console.log('inside page.open callback');
page.evaluate(function (credentials) {
document.querySelector('input[id=username]').value = credentials.username;
document.querySelector('input[id=password]').value = credentials.password;
document.querySelector('input[id=button_submit]').click();
console.log('finished querying selectors');
}, credentials);
window.setTimeout(function () {
page.render('postLogin.png');
console.log('rendered post-login');
page.open(url2, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
console.log('INSIDE frame');
window.setTimeout(function () {
page.render('framePic.png');
console.log('rendered framePic.png');
var output = page.evaluate(function () {
var output;
// get all table data
tables = document.getElementsByTagName('table');
// go through table data
for(i=0; i < tables.length; i++) {
cells = tables[i].getElementsByTagName('td');
// assign data cells to output var
for (j=0; j < cells.length; j++) {
output += cells[j].innerText + '\n';
}
}
return output;
});
// get rid of /'s
while (output.indexOf("/") != -1)
{
replacedOutput = output.replace("/", "")
output = replacedOutput
}
// function call to encode output to ascii and replace a few things
replaceDataAndWrite(output);
// var path = 'scrapedData.txt';
// fs.write(path, output, 'w');
console.log('function complete');
phantom.exit();
}, 10000); // inner setTimeout
} // second else
});
}, 10000); // outer setTimeout
} // first else
}); // first page.open
function replaceDataAndWrite (data) {
encodedData = iconv.encode(data, 'ascii');
newData = encodedData.toString('ascii')
replacedData = newData.replace(/\?/gi,"")
// replacedData = replacedData.replace(/undefined/gi,"")
fs.writeFile('scrapedData.txt', replacedData, function (err) {
if (err) throw err;
});
}
I literally just copied and pasted the code from the following working program iconvTest.js to the function "replaceDataAndWrite()" in my scraping script (but taking the file read part out of it obviously):
var iconv = require('iconv-lite');
var fs = require('fs');
fs.readFile('scrapedData.txt', function(err, data) {
if (err) throw err;
encodedData = iconv.encode(data, 'ascii');
newData = encodedData.toString('ascii')
replacedData = newData.replace(/\?/gi,"")
// replacedData = replacedData.replace(/undefined/gi,"")
fs.writeFile('message.txt', replacedData, function (err) {
if (err) throw err;
});
});
Iconv works perfectly in this last example but not in my scraping program.. Is this a bug in iconv or is there some other explanation?