I am trying to index a big (1 500 000 lines) file and push it to elastic search. In order to do this, I am using node js streams; however, I keep running out of memory. What am I doing wrong?
var rl = null;
initialize(function() {
var stream = fs.createReadStream(process.argv[2]);
rl = readline.createInterface({input: stream, terminal: false});
var i = 0;
rl.on('line', function(line) {
rl.pause();
processObject(++i, extractObject(line));
});
rl.on('close', function() {
console.log('\nRefreshed index;');
process.exit();
});
});
function processObject(number, input) {
client.index({
index: INDEX,
type: TYPE,
id: number,
body: input
}, function (error, response) {
rl.resume();
if(number % 1000 == 0) process.stdout.write('.');
});
};
Ok, so here is the solution. The code I wrote was just fine; the problem was with the 'readline' package. In fact, the rl.pause() function was not pausing the line reads, as it should. I solved it by switching to the 'line-by-line' package, which works in the same way. Using the same code, the process runs within 60 MB.