I am writing an API using node.js with express. Part of the API will allow users to POST large payloads of binary data (perhaps hundreds of MB) to be stored in the server database.
As it stands now, the express request handler does not get called until the entire upload is ready and stored in memory on the server (req.body). Then it has to be saved to a database. There are two things I don't like about this. The first is that it requires a lot of server memory to hold all that binary data at once. The second is that many databases like MongoDB and S3 allow for streaming so you don't really need to have all the data in place before you start writing it, so there's no reason to wait around.
So my question is, can node (through express or some other way) be configured to start streaming to the database before the entire request has come in?
After further research, I have found that the native "http" module does in fact support streaming in the way I mentioned. I'm not sure if express supports this. I would guess that it does, but in the case of an upload you probably cannot use the bodyParser middleware since that probably blocks until the entire request body is received.
Anyway, here is some code that shows how you can stream an incoming request to MongoDB's GridFS:
var http = require('http');
var mongo = require('mongodb');
var db = new mongo.Db('somedb', new mongo.Server("localhost", 27017), { safe: true });
db.open(function(err) {
if (err)
console.log(err);
http.createServer(function(req, res) {
var numToSave = 0;
var endCalled = false;
new mongo.GridStore(db, new mongo.ObjectID(), "w", { root: "fs", filename: "test" }).open(function(err, gridStore) {
if(err)
console.log(err);
gridStore.chunkSize = 1024 * 256;
req.on("data", function(chunk) {
numToSave++;
gridStore.write(chunk, function(err, gridStore) {
if(err)
console.log(err);
numToSave--;
if(numToSave === 0 && endCalled)
finishUp(gridStore, res);
});
});
req.on("end", function() {
endCalled = true;
console.log("end called");
if(numToSave === 0)
finishUp(gridStore, res);
});
});
}).listen(8000);
});
function finishUp(gridStore, res) {
gridStore.close();
res.end();
console.log("finishing up");
}
The gist is that the req object is actually a stream with "data" and "end" events. Every time a "data" event occurs, you write a chunk of data to mongo. When the "end" event occurs, you close the mongo connection and send out the response.
There is some yuckiness related to coordinating all the different async activities. You don't want to close the mongo connection before you have had a chance to actually write out all the data. I achieve this with a counter and a boolean but there might be a better way using some library.