I'm using the nodejs xml parser sax-js to get content from xml. The structure of the xml is as follows:
<item>
<title>Some title</title>
<guid isPermaLink="false">http://example.com</guid>
</item>
<item>
<title>VIDEO: Some title</title>
<guid isPermaLink="false">http://example1.com</guid>
</item>
I want all urls under guid whose title donot start with VIDEO.
Currently, it's giving me all the urls.
My code currently is:
'use strict';
var sax = require('sax-js');
var request = require('request');
var href = 'http://some-xml-url.xml';
var urls = [];
var isTextPending = false;
var saxStream = sax.createStream(true);
saxStream.on('error', function (e) {
console.error(e);
});
saxStream.ontext = function (text) {
if(isTextPending) {
urls.push(text);
isTextPending = false;
}
};
saxStream.on('opentag', function (node) {
if(node.name === 'guid' && node.attributes.isPermaLink === 'false') {
isTextPending = true;
}
});
saxStream.end = function () {
}
request(href).pipe(saxStream);
You will need to handle more states that just 'isTextPending'.
Here is an example (note that this also handles 'closetag' event to exclude text between tags from the processing).
'use strict';
var sax = require('sax-js');
var request = require('request');
var href = 'http://some-xml-url.xml';
var urls = [];
var tagName = undefined;
var isValidGuid = false;
var isValidTitle = false;
var guidUrl = undefined;
var saxStream = sax.createStream(true);
saxStream.on('error', function (e) {
console.error(e);
});
saxStream.ontext = function (text) {
if (tagName === 'guid' && isValidGuid) {
guidUrl = text;
}
else if (tagName === 'title') {
isValidTitle = !(text.indexOf('VIDEO') === 0);
}
else return;
if (guidUrl !== undefined && isValidTitle) {
urls.push(guidUrl);
}
};
saxStream.on('opentag', function (node) {
tagName = node.name;
switch(node.name) {
case 'guid':
isValidGuid = (node.attributes.isPermaLink === 'false');
break
case 'item':
isValidGuid = false;
isValidTitle = false;
guidUrl = undefined;
break;
}
});
saxStream.on('closetag', function (node) {
tagName = undefined;
});
saxStream.end = function () {
console.log('Result: '+JSON.stringify(urls));
};
request(href).pipe(saxStream);