I am attempting to grab the description of a website in a node.js web application. It seems to be working perfectly, however node.js appears to be having issues with the NCR characters (http://en.wikipedia.org/wiki/Numeric_character_reference). The code I have for the link grabber is shown below
getInfo:(url) ->
errorMessage = 'Invalid Link'
request(url, (error, response, body)->
if (!error && response.statusCode == 200)
handler = new htmlparser.DefaultHandler((err, dom) ->
if (err)
res(error: errorMessage)
else
imgs = select(dom, 'img')
titletags = select(dom,'title')
descripTags = select(dom,'meta')
filteredTags = _.filter(descripTags,(tag) -> tag.attribs.name? && tag.attribs.name == 'description')
uri = response.request.uri.href
mapFunc =(imgSrc) ->
pattern = /^((http|https|ftp):\/\/)/
img = imgSrc.attribs.src
if (!pattern.test(img)) then uri.substring(0,uri.length-1) + img else img
res(
images: _.filter(_.map(imgs,mapFunc),(img)-> (img != '')) || []
title: titletags[0].children[0].raw || ''
description: if filteredTags.length != 0 then filteredTags[0].attribs.content || '' else ''
)
)
parser = new htmlparser.Parser(handler)
parser.parseComplete(body)
else
res(error: errorMessage)
)
As an example, if I put in the following URL for to grab info form (http://www.zdnet.com), the description will be ZDNet's breaking news, analysis, and research keeps business technology professionals in touch with the latest IT trends, issues and events.
. The apostrophe is the issue (being represented as '
)
My question is, why aren't any of the libraries properly parsing the valid HTML NCR's and converting them to the string equivalent, and if there isn't a way to fix this, is it just safe to replace all occurrences of NCR's using some other library?
The libraries I am using are described below
request = require 'request'
htmlparser = require 'htmlparser'
select = require('soupselect').select
_ = require 'underscore'
Ended up using the https://github.com/minchenkov/node-html-encoder library to decode the strings, worked fine (not sure why the node.js standard libraries don't html decode strings by default)