I am attempting to grab the description of a website in a node.js web application. It seems to be working perfectly, however node.js appears to be having issues with the NCR characters (http://en.wikipedia.org/wiki/Numeric_character_reference). The code I have for the link grabber is shown below
getInfo:(url) ->
errorMessage = 'Invalid Link'
request(url, (error, response, body)->
if (!error && response.statusCode == 200)
handler = new htmlparser.DefaultHandler((err, dom) ->
if (err)
res(error: errorMessage)
else
imgs = select(dom, 'img')
titletags = select(dom,'title')
descripTags = select(dom,'meta')
filteredTags = _.filter(descripTags,(tag) -> tag.attribs.name? && tag.attribs.name == 'description')
uri = response.request.uri.href
mapFunc =(imgSrc) ->
pattern = /^((http|https|ftp):\/\/)/
img = imgSrc.attribs.src
if (!pattern.test(img)) then uri.substring(0,uri.length-1) + img else img
res(
images: _.filter(_.map(imgs,mapFunc),(img)-> (img != '')) || []
title: titletags[0].children[0].raw || ''
description: if filteredTags.length != 0 then filteredTags[0].attribs.content || '' else ''
)
)
parser = new htmlparser.Parser(handler)
parser.parseComplete(body)
else
res(error: errorMessage)
)
As an example, if I put in the following URL for to grab info form (http://www.zdnet.com), the description will be ZDNet's breaking news, analysis, and research keeps business technology professionals in touch with the latest IT trends, issues and events.. The apostrophe is the issue (being represented as ')
My question is, why aren't any of the libraries properly parsing the valid HTML NCR's and converting them to the string equivalent, and if there isn't a way to fix this, is it just safe to replace all occurrences of NCR's using some other library?
The libraries I am using are described below
request = require 'request'
htmlparser = require 'htmlparser'
select = require('soupselect').select
_ = require 'underscore'
Ended up using the https://github.com/minchenkov/node-html-encoder library to decode the strings, worked fine (not sure why the node.js standard libraries don't html decode strings by default)