Error with apostrophe when using tagsoup

Question

Error with apostrophe when using tagsoup

I am attempting to grab the description of a website in a node.js web application. It seems to be working perfectly, however node.js appears to be having issues with the NCR characters (http://en.wikipedia.org/wiki/Numeric_character_reference). The code I have for the link grabber is shown below

getInfo:(url) ->
  errorMessage = 'Invalid Link'

  request(url, (error, response, body)->
    if (!error && response.statusCode == 200)
      handler = new htmlparser.DefaultHandler((err, dom) ->
        if (err)
          res(error: errorMessage)
        else
          imgs = select(dom, 'img')
          titletags = select(dom,'title')
          descripTags = select(dom,'meta')
          filteredTags = _.filter(descripTags,(tag) -> tag.attribs.name? && tag.attribs.name == 'description')

          uri = response.request.uri.href

          mapFunc =(imgSrc) ->
            pattern = /^((http|https|ftp):\/\/)/
            img = imgSrc.attribs.src
            if (!pattern.test(img)) then uri.substring(0,uri.length-1) + img else img

          res(
            images: _.filter(_.map(imgs,mapFunc),(img)-> (img != '')) || []
            title: titletags[0].children[0].raw || ''
            description: if filteredTags.length != 0 then filteredTags[0].attribs.content || '' else ''
          )
      )
      parser = new htmlparser.Parser(handler)
      parser.parseComplete(body)
    else
      res(error: errorMessage)
  )

As an example, if I put in the following URL for to grab info form (http://www.zdnet.com), the description will be ZDNet's breaking news, analysis, and research keeps business technology professionals in touch with the latest IT trends, issues and events.. The apostrophe is the issue (being represented as ')

My question is, why aren't any of the libraries properly parsing the valid HTML NCR's and converting them to the string equivalent, and if there isn't a way to fix this, is it just safe to replace all occurrences of NCR's using some other library?

The libraries I am using are described below

request = require 'request'
htmlparser = require 'htmlparser'
select = require('soupselect').select
_ = require 'underscore'

node.js
coffeescript
html-parsing
tag-soup

Answer 1

Ended up using the https://github.com/minchenkov/node-html-encoder library to decode the strings, worked fine (not sure why the node.js standard libraries don't html decode strings by default)