I'm scraping a external webpage to save in file, but sometimes this file contains relative urls that i can't view because this files not will be found. I want replace by absolute urls. I'm insterested in some module or function that replace all relative urls in html string to absolute. Any idea?
var request = require('request');
var WEBSITE = 'http://somewebsite.com/';
request.get(WEBSITE, function(error, response, body){
body = replace_all_relative_by_absolute(body);
console.log(body);
});
You can iterate over the links and add the domain yourself.
Using jQuery, install with:
npm install -S 'jquery@>=2.1'
npm install -S 'jsdom@latest'
Example implementation (barely tested):
var env = require('jsdom').env;
function addDomainToLinks(domain, html, callback) {
env(html, function (err, window) {
var $ = require('jquery')(window);
$.each($('a'), function(i, v) {
var href = $(v).attr('href');
// Match links starting with /, but not //
// You probably want to do handle './', and do the same for images, etc.
if (href.match(/^\/[^\/]/)) {
$(v).attr('href', domain + href);
}
});
callback($('html')[0].outerHTML);
});
}
Usage:
addDomainToLinks('http://example.com', html, function(html) {
console.log(html);
});
import this
from urlparse import urljoin
and then
urljoin(base_url, file_url)