Finding the coordinates of an image in every HTML file in a directory?

I have a library of flat HTML files with similar image tags. How should I go through all of them and find the specific x, y coordinates on the page of a specific image tag?

I'm thinking that I'll need to either render each page as an image (replacing the image tag that I'm looking for with a specific color that I can then match on) or I could render headlessly render the page with something like phantom.js and find the coordinates that way (though I don't know if that will work). Any thoughts on which will be easier?

I'd prefer to use either a LAMP stack or Node.js.

Thanks!

I think using PhantomJS will be the easiest. No need for node.js.

You can combine examples/scandir.js and examples/phantomwebintro.js to get what you want.

var system = require('system');
var fs = require('fs');

if (system.args.length !== 2) {
    console.log("Usage: phantomjs scandir.js DIRECTORY_TO_SCAN");
    phantom.exit(1);
}

function scanDirectory(path, cb) {
    if (fs.exists(path) && fs.isFile(path)) {
        cb(path);
    } else if (fs.isDirectory(path)) {
        fs.list(path).forEach(function (e) {
            if (e !== "." && e !== "..") {
                scanDirectory(path + '/' + e, cb);
            }
        });
    }
}

function parsePage(path) {
    var page = require('webpage').create();
    page.open(path, function(status) {
        if (status === "success") {
           page.includeJs("http://code.jquery.com/jquery-latest.js", function() {
               var images = page.evaluate(function() {
                    var images = [];
                    $('img').each(function() {
                        images.push({ src: $(this).attr('src'), pos: $(this).position() });
                    });
                    return images;
               });
               console.log(images);
           });
         }
    });
}

scanDirectory(system.args[1], parsePage);

This script (phantomjs img.js kittens) will scan the directory for files, load every file in that directory (and subdirectories, you can modify this behavior in scanDirectory) and find all <img> tags on that page and return an array with their src attributes and .position().

Took me about 20 minutes to get this to work, so I think this is the easiest way.