Making a while loop asynchronous in nodeJS

There are several similar questions here on Stack but I can't get any answers working for me, I'm completely new to Node and the idea of asynchronous programming so please bear with me.

I'm building a scraper that currently has a 4 step process:

  1. I give it a collection of links
  2. It goes to each of these links, finds all relevant img src on the page
  3. It finds the "next page" link, gets its href, retrieves the dom from said href and repeats step #2.
  4. All of these img src are put into an array and returned

Here's the code. getLinks can be called asynchronously but the while loop in it currently cannot:

function scrape(url, oncomplete) {
    console.log("Scrape Function: " + url);
    request(url, function(err, resp, body) {
        if (err) {
            console.log(UHOH);
            throw err;
        }
        var html = cheerio.load(body);
        oncomplete(html);
    }
    );
}
function getLinks(url, prodURL, baseURL, next_select) {
    var urls = [];
    while(url) {
        console.log("GetLinks Indexing: " + url);
        var html = scrape(url, function(data) {
            $ = data;
            $(prodURL).each(function() {
                var theHref = $(this).attr('href');
                urls.push(baseURL + theHref);
            }
            );
            next = $(next_select).first().attr('href');
            url  = next ? baseurl + next : null;
        }
        );
    }
    console.log(urls);
    return urls;
}

At present this goes into an infinite loop without scraping anything. If I put the url = next ? baseurl + next : null; outside of the callback I get a "next" is not defined error.

Any ideas on how I can re-work this to make it node-friendly? It seems like, by this problem's very nature, it needs to be blocking, no?

It is a frequent pattern where you want to perform a loop but use an asynchronous function with a callback. As you cannot wait for an asynchronous function you cannot simply use a while loop.

One solution is to use a "stack" (or array). Fill this with the initial elements you want to process. As you discover more elements to process add them to this stack. And recursively call your function incrementing the index to process until the index exceeds the length of the array.

e.g.

function do_scrape( stack, this_url, callback ) {
    // get list of URLs from webpage at this_url
    ...
    stack.push( new_url ); // adding new element to array
    ...
    ...
    callback(); // process callback
}

function process_stack( stack_of_urls, idx ) {
    var this_url = stack_of_urls[idx];

    do_scrape(
        stack_of_urls,
        this_url,           
        function () {
            if ( idx + 1 < stack_of_urls.length ) {
                process_stack( stack_of_urls, (idx + 1) );
            } else {
                process.exit( 0 );
            }
        }
    );
}

var stack_of_urls = [ "http://www.yahoo.com/" ];
process_stack( stack_of_urls, 0 ); // start from index zero

Note there are many ways of approaching this. For efficiency's sake you could remove processed elements off the stack. Also you have a choice of whether to process the stack from beginning to end, or end to beginning, etc. Finally note that if you do not call an asynchronous function in your do_scrape function then you'll have a tight loop of callbacks whereby node.js will abort complaining about the stack being too big.