diff --git a/chromeredir/checkredir.js b/chromeredir/checkredir.js index 5d6c574..355b724 100644 --- a/chromeredir/checkredir.js +++ b/chromeredir/checkredir.js @@ -1,6 +1,7 @@ const puppeteer = require('puppeteer') const readline = require('readline'); +const MAX_WORKERS = 20 const rl = readline.createInterface({ input: process.stdin, output: process.stdout, @@ -10,45 +11,46 @@ const rl = readline.createInterface({ var urls = [] var reading = true rl.on('line', async (url) => { - urls.push(url) // start queuing the read urls right away + urls.push(url) // start queuing the read urls right away }); -(async ()=> { // kick off an async "thread" to read from the queue +(async ()=> { // kick off an async "thread" to read from the queue const browser = await puppeteer.launch({ignoreHTTPSErrors: true}) // build the browser once -let working = new Set() // maybe not the most memory efficient to make two datastructures -while (urls.length) { // but the list as a queue is helpful and the set is helpful for different reasons - let url = urls.shift() // grab the first URL - working.add(url) // mark that we are working on that URL - ; // so we can call another async func inline - (async () => { // check the redirects in another "thread" so we can check multiple at a time - const page = await browser.newPage() - try { - await page.goto(url) - var destination = await page.evaluate(() => { - return {"domain": document.domain, "href": document.location.href} - }) +let working = new Set() // maybe not the most memory efficient to make two datastructures +for (let i = 0; i < MAX_WORKERS; i++) { // but the list as a queue is helpful and the set is helpful for different reasons + (async ()=>{ // only make MAX_WORKERS tasks ("threads") so we do not crash chrome + const page = await browser.newPage() + while (urls.length) { + let url = urls.shift() // grab the first URL + working.add(url) // mark that we are working on that URL + try { + await page.goto(url) + var destination = await page.evaluate(() => { + return {"domain": document.domain, "href": document.location.href} + }) - var u = new URL(url) + var u = new URL(url) - if (u.host != destination.domain){ - console.log(`${url} redirects to ${destination.href}`) - } else { - console.log(`${url} does not redirect`) + if (u.host != destination.domain){ + console.log(`${url} redirects to ${destination.href}`) + } else { + console.log(`${url} does not redirect`) + } + } catch { + // should an error just pass? + console.log(`error checking ${url}`) + } finally { + working.delete(url) // we are no longer working on that URL + } } - } catch { - // should an error just pass? - console.log(`error checking ${url}`) - } finally { - await page.close() // clean up the page object (we make a new one for each URL) - working.delete(url) // we are no longer working on that URL - if (!reading && !working.size) { // I think this will prevent premature browser closure and issues with list/set desync + await page.close() // clean up the page object, potentially an issue if page crashes in loop + if (!reading && !working.size) {// I think this will prevent premature browser closure and issues with list/set desync browser.close() } - } -})() + })() } })() rl.on('close', async () => { - reading = false // make sure that our queue and set do not get desynced + reading = false // make sure that our queue and set do not get desynced })