相关文章推荐
强健的拐杖  ·  基于spring security ...·  1 年前    · 
乐观的签字笔  ·  Java ...·  2 年前    · 
独立的扁豆  ·  Azure ...·  2 年前    · 
魁梧的书签  ·  Connecteur de base de ...·  2 年前    · 

Puppeteer page.on('request')发生在waitForNavigation获得重定向链之前

0 人关注

我正在构建一个Puppeteer Node JS脚本,它需要拾取URL重定向链中的网页,包括那些301和302的网页,也包括那些有 setTimeout ,以导航到一个新页面的网页/网站。

我目前的脚本实现了 await page.waitForNavigation ,这对于在重定向链中拾取利用JS方法从一个网站重定向到另一个网站的网站来说是很好的,但TI没有拾取301或302重定向,所以我同时实现了 page.on('request') page.on('response') ,并将它们结合起来,因为我也需要得到重定向的状态代码。

问题是,301和302重定向似乎在起始URL之前,例如,鉴于以下重定向链的旅程。

  • https://example.com/starting-page < -- 使用JS来重定向
  • https://example.com/301-redirection-page
  • https://example.com/end-page < -- 没有重定向
  • 在我的代码中,我似乎得到了以下的顺序。

  • https://example.com/301-redirection-page
  • https://example.com/starting-page
  • https://example.com/end-page
  • 为什么会这样,我怎样才能实现一个可靠的页面重定向链,包括那些使用301和302以及使用JS重定向的网站,我目前的代码是。

    对于这里的代码量我表示歉意,我已经尽量让它变得紧凑,但我只是想把URL的集合按照它们应该出现的顺序整理出来。

    const dayjs = require('dayjs');
    const AdvancedFormat = require('dayjs/plugin/advancedFormat');
    dayjs.extend(AdvancedFormat);
    const puppeteer = require('puppeteer');
    const { config } = require('./config');
    const helpers = require('./helpers');
    const logs = require('./logs');
    const runEmulation = async (body) => {
      logs.debug('starting emulation');
      // vars
      const journeyCodes = [301, 302];
      const journey = [], requestUrls = [], responseUrls = [];
      let mergedRequestResponses = [];
      let evalTimeout, evalMaxJourneyTimeout, hopDataToReturn;
      // launcher arg options
      const argOptions = [];
      // sandbox config
      if ((config.puppeteer.run_in_sandbox === 'true')) {
        argOptions.push('--no-sandbox');
      // initiate a Puppeteer instance with options and launch
      const browser = await puppeteer.launch({
        args: argOptions,
        headless: (config.puppeteer.run_in_headless === 'true') ? true : false
      // launch a new page
      logs.debug('launching new page');
      const page = await browser.newPage();
      // hold the page times, and start with the first page
      const times = [{
        loaded_at: dayjs().valueOf()
      logs.debug(`setting UserAgent to ${body.userAgent ? body.userAgent : 'unknown'}`);
      await page.setUserAgent(body.userAgent);
      logs.debug(`going to ${body.url ? body.url : 'unknown'}`);
      await page.goto(body.url);
      // expose a function to close the browser
      async function closeBrowser () {
        try {
          logs.debug('closing browser');
          await browser.close();
        } catch (err) {
          await logs.generateLog(err.message ? err.message : 'errow thrown in closing browser', 'ERROR');
      // get loggable hop data
      function getHopDataLoggable (hops, deduped) {
        return {
          hops: hops ? hops : 1,
          all_hops: deduped ? deduped : [],
          storage: deduped.storage ? deduped.storage : [],
          destination: deduped.length > 1 ? deduped[deduped.length - 1].url : deduped[0].url,
          redirect_id: body.redirect_id ? body.redirect_id : null,
          user_agent: body.userAgent ? body.userAgent : 'Unavailable',
          date_iso: dayjs().toISOString(),
          date: dayjs().format('YYYY-MM-DD HH:mm:ss')
      // set hop data
      async function setHopData (deduped, body) {
        let hops = deduped.length - 1
        if (hops < 1) hops = 0
        hopDataToReturn = {
          hops: hops,
          hop_destination: deduped.length > 1 ? deduped[deduped.length - 1].url : deduped[0].url,
          all_hops: deduped ? deduped : [],
          redirect_id: body.redirect_id ? body.redirect_id : null,
          storage: deduped.storage ? deduped.storage : []
        await logs.generateLog(JSON.stringify(getHopDataLoggable(hops, deduped)), 'LOG');
      // dedupe functioon
      function durationFromData (data, times) {
        for (const [index, redirect] of times.entries()) {
          if (index === 0) continue
          let pos = index - 1
          // calculate the difference
          const previous = dayjs(times[pos].loaded_at)
          const current = dayjs(redirect.loaded_at)
          // on page for
          const pageDurationInMs = current.diff(previous, 'millisecond')
          const pageDurationInSec = current.diff(previous, 'second')
          // update durations
          if (data[pos] != null && data[pos].duration_in_ms != null && data[pos].duration_in_sec != null) {
            data[pos].duration_in_ms = pageDurationInMs
            data[pos].duration_in_sec = pageDurationInSec
          } else {
            logs.debug('no index at data poition or duration not available to set');
        return data
      // clean data
      function cleanData (data) {
        for (const [index, redirect] of data.entries()) {
          if (redirect.duration_in_ms != null && redirect.duration_in_ms == 0) {
            delete redirect.duration_in_ms
          if (redirect.duration_in_sec != null && redirect.duration_in_sec == 0) {
            delete redirect.duration_in_sec
        return data
      // return our destinations after a defined time
      return new Promise((resolve, reject) => {
        // create a function to inject into the page to scrape data
        const scrapablePageData = async () => {
          ** Read localStorage
          function getLocalStorage () {
            const values = [];
            const keys = Object.keys(localStorage);
            let index = keys.length;
            while (index--) {
              values.push({
                key: keys[index],
                value: localStorage.getItem(keys[index])
            return values ? values : [];
          ** Read sessionStorage
          function getSessionStorage () {
            const values = [];
            const keys = Object.keys(sessionStorage);
            let index = keys.length;
            while (index--) {
              values.push({
                key: keys[index],
                value: sessionStorage.getItem(keys[index])
            return values ? values : [];
          return {
            localStorage: getLocalStorage(),
            sessionStorage: getSessionStorage(),
            url: window.location.href
        // scrape the page
        async function scrapePageUrl (runOnce = false) {
          try {
            if (!runOnce) {
              evalTimeout = setTimeout(() => {
                scrapePageUrl(true)
              }, parseInt(config.emulation.max_redirect_window * 1000))
            const scrapable = await page.evaluate(scrapablePageData);
            const cookies = await page.cookies();
            await page.setRequestInterception(true);
            await page.on('request', async (request) => {
              try {
                if (request.resourceType() == 'document' && request.redirectChain().length >= 1) {
                  requestUrls.push({
                    url: request.url(),
                    duration_in_ms: 0,
                    duration_in_sec: 0,
                    loaded_at: dayjs().valueOf()
                await request.continue()
              } catch (err) {}
            await page.on('response', async (response) => {
              try {
                const status = response.status()
                if (status && journeyCodes.includes(status)) {
                  responseUrls.push({
                    url: response.url(),
                    type: status,
                    duration_in_ms: 0,
                    duration_in_sec: 0
              } catch (err) {}
            if (!runOnce) {
              await page.waitForNavigation();
              // each page loaded at this time
              times.push({
                loaded_at: dayjs().valueOf()
            journey.push({
              url: scrapable.url,
              storage: {
                cookies: cookies,
                local: scrapable.localStorage,
                session: scrapable.sessionStorage
              type: 'JS redirection',
              duration_in_ms: 0,
              duration_in_sec: 0,
              loaded_at: dayjs().valueOf(),
              loaded_at_friendly: dayjs().format('YYYY-MM-DD HH:mm:ss')
            if (!runOnce) {
              clearTimeout(evalTimeout)
              scrapePageUrl()
            // final URL
            if (runOnce) {
              clearTimeout(evalMaxJourneyTimeout)
              // the last page loaded at this time
              times.push({
                loaded_at: dayjs().valueOf()
              const dedupedReqUrls = helpers.dedupe(requestUrls, it => it.url)
              const dedupedResUrls = helpers.dedupe(responseUrls, it => it.url)
              for (const [index, mainHop] of dedupedReqUrls.entries()) {
                mergedRequestResponses.push({
                 ...dedupedReqUrls[index],
                 ...(dedupedResUrls.find((item) => item.url === dedupedReqUrls[index].url))}
              mergedRequestResponses = mergedRequestResponses.filter(obj => Object.keys(obj).includes('type'))
              for (const [index, responseHop] of mergedRequestResponses.entries()) {
                journey.push(responseHop)
              let deduped = helpers.dedupe(journey, it => it.url)
              deduped = durationFromData(deduped, times)
              deduped = cleanData(deduped)
              deduped.sort(function(a, b) {
                return new Date(a.loaded_at * 1000) - new Date(b.loaded_at * 1000);
              console.log('=== DEDUPED - AFTER SORT ===')
              console.log(deduped)
              closeBrowser();
              setHopData(deduped, body);
              logs.debug('SUCCESS, returning hop data');
              resolve(hopDataToReturn);
          } catch (err) {
            await logs.generateLog(err.message ? err.message : 'errow thrown in scraping', 'ERROR');
        // max journey time failsafe
        evalMaxJourneyTimeout = setTimeout(async () => {
          closeBrowser();
          clearTimeout(evalTimeout)
          resolve(hopDataToReturn);
        }, parseInt(config.emulation.max_journey_time * 1000))
        // begin scraping pages
        scrapePageUrl()