我正在构建一个Puppeteer Node JS脚本,它需要拾取URL重定向链中的网页,包括那些301和302的网页,也包括那些有
setTimeout
,以导航到一个新页面的网页/网站。
我目前的脚本实现了
await page.waitForNavigation
,这对于在重定向链中拾取利用JS方法从一个网站重定向到另一个网站的网站来说是很好的,但TI没有拾取301或302重定向,所以我同时实现了
page.on('request')
和
page.on('response')
,并将它们结合起来,因为我也需要得到重定向的状态代码。
问题是,301和302重定向似乎在起始URL之前,例如,鉴于以下重定向链的旅程。
在我的代码中,我似乎得到了以下的顺序。
为什么会这样,我怎样才能实现一个可靠的页面重定向链,包括那些使用301和302以及使用JS重定向的网站,我目前的代码是。
对于这里的代码量我表示歉意,我已经尽量让它变得紧凑,但我只是想把URL的集合按照它们应该出现的顺序整理出来。
const dayjs = require('dayjs');
const AdvancedFormat = require('dayjs/plugin/advancedFormat');
dayjs.extend(AdvancedFormat);
const puppeteer = require('puppeteer');
const { config } = require('./config');
const helpers = require('./helpers');
const logs = require('./logs');
const runEmulation = async (body) => {
logs.debug('starting emulation');
// vars
const journeyCodes = [301, 302];
const journey = [], requestUrls = [], responseUrls = [];
let mergedRequestResponses = [];
let evalTimeout, evalMaxJourneyTimeout, hopDataToReturn;
// launcher arg options
const argOptions = [];
// sandbox config
if ((config.puppeteer.run_in_sandbox === 'true')) {
argOptions.push('--no-sandbox');
// initiate a Puppeteer instance with options and launch
const browser = await puppeteer.launch({
args: argOptions,
headless: (config.puppeteer.run_in_headless === 'true') ? true : false
// launch a new page
logs.debug('launching new page');
const page = await browser.newPage();
// hold the page times, and start with the first page
const times = [{
loaded_at: dayjs().valueOf()
logs.debug(`setting UserAgent to ${body.userAgent ? body.userAgent : 'unknown'}`);
await page.setUserAgent(body.userAgent);
logs.debug(`going to ${body.url ? body.url : 'unknown'}`);
await page.goto(body.url);
// expose a function to close the browser
async function closeBrowser () {
try {
logs.debug('closing browser');
await browser.close();
} catch (err) {
await logs.generateLog(err.message ? err.message : 'errow thrown in closing browser', 'ERROR');
// get loggable hop data
function getHopDataLoggable (hops, deduped) {
return {
hops: hops ? hops : 1,
all_hops: deduped ? deduped : [],
storage: deduped.storage ? deduped.storage : [],
destination: deduped.length > 1 ? deduped[deduped.length - 1].url : deduped[0].url,
redirect_id: body.redirect_id ? body.redirect_id : null,
user_agent: body.userAgent ? body.userAgent : 'Unavailable',
date_iso: dayjs().toISOString(),
date: dayjs().format('YYYY-MM-DD HH:mm:ss')
// set hop data
async function setHopData (deduped, body) {
let hops = deduped.length - 1
if (hops < 1) hops = 0
hopDataToReturn = {
hops: hops,
hop_destination: deduped.length > 1 ? deduped[deduped.length - 1].url : deduped[0].url,
all_hops: deduped ? deduped : [],
redirect_id: body.redirect_id ? body.redirect_id : null,
storage: deduped.storage ? deduped.storage : []
await logs.generateLog(JSON.stringify(getHopDataLoggable(hops, deduped)), 'LOG');
// dedupe functioon
function durationFromData (data, times) {
for (const [index, redirect] of times.entries()) {
if (index === 0) continue
let pos = index - 1
// calculate the difference
const previous = dayjs(times[pos].loaded_at)
const current = dayjs(redirect.loaded_at)
// on page for
const pageDurationInMs = current.diff(previous, 'millisecond')
const pageDurationInSec = current.diff(previous, 'second')
// update durations
if (data[pos] != null && data[pos].duration_in_ms != null && data[pos].duration_in_sec != null) {
data[pos].duration_in_ms = pageDurationInMs
data[pos].duration_in_sec = pageDurationInSec
} else {
logs.debug('no index at data poition or duration not available to set');
return data
// clean data
function cleanData (data) {
for (const [index, redirect] of data.entries()) {
if (redirect.duration_in_ms != null && redirect.duration_in_ms == 0) {
delete redirect.duration_in_ms
if (redirect.duration_in_sec != null && redirect.duration_in_sec == 0) {
delete redirect.duration_in_sec
return data
// return our destinations after a defined time
return new Promise((resolve, reject) => {
// create a function to inject into the page to scrape data
const scrapablePageData = async () => {
** Read localStorage
function getLocalStorage () {
const values = [];
const keys = Object.keys(localStorage);
let index = keys.length;
while (index--) {
values.push({
key: keys[index],
value: localStorage.getItem(keys[index])
return values ? values : [];
** Read sessionStorage
function getSessionStorage () {
const values = [];
const keys = Object.keys(sessionStorage);
let index = keys.length;
while (index--) {
values.push({
key: keys[index],
value: sessionStorage.getItem(keys[index])
return values ? values : [];
return {
localStorage: getLocalStorage(),
sessionStorage: getSessionStorage(),
url: window.location.href
// scrape the page
async function scrapePageUrl (runOnce = false) {
try {
if (!runOnce) {
evalTimeout = setTimeout(() => {
scrapePageUrl(true)
}, parseInt(config.emulation.max_redirect_window * 1000))
const scrapable = await page.evaluate(scrapablePageData);
const cookies = await page.cookies();
await page.setRequestInterception(true);
await page.on('request', async (request) => {
try {
if (request.resourceType() == 'document' && request.redirectChain().length >= 1) {
requestUrls.push({
url: request.url(),
duration_in_ms: 0,
duration_in_sec: 0,
loaded_at: dayjs().valueOf()
await request.continue()
} catch (err) {}
await page.on('response', async (response) => {
try {
const status = response.status()
if (status && journeyCodes.includes(status)) {
responseUrls.push({
url: response.url(),
type: status,
duration_in_ms: 0,
duration_in_sec: 0
} catch (err) {}
if (!runOnce) {
await page.waitForNavigation();
// each page loaded at this time
times.push({
loaded_at: dayjs().valueOf()
journey.push({
url: scrapable.url,
storage: {
cookies: cookies,
local: scrapable.localStorage,
session: scrapable.sessionStorage
type: 'JS redirection',
duration_in_ms: 0,
duration_in_sec: 0,
loaded_at: dayjs().valueOf(),
loaded_at_friendly: dayjs().format('YYYY-MM-DD HH:mm:ss')
if (!runOnce) {
clearTimeout(evalTimeout)
scrapePageUrl()
// final URL
if (runOnce) {
clearTimeout(evalMaxJourneyTimeout)
// the last page loaded at this time
times.push({
loaded_at: dayjs().valueOf()
const dedupedReqUrls = helpers.dedupe(requestUrls, it => it.url)
const dedupedResUrls = helpers.dedupe(responseUrls, it => it.url)
for (const [index, mainHop] of dedupedReqUrls.entries()) {
mergedRequestResponses.push({
...dedupedReqUrls[index],
...(dedupedResUrls.find((item) => item.url === dedupedReqUrls[index].url))}
mergedRequestResponses = mergedRequestResponses.filter(obj => Object.keys(obj).includes('type'))
for (const [index, responseHop] of mergedRequestResponses.entries()) {
journey.push(responseHop)
let deduped = helpers.dedupe(journey, it => it.url)
deduped = durationFromData(deduped, times)
deduped = cleanData(deduped)
deduped.sort(function(a, b) {
return new Date(a.loaded_at * 1000) - new Date(b.loaded_at * 1000);
console.log('=== DEDUPED - AFTER SORT ===')
console.log(deduped)
closeBrowser();
setHopData(deduped, body);
logs.debug('SUCCESS, returning hop data');
resolve(hopDataToReturn);
} catch (err) {
await logs.generateLog(err.message ? err.message : 'errow thrown in scraping', 'ERROR');
// max journey time failsafe
evalMaxJourneyTimeout = setTimeout(async () => {
closeBrowser();
clearTimeout(evalTimeout)
resolve(hopDataToReturn);
}, parseInt(config.emulation.max_journey_time * 1000))
// begin scraping pages
scrapePageUrl()