Код: Выделить всё
//**************************************** constants
// all error codes that we pass back
const ERROR_CODE_URL_INVALID = 1002;
const ERROR_INTERNAL_SERVER_ERROR = 1003;
const ERROR_TIMEOUT = 1004;
//**************************************** scrape
const scrape = async (page, url) => {
let response = null;
// load the URL and check it's something useful
let errorContent = null;
try {
// deliberately tiny timeout value, to demonstrate problem
response = await page.goto(url, { waitUntil: 'load', timeout: 100 });
if (!response) {
// sometimes the response is not returned immediately, so we need to wait for it
response = await page.waitForResponse(() => true, { timeout: 100 });
}
if (response.status() === 200) {
// success; nothing to do here
}
else if (response.status() === 404) {
// get ready to throw a 404 below
errorContent = ERROR_CODE_URL_INVALID;
} else if (response.status() === 500) {
// get ready to throw a 500 below
errorContent = ERROR_INTERNAL_SERVER_ERROR;
} else {
// throw whatever else we have down to the catch block
throw response.text;
}
} catch (error) {
if (error instanceof TimeoutError) {
// get ready to throw a timeout to our caller
errorContent = ERROR_TIMEOUT;
}
else {
// rethrow the error we got above
throw error;
}
}
// now throw whatever error we got above
if (errorContent !== null) {
throw errorContent;
}
// get ready to scrape the page HTML
// first hopefully get rid of any annoying popups in the way
page.mouse.click(1, 1);
// get the page HTML and return it
const data = await page.evaluate(() => document.querySelector('*').outerHTML);
return data;
} // scrape()
//**************************************** main
let browser;
(async function main() {
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality
const puppeteer = require('puppeteer-extra');
// use the following line instead of the above one if not doing puppeteer stealth
//const puppeteer = require('puppeteer');
// add stealth plugin and use defaults (all evasion techniques)
const stealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(stealthPlugin())
// start up the browser and get a page
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
// get the page HTML and return it
let s = await scrape(page, "https://www.scrapethissite.com/pages/");
console.log(s);
})()
.catch(err => console.log(err))
.finally(() => browser?.close());
Подробнее здесь: https://stackoverflow.com/questions/796 ... ot-defined
Мобильная версия