I'm trying to scrape some content from a website and I think that the captha is avoiding the task.
I'm using userAgent but it still does not work.
Here is the code:
// helper.js
const puppeteer = require('puppeteer');
const userAgent = require('user-agents');
async function getDynamicPageHtml(url) {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent(userAgent.toString());
await page.goto(url, { waitUntil: 'networkidle0' });
const html = await page.evaluate(() => document.querySelector('*').outerHTML);
await browser.close();
return html;
} catch (err) {
console.error(err);
return null;
}
}
module.exports = {
getDynamicPageHtml
}
// idealista.js
const cheerio = require('cheerio');
const browser = require('./helper');
async function getData() {
const html = await browser.getDynamicPageHtml('https://www.idealista.com/alquiler-habitacion/madrid/chamberi/con-precio-hasta_450,compartido-2-personas/?ordenado-por=fecha-publicacion-desc&ordenado-por=fecha-publicacion-desc');
console.log(html);
const $ = cheerio.load(html);
const announce = $('#main-content > section > article').map((index, element) => {
return $(element).first().text().trim();
}).toArray();
announce.forEach((element, index) => {
// do stuff
});
}
module.exports = {
getData
};
// app.js
const express = require('express');
const idealista = require('./idealista');
const app = express();
app.set('port', process.env.PORT || 3000);
app.use(express.json());
app.use(express.urlencoded({extended: true}))
app.listen(app.get('port'), async () => {
console.log('server on port ',app.get('port'));
await idealista.getData();
})
This is the output when I debug puppeteer content:
question from:
https://stackoverflow.com/questions/65890873/avoiding-captcha-with-puppeteer-and-nodejs 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…