I have built a puppeteer web scraper which takes a query, scrapes the result and saves it in a json. I want to create a process/pipeline to do this for >50k queries and save the results - ideally not having to wait 2 weeks for the script to finish.
I did a lot of research and decided to follow your tutorial: https://www.digitalocean.com/community/tutorials/how-to-build-a-concurrent-web-scraper-with-puppeteer-node-js-docker-and-kubernetes to build a concurrent scraper by dockerizing and deploying it to a Kubernetes cluster. It’s a great resource and so and with a lot of work adapted everything to my specific situation.
However, when I tried running it, I kept getting the error connect eaddrnotavail but I couldn’t really understand how best to solve the issue. I finally got it to work by changing my client side code slightly compared to the article by using async/await. It works but it is very slow - I feel like only one request is sent to the cluster and finished and then the next one is sent off, which isn’t really concurrent and I don’t need a cluster for that!
So now I’m asking myself what is wrong or how can I improve my code to speed it up. The way I see it it can be done a lot faster but I must be missing something compared to your code which works so fast. My client side code:
let axios = require('axios')
let ldb = require('./lowdbHelper.js').LowDbHelper
let ldbHelper = new ldb()
let allAddresses = ldbHelper.getData()
const fs = require('fs');
const csv = require('csv-parser');
let fname = 'Altbau_new.csv';
let server = "http://123.456.789.000"
let podsWorkDone = []
let addressesDetails = []
let errors = []
function main() {
getDetails()
}
async function getDetails(){
let begin = Date.now()
for (let j = 0; j < allAddresses.length; j++) {
try{
let data = {
url: 'https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/',
addr: allAddresses[j],
commands: [{description: 'scrape', type: 'scrape'}]
}
await sendRequest(data, function (result) {
parseResult(result, begin)
})
} catch (e){
continue
}
}
}
async function sendRequest(payload, cb) {
let address = payload
try {
await axios.post(`${server}/api/addresses`, address).then(response => {
if (Object.keys(response.data).includes('error')) {
let res = {
address: address.addr,
error: response.data.error
}
cb(res)
} else {
cb(response.data)
}
})
} catch (error) {
console.log(error)
let res = {
address: address.addr,
error: error
}
cb({ res })
}
}
function parseResult(result, begin){
try {
let end = Date.now()
let timeSpent = (end - begin) / 1000 + "secs ";
if (!Object.keys(result).includes("error")) {
let wasSuccessful = Object.keys(result.zuschlag).length > 0 ? true : false
if (wasSuccessful) {
let podID = result.hostname
let podsIDs = podsWorkDone.length > 0 ? podsWorkDone.map(pod => { return Object.keys(pod)[0]}) : []
if (!podsIDs.includes(podID)) {
let podWork = {}
podWork[podID] = 1
podsWorkDone.push(podWork)
} else {
for (let pwd = 0; pwd < podsWorkDone.length; pwd++) {
if (Object.keys(podsWorkDone[pwd]).includes(podID)) {
podsWorkDone[pwd][podID] += 1
break
}
}
}
addressesDetails.push(result)
} else {
errors.push(result)
}
} else {
errors.push(result)
}
console.log('podsWorkDone', podsWorkDone, ', retrieved ' + addressesDetails.length + " addresses, ",
"took " + timeSpent + ", ", "used " + podsWorkDone.length + " pods,", " errors: " + errors.length)
ldbHelper.saveData(addressesDetails)
} catch (error) {
console.log(error)
}
}
main()
I included await at the sendRequest function and made the getDetails function async otherwise it would not work. My server side:
const express = require('express');
const bodyParser = require('body-parser')
const os = require('os');
const PORT = 5000;
const app = express();
let timeout = 1500000
app.use(bodyParser.urlencoded({ extended: true }))
app.use(bodyParser.json())
let browsers = 0
let maxNumberOfBrowsers = 5
app.get('/', (req, res) => {
console.log(os.hostname())
let response = {
msg: 'hello world',
hostname: os.hostname().toString()
}
res.send(response);
});
app.post('/api/addresses', async (req, res) => {
req.setTimeout(timeout);
try {
let data = req.body
console.log(req.body.url)
while (browsers == maxNumberOfBrowsers) {
await sleep(1000)
}
await getAddressesHandler(data).then(result => {
let response = {
msg: 'retrieved addresses ',
hostname: os.hostname(),
addr: data.addr,
zuschlag: result
}
console.log('done')
res.send(response)
})
} catch (error) {
res.send({ error: error.toString() })
}
});
async function getAddressesHandler(arg) {
let pMng = require('./puppeteerManager')
let puppeteerMng = new pMng.PuppeteerManager(arg)
browsers += 1
try {
let addresses = await puppeteerMng.getAllAddresses().then(result => {
return result
})
browsers -= 1
return addresses
} catch (error) {
browsers -= 1
console.log(error)
}
}
function sleep(ms) {
console.log(' running maximum number of browsers')
return new Promise(resolve => setTimeout(resolve, ms))
}
app.listen(PORT);
console.log(`Running on port: ${PORT}`);
many thanks in advance
This textbox defaults to using Markdown to format your answer.
You can type !ref in this text area to quickly search our full set of tutorials, documentation & marketplace offerings and insert the link!
These answers are provided by our Community. If you find them useful, show some love by clicking the heart. If you run into issues leave a comment, or add your own answer to help others.
Using
await
before thesendRequest()
function is a big no because when you useawait
the code will be forced to wait for thesendRequest()
function to finishing executing. You mentioned that you have more than 50K queries, so maybe the client can’t send that many requests at once. So my advice would be to remove theawait
in thesendRequest()
function and only send 400 requests at a time to the server, if that succeeds try increasing the number of requests that the client makes to see what is the maximum number of requests that the client can make. Also, check this question I found on StackOverflow , it might help you with the EADDRNOTAVAIL issue. You can reach me at sam.b.russian@gmail.com if you need more help.