Integrate Scrapoxy to Node.js¶
Goal¶
Is it easy to find a good Javascript developer on Paris ? No!
So, it’s time to build a scraper with Node.js, Request and Cheerio to find our perfect profile.
The site Scraping Challenge indexes a lot of profiles (fake, for demo purposes). We want to list them.
However, the site is protected against scraping ! We must use Scrapoxy to bypass the protection.
Step 1: Create a Node.js project¶
Create a new project¶
Create a directory for the project:
mkdir nodejs-request
cd nodejs-request
Create the package.json:
npm init --yes
Add dependencies:
npm install lodash bluebird cheerio request winston@2.x --save
What are these dependencies ?
- lodash is a javascript helper,
- bluebird is a promise library,
- cheerio is a JQuery parser,
- requests makes HTTP requests,
- winston is a logger.
Add a scraper¶
Add this content to index.js
const _ = require('lodash'),
Promise = require('bluebird'),
cheerio = require('cheerio'),
request = require('request'),
winston = require('winston');
winston.level = 'debug';
const config = {
// URL of the site
source: 'http://scraping-challenge-2.herokuapp.com',
opts: {
},
};
// Get all URLs
getProfilesUrls(config.source, config.opts)
.then((urls) => {
winston.info('Found %d profiles', urls.length);
winston.info('Wait 120 seconds to scale instances');
return urls;
})
// Wait 2 minutes to scale instances
.delay(2 * 60 * 1000)
.then((urls) => {
// Get profiles one by one.
return Promise.map(urls,
(url) => getProfile(url, config.opts)
.then((profile) => {
winston.debug('Found %s', profile.name);
return profile;
})
.catch(() => {
winston.debug('Cannot retrieve %s', url);
})
, {concurrency: 1})
.then((profiles) => {
const results = _.compact(profiles);
winston.info('Extract %d on %d profiles', results.length, urls.length);
});
})
.catch((err) => winston.error('Error: ', err));
////////////
/**
* Get all the urls of the profiles
* @param url Main URL
* @param defaultOpts options for http request
* @returns {promise}
*/
function getProfilesUrls(url, defaultOpts) {
return new Promise((resolve, reject) => {
// Create options for the HTTP request
// Add the URL to the default options
const opts = _.merge({}, defaultOpts, {url});
request(opts, (err, res, body) => {
if (err) {
return reject(err);
}
if (res.statusCode !== 200) {
return reject(body);
}
// Load content into a JQuery parser
const $ = cheerio.load(body);
// Extract all urls
const urls = $('.profile a')
.map((i, el) => $(el).attr('href'))
.get()
.map((url) => `${config.source}${url}`);
resolve(urls);
});
});
}
/**
* Get the profile and extract the name
* @param url URL of the profile
* @param defaultOpts options for http request
* @returns {promise}
*/
function getProfile(url, defaultOpts) {
return new Promise((resolve, reject) => {
// Create options for the HTTP request
// Add the URL to the default options
const opts = _.merge({}, defaultOpts, {url});
request(opts, (err, res, body) => {
if (err) {
return reject(err);
}
if (res.statusCode !== 200) {
return reject(body);
}
// Load content into a JQuery parser
const $ = cheerio.load(body);
// Extract the names
const name = $('.profile-info-name').text();
resolve({name});
});
});
}
Run the script¶
Let’s try our new scraper!
Run this command:
node index.js
The script scraps the site and list profiles.
However, Scraping Challenge is protected! All requests fail…
We will integrate Scrapoxy to bypass the protection.
Step 2: Integrate Scrapoxy to the script¶
Install Scrapoxy¶
See Quick Start to install Scrapoxy.
Start Scrapoxy¶
Set the maximum of instances to 6, and start Scrapoxy (see Change scaling with GUI).
Warning
Don’t forget to set the maximum of instances!
Edit the script¶
Open index.js
and modify the config value
const config = {
// URL of the site
source: 'http://scraping-challenge-2.herokuapp.com',
opts: {
// URL of Scrapoxy
proxy: 'http://localhost:8888',
// HTTPS over HTTP
tunnel: false,
}
};