Crawleando sites com NodeJS

1
CRAWLEANDO SITES COM
NODEJS
JSDAY CAMPINA GRANDE 2016
Allisson Azevedo

2
ALLISSON AZEVEDO
allissonazevedo.com
youtube.com/user/allissonazevedo
github.com/allisson
twitter.com/allisson
allisson.github.io/slides/
allisson@gmail.com

3
OBJETIVO
Crawlear o
Obter as receitas com imagem
Indexar no Elasticsearch
Exibir os resultados com o ExpressJS
http://www.tudogostoso.com.br

4
WEB CRAWLER / ROBOT / SPIDER
Um programa que navega por toda a rede de maneira
automática
Googlebot, BingBot, Yahoo! Slurp, Baiduspider
Opção quando não houver acesso aos dados via Web API

5
FUNCIONAMENTO
1. Carrega url
2. Parser do conteúdo
3. Carrega novas urls a partir dos links da atual

6
FERRAMENTAS
npm install request
npm install cheerio
npm install simplecrawler

7
SPEAKERS DO JSDAY
'use strict';
const request = require('request');
const cheerio = require('cheerio');
request('http://jsday.com.br/speakers/', (error, response, body) => {
let $ = cheerio.load(body);
let speakers = [];
$('.people-modal').each((i, element) => {
let speaker = {};
speaker.title = $(element).find('h4').text();
speaker.description = $(element).find('.theme-description').text();
speaker.name = $(element).find('.name').contents()[0].data.trim();
speaker.about = $(element).find('.about').text();
speaker.image = $(element).find('.people-img').css('background-image').replac
speakers.push(speaker);
});
console.log(JSON.stringify(speakers, null, 2));
});

8
PROGRAMAÇÃO DO JSDAY
'use strict';
request('http://jsday.com.br/schedule/', (error, response, body) => {
let $ = cheerio.load(body);
let subEvents = [];
$('.timeslot[itemtype="http://schema.org/subEvent"]').each((i, element) => {
let event = {};
event.title = $(element).find('.slot-title').text();
event.time = $(element).find('.start-time').attr('datetime');
subEvents.push(event);
});
console.log(JSON.stringify(subEvents, null, 2));
});

9
CRAWLER SIMPLES
'use strict';
const startUrl = 'https://allissonazevedo.com/';
const hostname = new RegExp(startUrl);
let urlSet = new Set();
let pages = [];
function normalizeUrl(url) {
return url.split('?')[0].split('#')[0];
}
function verifyUrl(url) {
if (
url.match(/.xml$/i) ||
url.match(//feed/$/i) ||
url.match(//amp/$/i)
) {

10
SIMPLECRAWLER
'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('allissonazevedo.com');
let pages = [];
myCrawler.interval = 100;
myCrawler.maxConcurrency = 16;
myCrawler.stripQuerystring = true;
const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
if (
parsedURL.path.match(/.xml$/i) ||
parsedURL.path.match(//feed/$/i) ||
parsedURL.path.match(//amp/$/i)
) {
return false;
}

11
CRAWLEANDO O TUDOGOSTOSO
1. Veri car as urls que são carregadas
2. Identi car as urls que são receitas
3. Parser e indexação no Elasticsearch

12
VERIFICANDO URLS
'use strict';
const myCrawler = new Crawler('www.tudogostoso.com.br');
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
console.log(queueItem.url);
});
myCrawler.start();

13
URLS QUE DEVEMOS EVITAR
http://www.tudogostoso.com.br/favicon-v2.1.ico
http://www.tudogostoso.com.br/app/assets/stylesheets/ie.css
http://www.tudogostoso.com.br/images/layout/logo-v4.png
http://www.tudogostoso.com.br/assets/layout/blank.gif
http://www.tudogostoso.com.br/imagens/renew/footer-bg.jpg
http://www.tudogostoso.com.br/dicas/10-pontos-do-brigadeiro/print
http://www.tudogostoso.com.br/receita/print_recipe.php?recipe_id=2721
http://www.tudogostoso.com.br/receita/4683/comentarios.js

14
EVITANDO O DOWNLOAD DE URLS
'use strict';
if (
parsedURL.path.match(/.ico$/i) ||
parsedURL.path.match(/.css$/i) ||
parsedURL.path.match(/.png$/i) ||
parsedURL.path.match(/.gif$/i) ||
parsedURL.path.match(/.jpg$/i) ||
parsedURL.path.match(/.js$/i) ||
parsedURL.path.match(/print_recipe.php/i) ||
parsedURL.path.match(//print$/i)

15
URLS DE RECEITAS
http://www.tudogostoso.com.br/receita/76147-anchova-assada.html
'use strict';
const re = //receita/([0-9]+)-([w-]+).html$/i;
const urls = [
'http://www.tudogostoso.com.br/',
'http://www.tudogostoso.com.br/categorias/bolos-e-tortas-doces.php',
'http://www.tudogostoso.com.br/receita/179236-petit-gateau-de-nutella-perfeito.
];
for (let url of urls) {
if (re.test(url)) {
console.log(url + ' é uma receita.');
} else {
console.log(url + ' não é uma receita');
}
}

16
PARSER DA RECEITA
'use strict';
if (
parsedURL.path.match(/.css$/i) ||
parsedURL.path.match(/.png$/i) ||
parsedURL.path.match(/.gif$/i) ||
parsedURL.path.match(/.jpg$/i) ||
parsedURL.path.match(/.js$/i) ||

17
INDEXANDO NO ELASTICSEARCH
'use strict';
const elasticsearch = require('elasticsearch');
const client = new elasticsearch.Client({
host: 'localhost:9200',
log: 'trace'
});
if (

Crawleando sites com NodeJS

Recommended

Recommended

More Related Content

More from Allisson Azevedo

More from Allisson Azevedo (9)

Recently uploaded

Recently uploaded (6)

Crawleando sites com NodeJS