This is a tiny web crawler under 1000 lines of code.
[![Maintenance Status][status-image]][status-url] [status-image]: https://img.shields.io/badge/status-maintained-brightgreen.svg [status-url]: https://github.com/bfwg/node-tinycrawler
var Crawler = require('../lib/crawler')
var crawler = new Crawler('http://www.someUrl.com');
// crawler.maxDepth = 4;
// crawler.crawlInterval = 10;
// crawler.maxListenerCurrency = 10;
// crawler.redisQueue = true;
crawler.start();
start => fetcherror | timeout => fetchcomplete | fetchredirect => complete
crawler.on('start', () => {
console.log('Start crawling');
})
crawler.on("fetcherror", (a, b) => {
console.log(b.statusCode);
});
crawler.on('timeout', () => {
console.log('timeout');
});
crawler.on('fetchredirect', (queueItem, targetl) => {
console.log('REDIRECTED', queueItem.url, 'to', targetl);
});
crawler.on('fetchcomplete', (queueItem, buffer) => {
// Do whatever you want with queueItem or buffer
console.log('finished fetching', queueItem.url);
});
The maximum depth crawler allow to go.
The interval for dispatching crawlers.
How many crawlers are we dispatching at the same time.
For bigger websides, default in memory queue might not be enough. Install redis first and config tiny-crawler to use redis queue.