😎 nest-crawler 😎
Crawler and Scraper Module for NestJS
Installation
$ npm install --save nest-crawler
Usage
First, register it in the application module so that Nest can handle dependencies:
import { Module } from '@nestjs/common';import { NestCrawlerModule } from 'nest-crawler'; @Module({ imports: [ NestCrawlerModule, ],})export class AppModule {}
Then, just import it and use it:
crawler.module.ts
import { Module } from '@nestjs/common';import { NestCrawlerModule } from 'nest-crawler';@Module({ imports: [ NestCrawlerModule, ],})export class CrawlerModule {}
crawler.service.ts
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} // scraping the specific page public async scrape(): Promise<void> { interface ExampleCom { title: string; info: string; content: string; } const data: ExampleCom = await this.crawler.fetch({ target: 'http://example.com', fetch: { title: 'h1', info: { selector: 'p > a', attr: 'href', }, content: { selector: '.content', how: 'html', }, }, }); console.log(data); // { // title: 'Example Domain', // info: 'http://www.iana.org/domains/example', // content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>' // } } // crawling multi pages is also supported public async crawl(): Promise<void> { interface HackerNewsPage { title: string; } const pages: HackerNewsPage[] = await this.crawler.fetch({ target: { url: 'https://news.ycombinator.com', iterator: { selector: 'span.age > a', convert: (x: string) => `https://news.ycombinator.com/${x}`, }, }, fetch: (data: any, index: number, url: string) => ({ title: '.title > a', }), }); console.log(pages); // [ // { title: 'Post Title 1' }, // { title: 'Post Title 2' }, // ... // ... // { title: 'Post Title 30' } // ] }}
Recipe
Single Page Scraping
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} public async scrape(): Promise<void> { interface ExampleCom { title: string; info: string; content: string; } const data: ExampleCom = await this.crawler.fetch({ target: 'http://example.com', fetch: { title: 'h1', info: { selector: 'p > a', attr: 'href', }, content: { selector: '.content', how: 'html', } }, }); console.log(data); // { // title: 'Example Domain', // info: 'http://www.iana.org/domains/example', // content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>' // } }}
Multi Pages Crawling
You Know the target urls already
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} public async crawl(): Promise<void> { interface Site { title: string; } const sites: Site[] = await this.crawler.fetch({ target: [ 'https://example1.com', 'https://example2.com', 'https://example3.com', ], fetch: (data: any, index: number, url: string) => ({ title: 'h1', }), }); console.log(sites); // [ // { title: 'An easiest crawling and scraping module for NestJS' }, // { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' }, // { title: '[Experimental] React SSR as a view template engine' } // ] }}
You Don't Know the Target Urls so Want to Crawl Dynamically
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} public async crawl(): Promise<void> { interface Page { title: string; } const pages: Page[] = await this.crawler.fetch({ target: { url: 'https://news.ycombinator.com', iterator: { selector: 'span.age > a', convert: (x: string) => `https://news.ycombinator.com/${x}`, }, }, // fetch each `https://news.ycombinator.com/${x}` and scrape data fetch: (data: any, index: number, url: string) => ({ title: '.title > a', }), }); console.log(pages); // [ // { title: 'Post Title 1' }, // { title: 'Post Title 2' }, // ... // ... // { title: 'Post Title 30' } // ] }}
You Need to Pass Data Dynamically
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} public async crawl(): Promise<void> { interface Img { src: string; } const images: Img[] = await this.crawler.fetch({ target: { url: 'https://some.image.com', iterator: { selector: 'span.age > a', convert: (x: string) => `https://some.image.com${x}`, }, fetch: { imageIds: { listItem: 'div.image', data: { id: { selector: 'div.image-wrapper', attr: 'data-image-id', }, }, }, }, }, // fetch each `https://some.image.com${x}`, pass data and scrape data fetch: (data: any, index: number, url: string) => ({ src: { convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`, }, }), }); console.log(images); // [ // { src: 'https://some.image.com/images/1.png' }, // { src: 'https://some.image.com/images/2.png' }, // ... // ... // { src: 'https://some.image.com/images/100.png' } // ] }}
Waitable (by using puppeteer
)
import { Injectable } from '@nestjs/common';import { NestCrawlerService } from 'nest-crawler'; @Injectable()export class CrawlerService { constructor( private readonly crawler: NestCrawlerService, ) {} public async crawl(): Promise<void> { interface Page { title: string; } const pages: Page[] = await this.crawler.fetch({ target: { url: 'https://news.ycombinator.com', iterator: { selector: 'span.age > a', convert: (x: string) => `https://news.ycombinator.com/${x}`, }, }, waitFor: 3 * 1000, // wait for the content loaded! (like single page apps) fetch: (data: any, index: number, url: string) => ({ title: '.title > a', }), }); console.log(pages); // [ // { title: 'Post Title 1' }, // { title: 'Post Title 2' }, // ... // ... // { title: 'Post Title 30' } // ] }}