nest-crawler

Nest Logo

😎 nest-crawler 😎

Crawler and Scraper Module for NestJS

Package License (MIT)

Installation

$ npm install --save nest-crawler

Usage

First, register it in the application module so that Nest can handle dependencies:

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
 
@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class AppModule {}

Then, just import it and use it:

crawler.module.ts

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class CrawlerModule {}

crawler.service.ts

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  // scraping the specific page
  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }
 
    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        },
      },
    });
 
    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }
 
  // crawling multi pages is also supported
  public async crawl(): Promise<void> {
    interface HackerNewsPage {
      title: string;
    }
 
    const pages: HackerNewsPage[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });
 
    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Recipe

Single Page Scraping

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }
 
    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        }
      },
    });
 
    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }
}

Multi Pages Crawling

You Know the target urls already

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  public async crawl(): Promise<void> {
    interface Site {
      title: string;
    }
 
    const sites: Site[] = await this.crawler.fetch({
      target: [
        'https://example1.com',
        'https://example2.com',
        'https://example3.com',
      ],
      fetch: (data: any, index: number, url: string) => ({
        title: 'h1',
      }),
    });
 
    console.log(sites);
    // [
    //   { title: 'An easiest crawling and scraping module for NestJS' },
    //   { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
    //   { title: '[Experimental] React SSR as a view template engine' }
    // ]
  }
}

You Don't Know the Target Urls so Want to Crawl Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }
 
    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      // fetch each `https://news.ycombinator.com/${x}` and scrape data
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });
 
    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

You Need to Pass Data Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  public async crawl(): Promise<void> {
    interface Img {
      src: string;
    }
 
    const images: Img[] = await this.crawler.fetch({
      target: {
        url: 'https://some.image.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://some.image.com${x}`,
        },
        fetch: {
          imageIds: {
            listItem: 'div.image',
            data: {
              id: {
                selector: 'div.image-wrapper',
                attr: 'data-image-id',
              },
            },
          },
        },
      },
      // fetch each `https://some.image.com${x}`, pass data and scrape data
      fetch: (data: any, index: number, url: string) => ({
        src: {
          convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
        },
      }),
    });
 
    console.log(images);
    // [
    //   { src: 'https://some.image.com/images/1.png' },
    //   { src: 'https://some.image.com/images/2.png' },
    //   ...
    //   ...
    //   { src: 'https://some.image.com/images/100.png' }
    // ]
  }
}

Waitable (by using `puppeteer`)

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
 
@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}
 
  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }
 
    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });
 
    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

@web-master/node-web-fetch

nest-crawler

Installation

Usage

Recipe

Single Page Scraping

Multi Pages Crawling

You Know the target urls already

You Don't Know the Target Urls so Want to Crawl Dynamically

You Need to Pass Data Dynamically

Waitable (by using `puppeteer`)

Related

Readme

Keywords

Package Sidebar

Install

Repository

Homepage

Weekly Downloads

Version

License

Unpacked Size

Total Files

Last publish

Collaborators

nest-crawler

Installation

Usage

Recipe

Single Page Scraping

Multi Pages Crawling

You Know the target urls already

You Don't Know the Target Urls so Want to Crawl Dynamically

You Need to Pass Data Dynamically

Waitable (by using puppeteer)

Related

Readme

Keywords

Package Sidebar

Install

Repository

Homepage

DownloadsWeekly Downloads

Version

License

Unpacked Size

Total Files

Last publish

Collaborators

Waitable (by using `puppeteer`)

Weekly Downloads