โ† Back

Website crawler (Webcrawler)

webcrawler/html

The Webcrawler explores web pages by following links, extracting clean text for applications like AI model training, content aggregation, and market research.

$2.00 / 1,000 requests

Website crawler (Webcrawler) Javascript integration

Install dependency

npm i webcrawlerapi-js

How to get an access key?

Read Docs Access Key section to obtain a key.

Usage

In sync way, waiting for all items are done:

import webcrawlerapi from "webcrawlerapi-js";

async function main() {
    const client = new webcrawlerapi.WebcrawlerClient(
        "YOUR API ACCESS KEY HERE",
    )
    const response = await client.crawl({
            "items_limit": 3,
            "url": "https://books.toscrape.com/",
            "scrape_type": "markdown"
        }
    )
    console.log(response)
}

main().catch(console.error);

Async way

import webcrawlerapi from "webcrawlerapi-js";

async function main() {
    const client = new webcrawlerapi.WebcrawlerClient(
        "YOUR API ACCESS KEY HERE"
    )
    const response = await client.crawlAsync({
            "items_limit": 20,
            "url": "https://books.toscrape.com/",
            "scrape_type": "markdown"
        }
    )
    console.log("Job id: "+response.id)
    console.log(`Job Dashboard link: https://dash.webcrawlerapi.com/jobs/job/${response.id}`)
    for (let i = 0; i < 100; i++) {
        const job = await client.getJob(response.id)
        const doneItemsCount = job.job_items.filter(item => item.status === "done").length
        const limitItemsCount = job.items_limit
        if (doneItemsCount === limitItemsCount) {
            console.log("All items are done")
            client.getJob(response.id).then((job) => {
                job.job_items.forEach((item) => {
                    console.log(item.original_url)
                    console.log("\t", item.markdown_content_url)
                })
            })
            break
        }

        console.log(`Crawled ${job.job_items.filter(item => item.status === "done").length} out of ${job.items_limit} items`)
        await new Promise(r => setTimeout(r, 2000))
    }
}

main().catch(console.error);