diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 4355321581d6..d024949bdea4 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -2,22 +2,24 @@ import { dirname } from 'node:path'; import type { Log } from '@apify/log'; import defaultLog, { LogLevel } from '@apify/log'; -import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout'; +import { TimeoutError, addTimeoutToPromise, tryCancel } from '@apify/timeout'; import { cryptoRandomObjectId } from '@apify/utilities'; import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, CrawlingContext, + DatasetExportOptions, EnqueueLinksOptions, EventManager, - DatasetExportOptions, FinalStatistics, GetUserDataFromRequest, IRequestList, + LoadedContext, ProxyInfo, Request, RequestOptions, + RestrictedCrawlingContext, RouterHandler, RouterRoutes, Session, @@ -25,30 +27,29 @@ import type { Source, StatisticState, StatisticsOptions, - LoadedContext, - RestrictedCrawlingContext, } from '@crawlee/core'; import { AutoscaledPool, Configuration, CriticalError, Dataset, - enqueueLinks, EnqueueStrategy, EventType, KeyValueStore, - mergeCookies, + Monitor, NonRetryableError, - purgeDefaultStorages, RequestProvider, - RequestQueueV1, RequestQueue, + RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, + enqueueLinks, + mergeCookies, + purgeDefaultStorages, validators, } from '@crawlee/core'; import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types'; @@ -56,7 +57,7 @@ import { ROTATE_PROXY_ERRORS, gotScraping } from '@crawlee/utils'; import { stringify } from 'csv-stringify/sync'; import { ensureDir, writeFile, writeJSON } from 'fs-extra'; // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood -import type { OptionsInit, Method } from 'got-scraping'; +import type { Method, OptionsInit } from 'got-scraping'; import ow, { ArgumentError } from 'ow'; import { getDomain } from 'tldts'; import type { SetRequired } from 'type-fest'; @@ -351,6 +352,12 @@ export interface BasicCrawlerOptions> = {}; @@ -542,6 +550,7 @@ export class BasicCrawler { + await this.display(); + }, interval); + } + + stop() { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + } + + private async display() { + const stats = this.statistics.calculate(); + const now = new Date(); + const startTime = this.statistics.state.crawlerStartedAt; + const elapsedTime = now.getTime() - new Date(startTime!).getTime(); + const cpuLoad = os.loadavg()[0]; + const memLoad = (os.totalmem() - os.freemem()) / os.totalmem(); + const { requestsFinished } = this.statistics.state; + const assumedTotalCount = this.requestQueue?.assumedTotalCount ?? 0; + + if (!this.monitorDisplay) { + throw new Error('Start the monitor first'); + } + + this.monitorDisplay.log(`Start: ${startTime ? formatDateTime(new Date(startTime)) : undefined}`); + this.monitorDisplay.log(`Now: ${formatDateTime(now)} (running for ${elapsedTime / 1000}s)`); + this.monitorDisplay.log( + `Progress: ${requestsFinished} / ${assumedTotalCount} (${((requestsFinished / assumedTotalCount) * 100).toFixed(2)}%), failed: ${this.statistics.state.requestsFailed} (${((this.statistics.state.requestsFailed / assumedTotalCount) * 100).toFixed(2)}%)`, + ); + this.monitorDisplay.log( + `Remaining: ${this.estimateRemainingTime(stats)} seconds (${(stats.requestsFinishedPerMinute / 60).toFixed(2)} pages/seconds)`, + ); + this.monitorDisplay.log(`Sys. load: ${cpuLoad.toFixed(2)}% CPU / ${(memLoad * 100).toFixed(2)}% Memory`); + this.monitorDisplay.log( + `Concurrencies: Current ${this.autoscaledPool?.currentConcurrency}, Desired ${this.autoscaledPool?.desiredConcurrency}`, + ); + + // TODO: Add list of URLs that are currently being processed + + this.monitorDisplay.resetCursor(); + } + + private estimateRemainingTime(stats: ReturnType) { + const na = 'N/A'; + if (!this.requestQueue) { + return na; + } + + const remainingRequests = this.requestQueue.assumedTotalCount - this.statistics.state.requestsFinished; + const avgDuration = stats.requestAvgFinishedDurationMillis; + const remainingTime = (remainingRequests * avgDuration) / 1000; + const safeRemainingTime = Number.isFinite(remainingTime) ? remainingTime.toFixed(2) : na; + return safeRemainingTime; + } +} + +const CLEAR_LINE = '\x1B[K'; + +class MonitorDisplay { + private lastLinesCount: number = 0; + private linesCount: number = 0; + + public log(str: string): void { + // We create an empty line at the start so that any console.log calls + // from within the script are above our output. + if (this.linesCount === 0) { + // eslint-disable-next-line no-console + console.log(CLEAR_LINE); // erases the current line + this.linesCount += 1; + } + + // Strip lines that are too long + // const strToLog = str.substring(0, 78); + const strToLog = str; + // eslint-disable-next-line no-console + console.log(`${CLEAR_LINE}${strToLog}`); + this.linesCount += 1; + } + + public resetCursor(): void { + // move cursor up to draw over out output + process.stdout.write(`\x1B[${this.linesCount}A`); + this.lastLinesCount = this.linesCount; + this.linesCount = 0; + } + + public close(): void { + // move cursor down so that console output stays + process.stdout.write(`\x1B[${this.lastLinesCount}B`); + } +} + +function formatDateTime(datetime: Date | number): string { + const date = typeof datetime === 'number' ? new Date(datetime) : datetime; + + const dateStr = `${date.getFullYear()}-${padDate(date.getMonth() + 1, 2)}-${padDate(date.getDate(), 2)}`; + const timeStr = + `${padDate(date.getHours(), 2)}` + + `:${padDate(date.getMinutes(), 2)}` + + `:${padDate(date.getSeconds(), 2)}` + + `.${padDate(date.getMilliseconds(), 3)}`; + + return `${dateStr} ${timeStr}`; +} + +function padDate(value: number | string, num: number): string { + const str = value.toString(); + if (str.length >= num) { + return str; + } + const zeroesToAdd = num - str.length; + return '0'.repeat(zeroesToAdd) + str; +}