diff --git a/packages/crawler/config/crawler.php b/packages/crawler/config/crawler.php index 559913ce..23f30f76 100644 --- a/packages/crawler/config/crawler.php +++ b/packages/crawler/config/crawler.php @@ -7,4 +7,128 @@ 'connect_timeout' => env('CRAWLER_CONNECT_TIMEOUT', 2), 'crawls_per_minute' => (int) env('CRAWLER_CRAWLS_PER_MINUTE', 500), + + 'platform_blacklists' => [ + 'magento1' => [ + 'label' => 'Magento 1', + 'patterns' => [ + '~^https?://[^/]+/index\.php/admin~i', + '~^https?://[^/]+/admin~i', + '~^https?://[^/]+/api/~i', + '~^https?://[^/]+/cron\.php~i', + '~^https?://[^/]+/index\.php/customer/account~i', + '~^https?://[^/]+/customer/account/login~i', + '~^https?://[^/]+/customer/account/create~i', + '~^https?://[^/]+/customer/account/logout~i', + '~^https?://[^/]+/checkout/~i', + '~^https?://[^/]+/cart~i', + '~^https?://[^/]+/wishlist/~i', + '~^https?://[^/]+/review/product/~i', + '~^https?://[^/]+/newsletter/subscriber/~i', + '~^https?://[^/]+/contacts/index/post~i', + '~^https?://[^/]+/catalogsearch/ajax/~i', + '~^https?://[^/]+/sendfriend/~i', + '~^https?://[^/]+/catalog/product_compare/~i', + '~^https?://[^/]+/tag/~i', + '~^https?://[^/]+/rating/~i', + '~^https?://[^/]+/poll/~i', + '~^https?://[^/]+/paypal/~i', + ], + ], + 'magento2' => [ + 'label' => 'Magento 2', + 'patterns' => [ + '~^https?://[^/]+/admin~i', + '~^https?://[^/]+/rest/~i', + '~^https?://[^/]+/graphql~i', + '~^https?://[^/]+/soap/~i', + '~^https?://[^/]+/cron\.php~i', + '~^https?://[^/]+/index\.php/customer/account~i', + '~^https?://[^/]+/customer/account/login~i', + '~^https?://[^/]+/customer/account/create~i', + '~^https?://[^/]+/customer/account/logout~i', + '~^https?://[^/]+/checkout/~i', + '~^https?://[^/]+/onestepcheckout/~i', + '~^https?://[^/]+/cart~i', + '~^https?://[^/]+/wishlist/~i', + '~^https?://[^/]+/review/product/~i', + '~^https?://[^/]+/newsletter/subscriber/~i', + '~^https?://[^/]+/contact/index/post~i', + '~^https?://[^/]+/search/ajax/~i', + '~^https?://[^/]+/catalogsearch/ajax/~i', + '~^https?://[^/]+/page_cache/~i', + '~^https?://[^/]+/static/version~i', + '~^https?://[^/]+/media/tmp/~i', + '~^https?://[^/]+/pub/media/tmp/~i', + '~^https?://[^/]+/sendfriend/~i', + '~^https?://[^/]+/catalog/product_compare/~i', + ], + ], + 'wordpress' => [ + 'label' => 'WordPress', + 'patterns' => [ + '~^https?://[^/]+/wp-admin~i', + '~^https?://[^/]+/wp-login\.php~i', + '~^https?://[^/]+/wp-cron\.php~i', + '~^https?://[^/]+/wp-json/~i', + '~^https?://[^/]+/xmlrpc\.php~i', + '~^https?://[^/]+/\?feed=~i', + '~^https?://[^/]+/feed/~i', + '~^https?://[^/]+/comments/feed/~i', + '~[?&]replytocom=~i', + '~[?&]preview=true~i', + '~^https?://[^/]+/\?p=\d+&preview=true~i', + '~^https?://[^/]+/wp-content/uploads/~i', + '~^https?://[^/]+/\?add-to-cart=~i', + '~^https?://[^/]+/cart/~i', + '~^https?://[^/]+/checkout/~i', + '~^https?://[^/]+/my-account/~i', + '~^https?://[^/]+/\?wc-ajax=~i', + '~^https?://[^/]+/wp-trackback\.php~i', + ], + ], + 'joomla' => [ + 'label' => 'Joomla', + 'patterns' => [ + '~^https?://[^/]+/administrator/~i', + '~^https?://[^/]+/index\.php\?option=com_users&task=user\.login~i', + '~^https?://[^/]+/index\.php\?option=com_users&task=user\.logout~i', + '~^https?://[^/]+/index\.php\?option=com_users&task=registration~i', + '~^https?://[^/]+/index\.php\?option=com_contact&task=contact\.submit~i', + '~[?&]format=feed~i', + '~[?&]format=json~i', + '~[?&]format=raw~i', + '~[?&]tmpl=component~i', + '~^https?://[^/]+/index\.php\?option=com_search~i', + '~^https?://[^/]+/index\.php\?option=com_finder~i', + '~^https?://[^/]+/index\.php\?option=com_ajax~i', + '~^https?://[^/]+/cache/~i', + '~^https?://[^/]+/tmp/~i', + '~^https?://[^/]+/logs/~i', + '~^https?://[^/]+/cli/~i', + ], + ], + 'drupal' => [ + 'label' => 'Drupal', + 'patterns' => [ + '~^https?://[^/]+/admin/~i', + '~^https?://[^/]+/user/login~i', + '~^https?://[^/]+/user/logout~i', + '~^https?://[^/]+/user/register~i', + '~^https?://[^/]+/user/password~i', + '~^https?://[^/]+/\?q=user/~i', + '~^https?://[^/]+/\?q=admin/~i', + '~^https?://[^/]+/jsonapi/~i', + '~^https?://[^/]+/api/~i', + '~^https?://[^/]+/batch\b~i', + '~[?&]ajax_form=1~i', + '~^https?://[^/]+/cron/~i', + '~^https?://[^/]+/update\.php~i', + '~^https?://[^/]+/install\.php~i', + '~^https?://[^/]+/rebuild\.php~i', + '~^https?://[^/]+/core/rebuild\.php~i', + '~^https?://[^/]+/sites/default/files/~i', + ], + ], + ], ]; diff --git a/packages/crawler/resources/views/crawler/index.blade.php b/packages/crawler/resources/views/crawler/index.blade.php index fa098119..d630c02a 100644 --- a/packages/crawler/resources/views/crawler/index.blade.php +++ b/packages/crawler/resources/views/crawler/index.blade.php @@ -23,6 +23,15 @@ + @if ($crawler->state === \Vigilant\Crawler\Enums\State::Crawling) +
+

+ {{ __('Crawling') }}

+ + +
+ @endif +

{{ __('Issues') }}

diff --git a/packages/crawler/resources/views/livewire/crawler-form.blade.php b/packages/crawler/resources/views/livewire/crawler-form.blade.php index 85eb452a..fe37a385 100644 --- a/packages/crawler/resources/views/livewire/crawler-form.blade.php +++ b/packages/crawler/resources/views/livewire/crawler-form.blade.php @@ -75,8 +75,57 @@ class="mt-2 block w-full rounded-md border-0 py-1.5 pl-3 pr-10 text-base-100 bg- @if (!$inline) -
- +
+
+
+ + @lang('Pre-fills the URL blacklist for the selected platform.') +
+
+ +
+
+ +
+ + +
+ +
+
+ +
+ +
@endif
diff --git a/packages/crawler/src/Actions/CrawlUrl.php b/packages/crawler/src/Actions/CrawlUrl.php index 8f45822b..91f00f99 100644 --- a/packages/crawler/src/Actions/CrawlUrl.php +++ b/packages/crawler/src/Actions/CrawlUrl.php @@ -5,12 +5,14 @@ use Illuminate\Http\Client\ConnectionException; use Illuminate\Http\Client\Response; use Illuminate\Support\Arr; +use Illuminate\Support\Collection; use Illuminate\Support\Facades\Gate; use Illuminate\Support\Facades\Http; use Illuminate\Support\Str; use Vigilant\Core\Services\TeamService; use Vigilant\Crawler\Enums\State; use Vigilant\Crawler\Models\CrawledUrl; +use Vigilant\Crawler\Models\Crawler; use Vigilant\Crawler\Models\IgnoredUrl; use Vigilant\Crawler\Notifications\RatelimitedNotification; @@ -113,9 +115,16 @@ public function crawl(CrawledUrl $url, int $try = 0): void ->pluck('url_hash') ->all(); - $queuedLinks = array_filter($queuedLinks, function (array $record) use ($existingLinks): bool { - return ! in_array($record['url_hash'], $existingLinks, true); - }); + $blacklistPatterns = $this->buildBlacklistPatterns($url->crawler); + + $queuedLinks = collect($queuedLinks) + ->reject(fn (array $record): bool => in_array($record['url_hash'], $existingLinks, true)) + ->when($blacklistPatterns->isNotEmpty(), fn (Collection $links): Collection => $links + ->reject(fn (array $record): bool => $blacklistPatterns + ->contains(fn (string $pattern): bool => @preg_match($pattern, $record['url']) === 1) + ) + ) + ->all(); if ($queuedLinks !== []) { $timestamp = now(); @@ -154,6 +163,14 @@ public function crawl(CrawledUrl $url, int $try = 0): void ]); } + protected function buildBlacklistPatterns(Crawler $crawler): \Illuminate\Support\Collection + { + return collect(explode("\n", (string) ($crawler->settings['url_blacklist'] ?? ''))) + ->map(fn (string $line): string => trim($line)) + ->filter() + ->values(); + } + protected function extractLinks(string $html, array $baseUrl): array { if ($html === '' || stripos($html, 'authorize('update', $crawler); $this->form->fill($crawler->toArray()); + $this->form->url_blacklist = $crawler->settings['url_blacklist'] ?? ''; } else { $this->authorize('create', Crawler::class); @@ -56,19 +57,24 @@ public function save(): void { $this->form->sitemaps = $this->form->sitemaps !== null ? array_filter($this->form->sitemaps) : null; $this->form->schedule = $this->getCronSchedule(); + $this->form->settings = array_merge($this->form->settings ?? [], [ + 'url_blacklist' => $this->form->url_blacklist, + ]); $this->validate(); + /** @var array $formData */ + $formData = $this->form->all(); + $data = collect($formData)->except('url_blacklist')->all(); + if ($this->crawler->exists) { $this->authorize('update', $this->crawler); - $this->crawler->update($this->form->all()); + $this->crawler->update($data); } else { $this->authorize('create', $this->crawler); - $this->crawler = Crawler::query()->create( - $this->form->all() - ); + $this->crawler = Crawler::query()->create($data); } if (! $this->inline) { diff --git a/packages/crawler/src/Livewire/Forms/CrawlerForm.php b/packages/crawler/src/Livewire/Forms/CrawlerForm.php index 24820c05..16d96513 100644 --- a/packages/crawler/src/Livewire/Forms/CrawlerForm.php +++ b/packages/crawler/src/Livewire/Forms/CrawlerForm.php @@ -7,6 +7,7 @@ use Vigilant\Core\Validation\CanEnableRule; use Vigilant\Crawler\Models\Crawler; use Vigilant\Crawler\Validation\EqualDomainRule; +use Vigilant\Crawler\Validation\ValidRegexLines; use Vigilant\Frontend\Validation\CronExpression; class CrawlerForm extends Form @@ -22,6 +23,8 @@ class CrawlerForm extends Form public ?array $sitemaps = []; + public string $url_blacklist = ''; + public ?array $settings = [ 'scheduleConfig' => [ 'type' => 'monthly', @@ -40,6 +43,7 @@ public function rules(): array 'sitemaps.*' => ['required', 'url'], 'settings' => ['array'], 'enabled' => ['boolean', new CanEnableRule(Crawler::class)], + 'url_blacklist' => ['nullable', 'string', new ValidRegexLines], ]; } } diff --git a/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php b/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php new file mode 100644 index 00000000..91aa9600 --- /dev/null +++ b/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php @@ -0,0 +1,66 @@ + '', + ]; + + #[Locked] + public int $crawlerId; + + public function mount(int $crawlerId): void + { + $this->crawlerId = $crawlerId; + } + + protected function columns(): array + { + return [ + LinkColumn::make(__('URL'), 'url') + ->openInNewTab() + ->searchable() + ->sortable(), + + Column::make(__('Crawled'), 'crawled') + ->displayUsing(fn (bool $crawled): string => $crawled ? __('Yes') : __('No')) + ->sortable(), + ]; + } + + protected function filters(): array + { + return [ + SelectFilter::make(__('Crawled'), 'crawled') + ->options([ + 'yes' => __('Crawled'), + 'no' => __('Not crawled'), + ]) + ->filterUsing(function (Builder $builder, ?string $value): void { + if ($value === 'yes') { + $builder->where($builder->qualifyColumn('crawled'), '=', true); + } elseif ($value === 'no') { + $builder->where($builder->qualifyColumn('crawled'), '=', false); + } + }), + ]; + } + + protected function query(): Builder + { + return parent::query() + ->where('web_crawled_urls.crawler_id', '=', $this->crawlerId); + } +} diff --git a/packages/crawler/src/ServiceProvider.php b/packages/crawler/src/ServiceProvider.php index 111618cf..5dd34a5d 100644 --- a/packages/crawler/src/ServiceProvider.php +++ b/packages/crawler/src/ServiceProvider.php @@ -19,6 +19,7 @@ use Vigilant\Crawler\Livewire\Crawler\Dashboard; use Vigilant\Crawler\Livewire\CrawlerForm; use Vigilant\Crawler\Livewire\Crawlers; +use Vigilant\Crawler\Livewire\Tables\CrawledUrlsTable; use Vigilant\Crawler\Livewire\Tables\CrawlerTable; use Vigilant\Crawler\Livewire\Tables\IssuesTable; use Vigilant\Crawler\Models\Crawler; @@ -105,6 +106,7 @@ protected function bootLivewire(): static Livewire::component('crawler-dashboard', Dashboard::class); Livewire::component('crawler-issues-table', IssuesTable::class); + Livewire::component('crawler-crawled-urls-table', CrawledUrlsTable::class); return $this; } diff --git a/packages/crawler/src/Validation/ValidRegexLines.php b/packages/crawler/src/Validation/ValidRegexLines.php new file mode 100644 index 00000000..1fcb5c42 --- /dev/null +++ b/packages/crawler/src/Validation/ValidRegexLines.php @@ -0,0 +1,21 @@ +map(fn (string $line): string => trim($line)) + ->filter() + ->each(function (string $line) use ($fail): void { + if (@preg_match($line, '') === false) { + $fail(__('One or more URL blacklist patterns are not valid regular expressions.')); + } + }); + } +} diff --git a/packages/crawler/tests/Actions/CrawUrlTest.php b/packages/crawler/tests/Actions/CrawUrlTest.php index 405be6fb..e034d5d3 100644 --- a/packages/crawler/tests/Actions/CrawUrlTest.php +++ b/packages/crawler/tests/Actions/CrawUrlTest.php @@ -127,4 +127,86 @@ public function it_handles_ratelimiting(): void $this->assertEquals(State::Ratelimited, $crawler->state); $this->assertTrue($crawledUrl->crawled); } + + #[Test] + public function it_does_not_insert_blacklisted_urls(): void + { + Http::fake([ + 'https://govigilant.io/url-1' => Http::response(' + + + + + '), + ])->preventStrayRequests(); + + /** @var Crawler $crawler */ + $crawler = Crawler::query()->create([ + 'start_url' => 'https://govigilant.io', + 'state' => State::Crawling, + 'schedule' => '0 0 * * *', + 'settings' => [ + 'url_blacklist' => implode("\n", [ + '~^https?://[^/]+/checkout/~i', + '~^https?://[^/]+/customer/account/login~i', + ]), + ], + ]); + + /** @var CrawledUrl $crawledUrl */ + $crawledUrl = $crawler->urls()->create([ + 'url' => 'https://govigilant.io/url-1', + 'crawled' => false, + ]); + + /** @var CrawlUrl $action */ + $action = app(CrawlUrl::class); + $action->crawl($crawledUrl); + + $discoveredUrls = $crawler->urls() + ->where('crawled', '=', false) + ->pluck('url') + ->toArray(); + + $this->assertContains('https://govigilant.io/products/shoes', $discoveredUrls); + $this->assertContains('https://govigilant.io/about-us', $discoveredUrls); + $this->assertNotContains('https://govigilant.io/checkout/cart', $discoveredUrls); + $this->assertNotContains('https://govigilant.io/customer/account/login', $discoveredUrls); + } + + #[Test] + public function it_inserts_all_urls_when_blacklist_is_empty(): void + { + Http::fake([ + 'https://govigilant.io/url-1' => Http::response(' + + + '), + ])->preventStrayRequests(); + + /** @var Crawler $crawler */ + $crawler = Crawler::query()->create([ + 'start_url' => 'https://govigilant.io', + 'state' => State::Crawling, + 'schedule' => '0 0 * * *', + ]); + + /** @var CrawledUrl $crawledUrl */ + $crawledUrl = $crawler->urls()->create([ + 'url' => 'https://govigilant.io/url-1', + 'crawled' => false, + ]); + + /** @var CrawlUrl $action */ + $action = app(CrawlUrl::class); + $action->crawl($crawledUrl); + + $discoveredUrls = $crawler->urls() + ->where('crawled', '=', false) + ->pluck('url') + ->toArray(); + + $this->assertContains('https://govigilant.io/checkout/cart', $discoveredUrls); + $this->assertContains('https://govigilant.io/about-us', $discoveredUrls); + } } diff --git a/resources/views/components/form/textarea.blade.php b/resources/views/components/form/textarea.blade.php new file mode 100644 index 00000000..fa701f4a --- /dev/null +++ b/resources/views/components/form/textarea.blade.php @@ -0,0 +1,27 @@ +@props(['field', 'name' => '', 'placeholder' => '', 'description' => '', 'rows' => 4, 'live' => true, 'xRef' => null]) +
+ @if(!blank($name)) +
+ + @if($description) + {{ $description }} + @endif +
+ @endif +
+
+ +
+ + @error($field) {{ $message }} @enderror +
+