diff --git a/packages/crawler/config/crawler.php b/packages/crawler/config/crawler.php
index 559913ce..23f30f76 100644
--- a/packages/crawler/config/crawler.php
+++ b/packages/crawler/config/crawler.php
@@ -7,4 +7,128 @@
'connect_timeout' => env('CRAWLER_CONNECT_TIMEOUT', 2),
'crawls_per_minute' => (int) env('CRAWLER_CRAWLS_PER_MINUTE', 500),
+
+ 'platform_blacklists' => [
+ 'magento1' => [
+ 'label' => 'Magento 1',
+ 'patterns' => [
+ '~^https?://[^/]+/index\.php/admin~i',
+ '~^https?://[^/]+/admin~i',
+ '~^https?://[^/]+/api/~i',
+ '~^https?://[^/]+/cron\.php~i',
+ '~^https?://[^/]+/index\.php/customer/account~i',
+ '~^https?://[^/]+/customer/account/login~i',
+ '~^https?://[^/]+/customer/account/create~i',
+ '~^https?://[^/]+/customer/account/logout~i',
+ '~^https?://[^/]+/checkout/~i',
+ '~^https?://[^/]+/cart~i',
+ '~^https?://[^/]+/wishlist/~i',
+ '~^https?://[^/]+/review/product/~i',
+ '~^https?://[^/]+/newsletter/subscriber/~i',
+ '~^https?://[^/]+/contacts/index/post~i',
+ '~^https?://[^/]+/catalogsearch/ajax/~i',
+ '~^https?://[^/]+/sendfriend/~i',
+ '~^https?://[^/]+/catalog/product_compare/~i',
+ '~^https?://[^/]+/tag/~i',
+ '~^https?://[^/]+/rating/~i',
+ '~^https?://[^/]+/poll/~i',
+ '~^https?://[^/]+/paypal/~i',
+ ],
+ ],
+ 'magento2' => [
+ 'label' => 'Magento 2',
+ 'patterns' => [
+ '~^https?://[^/]+/admin~i',
+ '~^https?://[^/]+/rest/~i',
+ '~^https?://[^/]+/graphql~i',
+ '~^https?://[^/]+/soap/~i',
+ '~^https?://[^/]+/cron\.php~i',
+ '~^https?://[^/]+/index\.php/customer/account~i',
+ '~^https?://[^/]+/customer/account/login~i',
+ '~^https?://[^/]+/customer/account/create~i',
+ '~^https?://[^/]+/customer/account/logout~i',
+ '~^https?://[^/]+/checkout/~i',
+ '~^https?://[^/]+/onestepcheckout/~i',
+ '~^https?://[^/]+/cart~i',
+ '~^https?://[^/]+/wishlist/~i',
+ '~^https?://[^/]+/review/product/~i',
+ '~^https?://[^/]+/newsletter/subscriber/~i',
+ '~^https?://[^/]+/contact/index/post~i',
+ '~^https?://[^/]+/search/ajax/~i',
+ '~^https?://[^/]+/catalogsearch/ajax/~i',
+ '~^https?://[^/]+/page_cache/~i',
+ '~^https?://[^/]+/static/version~i',
+ '~^https?://[^/]+/media/tmp/~i',
+ '~^https?://[^/]+/pub/media/tmp/~i',
+ '~^https?://[^/]+/sendfriend/~i',
+ '~^https?://[^/]+/catalog/product_compare/~i',
+ ],
+ ],
+ 'wordpress' => [
+ 'label' => 'WordPress',
+ 'patterns' => [
+ '~^https?://[^/]+/wp-admin~i',
+ '~^https?://[^/]+/wp-login\.php~i',
+ '~^https?://[^/]+/wp-cron\.php~i',
+ '~^https?://[^/]+/wp-json/~i',
+ '~^https?://[^/]+/xmlrpc\.php~i',
+ '~^https?://[^/]+/\?feed=~i',
+ '~^https?://[^/]+/feed/~i',
+ '~^https?://[^/]+/comments/feed/~i',
+ '~[?&]replytocom=~i',
+ '~[?&]preview=true~i',
+ '~^https?://[^/]+/\?p=\d+&preview=true~i',
+ '~^https?://[^/]+/wp-content/uploads/~i',
+ '~^https?://[^/]+/\?add-to-cart=~i',
+ '~^https?://[^/]+/cart/~i',
+ '~^https?://[^/]+/checkout/~i',
+ '~^https?://[^/]+/my-account/~i',
+ '~^https?://[^/]+/\?wc-ajax=~i',
+ '~^https?://[^/]+/wp-trackback\.php~i',
+ ],
+ ],
+ 'joomla' => [
+ 'label' => 'Joomla',
+ 'patterns' => [
+ '~^https?://[^/]+/administrator/~i',
+ '~^https?://[^/]+/index\.php\?option=com_users&task=user\.login~i',
+ '~^https?://[^/]+/index\.php\?option=com_users&task=user\.logout~i',
+ '~^https?://[^/]+/index\.php\?option=com_users&task=registration~i',
+ '~^https?://[^/]+/index\.php\?option=com_contact&task=contact\.submit~i',
+ '~[?&]format=feed~i',
+ '~[?&]format=json~i',
+ '~[?&]format=raw~i',
+ '~[?&]tmpl=component~i',
+ '~^https?://[^/]+/index\.php\?option=com_search~i',
+ '~^https?://[^/]+/index\.php\?option=com_finder~i',
+ '~^https?://[^/]+/index\.php\?option=com_ajax~i',
+ '~^https?://[^/]+/cache/~i',
+ '~^https?://[^/]+/tmp/~i',
+ '~^https?://[^/]+/logs/~i',
+ '~^https?://[^/]+/cli/~i',
+ ],
+ ],
+ 'drupal' => [
+ 'label' => 'Drupal',
+ 'patterns' => [
+ '~^https?://[^/]+/admin/~i',
+ '~^https?://[^/]+/user/login~i',
+ '~^https?://[^/]+/user/logout~i',
+ '~^https?://[^/]+/user/register~i',
+ '~^https?://[^/]+/user/password~i',
+ '~^https?://[^/]+/\?q=user/~i',
+ '~^https?://[^/]+/\?q=admin/~i',
+ '~^https?://[^/]+/jsonapi/~i',
+ '~^https?://[^/]+/api/~i',
+ '~^https?://[^/]+/batch\b~i',
+ '~[?&]ajax_form=1~i',
+ '~^https?://[^/]+/cron/~i',
+ '~^https?://[^/]+/update\.php~i',
+ '~^https?://[^/]+/install\.php~i',
+ '~^https?://[^/]+/rebuild\.php~i',
+ '~^https?://[^/]+/core/rebuild\.php~i',
+ '~^https?://[^/]+/sites/default/files/~i',
+ ],
+ ],
+ ],
];
diff --git a/packages/crawler/resources/views/crawler/index.blade.php b/packages/crawler/resources/views/crawler/index.blade.php
index fa098119..d630c02a 100644
--- a/packages/crawler/resources/views/crawler/index.blade.php
+++ b/packages/crawler/resources/views/crawler/index.blade.php
@@ -23,6 +23,15 @@
+ @if ($crawler->state === \Vigilant\Crawler\Enums\State::Crawling)
+
+
+ {{ __('Crawling') }}
+
+
+
+ @endif
+
{{ __('Issues') }}
diff --git a/packages/crawler/resources/views/livewire/crawler-form.blade.php b/packages/crawler/resources/views/livewire/crawler-form.blade.php
index 85eb452a..fe37a385 100644
--- a/packages/crawler/resources/views/livewire/crawler-form.blade.php
+++ b/packages/crawler/resources/views/livewire/crawler-form.blade.php
@@ -75,8 +75,57 @@ class="mt-2 block w-full rounded-md border-0 py-1.5 pl-3 pr-10 text-base-100 bg-
@if (!$inline)
-
-
+
+
+
+
+ @lang('Pre-fills the URL blacklist for the selected platform.')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@endif
diff --git a/packages/crawler/src/Actions/CrawlUrl.php b/packages/crawler/src/Actions/CrawlUrl.php
index 8f45822b..91f00f99 100644
--- a/packages/crawler/src/Actions/CrawlUrl.php
+++ b/packages/crawler/src/Actions/CrawlUrl.php
@@ -5,12 +5,14 @@
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Arr;
+use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Gate;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
use Vigilant\Core\Services\TeamService;
use Vigilant\Crawler\Enums\State;
use Vigilant\Crawler\Models\CrawledUrl;
+use Vigilant\Crawler\Models\Crawler;
use Vigilant\Crawler\Models\IgnoredUrl;
use Vigilant\Crawler\Notifications\RatelimitedNotification;
@@ -113,9 +115,16 @@ public function crawl(CrawledUrl $url, int $try = 0): void
->pluck('url_hash')
->all();
- $queuedLinks = array_filter($queuedLinks, function (array $record) use ($existingLinks): bool {
- return ! in_array($record['url_hash'], $existingLinks, true);
- });
+ $blacklistPatterns = $this->buildBlacklistPatterns($url->crawler);
+
+ $queuedLinks = collect($queuedLinks)
+ ->reject(fn (array $record): bool => in_array($record['url_hash'], $existingLinks, true))
+ ->when($blacklistPatterns->isNotEmpty(), fn (Collection $links): Collection => $links
+ ->reject(fn (array $record): bool => $blacklistPatterns
+ ->contains(fn (string $pattern): bool => @preg_match($pattern, $record['url']) === 1)
+ )
+ )
+ ->all();
if ($queuedLinks !== []) {
$timestamp = now();
@@ -154,6 +163,14 @@ public function crawl(CrawledUrl $url, int $try = 0): void
]);
}
+ protected function buildBlacklistPatterns(Crawler $crawler): \Illuminate\Support\Collection
+ {
+ return collect(explode("\n", (string) ($crawler->settings['url_blacklist'] ?? '')))
+ ->map(fn (string $line): string => trim($line))
+ ->filter()
+ ->values();
+ }
+
protected function extractLinks(string $html, array $baseUrl): array
{
if ($html === '' || stripos($html, '
authorize('update', $crawler);
$this->form->fill($crawler->toArray());
+ $this->form->url_blacklist = $crawler->settings['url_blacklist'] ?? '';
} else {
$this->authorize('create', Crawler::class);
@@ -56,19 +57,24 @@ public function save(): void
{
$this->form->sitemaps = $this->form->sitemaps !== null ? array_filter($this->form->sitemaps) : null;
$this->form->schedule = $this->getCronSchedule();
+ $this->form->settings = array_merge($this->form->settings ?? [], [
+ 'url_blacklist' => $this->form->url_blacklist,
+ ]);
$this->validate();
+ /** @var array $formData */
+ $formData = $this->form->all();
+ $data = collect($formData)->except('url_blacklist')->all();
+
if ($this->crawler->exists) {
$this->authorize('update', $this->crawler);
- $this->crawler->update($this->form->all());
+ $this->crawler->update($data);
} else {
$this->authorize('create', $this->crawler);
- $this->crawler = Crawler::query()->create(
- $this->form->all()
- );
+ $this->crawler = Crawler::query()->create($data);
}
if (! $this->inline) {
diff --git a/packages/crawler/src/Livewire/Forms/CrawlerForm.php b/packages/crawler/src/Livewire/Forms/CrawlerForm.php
index 24820c05..16d96513 100644
--- a/packages/crawler/src/Livewire/Forms/CrawlerForm.php
+++ b/packages/crawler/src/Livewire/Forms/CrawlerForm.php
@@ -7,6 +7,7 @@
use Vigilant\Core\Validation\CanEnableRule;
use Vigilant\Crawler\Models\Crawler;
use Vigilant\Crawler\Validation\EqualDomainRule;
+use Vigilant\Crawler\Validation\ValidRegexLines;
use Vigilant\Frontend\Validation\CronExpression;
class CrawlerForm extends Form
@@ -22,6 +23,8 @@ class CrawlerForm extends Form
public ?array $sitemaps = [];
+ public string $url_blacklist = '';
+
public ?array $settings = [
'scheduleConfig' => [
'type' => 'monthly',
@@ -40,6 +43,7 @@ public function rules(): array
'sitemaps.*' => ['required', 'url'],
'settings' => ['array'],
'enabled' => ['boolean', new CanEnableRule(Crawler::class)],
+ 'url_blacklist' => ['nullable', 'string', new ValidRegexLines],
];
}
}
diff --git a/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php b/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php
new file mode 100644
index 00000000..91aa9600
--- /dev/null
+++ b/packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php
@@ -0,0 +1,66 @@
+ '',
+ ];
+
+ #[Locked]
+ public int $crawlerId;
+
+ public function mount(int $crawlerId): void
+ {
+ $this->crawlerId = $crawlerId;
+ }
+
+ protected function columns(): array
+ {
+ return [
+ LinkColumn::make(__('URL'), 'url')
+ ->openInNewTab()
+ ->searchable()
+ ->sortable(),
+
+ Column::make(__('Crawled'), 'crawled')
+ ->displayUsing(fn (bool $crawled): string => $crawled ? __('Yes') : __('No'))
+ ->sortable(),
+ ];
+ }
+
+ protected function filters(): array
+ {
+ return [
+ SelectFilter::make(__('Crawled'), 'crawled')
+ ->options([
+ 'yes' => __('Crawled'),
+ 'no' => __('Not crawled'),
+ ])
+ ->filterUsing(function (Builder $builder, ?string $value): void {
+ if ($value === 'yes') {
+ $builder->where($builder->qualifyColumn('crawled'), '=', true);
+ } elseif ($value === 'no') {
+ $builder->where($builder->qualifyColumn('crawled'), '=', false);
+ }
+ }),
+ ];
+ }
+
+ protected function query(): Builder
+ {
+ return parent::query()
+ ->where('web_crawled_urls.crawler_id', '=', $this->crawlerId);
+ }
+}
diff --git a/packages/crawler/src/ServiceProvider.php b/packages/crawler/src/ServiceProvider.php
index 111618cf..5dd34a5d 100644
--- a/packages/crawler/src/ServiceProvider.php
+++ b/packages/crawler/src/ServiceProvider.php
@@ -19,6 +19,7 @@
use Vigilant\Crawler\Livewire\Crawler\Dashboard;
use Vigilant\Crawler\Livewire\CrawlerForm;
use Vigilant\Crawler\Livewire\Crawlers;
+use Vigilant\Crawler\Livewire\Tables\CrawledUrlsTable;
use Vigilant\Crawler\Livewire\Tables\CrawlerTable;
use Vigilant\Crawler\Livewire\Tables\IssuesTable;
use Vigilant\Crawler\Models\Crawler;
@@ -105,6 +106,7 @@ protected function bootLivewire(): static
Livewire::component('crawler-dashboard', Dashboard::class);
Livewire::component('crawler-issues-table', IssuesTable::class);
+ Livewire::component('crawler-crawled-urls-table', CrawledUrlsTable::class);
return $this;
}
diff --git a/packages/crawler/src/Validation/ValidRegexLines.php b/packages/crawler/src/Validation/ValidRegexLines.php
new file mode 100644
index 00000000..1fcb5c42
--- /dev/null
+++ b/packages/crawler/src/Validation/ValidRegexLines.php
@@ -0,0 +1,21 @@
+map(fn (string $line): string => trim($line))
+ ->filter()
+ ->each(function (string $line) use ($fail): void {
+ if (@preg_match($line, '') === false) {
+ $fail(__('One or more URL blacklist patterns are not valid regular expressions.'));
+ }
+ });
+ }
+}
diff --git a/packages/crawler/tests/Actions/CrawUrlTest.php b/packages/crawler/tests/Actions/CrawUrlTest.php
index 405be6fb..e034d5d3 100644
--- a/packages/crawler/tests/Actions/CrawUrlTest.php
+++ b/packages/crawler/tests/Actions/CrawUrlTest.php
@@ -127,4 +127,86 @@ public function it_handles_ratelimiting(): void
$this->assertEquals(State::Ratelimited, $crawler->state);
$this->assertTrue($crawledUrl->crawled);
}
+
+ #[Test]
+ public function it_does_not_insert_blacklisted_urls(): void
+ {
+ Http::fake([
+ 'https://govigilant.io/url-1' => Http::response('
+
+
+
+
+ '),
+ ])->preventStrayRequests();
+
+ /** @var Crawler $crawler */
+ $crawler = Crawler::query()->create([
+ 'start_url' => 'https://govigilant.io',
+ 'state' => State::Crawling,
+ 'schedule' => '0 0 * * *',
+ 'settings' => [
+ 'url_blacklist' => implode("\n", [
+ '~^https?://[^/]+/checkout/~i',
+ '~^https?://[^/]+/customer/account/login~i',
+ ]),
+ ],
+ ]);
+
+ /** @var CrawledUrl $crawledUrl */
+ $crawledUrl = $crawler->urls()->create([
+ 'url' => 'https://govigilant.io/url-1',
+ 'crawled' => false,
+ ]);
+
+ /** @var CrawlUrl $action */
+ $action = app(CrawlUrl::class);
+ $action->crawl($crawledUrl);
+
+ $discoveredUrls = $crawler->urls()
+ ->where('crawled', '=', false)
+ ->pluck('url')
+ ->toArray();
+
+ $this->assertContains('https://govigilant.io/products/shoes', $discoveredUrls);
+ $this->assertContains('https://govigilant.io/about-us', $discoveredUrls);
+ $this->assertNotContains('https://govigilant.io/checkout/cart', $discoveredUrls);
+ $this->assertNotContains('https://govigilant.io/customer/account/login', $discoveredUrls);
+ }
+
+ #[Test]
+ public function it_inserts_all_urls_when_blacklist_is_empty(): void
+ {
+ Http::fake([
+ 'https://govigilant.io/url-1' => Http::response('
+
+
+ '),
+ ])->preventStrayRequests();
+
+ /** @var Crawler $crawler */
+ $crawler = Crawler::query()->create([
+ 'start_url' => 'https://govigilant.io',
+ 'state' => State::Crawling,
+ 'schedule' => '0 0 * * *',
+ ]);
+
+ /** @var CrawledUrl $crawledUrl */
+ $crawledUrl = $crawler->urls()->create([
+ 'url' => 'https://govigilant.io/url-1',
+ 'crawled' => false,
+ ]);
+
+ /** @var CrawlUrl $action */
+ $action = app(CrawlUrl::class);
+ $action->crawl($crawledUrl);
+
+ $discoveredUrls = $crawler->urls()
+ ->where('crawled', '=', false)
+ ->pluck('url')
+ ->toArray();
+
+ $this->assertContains('https://govigilant.io/checkout/cart', $discoveredUrls);
+ $this->assertContains('https://govigilant.io/about-us', $discoveredUrls);
+ }
}
diff --git a/resources/views/components/form/textarea.blade.php b/resources/views/components/form/textarea.blade.php
new file mode 100644
index 00000000..fa701f4a
--- /dev/null
+++ b/resources/views/components/form/textarea.blade.php
@@ -0,0 +1,27 @@
+@props(['field', 'name' => '', 'placeholder' => '', 'description' => '', 'rows' => 4, 'live' => true, 'xRef' => null])
+
+ @if(!blank($name))
+
+
+ @if($description)
+ {{ $description }}
+ @endif
+
+ @endif
+
+
+
+
+
+ @error($field)
{{ $message }} @enderror
+
+