Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions packages/crawler/config/crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,128 @@
'connect_timeout' => env('CRAWLER_CONNECT_TIMEOUT', 2),

'crawls_per_minute' => (int) env('CRAWLER_CRAWLS_PER_MINUTE', 500),

'platform_blacklists' => [
'magento1' => [
'label' => 'Magento 1',
'patterns' => [
'~^https?://[^/]+/index\.php/admin~i',
'~^https?://[^/]+/admin~i',
'~^https?://[^/]+/api/~i',
'~^https?://[^/]+/cron\.php~i',
'~^https?://[^/]+/index\.php/customer/account~i',
'~^https?://[^/]+/customer/account/login~i',
'~^https?://[^/]+/customer/account/create~i',
'~^https?://[^/]+/customer/account/logout~i',
'~^https?://[^/]+/checkout/~i',
'~^https?://[^/]+/cart~i',
'~^https?://[^/]+/wishlist/~i',
'~^https?://[^/]+/review/product/~i',
'~^https?://[^/]+/newsletter/subscriber/~i',
'~^https?://[^/]+/contacts/index/post~i',
'~^https?://[^/]+/catalogsearch/ajax/~i',
'~^https?://[^/]+/sendfriend/~i',
'~^https?://[^/]+/catalog/product_compare/~i',
'~^https?://[^/]+/tag/~i',
'~^https?://[^/]+/rating/~i',
'~^https?://[^/]+/poll/~i',
'~^https?://[^/]+/paypal/~i',
],
],
'magento2' => [
'label' => 'Magento 2',
'patterns' => [
'~^https?://[^/]+/admin~i',
'~^https?://[^/]+/rest/~i',
'~^https?://[^/]+/graphql~i',
'~^https?://[^/]+/soap/~i',
'~^https?://[^/]+/cron\.php~i',
'~^https?://[^/]+/index\.php/customer/account~i',
'~^https?://[^/]+/customer/account/login~i',
'~^https?://[^/]+/customer/account/create~i',
'~^https?://[^/]+/customer/account/logout~i',
'~^https?://[^/]+/checkout/~i',
'~^https?://[^/]+/onestepcheckout/~i',
'~^https?://[^/]+/cart~i',
'~^https?://[^/]+/wishlist/~i',
'~^https?://[^/]+/review/product/~i',
'~^https?://[^/]+/newsletter/subscriber/~i',
'~^https?://[^/]+/contact/index/post~i',
'~^https?://[^/]+/search/ajax/~i',
'~^https?://[^/]+/catalogsearch/ajax/~i',
'~^https?://[^/]+/page_cache/~i',
'~^https?://[^/]+/static/version~i',
'~^https?://[^/]+/media/tmp/~i',
'~^https?://[^/]+/pub/media/tmp/~i',
'~^https?://[^/]+/sendfriend/~i',
'~^https?://[^/]+/catalog/product_compare/~i',
],
],
'wordpress' => [
'label' => 'WordPress',
'patterns' => [
'~^https?://[^/]+/wp-admin~i',
'~^https?://[^/]+/wp-login\.php~i',
'~^https?://[^/]+/wp-cron\.php~i',
'~^https?://[^/]+/wp-json/~i',
'~^https?://[^/]+/xmlrpc\.php~i',
'~^https?://[^/]+/\?feed=~i',
'~^https?://[^/]+/feed/~i',
'~^https?://[^/]+/comments/feed/~i',
'~[?&]replytocom=~i',
'~[?&]preview=true~i',
'~^https?://[^/]+/\?p=\d+&preview=true~i',
'~^https?://[^/]+/wp-content/uploads/~i',
'~^https?://[^/]+/\?add-to-cart=~i',
'~^https?://[^/]+/cart/~i',
'~^https?://[^/]+/checkout/~i',
'~^https?://[^/]+/my-account/~i',
'~^https?://[^/]+/\?wc-ajax=~i',
'~^https?://[^/]+/wp-trackback\.php~i',
],
],
'joomla' => [
'label' => 'Joomla',
'patterns' => [
'~^https?://[^/]+/administrator/~i',
'~^https?://[^/]+/index\.php\?option=com_users&task=user\.login~i',
'~^https?://[^/]+/index\.php\?option=com_users&task=user\.logout~i',
'~^https?://[^/]+/index\.php\?option=com_users&task=registration~i',
'~^https?://[^/]+/index\.php\?option=com_contact&task=contact\.submit~i',
'~[?&]format=feed~i',
'~[?&]format=json~i',
'~[?&]format=raw~i',
'~[?&]tmpl=component~i',
'~^https?://[^/]+/index\.php\?option=com_search~i',
'~^https?://[^/]+/index\.php\?option=com_finder~i',
'~^https?://[^/]+/index\.php\?option=com_ajax~i',
'~^https?://[^/]+/cache/~i',
'~^https?://[^/]+/tmp/~i',
'~^https?://[^/]+/logs/~i',
'~^https?://[^/]+/cli/~i',
],
],
'drupal' => [
'label' => 'Drupal',
'patterns' => [
'~^https?://[^/]+/admin/~i',
'~^https?://[^/]+/user/login~i',
'~^https?://[^/]+/user/logout~i',
'~^https?://[^/]+/user/register~i',
'~^https?://[^/]+/user/password~i',
'~^https?://[^/]+/\?q=user/~i',
'~^https?://[^/]+/\?q=admin/~i',
'~^https?://[^/]+/jsonapi/~i',
'~^https?://[^/]+/api/~i',
'~^https?://[^/]+/batch\b~i',
'~[?&]ajax_form=1~i',
'~^https?://[^/]+/cron/~i',
'~^https?://[^/]+/update\.php~i',
'~^https?://[^/]+/install\.php~i',
'~^https?://[^/]+/rebuild\.php~i',
'~^https?://[^/]+/core/rebuild\.php~i',
'~^https?://[^/]+/sites/default/files/~i',
],
],
],
];
9 changes: 9 additions & 0 deletions packages/crawler/resources/views/crawler/index.blade.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@

<livewire:crawler-dashboard :crawlerId="$crawler->id" wire:key="crawher-dashboard" />

@if ($crawler->state === \Vigilant\Crawler\Enums\State::Crawling)
<div class="mt-4">
<h2 class="text-xl font-bold leading-7 sm:truncate sm:text-2xl sm:tracking-tight text-neutral-100 mb-2">
{{ __('Crawling') }}</h2>

<livewire:crawler-crawled-urls-table :crawlerId="$crawler->id" wire:key="crawled-urls-table" />
</div>
@endif

<div class="mt-4">
<h2 class="text-xl font-bold leading-7 sm:truncate sm:text-2xl sm:tracking-tight text-neutral-100 mb-2">
{{ __('Issues') }}</h2>
Expand Down
53 changes: 51 additions & 2 deletions packages/crawler/resources/views/livewire/crawler-form.blade.php
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,57 @@ class="mt-2 block w-full rounded-md border-0 py-1.5 pl-3 pr-10 text-base-100 bg-


@if (!$inline)
<div class="flex justify-end gap-4">
<x-form.submit-button dusk="submit-button" :submitText="$updating ? 'Save' : 'Create'" />
<div x-data="{
open: false,
platform: '',
platforms: {{ Js::from(collect(config('crawler.platform_blacklists'))->map(fn ($p) => ['label' => $p['label'], 'patterns' => implode("\n", $p['patterns'])])) }},
applyPlatform() {
if (this.platform === '') return;
this.$refs.urlBlacklist.value = this.platforms[this.platform].patterns;
this.$refs.urlBlacklist.dispatchEvent(new Event('input'));
this.$refs.urlBlacklist.dispatchEvent(new Event('change'));
},
}" class="flex flex-col gap-4">
<div class="grid grid-cols-1 md:grid-cols-2 gap-4">
<div class="flex flex-col justify-center">
<label class="block text-base font-semibold leading-6 text-base-50">@lang('Platform preset')</label>
<span class="text-base-400 text-sm mt-1">@lang('Pre-fills the URL blacklist for the selected platform.')</span>
</div>
<div class="flex flex-col justify-center">
<select x-model="platform" @change="applyPlatform()"
class="block w-full rounded-lg border-0 py-2.5 px-3 text-base-100 bg-base-900 ring-1 ring-inset ring-base-700 focus-within:ring-2 focus-within:ring-inset focus-within:ring-red transition-all duration-200">
<option value="">@lang('None')</option>
<template x-for="[key, p] in Object.entries(platforms)" :key="key">
<option :value="key" x-text="p.label"></option>
</template>
</select>
</div>
</div>

<div class="border-t border-base-700 pt-4">
<button type="button" @click="open = !open"
class="flex items-center justify-between w-full text-sm font-medium text-base-400 hover:text-base-100 transition-colors duration-200">
@lang('Advanced')
<div class="transition-transform duration-200" x-bind:class="open ? 'rotate-180' : ''">
@svg('phosphor-caret-down', 'w-4 h-4')
</div>
</button>

<div x-show="open" x-cloak x-collapse class="mt-4 flex flex-col gap-4">
<x-form.textarea
field="form.url_blacklist"
name="URL Blacklist"
description="One regex pattern per line. URLs matching any pattern will not be crawled."
:rows="6"
placeholder="/admin/&#10;/private/.*"
xRef="urlBlacklist"
/>
</div>
</div>

<div class="flex justify-end gap-4">
<x-form.submit-button dusk="submit-button" :submitText="$updating ? 'Save' : 'Create'" />
</div>
</div>
@endif
</div>
Expand Down
23 changes: 20 additions & 3 deletions packages/crawler/src/Actions/CrawlUrl.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\Response;
use Illuminate\Support\Arr;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Gate;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
use Vigilant\Core\Services\TeamService;
use Vigilant\Crawler\Enums\State;
use Vigilant\Crawler\Models\CrawledUrl;
use Vigilant\Crawler\Models\Crawler;
use Vigilant\Crawler\Models\IgnoredUrl;
use Vigilant\Crawler\Notifications\RatelimitedNotification;

Expand Down Expand Up @@ -113,9 +115,16 @@ public function crawl(CrawledUrl $url, int $try = 0): void
->pluck('url_hash')
->all();

$queuedLinks = array_filter($queuedLinks, function (array $record) use ($existingLinks): bool {
return ! in_array($record['url_hash'], $existingLinks, true);
});
$blacklistPatterns = $this->buildBlacklistPatterns($url->crawler);

$queuedLinks = collect($queuedLinks)
->reject(fn (array $record): bool => in_array($record['url_hash'], $existingLinks, true))
->when($blacklistPatterns->isNotEmpty(), fn (Collection $links): Collection => $links
->reject(fn (array $record): bool => $blacklistPatterns
->contains(fn (string $pattern): bool => @preg_match($pattern, $record['url']) === 1)
)
)
->all();

if ($queuedLinks !== []) {
$timestamp = now();
Expand Down Expand Up @@ -154,6 +163,14 @@ public function crawl(CrawledUrl $url, int $try = 0): void
]);
}

protected function buildBlacklistPatterns(Crawler $crawler): \Illuminate\Support\Collection
{
return collect(explode("\n", (string) ($crawler->settings['url_blacklist'] ?? '')))
->map(fn (string $line): string => trim($line))
->filter()
->values();
}

protected function extractLinks(string $html, array $baseUrl): array
{
if ($html === '' || stripos($html, '<a') === false || ! isset($baseUrl['host'], $baseUrl['scheme'])) {
Expand Down
14 changes: 10 additions & 4 deletions packages/crawler/src/Livewire/CrawlerForm.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public function mount(?Crawler $crawler, ?int $siteId = null): void
$this->authorize('update', $crawler);

$this->form->fill($crawler->toArray());
$this->form->url_blacklist = $crawler->settings['url_blacklist'] ?? '';
} else {
$this->authorize('create', Crawler::class);

Expand Down Expand Up @@ -56,19 +57,24 @@ public function save(): void
{
$this->form->sitemaps = $this->form->sitemaps !== null ? array_filter($this->form->sitemaps) : null;
$this->form->schedule = $this->getCronSchedule();
$this->form->settings = array_merge($this->form->settings ?? [], [
'url_blacklist' => $this->form->url_blacklist,
]);

$this->validate();

/** @var array<string, mixed> $formData */
$formData = $this->form->all();
$data = collect($formData)->except('url_blacklist')->all();

if ($this->crawler->exists) {
$this->authorize('update', $this->crawler);

$this->crawler->update($this->form->all());
$this->crawler->update($data);
} else {
$this->authorize('create', $this->crawler);

$this->crawler = Crawler::query()->create(
$this->form->all()
);
$this->crawler = Crawler::query()->create($data);
}

if (! $this->inline) {
Expand Down
4 changes: 4 additions & 0 deletions packages/crawler/src/Livewire/Forms/CrawlerForm.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use Vigilant\Core\Validation\CanEnableRule;
use Vigilant\Crawler\Models\Crawler;
use Vigilant\Crawler\Validation\EqualDomainRule;
use Vigilant\Crawler\Validation\ValidRegexLines;
use Vigilant\Frontend\Validation\CronExpression;

class CrawlerForm extends Form
Expand All @@ -22,6 +23,8 @@ class CrawlerForm extends Form

public ?array $sitemaps = [];

public string $url_blacklist = '';

public ?array $settings = [
'scheduleConfig' => [
'type' => 'monthly',
Expand All @@ -40,6 +43,7 @@ public function rules(): array
'sitemaps.*' => ['required', 'url'],
'settings' => ['array'],
'enabled' => ['boolean', new CanEnableRule(Crawler::class)],
'url_blacklist' => ['nullable', 'string', new ValidRegexLines],
];
}
}
66 changes: 66 additions & 0 deletions packages/crawler/src/Livewire/Tables/CrawledUrlsTable.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?php

namespace Vigilant\Crawler\Livewire\Tables;

use Illuminate\Database\Eloquent\Builder;
use Livewire\Attributes\Locked;
use RamonRietdijk\LivewireTables\Columns\Column;
use RamonRietdijk\LivewireTables\Filters\SelectFilter;
use Vigilant\Crawler\Models\CrawledUrl;
use Vigilant\Frontend\Integrations\Table\BaseTable;
use Vigilant\Frontend\Integrations\Table\LinkColumn;

class CrawledUrlsTable extends BaseTable
{
protected string $model = CrawledUrl::class;

public array $filters = [
'crawled' => '',
];

#[Locked]
public int $crawlerId;

public function mount(int $crawlerId): void
{
$this->crawlerId = $crawlerId;
}

protected function columns(): array
{
return [
LinkColumn::make(__('URL'), 'url')
->openInNewTab()
->searchable()
->sortable(),

Column::make(__('Crawled'), 'crawled')
->displayUsing(fn (bool $crawled): string => $crawled ? __('Yes') : __('No'))
->sortable(),
];
}

protected function filters(): array
{
return [
SelectFilter::make(__('Crawled'), 'crawled')
->options([
'yes' => __('Crawled'),
'no' => __('Not crawled'),
])
->filterUsing(function (Builder $builder, ?string $value): void {
if ($value === 'yes') {
$builder->where($builder->qualifyColumn('crawled'), '=', true);
} elseif ($value === 'no') {
$builder->where($builder->qualifyColumn('crawled'), '=', false);
}
}),
];
}

protected function query(): Builder
{
return parent::query()
->where('web_crawled_urls.crawler_id', '=', $this->crawlerId);
}
}
2 changes: 2 additions & 0 deletions packages/crawler/src/ServiceProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
use Vigilant\Crawler\Livewire\Crawler\Dashboard;
use Vigilant\Crawler\Livewire\CrawlerForm;
use Vigilant\Crawler\Livewire\Crawlers;
use Vigilant\Crawler\Livewire\Tables\CrawledUrlsTable;
use Vigilant\Crawler\Livewire\Tables\CrawlerTable;
use Vigilant\Crawler\Livewire\Tables\IssuesTable;
use Vigilant\Crawler\Models\Crawler;
Expand Down Expand Up @@ -105,6 +106,7 @@ protected function bootLivewire(): static

Livewire::component('crawler-dashboard', Dashboard::class);
Livewire::component('crawler-issues-table', IssuesTable::class);
Livewire::component('crawler-crawled-urls-table', CrawledUrlsTable::class);

return $this;
}
Expand Down
Loading