Skip to content

Extending the methods of an Element: getNode, getSelector #262

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Or more explicitly, like this:

```php
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
$converter->setOptions(['strip_tags' => true]);

$html = '<span>Turnips!</span>';
$markdown = $converter->convert($html); // $markdown now contains "Turnips!"
Expand Down Expand Up @@ -124,14 +124,20 @@ $markdown = $converter->convert($html); // $markdown now contains "Github"
### Style options

By default bold tags are converted using the asterisk syntax, and italic tags are converted using the underlined syntax. Change these by using the `bold_style` and `italic_style` options.
If you want to clear the format of some of them, set them to `null`.
If you want their format to remain as HTML tags, set them to the empty string.

```php
$converter = new HtmlConverter();
$converter->getConfig()->setOption('italic_style', '*');
$converter->getConfig()->setOption('bold_style', '__');

$html = '<em>Italic</em> and a <strong>bold</strong>';
$markdown = $converter->convert($html); // $markdown now contains "*Italic* and a __bold__"
$converter->setOptions([
'italic_style' => '*',
'bold_style' => '__',
'underline_style' => null,
'strikethrough_style' => ''
);

$html = '<u>Underline</u>, <del>Strikethrough</del> and <em>Italic</em> and a <strong>bold</strong>';
$markdown = $converter->convert($html); // $markdown now contains "Underline and a <del>Strikethrough</del> and *Italic* and a __bold__"
```

### Line break options
Expand All @@ -142,11 +148,11 @@ By default, `br` tags are converted to two spaces followed by a newline characte
$converter = new HtmlConverter();
$html = '<p>test<br>line break</p>';

$converter->getConfig()->setOption('hard_break', true);
$markdown = $converter->convert($html); // $markdown now contains "test\nline break"
$converter->setOptions(['hard_break' => true]);
$markdown = $converter->convert($html); // $markdown now contains "underline test\nline break"

$converter->getConfig()->setOption('hard_break', false); // default
$markdown = $converter->convert($html); // $markdown now contains "test \nline break"
$converter->setOptions(['hard_break' => false]); // default
$markdown = $converter->convert($html); // $markdown now contains "underline test \nline break"
```

### Autolinking options
Expand All @@ -157,10 +163,10 @@ By default, `a` tags are converted to the easiest possible link syntax, i.e. if
$converter = new HtmlConverter();
$html = '<p><a href="https://thephpleague.com">https://thephpleague.com</a></p>';

$converter->getConfig()->setOption('use_autolinks', true);
$converter->setOptions(['use_autolinks' => true]);
$markdown = $converter->convert($html); // $markdown now contains "<https://thephpleague.com>"

$converter->getConfig()->setOption('use_autolinks', false); // default
$converter->setOptions(['use_autolinks' => false]); // default
$markdown = $converter->convert($html); // $markdown now contains "[https://thephpleague.com](https://thephpleague.com)"
```

Expand Down
97 changes: 78 additions & 19 deletions src/Converter/EmphasisConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,35 @@ protected function getNormTag(?ElementInterface $element): string
{
if ($element !== null && ! $element->isText()) {
$tag = $element->getTagName();
if ($tag === 'i' || $tag === 'em') {
return 'em';
}

if ($tag === 'b' || $tag === 'strong') {
return 'strong';
switch($tag) {
case 'i':
case 'em':
case 'cite':
case 'dfn':
case 'var':
return 'em';
case 'b':
case 'strong':
return 'strong';
case 'strike':
case 's':
case 'del':
return 'del';
case 'sub':
return 'sub';
case 'sup':
return 'sup';
case 'u':
case 'ins':
return 'u';
case 'kdb':
return 'kbd';
case 'span':
case 'small':
case 'abbr':
return $tag;
}
}

return '';
}

Expand All @@ -42,22 +62,38 @@ public function convert(ElementInterface $element): string
if (! \trim($value)) {
return $value;
}

if ($tag === 'em') {
$style = $this->config->getOption('italic_style');
} else {
$style = $this->config->getOption('bold_style');
switch ($tag) {
case 'em':
$style = $this->config->getOption('italic_style');
break;
case 'del':
$style = $this->config->getOption('strikethrough_style');
break;
case 'sub':
$style = $this->config->getOption('subscript_style');
break;
case 'sup':
$style = $this->config->getOption('superscript_style');
break;
case 'strong':
$style = $this->config->getOption('bold_style');
break;
case 'u':
$style = $this->config->getOption('underline_style');
break;
case 'kdb':
$style = $this->config->getOption('keyboard_style');
break;
default:
$style = $this->config->getOption('undefined_style');
break;
}

$prefix = \ltrim($value) !== $value ? ' ' : '';
$suffix = \rtrim($value) !== $value ? ' ' : '';

/* If this node is immediately preceded or followed by one of the same type don't emit
* the start or end $style, respectively. This prevents <em>foo</em><em>bar</em> from
* being converted to *foo**bar* which is incorrect. We want *foobar* instead.
*/
$preStyle = $this->getNormTag($element->getPreviousSibling()) === $tag ? '' : $style;
$postStyle = $this->getNormTag($element->getNextSibling()) === $tag ? '' : $style;
$preStyle = $this->makeDelimiter($element, $tag, $style);
$postStyle = $this->makeDelimiter($element, $tag, $style, false);

return $prefix . $preStyle . \trim($value) . $postStyle . $suffix;
}
Expand All @@ -67,6 +103,29 @@ public function convert(ElementInterface $element): string
*/
public function getSupportedTags(): array
{
return ['em', 'i', 'strong', 'b'];
return [
'em', 'i', 'cite', 'dfn', 'var',
'strong', 'b',
'del', 'strike', 's',
'sub', 'sup',
'u', 'ins',
'kbd',
'span', 'small', 'abbr'
];
}

protected function makeDelimiter($element, string $tag, $style, bool $prev = true): string
{
/* If this node is immediately preceded or followed by one of the same type don't emit
* the start or end $style, respectively. This prevents <em>foo</em><em>bar</em> from
* being converted to *foo**bar* which is incorrect. We want *foobar* instead.
*/
if($prev) {
$ignore = $this->getNormTag($element->getPreviousSibling()) === $tag;
} else {
$ignore = $this->getNormTag($element->getNextSibling()) === $tag;
}
if (!is_string($style ?? null) || $ignore) return '';
return empty($style) ? "<" . ($prev ? "" : "/") ."{$tag}>" : $style;
}
}
23 changes: 23 additions & 0 deletions src/Element.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ public function isWhitespace(): bool
return $this->getTagName() === '#text' && \trim($this->getValue()) === '';
}

public function getNode(): ?\DOMNode
{
return $this->node;
}

public function getTagName(): string
{
return $this->node->nodeName;
Expand Down Expand Up @@ -221,6 +226,24 @@ public function getAttribute(string $name): string

return '';
}

public function getSelector(): string {
$element = $this;
if (!empty($element->getAttribute('id'))) {
return '#' . $element->getAttribute('id');
}
$path = [];
while ($element && $element->getTagName() !== 'body') {
$part = $element->getTagName();
$index = $element->getSiblingPosition();
if ($index > 0) {
$part .= ':nth-child(' . $index . ')';
}
array_unshift($path, $part);
$element = $element->getParent();
}
return implode(' > ', $path);
}

public function equals(ElementInterface $element): bool
{
Expand Down
4 changes: 4 additions & 0 deletions src/ElementInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ public function isText(): bool;

public function isWhitespace(): bool;

public function getNode(): ?\DOMNode;

public function getTagName(): string;

public function getValue(): string;
Expand Down Expand Up @@ -47,4 +49,6 @@ public function setFinalMarkdown(string $markdown): void;
public function getListItemLevel(): int;

public function getAttribute(string $name): string;

public function getSelector(): string;
}
6 changes: 6 additions & 0 deletions src/HtmlConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ public function __construct($options = [])
'strip_placeholder_links' => false, // Set to true to remove <a> that doesn't have href.
'bold_style' => '**', // DEPRECATED: Set to '__' if you prefer the underlined style
'italic_style' => '*', // DEPRECATED: Set to '_' if you prefer the underlined style
'strikethrough_style' => '~~',
'superscript_style' => '', // Set to '^' to use the superscript style
'subscript_style' => '', // Set to '~' to use the subscript style
'keyboard_style' => '\'',
'underline_style' => '', // Set to null to clear this style
'undefined_style' => '', // Set to null to clear this style
'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
'hard_break' => false, // Set to true to turn <br> into `\n` instead of ` \n`
'list_item_style' => '-', // Set the default character for each <li> in a <ul>. Can be '-', '*', or '+'
Expand Down