Newer
Older
Alex Pott
committed
<?php
namespace Drupal\Component\Utility;
/**
* Provides DOMDocument helpers for parsing and serializing HTML strings.
Jennifer Hodgdon
committed
*
* @ingroup utility
Alex Pott
committed
*/
class Html {
/**
* An array of previously cleaned HTML classes.
*
* @var array
*/
protected static $classes = [];
/**
* An array of the initial IDs used in one request.
*
* @var array
*/
protected static $seenIdsInit;
/**
* An array of IDs, including incremented versions when an ID is duplicated.
* @var array
*/
protected static $seenIds;
/**
* Stores whether the current request was sent via AJAX.
* @var bool
protected static $isAjax = FALSE;
Alex Pott
committed
/**
* All attributes that may contain URIs.
*
* - The attributes 'code' and 'codebase' are omitted, because they only exist
* for the <applet> tag. The time of Java applets has passed.
* - The attribute 'icon' is omitted, because no browser implements the
* <command> tag anymore.
* See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command.
* - The 'manifest' attribute is omitted because it only exists for the <html>
* tag. That tag only makes sense in a HTML-served-as-HTML context, in which
* case relative URLs are guaranteed to work.
*
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
* @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
*
* @var string[]
*/
protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about'];
/**
* Prepares a string for use as a valid class name.
*
* Do not pass one string containing multiple classes as they will be
* incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
*
Lauri Eskola
committed
* @param mixed $class
* The class name to clean. It can be a string or anything that can be cast
* to string.
*
* @return string
* The cleaned class name.
*/
public static function getClass($class) {
Lauri Eskola
committed
$class = (string) $class;
if (!isset(static::$classes[$class])) {
static::$classes[$class] = static::cleanCssIdentifier(Unicode::strtolower($class));
}
return static::$classes[$class];
}
/**
* Prepares a string for use as a CSS identifier (element, class, or ID name).
*
* http://www.w3.org/TR/CSS21/syndata.html#characters shows the syntax for
* valid CSS identifiers (including element names, classes, and IDs in
* selectors.)
*
* @param string $identifier
* The identifier to clean.
* @param array $filter
* An array of string replacements to use on the identifier.
*
* @return string
* The cleaned identifier.
*/
public static function cleanCssIdentifier($identifier, array $filter = [
' ' => '-',
'_' => '-',
'/' => '-',
'[' => '-',
']' => '',
]) {
// We could also use strtr() here but its much slower than str_replace(). In
// order to keep '__' to stay '__' we first replace it with a different
// placeholder after checking that it is not defined as a filter.
$double_underscore_replacements = 0;
if (!isset($filter['__'])) {
$identifier = str_replace('__', '##', $identifier, $double_underscore_replacements);
}
$identifier = str_replace(array_keys($filter), array_values($filter), $identifier);
// Replace temporary placeholder '##' with '__' only if the original
// $identifier contained '__'.
if ($double_underscore_replacements > 0) {
$identifier = str_replace('##', '__', $identifier);
}
// Valid characters in a CSS identifier are:
// - the hyphen (U+002D)
// - a-z (U+0030 - U+0039)
// - A-Z (U+0041 - U+005A)
// - the underscore (U+005F)
// - 0-9 (U+0061 - U+007A)
// - ISO 10646 characters U+00A1 and higher
// We strip out any character not in the above list.
$identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
// Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
$identifier = preg_replace([
'/^[0-9]/',
'/^(-[0-9])|^(--)/'
], ['_', '__'], $identifier);
return $identifier;
}
/**
* Sets if this request is an Ajax request.
* @param bool $is_ajax
* TRUE if this request is an Ajax request, FALSE otherwise.
public static function setIsAjax($is_ajax) {
static::$isAjax = $is_ajax;
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
}
/**
* Prepares a string for use as a valid HTML ID and guarantees uniqueness.
*
* This function ensures that each passed HTML ID value only exists once on
* the page. By tracking the already returned ids, this function enables
* forms, blocks, and other content to be output multiple times on the same
* page, without breaking (X)HTML validation.
*
* For already existing IDs, a counter is appended to the ID string.
* Therefore, JavaScript and CSS code should not rely on any value that was
* generated by this function and instead should rely on manually added CSS
* classes or similarly reliable constructs.
*
* Two consecutive hyphens separate the counter from the original ID. To
* manage uniqueness across multiple Ajax requests on the same page, Ajax
* requests POST an array of all IDs currently present on the page, which are
* used to prime this function's cache upon first invocation.
*
* To allow reverse-parsing of IDs submitted via Ajax, any multiple
* consecutive hyphens in the originally passed $id are replaced with a
* single hyphen.
*
* @param string $id
* The ID to clean.
*
* @return string
* The cleaned ID.
*/
public static function getUniqueId($id) {
// If this is an Ajax request, then content returned by this page request
// will be merged with content already on the base page. The HTML IDs must
// be unique for the fully merged content. Therefore use unique IDs.
if (static::$isAjax) {
return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
}
// @todo Remove all that code once we switch over to random IDs only,
// see https://www.drupal.org/node/1090592.
if (!isset(static::$seenIdsInit)) {
static::$seenIdsInit = [];
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
}
if (!isset(static::$seenIds)) {
static::$seenIds = static::$seenIdsInit;
}
$id = static::getId($id);
// Ensure IDs are unique by appending a counter after the first occurrence.
// The counter needs to be appended with a delimiter that does not exist in
// the base ID. Requiring a unique delimiter helps ensure that we really do
// return unique IDs and also helps us re-create the $seen_ids array during
// Ajax requests.
if (isset(static::$seenIds[$id])) {
$id = $id . '--' . ++static::$seenIds[$id];
}
else {
static::$seenIds[$id] = 1;
}
return $id;
}
/**
* Prepares a string for use as a valid HTML ID.
*
* Only use this function when you want to intentionally skip the uniqueness
* guarantee of self::getUniqueId().
*
* @param string $id
* The ID to clean.
*
* @return string
* The cleaned ID.
*
* @see self::getUniqueId()
*/
public static function getId($id) {
$id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], Unicode::strtolower($id));
// As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
// only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
// colons (":"), and periods ("."). We strip out any character not in that
// list. Note that the CSS spec doesn't allow colons or periods in identifiers
// (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
// characters as well.
$id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
// Removing multiple consecutive hyphens.
$id = preg_replace('/\-+/', '-', $id);
return $id;
}
/**
* Resets the list of seen IDs.
*/
public static function resetSeenIds() {
static::$seenIds = NULL;
}
Alex Pott
committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/**
* Normalizes an HTML snippet.
*
* This function is essentially \DOMDocument::normalizeDocument(), but
* operates on an HTML string instead of a \DOMDocument.
*
* @param string $html
* The HTML string to normalize.
*
* @return string
* The normalized HTML string.
*/
public static function normalize($html) {
$document = static::load($html);
return static::serialize($document);
}
/**
* Parses an HTML snippet and returns it as a DOM object.
*
* This function loads the body part of a partial (X)HTML document and returns
* a full \DOMDocument object that represents this document.
*
* Use \Drupal\Component\Utility\Html::serialize() to serialize this
* \DOMDocument back to a string.
*
* @param string $html
* The partial (X)HTML snippet to load. Invalid markup will be corrected on
* import.
*
* @return \DOMDocument
* A \DOMDocument that represents the loaded (X)HTML snippet.
*/
public static function load($html) {
$document = <<<EOD
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
<body>!html</body>
</html>
EOD;
// PHP's \DOMDocument serialization adds extra whitespace when the markup
// of the wrapping document contains newlines, so ensure we remove all
// newlines before injecting the actual HTML body to be processed.
$document = strtr($document, ["\n" => '', '!html' => $html]);
Alex Pott
committed
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
$dom = new \DOMDocument();
// Ignore warnings during HTML soup loading.
@$dom->loadHTML($document);
return $dom;
}
/**
* Converts the body of a \DOMDocument back to an HTML snippet.
*
* The function serializes the body part of a \DOMDocument back to an (X)HTML
* snippet. The resulting (X)HTML snippet will be properly formatted to be
* compatible with HTML user agents.
*
* @param \DOMDocument $document
* A \DOMDocument object to serialize, only the tags below the first <body>
* node will be converted.
*
* @return string
* A valid (X)HTML snippet, as a string.
*/
public static function serialize(\DOMDocument $document) {
$body_node = $document->getElementsByTagName('body')->item(0);
$html = '';
if ($body_node !== NULL) {
foreach ($body_node->getElementsByTagName('script') as $node) {
static::escapeCdataElement($node);
}
foreach ($body_node->getElementsByTagName('style') as $node) {
static::escapeCdataElement($node, '/*', '*/');
}
foreach ($body_node->childNodes as $node) {
$html .= $document->saveXML($node);
}
Alex Pott
committed
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
}
return $html;
}
/**
* Adds comments around a <!CDATA section in a \DOMNode.
*
* \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
* CDATA sections from the contents of inline script and style tags. This can
* cause HTML4 browsers to throw exceptions.
*
* This function attempts to solve the problem by creating a
* \DOMDocumentFragment to comment the CDATA tag.
*
* @param \DOMNode $node
* The element potentially containing a CDATA node.
* @param string $comment_start
* (optional) A string to use as a comment start marker to escape the CDATA
* declaration. Defaults to '//'.
* @param string $comment_end
* (optional) A string to use as a comment end marker to escape the CDATA
* declaration. Defaults to an empty string.
*/
public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
foreach ($node->childNodes as $child_node) {
if ($child_node instanceof \DOMCdataSection) {
$embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
$embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
// Prevent invalid cdata escaping as this would throw a DOM error.
// This is the same behavior as found in libxml2.
// Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
// Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting
Alex Pott
committed
$data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
$fragment = $node->ownerDocument->createDocumentFragment();
$fragment->appendXML($embed_prefix . $data . $embed_suffix);
$node->appendChild($fragment);
$node->removeChild($child_node);
}
}
}
Alex Pott
committed
/**
* Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
*
* Double-escaped entities will only be decoded once ("&lt;" becomes
* "<", not "<"). Be careful when using this function, as it will revert
* previous sanitization efforts (<script> will become <script>).
*
catch
committed
* This method is not the opposite of Html::escape(). For example, this method
* will convert "é" to "é", whereas Html::escape() will not convert "é"
* to "é".
*
Alex Pott
committed
* @param string $text
* The text to decode entities in.
*
* @return string
* The input $text, with all HTML entities decoded once.
catch
committed
*
* @see html_entity_decode()
* @see \Drupal\Component\Utility\Html::escape()
Alex Pott
committed
*/
public static function decodeEntities($text) {
return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
}
catch
committed
/**
* Escapes text by converting special characters to HTML entities.
*
* This method escapes HTML for sanitization purposes by replacing the
* following special characters with their HTML entity equivalents:
* - & (ampersand) becomes &
* - " (double quote) becomes "
* - ' (single quote) becomes '
* - < (less than) becomes <
* - > (greater than) becomes >
* Special characters that have already been escaped will be double-escaped
Alex Pott
committed
* (for example, "<" becomes "&lt;"), and invalid UTF-8 encoding
* will be converted to the Unicode replacement character ("�").
catch
committed
*
* This method is not the opposite of Html::decodeEntities(). For example,
* this method will not encode "é" to "é", whereas
* Html::decodeEntities() will convert all HTML entities to UTF-8 bytes,
* including "é" and "<" to "é" and "<".
*
catch
committed
* When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to
* '#markup' is not recommended. Use the '#plain_text' key instead and the
* renderer will autoescape the text.
*
catch
committed
* @param string $text
* The input text.
*
* @return string
* The text with all HTML special characters converted.
*
* @see htmlspecialchars()
* @see \Drupal\Component\Utility\Html::decodeEntities()
*
* @ingroup sanitization
*/
public static function escape($text) {
Alex Pott
committed
return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
catch
committed
}
Alex Pott
committed
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/**
* Converts all root-relative URLs to absolute URLs.
*
* Does not change any existing protocol-relative or absolute URLs. Does not
* change other relative URLs because they would result in different absolute
* URLs depending on the current path. For example: when the same content
* containing such a relative URL (for example 'image.png'), is served from
* its canonical URL (for example 'http://example.com/some-article') or from
* a listing or feed (for example 'http://example.com/all-articles') their
* "current path" differs, resulting in different absolute URLs:
* 'http://example.com/some-article/image.png' versus
* 'http://example.com/all-articles/image.png'. Only one can be correct.
* Therefore relative URLs that are not root-relative cannot be safely
* transformed and should generally be avoided.
*
* Necessary for HTML that is served outside of a website, for example, RSS
* and e-mail.
*
* @param string $html
* The partial (X)HTML snippet to load. Invalid markup will be corrected on
* import.
* @param string $scheme_and_host
* The root URL, which has a URI scheme, host and optional port.
*
* @return string
* The updated (X)HTML snippet.
*/
public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) {
assert(empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"])), '$scheme_and_host contains scheme, host and port at most.');
assert(isset(parse_url($scheme_and_host)["scheme"]), '$scheme_and_host is absolute and hence has a scheme.');
assert(isset(parse_url($scheme_and_host)["host"]), '$base_url is absolute and hence has a host.');
Alex Pott
committed
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
$html_dom = Html::load($html);
$xpath = new \DOMXpath($html_dom);
// Update all root-relative URLs to absolute URLs in the given HTML.
foreach (static::$uriAttributes as $attr) {
foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) {
$node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr));
}
foreach ($xpath->query("//*[@srcset]") as $node) {
// @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
// @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string
$image_candidate_strings = explode(',', $node->getAttribute('srcset'));
$image_candidate_strings = array_map('trim', $image_candidate_strings);
for ($i = 0; $i < count($image_candidate_strings); $i++) {
$image_candidate_string = $image_candidate_strings[$i];
if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') {
$image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string;
}
}
$node->setAttribute('srcset', implode(', ', $image_candidate_strings));
}
}
return Html::serialize($html_dom);
}