+++ /dev/null
-<?php
-
-/**
- * @file
- * Pathologic text filter for Drupal.
- *
- * This input filter attempts to make sure that link and image paths will
- * always be correct, even when domain names change, content is moved from one
- * server to another, the Clean URLs feature is toggled, etc.
- *
- * @todo for Pathlogic 8.x-2.x
- * - Account for new way dirty URLs are done (no more clean_url variable)
- * - - We can now tell url() to create clean or dirty URLs regardless of current
- * status!
- * - - (lol jk no more url() - but maybe the equivalent code is still there)
- * - - Our path parsing code needs to account for both new and old dirty URL
- * styles
- * - - Option to force output of clean or dirty URLs?
- * - Move _pathologic_filter() code into the actual filter class
- * - Do DOM object manipulation instead of preg_replace(). It's what core
- * filters are doing now.
- * - UrlHelper::Parse() instead of parse_url()?
- * - Alter hook alters Drupal\Core\Url object instead of method parameters.
- */
-
-use Drupal\Component\Utility\SafeMarkup;
-use Drupal\Component\Utility\Unicode;
-use Drupal\Core\Url;
-
-/**
- * Pathologic filter callback.
- *
- * @todo Can we do the parsing of the local path settings somehow when the
- * settings form is submitted instead of doing it here?
- */
-function _pathologic_filter($text, $settings, $hash) {
- // Get the base URL and explode it into component parts. We add these parts
- // to the exploded local paths settings later.
- global $base_url;
- $base_url_parts = parse_url($base_url . '/');
- // Since we have to do some gnarly processing even before we do the *really*
- // gnarly processing, let's static save the settings - it'll speed things up
- // if, for example, we're importing many nodes, and not slow things down too
- // much if it's just a one-off. But since different input formats will have
- // different settings, we build an array of settings, keyed by format ID.
- $cached_settings = &drupal_static(__FUNCTION__, []);
- if (!isset($cached_settings[$hash])) {
- $settings['local_paths_exploded'] = [];
- if ($settings['local_paths'] !== '') {
- // Build an array of the exploded local paths for this format's settings.
- // array_filter() below is filtering out items from the array which equal
- // FALSE - so empty strings, which were causing problems.
- // @see http://drupal.org/node/1727492
- $local_paths = array_filter(array_map('trim', explode("\n", $settings['local_paths'])));
- foreach ($local_paths as $local) {
- $parts = parse_url($local);
- // Okay, what the hellish "if" statement is doing below is checking to
- // make sure we aren't about to add a path to our array of exploded
- // local paths which matches the current "local" path. We consider it
- // not a match, if…
- // @todo: This is pretty horrible. Can this be simplified?
- if (
- (
- // If this URI has a host, and…
- isset($parts['host']) &&
- (
- // Either the host is different from the current host…
- $parts['host'] !== $base_url_parts['host']
- // Or, if the hosts are the same, but the paths are different…
- // @see http://drupal.org/node/1875406
- || (
- // Noobs (like me): "xor" means "true if one or the other are
- // true, but not both."
- (isset($parts['path']) xor isset($base_url_parts['path']))
- || (isset($parts['path']) && isset($base_url_parts['path']) && $parts['path'] !== $base_url_parts['path'])
- )
- )
- ) ||
- // Or…
- (
- // The URI doesn't have a host…
- !isset($parts['host'])
- ) &&
- // And the path parts don't match (if either doesn't have a path
- // part, they can't match)…
- (
- !isset($parts['path']) ||
- !isset($base_url_parts['path']) ||
- $parts['path'] !== $base_url_parts['path']
- )
- ) {
- // Add it to the list.
- $settings['local_paths_exploded'][] = $parts;
- }
- }
- }
- // Now add local paths based on "this" server URL.
- $settings['local_paths_exploded'][] = ['path' => $base_url_parts['path']];
- $settings['local_paths_exploded'][] = ['path' => $base_url_parts['path'], 'host' => $base_url_parts['host']];
- // We'll also just store the host part separately for easy access.
- $settings['base_url_host'] = $base_url_parts['host'];
-
- $cached_settings[$hash] = $settings;
- }
- // Take note of which settings in the settings array should apply.
- $cached_settings['current_settings'] = &$cached_settings[$hash];
-
- // Now that we have all of our settings prepared, attempt to process all
- // paths in href, src, action or longdesc HTML attributes. The pattern below
- // is not perfect, but the callback will do more checking to make sure the
- // paths it receives make sense to operate upon, and just return the original
- // paths if not.
- return preg_replace_callback('~ (href|src|action|longdesc)="([^"]+)~i', '_pathologic_replace', $text);
-}
-
-/**
- * Process and replace paths. preg_replace_callback() callback.
- */
-function _pathologic_replace($matches) {
- // Get the base path.
- global $base_path;
-
- // Get the settings for the filter. Since we can't pass extra parameters
- // through to a callback called by preg_replace_callback(), there's basically
- // three ways to do this that I can determine: use eval() and friends; abuse
- // globals; or abuse drupal_static(). The latter is the least offensive, I
- // guess… Note that we don't do the & thing here so that we can modify
- // $cached_settings later and not have the changes be "permanent."
- $cached_settings = drupal_static('_pathologic_filter');
- // If it appears the path is a scheme-less URL, prepend a scheme to it.
- // parse_url() cannot properly parse scheme-less URLs. Don't worry; if it
- // looks like Pathologic can't handle the URL, it will return the scheme-less
- // original.
- // @see https://drupal.org/node/1617944
- // @see https://drupal.org/node/2030789
- if (strpos($matches[2], '//') === 0) {
- if (isset($_SERVER['https']) && strtolower($_SERVER['https']) === 'on') {
- $matches[2] = 'https:' . $matches[2];
- }
- else {
- $matches[2] = 'http:' . $matches[2];
- }
- }
- // Now parse the URL after reverting HTML character encoding.
- // @see http://drupal.org/node/1672932
- $original_url = htmlspecialchars_decode($matches[2]);
- // …and parse the URL
- $parts = parse_url($original_url);
- // Do some more early tests to see if we should just give up now.
- if (
- // If parse_url() failed, $parts = FALSE. If the href was just "#", $parts
- // is an empty array. Give up in both cases.
- empty($parts)
- || (
- // If there's a scheme part and it doesn't look useful, bail out.
- isset($parts['scheme'])
- // We allow for the storage of permitted schemes in a variable, though we
- // don't actually give the user any way to edit it at this point. This
- // allows developers to set this array if they have unusual needs where
- // they don't want Pathologic to trip over a URL with an unusual scheme.
- // @see http://drupal.org/node/1834308
- // Default value is ['http', 'https', 'files', 'internal']
- // "files" and "internal" are for Path Filter compatibility.
- && !in_array($parts['scheme'], \Drupal::config('pathologic.settings')->get('scheme_whitelist'))
- )
- // Bail out if it looks like there's only a fragment part.
- || (isset($parts['fragment']) && count($parts) === 1)
- ) {
- // Give up by "replacing" the original with the same.
- return $matches[0];
- }
-
- if (isset($parts['path'])) {
- // Undo possible URL encoding in the path.
- // @see http://drupal.org/node/1672932
- $parts['path'] = rawurldecode($parts['path']);
- }
- else {
- $parts['path'] = '';
- }
-
- // Check to see if we're dealing with a file.
- // @todo Should we still try to do path correction on these files too?
- if (isset($parts['scheme']) && $parts['scheme'] === 'files') {
- // Path Filter "files:" support. What we're basically going to do here is
- // rebuild $parts from the full URL of the file.
- $new_parts = parse_url(file_create_url(file_default_scheme() . '://' . $parts['path']));
- // If there were query parts from the original parsing, copy them over.
- if (!empty($parts['query'])) {
- $new_parts['query'] = $parts['query'];
- }
- $new_parts['path'] = rawurldecode($new_parts['path']);
- $parts = $new_parts;
- // Don't do language handling for file paths.
- $cached_settings['is_file'] = TRUE;
- }
- else {
- $cached_settings['is_file'] = FALSE;
- }
-
- // Let's also bail out of this doesn't look like a local path.
- $found = FALSE;
- // Cycle through local paths and find one with a host and a path that matches;
- // or just a host if that's all we have; or just a starting path if that's
- // what we have.
- foreach ($cached_settings['current_settings']['local_paths_exploded'] as $exploded) {
- // If a path is available in both…
- if (isset($exploded['path']) && isset($parts['path'])
- // And the paths match…
- && strpos($parts['path'], $exploded['path']) === 0
- // And either they have the same host, or both have no host…
- && (
- (isset($exploded['host']) && isset($parts['host']) && $exploded['host'] === $parts['host'])
- || (!isset($exploded['host']) && !isset($parts['host']))
- )
- ) {
- // Remove the shared path from the path. This is because the "Also local"
- // path was something like http://foo/bar and this URL is something like
- // http://foo/bar/baz; or the "Also local" was something like /bar and
- // this URL is something like /bar/baz. And we only care about the /baz
- // part.
- $parts['path'] = Unicode::substr($parts['path'], Unicode::strlen($exploded['path']));
- $found = TRUE;
- // Break out of the foreach loop
- break;
- }
- // Okay, we didn't match on path alone, or host and path together. Can we
- // match on just host? Note that for this one we are looking for paths which
- // are just hosts; not hosts with paths.
- elseif ((isset($parts['host']) && !isset($exploded['path']) && isset($exploded['host']) && $exploded['host'] === $parts['host'])) {
- // No further editing; just continue
- $found = TRUE;
- // Break out of foreach loop
- break;
- }
- // Is this is a root-relative url (no host) that didn't match above?
- // Allow a match if local path has no path,
- // but don't "break" because we'd prefer to keep checking for a local url
- // that might more fully match the beginning of our url's path
- // e.g.: if our url is /foo/bar we'll mark this as a match for
- // http://example.com but want to keep searching and would prefer a match
- // to http://example.com/foo if that's configured as a local path
- elseif (!isset($parts['host']) && (!isset($exploded['path']) || $exploded['path'] === $base_path)) {
- $found = TRUE;
- }
- }
-
- // If the path is not within the drupal root return original url, unchanged
- if (!$found) {
- return $matches[0];
- }
-
- // Okay, format the URL.
- // If there's still a slash lingering at the start of the path, chop it off.
- $parts['path'] = ltrim($parts['path'], '/');
-
- // Examine the query part of the URL. Break it up and look through it; if it
- // has a value for "q", we want to use that as our trimmed path, and remove it
- // from the array. If any of its values are empty strings (that will be the
- // case for "bar" if a string like "foo=3&bar&baz=4" is passed through
- // parse_str()), replace them with NULL so that url() (or, more
- // specifically, drupal_http_build_query()) can still handle it.
- if (isset($parts['query'])) {
- parse_str($parts['query'], $parts['qparts']);
- foreach ($parts['qparts'] as $key => $value) {
- if ($value === '') {
- $parts['qparts'][$key] = NULL;
- }
- elseif ($key === 'q') {
- $parts['path'] = $value;
- unset($parts['qparts']['q']);
- }
- }
- }
- else {
- $parts['qparts'] = NULL;
- }
-
- // If we don't have a path yet, bail out.
- if (!isset($parts['path'])) {
- return $matches[0];
- }
-
- // If this looks like a D8-style unclean URL, crop off the "index.php/" from
- // the beginning.
- if (strpos($parts['path'], 'index.php/') === 0) {
- $parts['path'] = substr($parts['path'], 10);
- }
-
- // If we didn't previously identify this as a file, check to see if the file
- // exists now that we have the correct path relative to DRUPAL_ROOT
- if (!$cached_settings['is_file']) {
- $cached_settings['is_file'] = !empty($parts['path']) && is_file(DRUPAL_ROOT . '/' . $parts['path']);
- }
-
- // Okay, deal with language stuff.
- // Let's see if we can split off a language prefix from the path.
- if (\Drupal::moduleHandler()->moduleExists('language')) {
- // This logic is based on
- // \Drupal\language\Plugin\LanguageNegotiation\LanguageNegotiationUrl::getLangcode().
- $languages = \Drupal::languageManager()->getLanguages();
- $config = \Drupal::config('language.negotiation')->get('url');
-
- $request_path = urldecode(trim($parts['path'], '/'));
- $path_args = explode('/', $request_path);
- $prefix = array_shift($path_args);
-
- // Search for prefix within added languages.
- foreach ($languages as $language) {
- if (isset($config['prefixes'][$language->getId()]) && $config['prefixes'][$language->getId()] == $prefix) {
- $parts['path'] = implode('/', $path_args);
- $parts['language_obj'] = $language;
- break;
- }
- }
- }
-
- // If we get to this point and $parts['path'] is now an empty string (which
- // will be the case if the path was originally just "/"), then we
- // want to link to <front>.
- if ($parts['path'] === '') {
- $parts['path'] = '<front>';
- }
- // Build the parameters we will send to url()
- $url_params = [
- 'path' => $parts['path'],
- 'options' => [
- 'query' => $parts['qparts'],
- 'fragment' => isset($parts['fragment']) ? $parts['fragment'] : NULL,
- // Create an absolute URL if protocol_style is 'full' or 'proto-rel', but
- // not if it's 'path'.
- 'absolute' => $cached_settings['current_settings']['protocol_style'] !== 'path',
- // If we seem to have found a language for the path, pass it along to
- // url(). Otherwise, ignore the 'language' parameter.
- 'language' => isset($parts['language_obj']) ? $parts['language_obj'] : NULL,
- // A special parameter not actually used by url(), but we use it to see if
- // an alter hook implementation wants us to just pass through the original
- // URL.
- 'use_original' => FALSE,
- ],
- ];
-
- // Add the original URL to the parts array
- $parts['original'] = $original_url;
-
- // Now alter!
- // @see http://drupal.org/node/1762022
- \Drupal::moduleHandler()->alter('pathologic', $url_params, $parts, $cached_settings);
-
- // If any of the alter hooks asked us to just pass along the original URL,
- // then do so.
- if ($url_params['options']['use_original']) {
- return $matches[0];
- }
-
- // Now to build the URL. Drumroll, please…
- if ($parts['path'] == '<front>') {
- $url = Url::fromRoute('<front>', [], $url_params['options'])->toString();
- }
- else {
- try {
- $url = Url::fromUri('base://' . $url_params['path'], $url_params['options'])
- ->toString();
- }
- catch (\Exception $e) {
- // In case of an error, e.g. completely invalid URL, return it unchanged.
- return $matches[0];
- }
- }
-
- // If we need to create a protocol-relative URL, then convert the absolute
- // URL we have now.
- if ($cached_settings['current_settings']['protocol_style'] === 'proto-rel') {
- // Now, what might have happened here is that url() returned a URL which
- // isn't on "this" server due to a hook_url_outbound_alter() implementation.
- // We don't want to convert the URL in that case. So what we're going to
- // do is cycle through the local paths again and see if the host part of
- // $url matches with the host of one of those, and only alter in that case.
- $url_parts = parse_url($url);
- if (!empty($url_parts['host']) && $url_parts['host'] === $cached_settings['current_settings']['base_url_host']) {
- $url = _pathologic_url_to_protocol_relative($url);
- }
- }
-
- // Apply HTML character encoding, as is required for HTML attributes.
- // @see http://drupal.org/node/1672932
- $url = SafeMarkup::checkPlain($url);
- // $matches[1] will be the tag attribute; src, href, etc.
- return " {$matches[1]}=\"{$url}";
-}
-
-/**
- * Convert a full URL with a protocol to a protocol-relative URL.
- *
- * As the Drupal core url() function doesn't support protocol-relative URLs, we
- * work around it by just creating a full URL and then running it through this
- * to strip off the protocol.
- *
- * Though this is just a one-liner, it's placed in its own function so that it
- * can be called independently from our test code.
- */
-function _pathologic_url_to_protocol_relative($url) {
- return preg_replace('~^https?://~', '//', $url);
-}