5 * Pathologic text filter for Drupal.
7 * This input filter attempts to make sure that link and image paths will
8 * always be correct, even when domain names change, content is moved from one
9 * server to another, the Clean URLs feature is toggled, etc.
11 * @todo for Pathlogic 8.x-2.x
12 * - Account for new way dirty URLs are done (no more clean_url variable)
13 * - - We can now tell url() to create clean or dirty URLs regardless of current
15 * - - (lol jk no more url() - but maybe the equivalent code is still there)
16 * - - Our path parsing code needs to account for both new and old dirty URL
18 * - - Option to force output of clean or dirty URLs?
19 * - Move _pathologic_filter() code into the actual filter class
20 * - Do DOM object manipulation instead of preg_replace(). It's what core
21 * filters are doing now.
22 * - UrlHelper::Parse() instead of parse_url()?
23 * - Alter hook alters Drupal\Core\Url object instead of method parameters.
26 use Drupal\Component\Utility\Html;
27 use Drupal\Component\Utility\Unicode;
31 * Pathologic filter callback.
33 * @todo Can we do the parsing of the local path settings somehow when the
34 * settings form is submitted instead of doing it here?
36 function _pathologic_filter($text, $settings, $hash) {
37 // Get the base URL and explode it into component parts. We add these parts
38 // to the exploded local paths settings later.
40 $base_url_parts = parse_url($base_url . '/');
41 // Since we have to do some gnarly processing even before we do the *really*
42 // gnarly processing, let's static save the settings - it'll speed things up
43 // if, for example, we're importing many nodes, and not slow things down too
44 // much if it's just a one-off. But since different input formats will have
45 // different settings, we build an array of settings, keyed by format ID.
46 $cached_settings = &drupal_static(__FUNCTION__, []);
47 if (!isset($cached_settings[$hash])) {
48 $settings['local_paths_exploded'] = [];
49 if ($settings['local_paths'] !== '') {
50 // Build an array of the exploded local paths for this format's settings.
51 // array_filter() below is filtering out items from the array which equal
52 // FALSE - so empty strings, which were causing problems.
53 // @see http://drupal.org/node/1727492
54 $local_paths = array_filter(array_map('trim', explode("\n", $settings['local_paths'])));
55 foreach ($local_paths as $local) {
56 $parts = parse_url($local);
57 // Okay, what the hellish "if" statement is doing below is checking to
58 // make sure we aren't about to add a path to our array of exploded
59 // local paths which matches the current "local" path. We consider it
61 // @todo: This is pretty horrible. Can this be simplified?
64 // If this URI has a host, and…
65 isset($parts['host']) &&
67 // Either the host is different from the current host…
68 $parts['host'] !== $base_url_parts['host']
69 // Or, if the hosts are the same, but the paths are different…
70 // @see http://drupal.org/node/1875406
72 // Noobs (like me): "xor" means "true if one or the other are
73 // true, but not both."
74 (isset($parts['path']) xor isset($base_url_parts['path']))
75 || (isset($parts['path']) && isset($base_url_parts['path']) && $parts['path'] !== $base_url_parts['path'])
81 // The URI doesn't have a host…
82 !isset($parts['host'])
84 // And the path parts don't match (if either doesn't have a path
85 // part, they can't match)…
87 !isset($parts['path']) ||
88 !isset($base_url_parts['path']) ||
89 $parts['path'] !== $base_url_parts['path']
92 // Add it to the list.
93 $settings['local_paths_exploded'][] = $parts;
97 // Now add local paths based on "this" server URL.
98 $settings['local_paths_exploded'][] = ['path' => $base_url_parts['path']];
99 $settings['local_paths_exploded'][] = ['path' => $base_url_parts['path'], 'host' => $base_url_parts['host']];
100 // We'll also just store the host part separately for easy access.
101 $settings['base_url_host'] = $base_url_parts['host'];
103 $cached_settings[$hash] = $settings;
105 // Take note of which settings in the settings array should apply.
106 $cached_settings['current_settings'] = &$cached_settings[$hash];
108 // Now that we have all of our settings prepared, attempt to process all
109 // paths in href, src, action or longdesc HTML attributes. The pattern below
110 // is not perfect, but the callback will do more checking to make sure the
111 // paths it receives make sense to operate upon, and just return the original
113 return preg_replace_callback('~ (href|src|action|longdesc)="([^"]+)~i', '_pathologic_replace', $text);
117 * Process and replace paths. preg_replace_callback() callback.
119 function _pathologic_replace($matches) {
120 // Get the base path.
123 // Get the settings for the filter. Since we can't pass extra parameters
124 // through to a callback called by preg_replace_callback(), there's basically
125 // three ways to do this that I can determine: use eval() and friends; abuse
126 // globals; or abuse drupal_static(). The latter is the least offensive, I
127 // guess… Note that we don't do the & thing here so that we can modify
128 // $cached_settings later and not have the changes be "permanent."
129 $cached_settings = drupal_static('_pathologic_filter');
130 // If it appears the path is a scheme-less URL, prepend a scheme to it.
131 // parse_url() cannot properly parse scheme-less URLs. Don't worry; if it
132 // looks like Pathologic can't handle the URL, it will return the scheme-less
134 // @see https://drupal.org/node/1617944
135 // @see https://drupal.org/node/2030789
136 if (strpos($matches[2], '//') === 0) {
137 if (isset($_SERVER['https']) && strtolower($_SERVER['https']) === 'on') {
138 $matches[2] = 'https:' . $matches[2];
141 $matches[2] = 'http:' . $matches[2];
144 // Now parse the URL after reverting HTML character encoding.
145 // @see http://drupal.org/node/1672932
146 $original_url = htmlspecialchars_decode($matches[2]);
147 // …and parse the URL
148 $parts = parse_url($original_url);
149 // Do some more early tests to see if we should just give up now.
151 // If parse_url() failed, $parts = FALSE. If the href was just "#", $parts
152 // is an empty array. Give up in both cases.
155 // If there's a scheme part and it doesn't look useful, bail out.
156 isset($parts['scheme'])
157 // We allow for the storage of permitted schemes in a variable, though we
158 // don't actually give the user any way to edit it at this point. This
159 // allows developers to set this array if they have unusual needs where
160 // they don't want Pathologic to trip over a URL with an unusual scheme.
161 // @see http://drupal.org/node/1834308
162 // Default value is ['http', 'https', 'files', 'internal']
163 // "files" and "internal" are for Path Filter compatibility.
164 && !in_array($parts['scheme'], \Drupal::config('pathologic.settings')->get('scheme_whitelist'))
166 // Bail out if it looks like there's only a fragment part.
167 || (isset($parts['fragment']) && count($parts) === 1)
169 // Give up by "replacing" the original with the same.
173 if (isset($parts['path'])) {
174 // Undo possible URL encoding in the path.
175 // @see http://drupal.org/node/1672932
176 $parts['path'] = rawurldecode($parts['path']);
182 // Check to see if we're dealing with a file.
183 // @todo Should we still try to do path correction on these files too?
184 if (isset($parts['scheme']) && $parts['scheme'] === 'files') {
185 // Path Filter "files:" support. What we're basically going to do here is
186 // rebuild $parts from the full URL of the file.
187 $new_parts = parse_url(file_create_url(file_default_scheme() . '://' . $parts['path']));
188 // If there were query parts from the original parsing, copy them over.
189 if (!empty($parts['query'])) {
190 $new_parts['query'] = $parts['query'];
192 $new_parts['path'] = rawurldecode($new_parts['path']);
194 // Don't do language handling for file paths.
195 $cached_settings['is_file'] = TRUE;
198 $cached_settings['is_file'] = FALSE;
201 // Let's also bail out of this doesn't look like a local path.
203 // Cycle through local paths and find one with a host and a path that matches;
204 // or just a host if that's all we have; or just a starting path if that's
206 foreach ($cached_settings['current_settings']['local_paths_exploded'] as $exploded) {
207 // If a path is available in both…
208 if (isset($exploded['path']) && isset($parts['path'])
209 // And the paths match…
210 && strpos($parts['path'], $exploded['path']) === 0
211 // And either they have the same host, or both have no host…
213 (isset($exploded['host']) && isset($parts['host']) && $exploded['host'] === $parts['host'])
214 || (!isset($exploded['host']) && !isset($parts['host']))
217 // Remove the shared path from the path. This is because the "Also local"
218 // path was something like http://foo/bar and this URL is something like
219 // http://foo/bar/baz; or the "Also local" was something like /bar and
220 // this URL is something like /bar/baz. And we only care about the /baz
222 $parts['path'] = Unicode::substr($parts['path'], Unicode::strlen($exploded['path']));
224 // Break out of the foreach loop
227 // Okay, we didn't match on path alone, or host and path together. Can we
228 // match on just host? Note that for this one we are looking for paths which
229 // are just hosts; not hosts with paths.
230 elseif ((isset($parts['host']) && !isset($exploded['path']) && isset($exploded['host']) && $exploded['host'] === $parts['host'])) {
231 // No further editing; just continue
233 // Break out of foreach loop
236 // Is this is a root-relative url (no host) that didn't match above?
237 // Allow a match if local path has no path,
238 // but don't "break" because we'd prefer to keep checking for a local url
239 // that might more fully match the beginning of our url's path
240 // e.g.: if our url is /foo/bar we'll mark this as a match for
241 // http://example.com but want to keep searching and would prefer a match
242 // to http://example.com/foo if that's configured as a local path
243 elseif (!isset($parts['host']) && (!isset($exploded['path']) || $exploded['path'] === $base_path)) {
248 // If the path is not within the drupal root return original url, unchanged
253 // Okay, format the URL.
254 // If there's still a slash lingering at the start of the path, chop it off.
255 $parts['path'] = ltrim($parts['path'], '/');
257 // Examine the query part of the URL. Break it up and look through it; if it
258 // has a value for "q", we want to use that as our trimmed path, and remove it
259 // from the array. If any of its values are empty strings (that will be the
260 // case for "bar" if a string like "foo=3&bar&baz=4" is passed through
261 // parse_str()), replace them with NULL so that url() (or, more
262 // specifically, drupal_http_build_query()) can still handle it.
263 if (isset($parts['query'])) {
264 parse_str($parts['query'], $parts['qparts']);
265 foreach ($parts['qparts'] as $key => $value) {
267 $parts['qparts'][$key] = NULL;
269 elseif ($key === 'q') {
270 $parts['path'] = $value;
271 unset($parts['qparts']['q']);
276 $parts['qparts'] = NULL;
279 // If we don't have a path yet, bail out.
280 if (!isset($parts['path'])) {
284 // If this looks like a D8-style unclean URL, crop off the "index.php/" from
286 if (strpos($parts['path'], 'index.php/') === 0) {
287 $parts['path'] = substr($parts['path'], 10);
290 // If we didn't previously identify this as a file, check to see if the file
291 // exists now that we have the correct path relative to DRUPAL_ROOT
292 if (!$cached_settings['is_file']) {
293 $cached_settings['is_file'] = !empty($parts['path']) && is_file(DRUPAL_ROOT . '/' . $parts['path']);
296 // Okay, deal with language stuff.
297 // Let's see if we can split off a language prefix from the path.
298 if (\Drupal::moduleHandler()->moduleExists('language')) {
299 // This logic is based on
300 // \Drupal\language\Plugin\LanguageNegotiation\LanguageNegotiationUrl::getLangcode().
301 $languages = \Drupal::languageManager()->getLanguages();
302 $config = \Drupal::config('language.negotiation')->get('url');
304 $request_path = urldecode(trim($parts['path'], '/'));
305 $path_args = explode('/', $request_path);
306 $prefix = array_shift($path_args);
308 // Search for prefix within added languages.
309 foreach ($languages as $language) {
310 if (isset($config['prefixes'][$language->getId()]) && $config['prefixes'][$language->getId()] == $prefix) {
311 $parts['path'] = implode('/', $path_args);
312 $parts['language_obj'] = $language;
318 // If we get to this point and $parts['path'] is now an empty string (which
319 // will be the case if the path was originally just "/"), then we
320 // want to link to <front>.
321 if ($parts['path'] === '') {
322 $parts['path'] = '<front>';
324 // Build the parameters we will send to url()
326 'path' => $parts['path'],
328 'query' => $parts['qparts'],
329 'fragment' => isset($parts['fragment']) ? $parts['fragment'] : NULL,
330 // Create an absolute URL if protocol_style is 'full' or 'proto-rel', but
331 // not if it's 'path'.
332 'absolute' => $cached_settings['current_settings']['protocol_style'] !== 'path',
333 // If we seem to have found a language for the path, pass it along to
334 // url(). Otherwise, ignore the 'language' parameter.
335 'language' => isset($parts['language_obj']) ? $parts['language_obj'] : NULL,
336 // A special parameter not actually used by url(), but we use it to see if
337 // an alter hook implementation wants us to just pass through the original
339 'use_original' => FALSE,
343 // Add the original URL to the parts array
344 $parts['original'] = $original_url;
347 // @see http://drupal.org/node/1762022
348 \Drupal::moduleHandler()->alter('pathologic', $url_params, $parts, $cached_settings);
350 // If any of the alter hooks asked us to just pass along the original URL,
352 if ($url_params['options']['use_original']) {
356 // Now to build the URL. Drumroll, please…
357 if ($parts['path'] == '<front>') {
358 $url = Url::fromRoute('<front>', [], $url_params['options'])->toString();
362 $url = Url::fromUri('base://' . $url_params['path'], $url_params['options'])
365 catch (\Exception $e) {
366 // In case of an error, e.g. completely invalid URL, return it unchanged.
371 // If we need to create a protocol-relative URL, then convert the absolute
373 if ($cached_settings['current_settings']['protocol_style'] === 'proto-rel') {
374 // Now, what might have happened here is that url() returned a URL which
375 // isn't on "this" server due to a hook_url_outbound_alter() implementation.
376 // We don't want to convert the URL in that case. So what we're going to
377 // do is cycle through the local paths again and see if the host part of
378 // $url matches with the host of one of those, and only alter in that case.
379 $url_parts = parse_url($url);
380 if (!empty($url_parts['host']) && $url_parts['host'] === $cached_settings['current_settings']['base_url_host']) {
381 $url = _pathologic_url_to_protocol_relative($url);
385 // Apply HTML character encoding, as is required for HTML attributes.
386 // @see http://drupal.org/node/1672932
387 $url = Html::escape($url);
388 // $matches[1] will be the tag attribute; src, href, etc.
389 return " {$matches[1]}=\"{$url}";
393 * Convert a full URL with a protocol to a protocol-relative URL.
395 * As the Drupal core url() function doesn't support protocol-relative URLs, we
396 * work around it by just creating a full URL and then running it through this
397 * to strip off the protocol.
399 * Though this is just a one-liner, it's placed in its own function so that it
400 * can be called independently from our test code.
402 function _pathologic_url_to_protocol_relative($url) {
403 return preg_replace('~^https?://~', '//', $url);