5 * This module periodically check links in given node types, blocks etc.
7 * Developed by Alexander Hass, http://www.yaml-for-drupal.com/.
10 use Drupal\Core\Form\FormStateInterface;
11 use Drupal\Core\Routing\RouteMatchInterface;
12 use Drupal\Core\Session\UserSession;
13 use Drupal\node\NodeTypeInterface;
14 use GuzzleHttp\Exception\RequestException;
18 * Defines the maximum limit of links collected in one chunk if content is
19 * scanned for links. A value that is too high may overload the database server.
21 define('LINKCHECKER_SCAN_MAX_LINKS_PER_RUN', '100');
24 * A list of domain names reserved for use in documentation and not available
25 * for registration. See RFC 2606, Section 3 for more information.
27 define('LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS', "example.com\nexample.net\nexample.org");
30 * A list of blacklisted filters the modules do not need to run for the link
31 * extraction process. This filters only eat processing time or holds references
34 * - Align images, http://drupal.org/project/drupal
36 * - Line break converter, http://drupal.org/project/drupal
38 * - Caption images, http://drupal.org/project/drupal
39 * name: filter_caption
40 * - Insert block, http://drupal.org/project/insert_block
42 * tags: [block:name of module=delta of block]
43 * - Insert view filter, http://drupal.org/project/insert_view
45 * tags: [view:my_view]
46 * - Smiley filter, http://drupal.org/project/smiley
48 * tags: Depends on icon set, for e.g: ":) :-) :smile:"
49 * - Web Links Embed, http://drupal.org/project/weblinks
50 * name: weblinks_embed
51 * tags: [links-embed: id], [links-embed: name]
52 * - Web Links Filter, http://drupal.org/project/weblinks
53 * name: weblinks_filter
57 * - Smileys Filter, http://drupal.org/project/smileys
59 * tags: Depends on icon set, for e.g: ":) :-) :smile:"
60 * - Insert node, http://drupal.org/project/InsertNode
62 * tags: [node:<name of node> <parameters>]
63 * - Weblink filter, http://drupal.org/project/links
64 * name: links_weblink/0
65 * tags: [weblink:node_id|text], [weblink:node_id/link_id], [weblink:http://weblink.example.com/]
67 define('LINKCHECKER_DEFAULT_FILTER_BLACKLIST', 'filter_align|filter_autop|filter_caption|insert_block|insert_view|smiley|smileys|weblinks_embed|weblinks_filter');
70 * Implements hook_help().
72 function linkchecker_help($route_name, RouteMatchInterface $route_match) {
73 switch ($route_name) {
74 case 'help.page.linkchecker':
75 return '<p>' . t('This module provides an aid to finding broken links on your site. It periodically checks contents of all public nodes, tries to find any html links and check for their validity. It reports broken links through the admin interface. For more information about status codes see <a href="@rfc">Status Code Definitions</a>.', ['@rfc' => 'http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html']) . '</p>';
80 * Conditionally logs a system message.
83 * The category to which this message belongs. Can be any string, but the
84 * general practice is to use the name of the module calling watchdog().
86 * The message to store in the log. Keep $message translatable
87 * by not concatenating dynamic values into it! Variables in the
88 * message should be added by using placeholder strings alongside
89 * the variables argument to declare the value of the placeholders.
90 * See t() for documentation on how $message and $variables interact.
92 * Array of variables to replace in the message on display or
93 * NULL if message is already translated or not possible to
96 * The severity of the message; one of the following values as defined in
97 * @link http://www.faqs.org/rfcs/rfc3164.html RFC 3164: @endlink
98 * - WATCHDOG_EMERGENCY: Emergency, system is unusable.
99 * - WATCHDOG_ALERT: Alert, action must be taken immediately.
100 * - WATCHDOG_CRITICAL: Critical conditions.
101 * - WATCHDOG_ERROR: Error conditions.
102 * - WATCHDOG_WARNING: Warning conditions.
103 * - WATCHDOG_NOTICE: (default) Normal but significant conditions.
104 * - WATCHDOG_INFO: Informational messages.
105 * - WATCHDOG_DEBUG: Debug-level messages.
107 * A link to associate with the message.
109 * @see watchdog_severity_levels()
112 function linkchecker_watchdog_log($type, $message, $variables = array(), $severity = WATCHDOG_NOTICE, $link = NULL) {
113 if ($severity <= variable_get('linkchecker_log_level', WATCHDOG_INFO)) {
114 watchdog($type, $message, $variables, $severity, $link);
119 * Access callback for user/%user/linkchecker.
121 * @param object $account
126 function _linkchecker_user_access_account_broken_links_report($account) {
129 // Users with 'access own broken links report' permission can only view their
130 // own report. Users with the 'access broken links report' permission can
131 // view the report for any authenticated user.
132 return $account->uid && (($user->uid == $account->uid && user_access('access own broken links report')) || user_access('access broken links report'));
136 * Access callback for linkchecker/%linkchecker_link/edit.
138 * @param object $link
139 * An object representing the link to check.
142 * TRUE if the current user has the requested permission.
144 function _linkchecker_user_access_edit_link_settings($link) {
145 return user_access('edit link settings') && _linkchecker_link_access($link);
149 * Determines if the current user has access to view a link.
151 * Link URLs can contain private information (for example, usernames and
152 * passwords). So this module should only display links to a user if the link
153 * already appears in at least one place on the site where the user would
154 * otherwise have access to see it.
156 * @param object $link
157 * An object representing the link to check.
161 function _linkchecker_link_access($link) {
162 $link = (object) $link;
163 return _linkchecker_link_node_ids($link) || _linkchecker_link_comment_ids($link) || _linkchecker_link_block_ids($link);
167 * Returns IDs of nodes that contain a link which the current user may be allowed to view.
169 * Important note: For performance reasons, this function is not always
170 * guaranteed to return the exact list of node IDs that the current user is
171 * allowed to view. It will, however, always return an empty array if the user
172 * does not have access to view *any* such nodes, thereby meeting the security
173 * goals of _linkchecker_link_access() and other places that call it.
175 * In the case where a user has access to some of the nodes that contain the
176 * link, this function may return some node IDs that the user does not have
177 * access to. Therefore, use caution with its results.
179 * @param object $link
180 * An object representing the link to check.
181 * @param object $node_author_account
182 * (optional) If a user account object is provided, the returned nodes will
183 * additionally be restricted to only those owned by this account. Otherwise,
184 * nodes owned by any user account may be returned.
187 * An array of node IDs that contain the provided link and that the current
188 * user may be allowed to view.
190 function _linkchecker_link_node_ids($link, $node_author_account = NULL) {
191 static $fields_with_node_links = array();
193 // Exit if all node types are disabled or if the user cannot access content,
194 // there is no need to check further.
195 $linkchecker_scan_nodetypes = linkchecker_scan_node_types();
196 if (empty($linkchecker_scan_nodetypes) || !user_access('access content')) {
200 // Get a list of nodes containing the link, using addTag('node_access') to
201 // allow node access modules to exclude nodes that the current user does not
202 // have access to view.
203 if (!empty($node_author_account)) {
204 $query = db_select('node', 'n');
205 $query->addTag('node_access');
206 $query->innerJoin('linkchecker_node', 'ln', 'ln.nid = n.nid');
207 $query->innerJoin('node_revision', 'r', 'r.vid = n.vid');
208 $query->condition('ln.lid', $link->lid);
209 $query->condition(db_or()
210 ->condition('n.uid', $node_author_account->uid)
211 ->condition('r.uid', $node_author_account->uid)
213 $query->fields('n', array('nid'));
216 $query = db_select('node', 'n');
217 $query->addTag('node_access');
218 $query->innerJoin('linkchecker_node', 'ln', 'ln.nid = n.nid');
219 $query->condition('ln.lid', $link->lid);
220 $query->fields('n', array('nid'));
222 $nodes = $query->execute();
224 // Check if the current user has access to view the link in each node.
225 // However, for performance reasons, as soon as we find one node where that
226 // is the case, stop checking and return the remainder of the list.
228 $access_allowed = FALSE;
229 foreach ($nodes as $node) {
230 if ($access_allowed) {
231 $nids[] = $node->nid;
234 $node = node_load($node->nid);
236 // We must check whether the link is currently part of the node; if not, we
237 // do not want to return it (and it is not safe to, since we cannot know if
238 // it contained access restrictions for the current user at the point which
239 // it was originally extracted by the Link checker module).
240 if (!isset($fields_with_node_links[$node->nid])) {
241 $fields_with_node_links[$node->nid] = _linkchecker_extract_node_links($node, TRUE);
243 if (empty($fields_with_node_links[$node->nid][$link->url])) {
246 // If the link appears in fields and a field access module is being used,
247 // we must check that the current user has access to view at least one field
248 // that contains the link; if they don't, we should not return the node.
249 $fields = $fields_with_node_links[$node->nid][$link->url];
250 if (module_implements('field_access')) {
251 $fields_with_access = array();
253 $bundle_instances = field_info_instances('node', $node->type);
254 foreach ($bundle_instances as $field_name => $field_instance) {
255 $field = field_info_field($field_name);
257 // Field types supported by linkchecker.
258 $fields_supported = array(
265 // Only check link and text fields, since those are the only types we
266 // extract links from.
267 if (in_array($field['type'], $fields_supported) && field_access('view', $field, 'node', $node)) {
268 $fields_with_access[] = $field['field_name'];
271 if (!array_intersect($fields, $fields_with_access)) {
275 $nids[] = $node->nid;
276 $access_allowed = TRUE;
283 * Returns IDs of comments that contain a link which the current user is allowed to view.
285 * @param object $link
286 * An object representing the link to check.
287 * @param object $comment_author_account
288 * (optional) If a user account object is provided, the returned comments
289 * will additionally be restricted to only those owned by this account.
290 * Otherwise, comments owned by any user account may be returned.
293 * An array of comment IDs that contain the provided link and that the
294 * current user is allowed to view.
296 function _linkchecker_link_comment_ids($link, $comment_author_account = NULL) {
297 // Exit if comments are disabled or if the user cannot access comments, there
298 // is no need to check further.
299 $comment_types = linkchecker_scan_comment_types();
300 if (empty($comment_types) || !user_access('access comments')) {
304 // Get a list of comments containing the link, using addTag('node_access') to
305 // allow comment access modules to exclude comments that the current user
306 // does not have access to view.
307 if (!empty($comment_author_account)) {
308 $query = db_select('comment', 'c');
309 $query->addMetaData('base_table', 'comment');
310 $query->addTag('node_access');
311 $query->innerJoin('linkchecker_comment', 'lc', 'lc.cid = c.cid');
312 $query->condition('lc.lid', $link->lid);
313 $query->condition('c.uid', $comment_author_account->uid);
314 $query->fields('c', array('cid'));
317 $query = db_select('comment', 'c');
318 $query->addMetaData('base_table', 'comment');
319 $query->addTag('node_access');
320 $query->innerJoin('linkchecker_comment', 'lc', 'lc.cid = c.cid');
321 $query->condition('lc.lid', $link->lid);
322 $query->fields('c', array('cid'));
324 $cids = $query->execute()->fetchCol();
326 // Return the array of comment IDs.
331 * Returns IDs of blocks that contain a link which the current user is allowed to view.
333 * @param object $link
334 * An object representing the link to check.
337 * An array of custom block IDs that contain the provided link and that the
338 * current user is allowed to view.
340 function _linkchecker_link_block_ids($link) {
341 // Exit if blocks are disabled.
342 if (!variable_get('linkchecker_scan_blocks', 0)) {
346 // Get the initial list of block IDs.
347 $bids = db_query('SELECT bid FROM {linkchecker_block_custom} WHERE lid = :lid', array(':lid' => $link->lid))->fetchCol();
349 // If the user can administer blocks, they're able to see all block content.
350 if (user_access('administer blocks')) {
354 // Otherwise, only return blocks that this user (or anonymous users) have
357 $rids = array_keys($user->roles);
358 $rids[] = DRUPAL_ANONYMOUS_RID;
360 $query = db_select('block', 'b');
361 $query->leftJoin('block_role', 'r', 'b.module = r.module AND b.delta = r.delta');
362 $query->condition('b.module', 'block');
363 $query->condition(db_or()
364 ->condition('r.rid', $rids, 'IN')
367 $query->fields('b', array('delta'));
369 $allowed_bids = $query->execute()->fetchCol();
371 return array_intersect($bids, $allowed_bids);
375 * Implements hook_cron().
377 function linkchecker_cron() {
378 // Remove outdated links no longer in use once per day.
379 if (REQUEST_TIME - \Drupal::state()->get('linkchecker.cleanup_links_last') >= 86400) {
380 _linkchecker_cleanup_links();
381 \Drupal::state()->set('linkchecker.cleanup_links_last', REQUEST_TIME);
384 // Run link checker in a new process, independent of cron.
385 if (\Drupal::moduleHandler()->moduleExists('httprl') && \Drupal::config('linkchecker.settings')->get('check.library') == 'httprl') {
386 // Setup callback options array; call _linkchecker_check_links() in the
388 $callback_options = array(array('function' => '_linkchecker_check_links'));
389 // Queue up the request.
390 httprl_queue_background_callback($callback_options);
392 httprl_send_request();
394 // Exit here so we don't call _linkchecker_check_links() in this process.
397 // Run the link checks the normal way.
398 _linkchecker_check_links();
404 function _linkchecker_check_links() {
405 $config = \Drupal::config('linkchecker.settings');
407 // Get max_execution_time from configuration, override 0 with 240 seconds.
408 $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');
409 // Make sure we have enough time to validate all of the links.
410 drupal_set_time_limit($max_execution_time);
412 // Make sure this is the only process trying to run this function.
413 $lock = \Drupal::lock();
414 if ($lock->acquire(__FUNCTION__, $max_execution_time)) {
415 linkchecker_watchdog_log('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING);
419 $has_httprl = (\Drupal::moduleHandler()->moduleExists('httprl') && $config->get('check.library') == 'httprl');
421 // Do not confuse admins with a setting of maximum checkable links per cron
422 // run and guess that 2 links can be checked per second with 1 thread, what is
423 // nevertheless uncommon. The max_execution_time can be used to calculate
424 // a useful value that is higher, but not totally out of scope and limits the
425 // query result set to a reasonable size.
426 $linkchecker_check_connections_max = $config->get('check.connections_max');
427 $check_links_max_per_cron_run = ($has_httprl) ? ($linkchecker_check_connections_max * $max_execution_time) : $max_execution_time;
429 $linkchecker_check_links_interval = $config->get('check.interval');
430 $linkchecker_check_useragent = $config->get('check.useragent');
432 // Connection limit can be overridden via settings.php. Two connections is the
433 // limit defined in RFC http://www.ietf.org/rfc/rfc2616.txt. Modern browsers
434 // are typically using 6-8 connections and no more. Never use more and keep
435 // in mind that you can overload other people servers.
436 $linkchecker_check_domain_connections = $config->get('check.connections_max_per_domain');
438 // Get URLs for checking.
439 $links = db_query_range('SELECT * FROM {linkchecker_link} WHERE last_checked < :last_checked AND status = :status ORDER BY last_checked, lid ASC', 0, $check_links_max_per_cron_run, [':last_checked' => REQUEST_TIME - $linkchecker_check_links_interval, ':status' => 1]);
440 $links_remaining = $links->rowCount();
442 foreach ($links as $link) {
444 $headers['User-Agent'] = $linkchecker_check_useragent;
446 $uri = @parse_url($link->url);
448 // URL contains a fragment.
449 if (in_array($link->method, ['HEAD', 'GET']) && !empty($uri['fragment'])) {
450 // We need the full content and not only the HEAD.
451 $link->method = 'GET';
452 // Request text content only (like Firefox/Chrome).
453 $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
455 elseif ($link->method == 'GET') {
456 // Range: Only request the first 1024 bytes from remote server. This is
457 // required to prevent timeouts on URLs that are large downloads.
458 $headers['Range'] = 'bytes=0-1024';
461 // Add in the headers.
463 'headers' => $headers,
464 'method' => $link->method,
465 'max_redirects' => 0,
469 // Define the callback and add the $link object to it.
471 // - 'global_timeout' does not require a timer_read('page'), as this job
472 // runs in a new process, independent of cron.
474 'global_connections' => $linkchecker_check_connections_max,
475 'global_timeout' => $max_execution_time - 30,
476 'domain_connections' => $linkchecker_check_domain_connections,
479 'function' => '_linkchecker_status_handling',
481 $link, // This need to be passed or it's not send back to _linkchecker_status_handling()
484 // Queue up the requests.
485 httprl_request($link->url, $options);
488 // After all links are queued, run the url checks.
489 if ($links_remaining == 0) {
490 httprl_send_request();
496 // @fixme: Object is totally different in D8.
497 $response = \Drupal::httpClient()->request($link->method, $link->url, $options);
498 //$response = drupal_http_request($link->url, $options);
501 // Add 'redirect_code' property to core response object for consistency
502 // with HTTPRL object.
503 //if ($response->code == 301 && !isset($response->redirect_code)) {
504 // $response->redirect_code = $response->code;
506 // Add 'uri' property to core response object for 'fragment' check and
507 // consistency with HTTPRL object.
508 //$response->uri = $uri;
510 _linkchecker_status_handling($response, $link);
512 if ((timer_read('page') / 1000) > ($max_execution_time / 2)) {
513 // Stop once we have used over half of the maximum execution time.
517 catch (RequestException $exception) {
518 watchdog_exception('linkchecker', $exception);
524 $lock->release(__FUNCTION__);
525 linkchecker_watchdog_log('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO);
526 linkchecker_watchdog_log('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array('@memory_get_peak_usage' => format_size(memory_get_peak_usage()), '@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG);
531 * Status code handling.
533 * @param object $response
534 * An object containing the HTTP request headers, response code, headers,
535 * data and redirect status.
536 * @param string $link
537 * An object containing the url, lid and fail_count.
539 function _linkchecker_status_handling(&$response, $link) {
540 $config = \Drupal::config('linkchecker.settings');
541 $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403"));
543 // - Prevent E_ALL warnings in DB updates for non-existing $response->error.
544 // - @todo drupal_http_request() may not provide an UTF8 encoded error message
545 // what results in a database UPDATE failure. For more information, see
546 // http://drupal.org/node/371495.
547 // Workaround: ISO-8859-1 as source encoding may be wrong, but WFM.
548 if (!isset($response->error)) {
549 $response->error = '';
551 if (!isset($response->status_message)) {
552 $response->status_message = '';
554 $response->error = trim(drupal_convert_to_utf8($response->error, 'ISO-8859-1'));
555 $response->status_message = trim(drupal_convert_to_utf8($response->status_message, 'ISO-8859-1'));
557 // Destination anchors in HTML documents may be specified either by:
558 // - the A element (naming it with the name attribute)
559 // - or by any other element (naming with the id attribute)
560 // - and must not contain a key/value pair as these type of hash fragments are
561 // typically used by AJAX applications to prevent additionally HTTP requests
562 // e.g. http://www.example.com/ajax.html#key1=value1&key2=value2
563 // - and must not contain '/' or ',' as this are not normal anchors.
564 // - and '#top' is a reserved fragment that must not exist in a page.
565 // See http://www.w3.org/TR/html401/struct/links.html
566 if ($response->code == 200
567 && !empty($response->data)
568 && !empty($response->headers['content-type'])
569 && !empty($response->uri['fragment'])
570 && preg_match('/=|\/|,/', $response->uri['fragment']) == FALSE
571 && !in_array($response->uri['fragment'], array('#top'))
572 && in_array($response->headers['content-type'], array('text/html', 'application/xhtml+xml', 'application/xml'))
573 && !preg_match('/(\s[^>]*(name|id)(\s+)?=(\s+)?["\'])(' . preg_quote($response->uri['fragment'], '/') . ')(["\'][^>]*>)/i', $response->data)
575 // Override status code 200 with status code 404 so it can be handled with
576 // default status code 404 logic and custom error text.
577 $response->code = 404;
578 $response->status_message = $response->error = 'URL fragment identifier not found in content';
581 switch ($response->code) {
582 case -4: // HTTPRL: httprl_send_request timed out.
583 // Skip these and try them again next cron run.
586 case -2: // HTTPRL: maximum allowed redirects exhausted.
588 // Remote site send status code 301 and link needs an update.
589 db_update('linkchecker_link')
590 ->condition('lid', $link->lid)
592 'code' => $response->redirect_code,
593 'error' => $response->status_message,
595 'last_checked' => time(),
597 ->expression('fail_count', 'fail_count + 1')
600 // A HTTP status code of 301 tells us an existing link have changed to
601 // a new link. The remote site owner was so kind to provide us the new
602 // link and if we trust this change we are able to replace the old link
603 // with the new one without any hand work.
604 $auto_repair_301 = variable_get('linkchecker_action_status_code_301', 0);
605 if ($auto_repair_301 && $auto_repair_301 <= ($link->fail_count + 1) && valid_url($response->redirect_url, TRUE)) {
606 // Switch anonymous user to an admin.
607 $accountSwitcher = Drupal::service('account_switcher');
608 $accountSwitcher->switchTo(new UserSession(array('uid' => user_load_by_name($config->get('error.impersonate_account')))));
610 // NODES: Autorepair all nodes having this outdated link.
611 $result = db_query('SELECT nid FROM {linkchecker_node} WHERE lid = :lid', array(':lid' => $link->lid));
612 foreach ($result as $row) {
613 // Explicitly don't use node_load_multiple() or the module may run
614 // into issues like http://drupal.org/node/1210606. With this logic
615 // nodes can be updated until an out of memory occurs and further
616 // updates will be made on the remaining nodes only.
617 $node = node_load($row->nid);
619 // Has the node object loaded successfully?
620 if (is_object($node)) {
621 $node_original = clone $node;
622 $node = _linkchecker_replace_fields('node', $node->type, $node, $link->url, $response->redirect_url);
624 if ($node_original != $node) {
625 // Always use the default revision setting. For more information,
626 // see node_object_prepare().
627 $node_options = variable_get('node_options_' . $node->type, array('status', 'promote'));
628 $node->revision = in_array('revision', $node_options);
630 // Generate a log message for the node_revisions table, visible on
631 // the node's revisions tab.
632 $node->log = t('Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $node->nid), '%src' => $link->url, '%dst' => $response->redirect_url));
634 // Save changed node and update the node link list.
636 linkchecker_watchdog_log('linkchecker', 'Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $node->nid), '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO);
639 linkchecker_watchdog_log('linkchecker', 'Link update in node failed. Permanently moved link %src not found in node %node. Manual fix required.', array('%node' => url('node/' . $row->nid), '%src' => $link->url), WATCHDOG_WARNING);
643 linkchecker_watchdog_log('linkchecker', 'Loading node %node for update failed. Manual fix required.', array('%node' => $row->nid), WATCHDOG_ERROR);
647 // COMMENTS: Autorepair all comments having this outdated link.
648 $result = db_query('SELECT cid FROM {linkchecker_comment} WHERE lid = :lid', array(':lid' => $link->lid));
649 foreach ($result as $row) {
650 // Explicitly don't use comment_load_multiple() or the module may run
651 // into issues like http://drupal.org/node/1210606. With this logic
652 // comment can be updated until an out of memory occurs and further
653 // updates will be made on the remaining comments only.
654 $comment = comment_load($row->cid);
656 // Has the comment object loaded successfully?
657 if (is_object($comment)) {
658 $comment_original = clone $comment;
660 // Replace links in subject.
661 _linkchecker_link_replace($comment->subject, $link->url, $response->redirect_url);
663 // Replace links in fields.
664 $comment = _linkchecker_replace_fields('comment', $comment->node_type, $comment, $link->url, $response->redirect_url);
666 // Save changed comment and update the comment link list.
667 if ($comment_original != $comment) {
668 comment_save($comment);
669 linkchecker_watchdog_log('linkchecker', 'Changed permanently moved link in comment %comment from %src to %dst.', array('%comment' => $comment->cid, '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO);
672 linkchecker_watchdog_log('linkchecker', 'Link update in comment failed. Permanently moved link %src not found in comment %comment. Manual fix required.', array('%comment' => $comment->cid, '%src' => $link->url), WATCHDOG_WARNING);
676 linkchecker_watchdog_log('linkchecker', 'Loading comment %comment for update failed. Manual fix required.', array('%comment' => $comment->cid), WATCHDOG_ERROR);
680 // CUSTOM BLOCKS: Autorepair all custom blocks having this outdated
682 $result = db_query('SELECT bid FROM {linkchecker_block_custom} WHERE lid = :lid', array(':lid' => $link->lid));
683 foreach ($result as $row) {
684 $block_custom = linkchecker_block_custom_block_get($row->bid);
686 // Has the custom block object loaded successfully?
687 if (is_object($block_custom)) {
688 $block_custom_original = clone $block_custom;
690 // Now replace the outdated link with the permanently moved one in
691 // all custom block fields.
692 _linkchecker_link_replace($block_custom->info, $link->url, $response->redirect_url);
693 _linkchecker_link_replace($block_custom->body['value'], $link->url, $response->redirect_url);
695 if ($block_custom_original != $block_custom) {
696 // Save changed block and update the block link list.
697 block_custom_block_save((array) $block_custom, $block_custom->delta);
698 // There is no hook that fires on block_custom_block_save(),
699 // therefore do link extraction programmatically.
700 _linkchecker_add_block_custom_links($block_custom, $block_custom->delta);
701 linkchecker_watchdog_log('linkchecker', 'Changed permanently moved link in custom block %bid from %src to %dst.', array('%bid' => $block_custom->delta, '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO);
704 linkchecker_watchdog_log('linkchecker', 'Link update in block failed. Permanently moved link %src not found in block %bid. Manual fix required.', array('%bid' => $block_custom->delta, '%src' => $link->url), WATCHDOG_WARNING);
708 linkchecker_watchdog_log('linkchecker', 'Loading block %bid for update failed. Manual fix required.', array('%bid' => $block_custom->delta), WATCHDOG_ERROR);
712 // Revert user back to anonymous.
713 $accountSwitcher->switchBack();
716 linkchecker_watchdog_log('linkchecker', 'Link %link has changed and needs to be updated.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker'));
721 db_update('linkchecker_link')
722 ->condition('lid', $link->lid)
724 'code' => $response->code,
725 'error' => $response->error,
727 'last_checked' => time(),
729 ->expression('fail_count', 'fail_count + 1')
731 linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker'));
733 // If unpublishing limit is reached, unpublish all nodes having this link.
734 $linkchecker_action_status_code_404 = variable_get('linkchecker_action_status_code_404', 0);
735 if ($linkchecker_action_status_code_404 && $linkchecker_action_status_code_404 <= ($link->fail_count + 1)) {
736 // Switch anonymous user to an admin.
737 $accountSwitcher = Drupal::service('account_switcher');
738 $accountSwitcher->switchTo(new UserSession(array('uid' => user_load_by_name($config->get('error.impersonate_account')))));
739 _linkchecker_unpublish_nodes($link->lid);
740 $accountSwitcher->switchBack();
745 // - 405: Special error handling if method is not allowed. Switch link
746 // checking to GET method and try again.
747 db_update('linkchecker_link')
748 ->condition('lid', $link->lid)
751 'code' => $response->code,
752 'error' => $response->error,
754 'last_checked' => time(),
756 ->expression('fail_count', 'fail_count + 1')
759 linkchecker_watchdog_log('linkchecker', 'Method HEAD is not allowed for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker'));
763 // - 500: Like WGET, try with GET on "500 Internal server error".
764 // - If GET also fails with status code 500, than the link is broken.
765 if ($link->method == 'GET' && $response->code == 500) {
766 db_update('linkchecker_link')
767 ->condition('lid', $link->lid)
769 'code' => $response->code,
770 'error' => $response->error,
772 'last_checked' => time(),
774 ->expression('fail_count', 'fail_count + 1')
777 linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker'));
780 db_update('linkchecker_link')
781 ->condition('lid', $link->lid)
784 'code' => $response->code,
785 'error' => $response->error,
787 'last_checked' => time(),
789 ->expression('fail_count', 'fail_count + 1')
792 linkchecker_watchdog_log('linkchecker', 'Internal server error for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker'));
797 // Don't treat ignored response codes as errors.
798 if (in_array($response->code, $ignore_response_codes)) {
799 db_update('linkchecker_link')
800 ->condition('lid', $link->lid)
802 'code' => $response->code,
803 'error' => $response->error,
805 'last_checked' => time(),
808 // linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker'));
811 db_update('linkchecker_link')
812 ->condition('lid', $link->lid)
814 'code' => $response->code,
815 'error' => $response->error,
817 'last_checked' => time(),
819 ->expression('fail_count', 'fail_count + 1')
821 // linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker'));
826 $response = new stdClass();
830 * @fixme: remove after migration
831 * Implements hook_node_type_delete().
833 function linkchecker_node_type_delete($info) {
834 variable_del('linkchecker_scan_node_' . $info->type);
835 variable_del('linkchecker_scan_comment_' . $info->type);
839 * Implements hook_node_prepare().
841 function linkchecker_node_prepare($node) {
842 // Node edit tab is viewed.
843 if (arg(0) == 'node' && is_numeric(arg(1)) && arg(2) == 'edit' && isset($node->nid)) {
844 // Show a message on node edit page if a link check failed once or more.
845 $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403"));
846 $links = db_query('SELECT ll.* FROM {linkchecker_node} ln INNER JOIN {linkchecker_link} ll ON ln.lid = ll.lid WHERE ln.nid = :nid AND ll.fail_count > :fail_count AND ll.status = :status AND ll.code NOT IN (:codes)', array(':nid' => $node->nid, ':fail_count' => 0, ':status' => 1, ':codes' => $ignore_response_codes));
847 foreach ($links as $link) {
848 if (_linkchecker_link_access($link)) {
849 drupal_set_message(format_plural($link->fail_count, 'Link check of <a href="@url">@url</a> failed once (status code: @code).', 'Link check of <a href="@url">@url</a> failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE);
856 * Implements hook_node_delete().
858 function linkchecker_node_delete($node) {
859 _linkchecker_delete_node_links($node->nid);
863 * Implements hook_node_insert().
865 function linkchecker_node_insert($node) {
866 // Every moderation module saving a forward revision needs to exit here.
867 // Please see _linkchecker_isdefaultrevision() for more details.
868 // @todo: Refactor this workaround under D8.
869 if (!_linkchecker_isdefaultrevision($node)) {
873 // The node is going to be published.
874 if (variable_get('linkchecker_scan_node_' . $node->type, FALSE) && $node->status == NODE_PUBLISHED) {
875 _linkchecker_add_node_links($node);
880 * Implements hook_node_update().
882 function linkchecker_node_update($node) {
883 // Every moderation module saving a forward revision needs to exit here.
884 // Please see _linkchecker_isdefaultrevision() for more details.
885 // @todo: Refactor this workaround under D8.
886 if (!_linkchecker_isdefaultrevision($node)) {
890 // The node is going to be published.
891 if (variable_get('linkchecker_scan_node_' . $node->type, FALSE) && $node->status == NODE_PUBLISHED) {
892 _linkchecker_add_node_links($node);
895 // The node is going to be unpublished.
896 linkchecker_node_delete($node);
901 * Implements hook_comment_delete().
903 function linkchecker_comment_delete($comment) {
904 _linkchecker_delete_comment_links($comment->cid);
908 * Implements hook_comment_insert().
910 function linkchecker_comment_insert($comment) {
911 // The comment is going to be published.
912 $node_type = db_query('SELECT type FROM {node} WHERE nid = :nid', array(':nid' => $comment->nid))->fetchField();
913 if (variable_get('linkchecker_scan_comment_' . $node_type, FALSE) && $comment->status == COMMENT_PUBLISHED) {
914 _linkchecker_add_comment_links($comment);
919 * Implements hook_comment_update().
921 function linkchecker_comment_update($comment) {
922 // The node is going to be published.
923 $node_type = db_query('SELECT type FROM {node} WHERE nid = :nid', array(':nid' => $comment->nid))->fetchField();
924 if (variable_get('linkchecker_scan_comment_' . $node_type, FALSE) && $comment->status == COMMENT_PUBLISHED) {
925 _linkchecker_add_comment_links($comment);
928 // The node is going to be unpublished.
929 linkchecker_comment_delete($comment);
934 * Implements hook_form_alter().
936 function linkchecker_form_alter(&$form, FormStateInterface $form_state, $form_id) {
938 // Catch the custom block add/configure form and add custom submit handler.
939 case 'block_add_block_form':
940 // Add custom submit handler to custom block add form.
941 $form['#submit'][] = 'linkchecker_block_custom_add_form_submit';
944 case 'block_admin_configure':
945 // When displaying the form, show the broken links warning.
946 if (empty($form_state['input']) && is_numeric(arg(5))) {
947 // Show a message on custom block edit page if a link check failed once
949 $ignore_response_codes = preg_split('/(\r\n?|\n)/', \Drupal::config('linkchecker.settings')->get('error.ignore_response_codes'));
950 $links = db_query('SELECT ll.* FROM {linkchecker_block_custom} lb INNER JOIN {linkchecker_link} ll ON lb.lid = ll.lid WHERE lb.bid = :bid AND ll.fail_count > :fail_count AND ll.status = :status AND ll.code NOT IN (:codes)', array(':bid' => arg(5), ':fail_count' => 0, ':status' => 1, ':codes' => $ignore_response_codes));
951 foreach ($links as $link) {
952 if (_linkchecker_link_access($link)) {
953 drupal_set_message(format_plural($link->fail_count, 'Link check of <a href=":url">:url</a> failed once (status code: @code).', 'Link check of <a href=":url">:url</a> failed @count times (status code: @code).', array(':url' => $link->url, '@code' => $link->code)), 'warning', FALSE);
958 // Add custom submit handler to custom block configuration form.
959 $form['#submit'][] = 'linkchecker_block_custom_configure_form_submit';
962 case 'block_custom_block_delete':
963 // Add custom submit handler to custom block delete form.
964 $form['#submit'][] = 'linkchecker_block_custom_delete_form_submit';
970 * Implements hook_form_FORM_ID_alter() for \Drupal\node\NodeTypeForm.
972 * Adds linkchecker options to the node type form.
974 * @see NodeTypeForm::form()
975 * @see linkchecker_form_node_type_form_submit()
977 function linkchecker_form_node_type_form_alter(&$form, FormStateInterface $form_state) {
978 /** @var \Drupal\node\NodeTypeInterface $type */
979 $type = $form_state->getFormObject()->getEntity();
980 $form['linkchecker'] = [
981 '#type' => 'details',
982 '#title' => t('Link checker'),
984 'library' => ['linkchecker/linkchecker.content_types'],
986 '#group' => 'additional_settings',
988 $form['linkchecker']['linkchecker_scan_node'] = [
989 '#type' => 'checkbox',
990 '#title' => t('Scan content'),
991 '#description' => t('Enables link checking for this content type.'),
992 '#default_value' => $type->getThirdPartySetting('linkchecker', 'scan_node', FALSE),
994 if (\Drupal::moduleHandler()->moduleExists('comment')) {
995 $form['linkchecker']['linkchecker_scan_comment'] = [
996 '#type' => 'checkbox',
997 '#title' => t('Scan comments'),
998 '#description' => t('Enables link checking for comments.'),
999 '#default_value' => $type->getThirdPartySetting('linkchecker', 'scan_comment', FALSE),
1003 $form['#submit'][] = 'linkchecker_form_node_type_form_submit';
1004 $form['#entity_builders'][] = 'linkchecker_form_node_type_form_builder';
1008 * Submit handler for forms with linkchecker options.
1010 * @see linkchecker_form_node_type_form_alter()
1012 function linkchecker_form_node_type_form_submit(&$form, FormStateInterface $form_state) {
1013 $node_type = $form_state->getValue('type');
1014 if (!$form['linkchecker']['linkchecker_scan_node']['#default_value'] && $form_state->getValue('linkchecker_scan_node')) {
1015 // We need to scan this node-type now.
1016 module_load_include('inc', 'linkchecker', 'linkchecker.batch');
1017 batch_set(_linkchecker_batch_import_nodes(array($node_type)));
1020 // Default to TRUE if comment module isn't enabled, we don't care.
1021 $original_linkchecker_comment_state = TRUE;
1022 if (\Drupal::moduleHandler()->moduleExists('comment')) {
1023 $original_linkchecker_comment_state = $form['linkchecker']['linkchecker_scan_comment']['#default_value'];
1025 // Use !empty here for when comment module isn't enabled and there is no
1027 if (!$original_linkchecker_comment_state && !empty($form_state->getValue('linkchecker_scan_comment'))) {
1028 // We need to scan comments for this node-type now.
1029 module_load_include('inc', 'linkchecker', 'linkchecker.batch');
1030 batch_set(_linkchecker_batch_import_comments(array($node_type)));
1035 * Entity builder for the node type form with linkchecker options.
1037 * @see linkchecker_form_node_type_form_alter()
1039 function linkchecker_form_node_type_form_builder($entity_type, NodeTypeInterface $type, &$form, FormStateInterface $form_state) {
1040 $type->setThirdPartySetting('linkchecker', 'scan_node', $form_state->getValue('linkchecker_scan_node'));
1041 $type->setThirdPartySetting('linkchecker', 'scan_comment', $form_state->getValue('linkchecker_scan_comment'));
1045 * Implements hook_form_BASE_FORM_ID_alter().
1047 function linkchecker_form_comment_form_alter(&$form, &$form_state, $form_id) {
1048 // When displaying the form as 'view' or 'preview', show the broken links
1050 if ((empty($form_state['input']) || (isset($form_state['input']['op']) && $form_state['input']['op'] == t('Preview'))) && arg(0) == 'comment' && is_numeric(arg(1)) && arg(2) == 'edit') {
1051 // Show a message on comment edit page if a link check failed once or
1053 $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403"));
1054 $links = db_query('SELECT ll.* FROM {linkchecker_comment} lc INNER JOIN {linkchecker_link} ll ON lc.lid = ll.lid WHERE lc.cid = :cid AND ll.fail_count > :fail_count AND ll.status = :status AND ll.code NOT IN (:codes)', array(':cid' => arg(1), ':fail_count' => 0, ':status' => 1, ':codes' => $ignore_response_codes));
1055 foreach ($links as $link) {
1056 if (_linkchecker_link_access($link)) {
1057 drupal_set_message(format_plural($link->fail_count, 'Link check of <a href="@url">@url</a> failed once (status code: @code).', 'Link check of <a href="@url">@url</a> failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE);
1064 * Custom submit handler for block add page.
1066 function linkchecker_block_custom_add_form_submit($form, &$form_state) {
1067 if (variable_get('linkchecker_scan_blocks', 0)) {
1068 $bid = db_query('SELECT MAX(bid) FROM {block_custom}')->fetchField();
1069 _linkchecker_add_block_custom_links($form_state['values'], $bid);
1074 * Custom submit handler for block configure page.
1076 function linkchecker_block_custom_configure_form_submit($form, &$form_state) {
1077 if (variable_get('linkchecker_scan_blocks', 0)) {
1078 _linkchecker_add_block_custom_links($form_state['values'], $form_state['values']['delta']);
1083 * Custom submit handler for block delete page.
1085 function linkchecker_block_custom_delete_form_submit($form, &$form_state) {
1086 _linkchecker_delete_block_custom_links($form_state['values']['bid']);
1090 * Returns information from database about a user-created (custom) block.
1093 * ID of the block to get information for.
1096 * Associative object of information stored in the database for this block.
1098 * - module: 'block' as the source of the custom blocks data.
1099 * - delta: Block ID.
1100 * - info: Block description.
1101 * - body['value']: Block contents.
1102 * - body['format']: Filter ID of the filter format for the body.
1104 function linkchecker_block_custom_block_get($bid) {
1105 $block_custom = block_custom_block_get($bid);
1107 if ($block_custom) {
1108 $block = new stdClass();
1109 $block->module = 'block';
1110 $block->delta = $block_custom['bid'];
1111 $block->info = $block_custom['info'];
1112 $block->body = array();
1113 $block->body['value'] = $block_custom['body'];
1114 $block->body['format'] = $block_custom['format'];
1124 * Extracts links from a node.
1126 * @param object $node
1127 * The fully populated node object.
1128 * @param bool $return_field_names
1129 * If set to TRUE, the returned array will contain the link URLs as keys, and
1130 * each element will be an array containing all field names in which the URL
1131 * is found. Otherwise, a simple array of URLs will be returned.
1134 * An array whose keys are fully qualified and unique URLs found in the node
1135 * (as returned by _linkchecker_extract_links()), or a more complex
1136 * structured array (see above) if $return_field_names is TRUE.
1138 function _linkchecker_extract_node_links($node, $return_field_names = FALSE) {
1140 $filter = new stdClass();
1141 $filter->settings['filter_url_length'] = 72;
1143 // Create array of node fields to scan.
1144 $text_items = array();
1145 $text_items_by_field = array();
1147 // Add fields typically not used for urls to the bottom. This way a link may
1148 // found earlier while looping over $text_items_by_field below.
1149 $text_items_by_field = array_merge($text_items_by_field, _linkchecker_parse_fields('node', $node->type, $node, TRUE));
1150 $text_items_by_field['title'][] = _filter_url($node->title, $filter);
1151 $text_items = _linkchecker_array_values_recursive($text_items_by_field);
1153 // Get the absolute node path for extraction of relative links.
1154 $languages = language_list();
1155 // Note: An "undefined language" (value: 'und') isn't listed in the available
1156 // languages variable $languages.
1157 $url_options = (empty($node->language) || empty($languages[$node->language])) ? array('absolute' => TRUE) : array('language' => $languages[$node->language], 'absolute' => TRUE);
1158 $path = url('node/' . $node->nid, $url_options);
1160 // Extract all links in a node.
1161 $links = _linkchecker_extract_links(implode(' ', $text_items), $path);
1163 // Return either the array of links, or an array of field names containing
1164 // each link, depending on what was requested.
1165 if (!$return_field_names) {
1169 $field_names = array();
1170 foreach ($text_items_by_field as $field_name => $items) {
1171 foreach ($items as $item) {
1172 foreach ($links as $uri => $link) {
1173 // We only need to do a quick check here to see if the URL appears
1174 // anywhere in the text; if so, that means users with access to this
1175 // field will be able to see the URL (and any private data such as
1176 // passwords contained in it). This is sufficient for the purposes of
1177 // _linkchecker_link_node_ids(), where this information is used.
1178 foreach ($link as $original_link) {
1179 if (strpos($item, $original_link) !== FALSE) {
1180 $field_names[$uri][$field_name] = $field_name;
1182 // URLs in $links have been auto-decoded by DOMDocument->loadHTML
1183 // and does not provide the RAW url with html special chars.
1184 // NOTE: htmlspecialchars() is 30% slower than str_replace().
1185 elseif (strpos($item, str_replace('&', '&', $original_link)) !== FALSE) {
1186 $field_names[$uri][$field_name] = $field_name;
1193 return $field_names;
1198 * Add node links to database.
1200 * @param object $node
1201 * The fully populated node object.
1202 * @param bool $skip_missing_links_detection
1203 * To prevent endless batch loops the value need to be TRUE. With FALSE
1204 * the need for content re-scans is detected by the number of missing links.
1206 function _linkchecker_add_node_links($node, $skip_missing_links_detection = FALSE) {
1207 $links = array_keys(_linkchecker_extract_node_links($node));
1210 if (!empty($links)) {
1211 // Remove all links from the links array already in the database and only
1212 // add missing links to database.
1213 $missing_links = _linkchecker_node_links_missing($node->nid, $links);
1215 // Only add links to database that do not exists.
1217 foreach ($missing_links as $url) {
1218 $urlhash = drupal_hash_base64($url);
1219 $link = db_query('SELECT lid FROM {linkchecker_link} WHERE urlhash = :urlhash', array(':urlhash' => $urlhash))->fetchObject();
1221 $link = new stdClass();
1222 $link->urlhash = $urlhash;
1224 $link->status = _linkchecker_link_check_status_filter($url);
1225 drupal_write_record('linkchecker_link', $link);
1227 db_insert('linkchecker_node')
1229 'nid' => $node->nid,
1230 'lid' => $link->lid,
1234 // Break processing if max links limit per run has been reached.
1236 if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
1241 // The first chunk of links not yet found in the {linkchecker_link} table
1242 // have now been imported by the above code. If the number of missing links
1243 // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1244 // the content need to be re-scanned until all links have been collected and
1245 // saved in {linkchecker_link} table.
1247 // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1248 // links and need to be substracted from the number of missing links to
1249 // calculate the correct number of re-scan rounds.
1251 // To prevent endless loops the $skip_missing_links_detection need to be TRUE.
1252 // This value will be set by the calling batch process that already knows
1253 // that it is running a batch job and the number of required re-scan rounds.
1254 $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
1255 if (!$skip_missing_links_detection && $missing_links_count > 0) {
1256 module_load_include('inc', 'linkchecker', 'linkchecker.batch');
1257 batch_set(_linkchecker_batch_import_single_node($node->nid, $missing_links_count));
1259 // If batches were set in the submit handlers, we process them now,
1260 // possibly ending execution. We make sure we do not react to the batch
1261 // that is already being processed (if a batch operation performs a
1263 if ($batch = &batch_get() && !isset($batch['current_set'])) {
1264 batch_process('node/' . $node->nid);
1269 // Remove dead link references for cleanup reasons as very last step.
1270 _linkchecker_cleanup_node_references($node->nid, $links);
1274 * Add comment links to database.
1276 * @param object $comment
1277 * The fully populated comment object.
1278 * @param bool $skip_missing_links_detection
1279 * To prevent endless batch loops the value need to be TRUE. With FALSE
1280 * the need for content re-scans is detected by the number of missing links.
1282 function _linkchecker_add_comment_links($comment, $skip_missing_links_detection = FALSE) {
1284 $filter = new stdClass();
1285 $filter->settings['filter_url_length'] = 72;
1287 // Create array of comment fields to scan.
1288 $text_items = array();
1289 $text_items[] = _filter_url($comment->subject, $filter);
1290 $text_items = array_merge($text_items, _linkchecker_parse_fields('comment', $comment->node_type, $comment));
1292 // Get the absolute node path for extraction of relative links.
1293 $languages = language_list();
1294 $node = node_load($comment->nid);
1295 $url_options = (empty($node->language) || empty($languages[$node->language])) ? array('absolute' => TRUE) : array('language' => $languages[$node->language], 'absolute' => TRUE);
1296 $path = url('node/' . $comment->nid, $url_options);
1298 // Extract all links in a comment.
1299 $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items), $path));
1301 // Comment have links.
1302 if (!empty($links)) {
1303 // Remove all links from the links array already in the database and only
1304 // add missing links to database.
1305 $missing_links = _linkchecker_comment_links_missing($comment->cid, $links);
1307 // Only add unique links to database that do not exist.
1309 foreach ($missing_links as $url) {
1310 $urlhash = drupal_hash_base64($url);
1311 $link = db_query('SELECT lid FROM {linkchecker_link} WHERE urlhash = :urlhash', array(':urlhash' => $urlhash))->fetchObject();
1313 $link = new stdClass();
1314 $link->urlhash = $urlhash;
1316 $link->status = _linkchecker_link_check_status_filter($url);
1317 drupal_write_record('linkchecker_link', $link);
1319 db_insert('linkchecker_comment')
1321 'cid' => $comment->cid,
1322 'lid' => $link->lid,
1326 // Break processing if max links limit per run has been reached.
1328 if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
1333 // The first chunk of links not yet found in the {linkchecker_link} table
1334 // have now been imported by the above code. If the number of missing links
1335 // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1336 // the content need to be re-scanned until all links have been collected and
1337 // saved in {linkchecker_link} table.
1339 // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1340 // links and need to be substracted from the number of missing links to
1341 // calculate the correct number of re-scan rounds.
1343 // To prevent endless loops the $skip_missing_links_detection need to be TRUE.
1344 // This value will be set by the calling batch process that already knows
1345 // that it is running a batch job and the number of required re-scan rounds.
1346 $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
1347 if (!$skip_missing_links_detection && $missing_links_count > 0) {
1348 module_load_include('inc', 'linkchecker', 'linkchecker.batch');
1349 batch_set(_linkchecker_batch_import_single_comment($comment->cid, $missing_links_count));
1351 // If batches were set in the submit handlers, we process them now,
1352 // possibly ending execution. We make sure we do not react to the batch
1353 // that is already being processed (if a batch operation performs a
1355 if ($batch = &batch_get() && !isset($batch['current_set'])) {
1356 batch_process('node/' . $comment->nid);
1361 // Remove dead link references for cleanup reasons as very last step.
1362 _linkchecker_cleanup_comment_references($comment->cid, $links);
1366 * Add custom block links to database.
1368 * @param array|object $block_custom
1369 * The fully populated custom block object.
1371 * Block id from table {block}.bid.
1372 * @param bool $skip_missing_links_detection
1373 * To prevent endless batch loops the value need to be TRUE. With FALSE
1374 * the need for content re-scans is detected by the number of missing links.
1376 function _linkchecker_add_block_custom_links($block_custom, $bid, $skip_missing_links_detection = FALSE) {
1377 // Convert custom block array to object.
1378 // @todo: Are we able to remove this global conversion?
1379 $block_custom = (object) $block_custom;
1381 // Custom blocks really suxxx as it's very inconsistent core logic (values are
1382 // integers or strings) and there are no usable hooks. Try to workaround this
1383 // bad logic as good as possible to prevent warnings/errors.
1384 // NOTE: Only custom blocks from block.module are supported. Skip all others.
1385 if ($block_custom->module != 'block' || !is_numeric($block_custom->delta) || !is_numeric($bid) || $block_custom->delta != $bid) {
1389 $filter = new stdClass();
1390 $filter->settings['filter_url_length'] = 72;
1392 // Create array of custom block fields to scan. All fields cannot exists.
1393 $text_items = array();
1394 if (!empty($block_custom->info)) {
1395 $text_items[] = _filter_url($block_custom->info, $filter);
1397 // $block_custom from editing/scanning a block. See block_custom_block_save().
1398 if (!empty($block_custom->body) && is_array($block_custom->body) && array_key_exists('value', $block_custom->body) && array_key_exists('format', $block_custom->body)) {
1399 $text_items[] = _linkchecker_check_markup($block_custom->body['value'], $block_custom->body['format']);
1402 // Extract all links in a custom block.
1403 $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items)));
1405 // Custom block has links.
1406 if (!empty($links)) {
1407 // Remove all links from the links array already in the database and only
1408 // add missing links to database.
1409 $missing_links = _linkchecker_block_custom_links_missing($bid, $links);
1411 // Only add unique links to database that do not exist.
1413 foreach ($missing_links as $url) {
1414 $urlhash = drupal_hash_base64($url);
1415 $link = db_query('SELECT lid FROM {linkchecker_link} WHERE urlhash = :urlhash', array(':urlhash' => $urlhash))->fetchObject();
1417 $link = new stdClass();
1418 $link->urlhash = $urlhash;
1420 $link->status = _linkchecker_link_check_status_filter($url);
1421 drupal_write_record('linkchecker_link', $link);
1423 db_insert('linkchecker_block_custom')
1426 'lid' => $link->lid,
1430 // Break processing if max links limit per run has been reached.
1432 if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) {
1437 // The first chunk of links not yet found in the {linkchecker_link} table
1438 // have now been imported by the above code. If the number of missing links
1439 // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1440 // the content need to be re-scanned until all links have been collected and
1441 // saved in {linkchecker_link} table.
1443 // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN
1444 // links and need to be substracted from the number of missing links to
1445 // calculate the correct number of re-scan rounds.
1447 // To prevent endless loops the $skip_missing_links_detection need to be TRUE.
1448 // This value will be set by the calling batch process that already knows
1449 // that it is running a batch job and the number of required re-scan rounds.
1450 $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN;
1451 if (!$skip_missing_links_detection && $missing_links_count > 0) {
1452 module_load_include('inc', 'linkchecker', 'linkchecker.batch');
1453 batch_set(_linkchecker_batch_import_single_block_custom($bid, $missing_links_count));
1455 // If batches were set in the submit handlers, we process them now,
1456 // possibly ending execution. We make sure we do not react to the batch
1457 // that is already being processed (if a batch operation performs a
1459 if ($batch = &batch_get() && !isset($batch['current_set'])) {
1460 batch_process('admin/structure/block');
1465 // Remove dead link references for cleanup reasons as very last step.
1466 _linkchecker_cleanup_block_custom_references($bid, $links);
1470 * Remove all node references to links in the linkchecker_node table.
1475 function _linkchecker_delete_node_links($nid) {
1476 db_delete('linkchecker_node')
1477 ->condition('nid', $nid)
1482 * Remove all comment references to links in the linkchecker_comment table.
1487 function _linkchecker_delete_comment_links($cid) {
1488 db_delete('linkchecker_comment')
1489 ->condition('cid', $cid)
1494 * Remove all block references to links in the linkchecker_block_custom table.
1500 function _linkchecker_delete_block_custom_links($bid) {
1501 db_delete('linkchecker_block_custom')
1502 ->condition('bid', $bid)
1507 * Cleanup no longer used node references to links in the linkchecker_node table.
1511 * @param array $links
1513 function _linkchecker_cleanup_node_references($nid = 0, $links = array()) {
1514 if (empty($links)) {
1515 // Node do not have links. Delete all references if exists.
1516 db_delete('linkchecker_node')
1517 ->condition('nid', $nid)
1521 // The node still have more than one link, but other links may have been
1522 // removed and links no longer in the content need to be deleted from the
1523 // linkchecker_node reference table.
1524 $subquery = db_select('linkchecker_link')
1525 ->fields('linkchecker_link', array('lid'))
1526 ->condition('urlhash', array_map('drupal_hash_base64', $links), 'IN');
1528 db_delete('linkchecker_node')
1529 ->condition('nid', $nid)
1530 ->condition('lid', $subquery, 'NOT IN')
1536 * Cleanup no longer used comment references to links in the linkchecker_comment table.
1540 * @param array $links
1542 function _linkchecker_cleanup_comment_references($cid = 0, $links = array()) {
1543 if (empty($links)) {
1544 // Comment do not have links. Delete all references if exists.
1545 db_delete('linkchecker_comment')
1546 ->condition('cid', $cid)
1550 // The comment still have more than one link, but other links may have been
1551 // removed and links no longer in the content need to be deleted from the
1552 // linkchecker_comment reference table.
1553 $subquery = db_select('linkchecker_link', 'll')
1554 ->fields('ll', array('lid'))
1555 ->condition('ll.urlhash', array_map('drupal_hash_base64', $links), 'IN');
1557 db_delete('linkchecker_comment')
1558 ->condition('cid', $cid)
1559 ->condition('lid', $subquery, 'NOT IN')
1565 * Cleanup no longer used custom block references to links in the linkchecker_block_custom table.
1569 * @param array $links
1571 function _linkchecker_cleanup_block_custom_references($bid = 0, $links = array()) {
1572 if (empty($links)) {
1573 // Block do not have links. Delete all references if exists.
1574 db_delete('linkchecker_block_custom')
1575 ->condition('bid', $bid)
1579 // The block still have more than one link, but other links may have been
1580 // removed and links no longer in the content need to be deleted from the
1581 // linkchecker_block_custom reference table.
1582 $subquery = db_select('linkchecker_link')
1583 ->fields('linkchecker_link', array('lid'))
1584 ->condition('urlhash', array_map('drupal_hash_base64', $links), 'IN');
1586 db_delete('linkchecker_block_custom')
1587 ->condition('bid', $bid)
1588 ->condition('lid', $subquery, 'NOT IN')
1594 * Returns an array of node references missing in the linkchecker_node table.
1598 * @param array $links
1599 * An array of links.
1602 * An array of node references missing in the linkchecker_node table.
1604 function _linkchecker_node_links_missing($nid, $links) {
1605 $result = db_query('SELECT ll.url FROM {linkchecker_link} ll INNER JOIN {linkchecker_node} ln ON ln.lid = ll.lid WHERE ln.nid = :nid AND ll.urlhash IN (:urlhashes)', array(':nid' => $nid, ':urlhashes' => array_map('drupal_hash_base64', $links)));
1606 $links_in_database = array();
1607 foreach ($result as $row) {
1608 $links_in_database[] = $row->url;
1610 return array_diff($links, $links_in_database);
1614 * Returns an array of comment references missing in the linkchecker_comment table.
1618 * @param array $links
1619 * An array of links.
1622 * An array of comment references missing in the linkchecker_comment table.
1624 function _linkchecker_comment_links_missing($cid, $links) {
1625 $result = db_query('SELECT ll.url FROM {linkchecker_link} ll INNER JOIN {linkchecker_comment} lc ON lc.lid = ll.lid WHERE lc.cid = :cid AND ll.urlhash IN (:urlhashes)', array(':cid' => $cid, ':urlhashes' => array_map('drupal_hash_base64', $links)));
1626 $links_in_database = array();
1627 foreach ($result as $row) {
1628 $links_in_database[] = $row->url;
1630 return array_diff($links, $links_in_database);
1634 * Returns an array of custom block references missing in the linkchecker_block_custom table.
1638 * @param array $links
1639 * An array of links.
1642 * An array of custom block references missing in the linkchecker_block_custom
1645 function _linkchecker_block_custom_links_missing($bid, $links) {
1646 $result = db_query('SELECT ll.url FROM {linkchecker_link} ll INNER JOIN {linkchecker_block_custom} lb ON lb.lid = ll.lid WHERE lb.bid = :bid AND ll.urlhash IN (:urlhashes)', array(':bid' => $bid, ':urlhashes' => array_map('drupal_hash_base64', $links)));
1647 $links_in_database = array();
1648 foreach ($result as $row) {
1649 $links_in_database[] = $row->url;
1651 return array_diff($links, $links_in_database);
1655 * Parse the urls from entity.
1657 * This function parse all fields from the entity and returns an array of
1658 * filtered field items.
1660 * @param string $entity_type
1661 * The type of entity; e.g., 'node', 'comment'.
1662 * @param string $bundle_name
1663 * The name of the bundle aka node type, e.g., 'article', 'page'.
1664 * @param object $entity
1665 * The entity to parse, a $node or a $comment object.
1666 * @param bool $return_field_names
1667 * If set to TRUE, the returned array will contain the content as keys, and
1668 * each element will be an array containing all field names in which the
1669 * content is found. Otherwise, a simple array with content will be returned.
1672 * Array of field items with filters applied.
1674 function _linkchecker_parse_fields($entity_type, $bundle_name, $entity, $return_field_names = FALSE) {
1675 $text_items = array();
1676 $text_items_by_field = array();
1678 // Create settings for _filter_url() function.
1679 $filter = new stdClass();
1680 $filter->settings['filter_url_length'] = 72;
1682 // Collect the fields from this entity_type and bundle.
1683 foreach (field_info_instances($entity_type, $bundle_name) as $field_name => $instance) {
1684 $field = field_info_field($field_name);
1685 // #1923328: field_name array may be missing in $entity.
1686 $entity_field = isset($entity->{$field['field_name']}) ? $entity->{$field['field_name']} : array();
1688 switch ($field['type']) {
1690 case 'text_with_summary':
1691 foreach ($entity_field as $language) {
1692 foreach ($language as $item) {
1698 $text_items[] = $text_items_by_field[$field['field_name']][] = _linkchecker_check_markup($item['value'], $item['format'], linkchecker_entity_language($entity_type, $entity), TRUE);
1699 $text_items[] = $text_items_by_field[$field['field_name']][] = _linkchecker_check_markup($item['summary'], $item['format'], linkchecker_entity_language($entity_type, $entity), TRUE);
1707 foreach ($entity_field as $language) {
1708 foreach ($language as $item) {
1713 $text_items[] = $text_items_by_field[$field['field_name']][] = _linkchecker_check_markup($item['value'], $item['format'], linkchecker_entity_language($entity_type, $entity), TRUE);
1718 // Link module field, http://drupal.org/project/link.
1720 foreach ($entity_field as $language) {
1721 foreach ($language as $item) {
1725 $options = drupal_parse_url(link_cleanup_url($item['url']));
1726 $text_items[] = $text_items_by_field[$field['field_name']][] = l($item['title'], $options['path'], $options);
1727 $text_items[] = $text_items_by_field[$field['field_name']][] = _linkchecker_check_markup($item['title'], NULL, linkchecker_entity_language($entity_type, $entity), TRUE);
1734 return ($return_field_names) ? $text_items_by_field : $text_items;
1738 * Replace the old url by a new url on 301 status codes.
1740 * @param string $entity_type
1741 * The type of entity; e.g., 'node', 'comment'.
1742 * @param string $bundle_name
1743 * The name of the bundle aka node type, e.g., 'article', 'page'.
1744 * @param object $entity
1745 * The entity to parse, a $node or a $comment object.
1746 * @param string $old_url
1748 * @param string $new_url
1749 * The new url to replace the old.
1753 function _linkchecker_replace_fields($entity_type, $bundle_name, $entity, $old_url, $new_url) {
1754 // Collect the fields from this entity_type and bundle.
1755 foreach (field_info_instances($entity_type, $bundle_name) as $field_name => $instance) {
1756 $field = field_info_field($field_name);
1757 $entity_field =& $entity->{$field['field_name']};
1759 switch ($field['type']) {
1761 case 'text_with_summary':
1762 foreach ($entity_field as $language_name => $language_value) {
1763 foreach ($language_value as $item_name => $item_value) {
1764 _linkchecker_link_replace($entity_field[$language_name][$item_name]['value'], $old_url, $new_url);
1765 _linkchecker_link_replace($entity_field[$language_name][$item_name]['summary'], $old_url, $new_url);
1773 foreach ($entity_field as $language_name => $language_value) {
1774 foreach ($language_value as $item_name => $item_value) {
1775 _linkchecker_link_replace($entity_field[$language_name][$item_name]['value'], $old_url, $new_url);
1780 // Link module field, http://drupal.org/project/link.
1782 foreach ($entity_field as $language_name => $language_value) {
1783 foreach ($language_value as $item_name => $item_value) {
1784 _linkchecker_link_replace($entity_field[$language_name][$item_name]['url'], $old_url, $new_url);
1785 _linkchecker_link_replace($entity_field[$language_name][$item_name]['title'], $old_url, $new_url);
1796 * Run perodically via cron and delete all links without a references.
1798 * For speed reasons and check results we keep the links for some time
1799 * as they may be reused by other new content.
1801 function _linkchecker_cleanup_links() {
1802 // Remove disabled node types no longer in use.
1803 $node_types = linkchecker_scan_node_types();
1804 if (!empty($node_types)) {
1805 $subquery1 = db_select('node', 'n')
1806 ->fields('n', array('nid'))
1807 ->condition('n.type', $node_types, 'NOT IN');
1809 db_delete('linkchecker_node')
1810 ->condition('nid', $subquery1, 'IN')
1813 // @todo Remove comments link references from table.
1814 // db_query('DELETE FROM {linkchecker_comment} WHERE cid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . db_placeholders($node_types, 'varchar') . '))', $node_types);
1817 // No active node_type. Remove all items from table.
1818 db_truncate('linkchecker_node')->execute();
1819 // @todo Remove comments link references from table.
1822 // Remove comment link references if comment scanning is disabled.
1823 // @todo Remove comments of unpublished nodes.
1824 $comment_types = linkchecker_scan_comment_types();
1825 if (empty($comment_types)) {
1826 db_truncate('linkchecker_comment')->execute();
1829 // Remove block link references if block scanning is disabled.
1830 if (variable_get('linkchecker_scan_blocks', 0) == 0) {
1831 db_truncate('linkchecker_block_custom')->execute();
1834 // Remove dead links without references.
1835 $linkchecker_node = db_select('linkchecker_node', 'ln')
1837 ->fields('ln', array('lid'));
1838 $linkchecker_comment = db_select('linkchecker_comment', 'lc')
1840 ->fields('lc', array('lid'));
1841 $linkchecker_block_custom = db_select('linkchecker_block_custom', 'lb')
1843 ->fields('lb', array('lid'));
1845 // UNION all linkchecker type tables.
1846 $subquery2 = db_select($linkchecker_block_custom->union($linkchecker_comment)->union($linkchecker_node), 'q1')
1848 ->fields('q1', array('lid'));
1850 db_delete('linkchecker_link')
1851 ->condition('lid', $subquery2, 'NOT IN')
1857 * Extract links from content.
1859 * @param string $text
1860 * The text to be scanned for links.
1861 * @param string $content_path
1862 * Path to the content that is currently scanned for links. This value is
1863 * required to build full qualified links from relative links. Relative links
1864 * are not extracted from content, if path is not provided.
1867 * Array whose keys are fully qualified and unique URLs found in the
1868 * content, and whose values are arrays of actual text (raw URLs or paths)
1869 * corresponding to each fully qualified URL.
1871 function _linkchecker_extract_links($text = '', $content_path = NULL) {
1872 global $base_root, $is_https;
1874 $html_dom = filter_dom_load($text);
1877 // Finds all hyperlinks in the content.
1878 if (variable_get('linkchecker_extract_from_a', 1) == 1) {
1879 $links = $html_dom->getElementsByTagName('a');
1880 foreach ($links as $link) {
1881 $urls[] = $link->getAttribute('href');
1884 $links = $html_dom->getElementsByTagName('area');
1885 foreach ($links as $link) {
1886 $urls[] = $link->getAttribute('href');
1890 // Finds all audio links in the content.
1891 if (variable_get('linkchecker_extract_from_audio', 0) == 1) {
1892 $audios = $html_dom->getElementsByTagName('audio');
1893 foreach ($audios as $audio) {
1894 $urls[] = $audio->getAttribute('src');
1896 // Finds source tags with links in the audio tag.
1897 $sources = $audio->getElementsByTagName('source');
1898 foreach ($sources as $source) {
1899 $urls[] = $source->getAttribute('src');
1901 // Finds track tags with links in the audio tag.
1902 $tracks = $audio->getElementsByTagName('track');
1903 foreach ($tracks as $track) {
1904 $urls[] = $track->getAttribute('src');
1909 // Finds embed tags with links in the content.
1910 if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
1911 $embeds = $html_dom->getElementsByTagName('embed');
1912 foreach ($embeds as $embed) {
1913 $urls[] = $embed->getAttribute('src');
1914 $urls[] = $embed->getAttribute('pluginurl');
1915 $urls[] = $embed->getAttribute('pluginspage');
1919 // Finds iframe tags with links in the content.
1920 if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
1921 $iframes = $html_dom->getElementsByTagName('iframe');
1922 foreach ($iframes as $iframe) {
1923 $urls[] = $iframe->getAttribute('src');
1927 // Finds img tags with links in the content.
1928 if (variable_get('linkchecker_extract_from_img', 0) == 1) {
1929 $imgs = $html_dom->getElementsByTagName('img');
1930 foreach ($imgs as $img) {
1931 $urls[] = $img->getAttribute('src');
1932 $urls[] = $img->getAttribute('longdesc');
1936 // Finds object/param tags with links in the content.
1937 if (variable_get('linkchecker_extract_from_object', 0) == 1) {
1938 $objects = $html_dom->getElementsByTagName('object');
1939 foreach ($objects as $object) {
1940 $urls[] = $object->getAttribute('data');
1941 $urls[] = $object->getAttribute('codebase');
1943 // Finds param tags with links in the object tag.
1944 $params = $object->getElementsByTagName('param');
1945 foreach ($params as $param) {
1947 // - Try to extract links in unkown "flashvars" values
1948 // (e.g., file=http://, data=http://).
1949 $names = array('archive', 'filename', 'href', 'movie', 'src', 'url');
1950 if ($param->hasAttribute('name') && in_array($param->getAttribute('name'), $names)) {
1951 $urls[] = $param->getAttribute('value');
1954 $srcs = array('movie');
1955 if ($param->hasAttribute('src') && in_array($param->getAttribute('src'), $srcs)) {
1956 $urls[] = $param->getAttribute('value');
1962 // Finds video tags with links in the content.
1963 if (variable_get('linkchecker_extract_from_video', 0) == 1) {
1964 $videos = $html_dom->getElementsByTagName('video');
1965 foreach ($videos as $video) {
1966 $urls[] = $video->getAttribute('poster');
1967 $urls[] = $video->getAttribute('src');
1969 // Finds source tags with links in the video tag.
1970 $sources = $video->getElementsByTagName('source');
1971 foreach ($sources as $source) {
1972 $urls[] = $source->getAttribute('src');
1974 // Finds track tags with links in the audio tag.
1975 $tracks = $video->getElementsByTagName('track');
1976 foreach ($tracks as $track) {
1977 $urls[] = $track->getAttribute('src');
1982 // Remove empty values.
1983 $urls = array_filter($urls);
1984 // Remove duplicate urls.
1985 $urls = array_unique($urls);
1987 // What type of links should be checked?
1988 $linkchecker_check_links_types = variable_get('linkchecker_check_links_types', 1);
1991 foreach ($urls as $url) {
1992 // Decode HTML links into plain text links.
1993 // DOMDocument->loadHTML does not provide the RAW url from code. All html
1994 // entities are already decoded.
1995 // @todo: Try to find a way to get the raw value.
1996 $url_decoded = $url;
1998 // Prefix protocol relative urls with a protocol to allow link checking.
1999 if (preg_match('!^//!', $url_decoded)) {
2000 $http_protocol = $is_https ? 'https' : 'http';
2001 $url_decoded = $http_protocol . ':' . $url_decoded;
2004 // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE and link gets added.
2005 $url_encoded = str_replace(' ', '%20', $url_decoded);
2007 // Full qualified URLs.
2008 if ($linkchecker_check_links_types != 2 && valid_url($url_encoded, TRUE)) {
2009 // Add to Array and change HTML links into plain text links.
2010 $links[$url_decoded][] = $url;
2012 // Skip mailto:, javascript:, etc.
2013 elseif (preg_match('/^\w[\w.+]*:/', $url_decoded)) {
2016 // Local URLs. $linkchecker_check_links_types = 0 or 2
2017 elseif ($linkchecker_check_links_types != 1 && valid_url($url_encoded, FALSE)) {
2018 // Get full qualified url with base path of content.
2019 $absolute_content_path = _linkchecker_absolute_content_path($content_path);
2021 // Absolute local URLs need to start with [/].
2022 if (preg_match('!^/!', $url_decoded)) {
2023 // Add to Array and change HTML encoded links into plain text links.
2024 $links[$base_root . $url_decoded][] = $url;
2026 // Anchors and URL parameters like "#foo" and "?foo=bar".
2027 elseif (!empty($content_path) && preg_match('!^[?#]!', $url_decoded)) {
2028 // Add to Array and change HTML encoded links into plain text links.
2029 $links[$content_path . $url_decoded][] = $url;
2031 // Relative URLs like "./foo/bar" and "../foo/bar".
2032 elseif (!empty($absolute_content_path) && preg_match('!^\.{1,2}/!', $url_decoded)) {
2033 // Build the URI without hostname before the URI is normalized and
2034 // dot-segments will be removed. The hostname is added back after the
2035 // normalization has completed to prevent hostname removal by the regex.
2036 // This logic intentionally does not implement all the rules definied in
2037 // RFC 3986, section 5.2.4 to show broken links and over-dot-segmented
2038 // URIs; e.g., http://example.com/../../foo/bar.
2039 // For more information, see http://drupal.org/node/832388.
2040 $path = substr_replace($absolute_content_path . $url_decoded, '', 0, strlen($base_root));
2042 // Remove './' segments where possible.
2043 $path = str_replace('/./', '/', $path);
2045 // Remove '../' segments where possible. Loop until all segments are
2046 // removed. Taken over from _drupal_build_css_path() in common.inc.
2048 while ($path != $last) {
2050 $path = preg_replace('`(^|/)(?!\.\./)([^/]+)/\.\./`', '$1', $path);
2053 // Glue the hostname and path to full-qualified URI.
2054 $links[$base_root . $path][] = $url;
2056 // Relative URLs like "test.png".
2057 elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url_decoded)) {
2058 $links[$absolute_content_path . $url_decoded][] = $url;
2061 // @todo Are there more special cases the module need to handle?
2070 * Replaces old link with new link in text.
2072 * @param string $text
2073 * The text a link is inside. Passed in as a reference.
2074 * @param string $old_link_fqdn
2075 * The old link to search for in strings.
2076 * @param string $new_link_fqdn
2077 * The old link should be overwritten with this new link.
2079 function _linkchecker_link_replace(&$text, $old_link_fqdn = '', $new_link_fqdn = '') {
2080 // Don't do any string replacement if one of the values is empty.
2081 if (!empty($text) && !empty($old_link_fqdn) && !empty($new_link_fqdn)) {
2082 // Remove protocols and hostname from local URLs.
2083 $base_roots = array(
2084 drupal_strtolower('http://' . $_SERVER['HTTP_HOST']),
2085 drupal_strtolower('https://' . $_SERVER['HTTP_HOST']),
2087 $old_link = str_replace($base_roots, '', $old_link_fqdn);
2088 $new_link = str_replace($base_roots, '', $new_link_fqdn);
2090 // Build variables with all URLs and run check_url() only once.
2091 $old_html_link_fqdn = check_url($old_link_fqdn);
2092 $new_html_link_fqdn = check_url($new_link_fqdn);
2093 $old_html_link = check_url($old_link);
2094 $new_html_link = check_url($new_link);
2096 // Replace links in link fields and text and Links weblink fields.
2097 if (in_array($text, array($old_html_link_fqdn, $old_html_link, $old_link_fqdn, $old_link))) {
2098 // Keep old and new links in the same encoding and format and short or
2100 $text = str_replace($old_html_link_fqdn, $new_html_link_fqdn, $text);
2101 $text = str_replace($old_html_link, $new_html_link, $text);
2102 $text = str_replace($old_link_fqdn, $new_link_fqdn, $text);
2103 $text = str_replace($old_link, $new_link, $text);
2106 // Create an array of links with HTML decoded and encoded URLs.
2108 $old_html_link_fqdn,
2113 // Remove duplicate URLs from array if URLs do not have URL parameters.
2114 // If more than one URL parameter exists - one URL in the array will have
2115 // an unencoded ampersand "&" and a second URL will have an HTML encoded
2116 // ampersand "&".
2117 $old_links = array_unique($old_links);
2119 // Load HTML code into DOM.
2120 $html_dom = filter_dom_load($text);
2122 // Finds all hyperlinks in the content.
2123 if (variable_get('linkchecker_extract_from_a', 1) == 1) {
2124 $links = $html_dom->getElementsByTagName('a');
2125 foreach ($links as $link) {
2126 if (in_array($link->getAttribute('href'), $old_links)) {
2127 $link->setAttribute('href', $new_html_link);
2129 // Replace link text, if same like the URL. If a link text contains
2130 // other child tags like <img> it will be skipped.
2131 if (in_array($link->nodeValue, $old_links)) {
2132 $link->nodeValue = $new_html_link;
2136 $links = $html_dom->getElementsByTagName('area');
2137 foreach ($links as $link) {
2138 if (in_array($link->getAttribute('href'), $old_links)) {
2139 $link->setAttribute('href', $new_html_link);
2144 // Finds all audio links in the content.
2145 if (variable_get('linkchecker_extract_from_audio', 0) == 1) {
2146 $audios = $html_dom->getElementsByTagName('audio');
2147 foreach ($audios as $audio) {
2148 if (in_array($audio->getAttribute('src'), $old_links)) {
2149 $audio->setAttribute('src', $new_html_link);
2152 // Finds source tags with links in the audio tag.
2153 $sources = $audio->getElementsByTagName('source');
2154 foreach ($sources as $source) {
2155 if (in_array($source->getAttribute('src'), $old_links)) {
2156 $source->setAttribute('src', $new_html_link);
2159 // Finds track tags with links in the audio tag.
2160 $tracks = $audio->getElementsByTagName('track');
2161 foreach ($tracks as $track) {
2162 if (in_array($track->getAttribute('src'), $old_links)) {
2163 $track->setAttribute('src', $new_html_link);
2169 // Finds embed tags with links in the content.
2170 if (variable_get('linkchecker_extract_from_embed', 0) == 1) {
2171 $embeds = $html_dom->getElementsByTagName('embed');
2172 foreach ($embeds as $embed) {
2173 if (in_array($embed->getAttribute('src'), $old_links)) {
2174 $embed->setAttribute('src', $new_html_link);
2176 if (in_array($embed->getAttribute('pluginurl'), $old_links)) {
2177 $embed->setAttribute('pluginurl', $new_html_link);
2179 if (in_array($embed->getAttribute('pluginspage'), $old_links)) {
2180 $embed->setAttribute('pluginspage', $new_html_link);
2185 // Finds iframe tags with links in the content.
2186 if (variable_get('linkchecker_extract_from_iframe', 0) == 1) {
2187 $iframes = $html_dom->getElementsByTagName('iframe');
2188 foreach ($iframes as $iframe) {
2189 if (in_array($iframe->getAttribute('src'), $old_links)) {
2190 $iframe->setAttribute('src', $new_html_link);
2195 // Finds img tags with links in the content.
2196 if (variable_get('linkchecker_extract_from_img', 0) == 1) {
2197 $imgs = $html_dom->getElementsByTagName('img');
2198 foreach ($imgs as $img) {
2199 if (in_array($img->getAttribute('src'), $old_links)) {
2200 $img->setAttribute('src', $new_html_link);
2202 if (in_array($img->getAttribute('longdesc'), $old_links)) {
2203 $img->setAttribute('longdesc', $new_html_link);
2208 // Finds object/param tags with links in the content.
2209 if (variable_get('linkchecker_extract_from_object', 0) == 1) {
2210 $objects = $html_dom->getElementsByTagName('object');
2211 foreach ($objects as $object) {
2212 if (in_array($object->getAttribute('data'), $old_links)) {
2213 $object->setAttribute('data', $new_html_link);
2215 if (in_array($object->getAttribute('codebase'), $old_links)) {
2216 $object->setAttribute('codebase', $new_html_link);
2219 // Finds param tags with links in the object tag.
2220 $params = $object->getElementsByTagName('param');
2221 foreach ($params as $param) {
2223 // - Try to replace links in unkown "flashvars" values
2224 // (e.g., file=http://, data=http://).
2225 $names = array('archive', 'filename', 'href', 'movie', 'src', 'url');
2226 if ($param->hasAttribute('name') && in_array($param->getAttribute('name'), $names)) {
2227 if (in_array($param->getAttribute('value'), $old_links)) {
2228 $param->setAttribute('value', $new_html_link);
2232 $srcs = array('movie');
2233 if ($param->hasAttribute('src') && in_array($param->getAttribute('src'), $srcs)) {
2234 if (in_array($param->getAttribute('value'), $old_links)) {
2235 $param->setAttribute('value', $new_html_link);
2242 // Finds video tags with links in the content.
2243 if (variable_get('linkchecker_extract_from_video', 0) == 1) {
2244 $videos = $html_dom->getElementsByTagName('video');
2245 foreach ($videos as $video) {
2246 if (in_array($video->getAttribute('poster'), $old_links)) {
2247 $video->setAttribute('poster', $new_html_link);
2249 if (in_array($video->getAttribute('src'), $old_links)) {
2250 $video->setAttribute('src', $new_html_link);
2253 // Finds source tags with links in the video tag.
2254 $sources = $video->getElementsByTagName('source');
2255 foreach ($sources as $source) {
2256 if (in_array($source->getAttribute('src'), $old_links)) {
2257 $source->setAttribute('src', $new_html_link);
2260 // Finds track tags with links in the audio tag.
2261 $tracks = $video->getElementsByTagName('track');
2262 foreach ($tracks as $track) {
2263 if (in_array($track->getAttribute('src'), $old_links)) {
2264 $track->setAttribute('src', $new_html_link);
2270 // Set the updated $text for the calling function.
2271 $text = filter_dom_serialize($html_dom);
2277 * Customized clone of core check_markup() with additional filter blacklist.
2279 * See http://api.drupal.org/api/function/check_markup/7 for API documentation.
2281 function _linkchecker_check_markup($text, $format_id = NULL, $langcode = '', $cache = FALSE) {
2282 if (!isset($text)) {
2286 if (!isset($format_id)) {
2287 $format_id = filter_fallback_format();
2289 // If the requested text format does not exist, the text cannot be filtered.
2290 if (!$format = filter_format_load($format_id)) {
2291 linkchecker_watchdog_log('filter', 'Missing text format: %format.', array('%format' => $format_id), WATCHDOG_ALERT);
2295 // Check for a cached version of this piece of text.
2296 $cache = $cache && !empty($format->cache);
2299 $cache_id = 'linkchecker:' . $format->format . ':' . $langcode . ':' . hash('sha256', $text);
2300 if ($cached = cache_get($cache_id, 'cache_filter')) {
2301 return $cached->data;
2305 // Convert all Windows and Mac newlines to a single newline, so filters only
2306 // need to deal with one possibility.
2307 $text = str_replace(array("\r\n", "\r"), "\n", $text);
2309 // Get a complete list of filters, ordered properly.
2310 $filters = filter_list_format($format->format);
2311 $filter_info = filter_get_filters();
2313 // Do not run placeholder or special tag filters used as references to nodes
2314 // like 'weblink' or 'weblinks' node types. If the original link node is
2315 // updated, all links are automatically up-to-date and there is no need to
2316 // notify about the broken link on all nodes having a link reference in
2317 // content. This would only confuse the authors as they may also not be able
2318 // to fix the source node of the reference.
2319 $filters_blacklist = array_keys(array_filter(variable_get('linkchecker_filter_blacklist', explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST))));
2321 // Give filters the chance to escape HTML-like data such as code or formulas.
2322 foreach ($filters as $name => $filter) {
2323 if (!in_array($name, $filters_blacklist)) {
2324 if ($filter->status && isset($filter_info[$name]['prepare callback']) && function_exists($filter_info[$name]['prepare callback'])) {
2325 $function = $filter_info[$name]['prepare callback'];
2326 $text = $function($text, $filter, $format, $langcode, $cache, $cache_id);
2331 // Perform filtering.
2332 foreach ($filters as $name => $filter) {
2333 if (!in_array($name, $filters_blacklist)) {
2334 if ($filter->status && isset($filter_info[$name]['process callback']) && function_exists($filter_info[$name]['process callback'])) {
2335 $function = $filter_info[$name]['process callback'];
2336 $text = $function($text, $filter, $format, $langcode, $cache, $cache_id);
2341 // Store in cache with a minimum expiration time of 1 day.
2343 cache_set($cache_id, $text, 'cache_filter', REQUEST_TIME + (60 * 60 * 24));
2350 * Get the path of an URL.
2352 * @param string $url
2353 * The http/https URL to parse.
2356 * Full qualified URL with absolute path of the URL.
2358 function _linkchecker_absolute_content_path($url) {
2360 // Parse the URL and make sure we can handle the schema.
2361 $uri = @parse_url($url);
2363 if ($uri == FALSE) {
2367 if (!isset($uri['scheme'])) {
2371 // Break if the schema is not supported.
2372 if (!in_array($uri['scheme'], array('http', 'https'))) {
2376 $scheme = isset($uri['scheme']) ? $uri['scheme'] . '://' : '';
2377 $user = isset($uri['user']) ? $uri['user'] . ($uri['pass'] ? ':' . $uri['pass'] : '') . '@' : '';
2378 $port = isset($uri['port']) ? $uri['port'] : 80;
2379 $host = $uri['host'] . ($port != 80 ? ':' . $port : '');
2380 $path = isset($uri['path']) ? $uri['path'] : '/';
2382 // Glue the URL variables.
2383 $absolute_url = $scheme . $user . $host . $path;
2385 // Find the last slash and remove all after the last slash to get the path.
2386 $last_slash = strrpos($absolute_url, '/');
2387 $absolute_content_url = drupal_substr($absolute_url, 0, $last_slash + 1);
2389 return $absolute_content_url;
2393 * Verifies against blacklists, if the link status should be checked or not.
2395 function _linkchecker_link_check_status_filter($url) {
2398 // Is url in domain blacklist?
2399 $urls = variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS);
2400 if (!empty($urls) && preg_match('/' . implode('|', array_map(create_function('$links', 'return preg_quote($links, \'/\');'), preg_split('/(\r\n?|\n)/', $urls))) . '/', $url)) {
2404 // Protocol whitelist check (without curl, only http/https is supported).
2405 if (!preg_match('/^(https?):\/\//i', $url)) {
2413 * Defines the list of allowed response codes for form input validation.
2416 * An numeric response code.
2419 * TRUE if the status code is valid, otherwise FALSE.
2421 function _linkchecker_isvalid_response_code($code) {
2425 101 => 'Switching Protocols',
2429 203 => 'Non-Authoritative Information',
2430 204 => 'No Content',
2431 205 => 'Reset Content',
2432 206 => 'Partial Content',
2433 300 => 'Multiple Choices',
2434 301 => 'Moved Permanently',
2437 304 => 'Not Modified',
2439 307 => 'Temporary Redirect',
2440 400 => 'Bad Request',
2441 401 => 'Unauthorized',
2442 402 => 'Payment Required',
2445 405 => 'Method Not Allowed',
2446 406 => 'Not Acceptable',
2447 407 => 'Proxy Authentication Required',
2448 408 => 'Request Time-out',
2451 411 => 'Length Required',
2452 412 => 'Precondition Failed',
2453 413 => 'Request Entity Too Large',
2454 414 => 'Request-URI Too Large',
2455 415 => 'Unsupported Media Type',
2456 416 => 'Requested range not satisfiable',
2457 417 => 'Expectation Failed',
2458 500 => 'Internal Server Error',
2459 501 => 'Not Implemented',
2460 502 => 'Bad Gateway',
2461 503 => 'Service Unavailable',
2462 504 => 'Gateway Time-out',
2463 505 => 'HTTP Version not supported',
2466 return array_key_exists($code, $responses);
2470 * Return all content type enable with link checking.
2473 * An array of node type names, keyed by the type.
2475 function linkchecker_scan_node_types() {
2477 foreach (node_type_get_names() as $type => $name) {
2478 if (variable_get('linkchecker_scan_node_' . $type, FALSE)) {
2479 $types[$type] = $type;
2486 * Return all content type enable with comment link checking.
2489 * An array of node type names, keyed by the type.
2491 function linkchecker_scan_comment_types() {
2493 foreach (node_type_get_names() as $type => $name) {
2494 if (variable_get('linkchecker_scan_comment_' . $type, FALSE)) {
2495 $types[$type] = $type;
2502 * Unpublishes all nodes having the specified link id.
2505 * A link ID that have reached a defined failcount.
2507 function _linkchecker_unpublish_nodes($lid) {
2508 $result = db_query('SELECT nid FROM {linkchecker_node} WHERE lid = :lid', array(':lid' => $lid));
2509 foreach ($result as $row) {
2510 // Explicitly don't use node_load_multiple() or the module may run
2511 // into issues like http://drupal.org/node/1210606. With this logic
2512 // nodes can be updated until an out of memory occurs and further
2513 // updates will be made on the remaining nodes only.
2514 $node = node_load($row->nid);
2515 $node->status = NODE_NOT_PUBLISHED;
2517 linkchecker_watchdog_log('linkchecker', 'Set @type %title to unpublished.', array('@type' => $node->type, '%title' => $node->title));
2522 * Load link as object.
2529 function linkchecker_link_load($lid) {
2530 return db_query('SELECT * FROM {linkchecker_link} WHERE lid = :lid', array(':lid' => $lid))->fetchObject();
2534 * Checks if this entity is the default revision (published).
2536 * @param object $entity
2537 * The entity object, e.g., $node.
2540 * TRUE if the entity is the default revision, FALSE otherwise.
2542 function _linkchecker_isdefaultrevision($entity) {
2543 // D7 "Forward revisioning" is complex and causes a node_save() with the
2544 // future node in node table. This fires hook_node_update() twice and cause
2545 // abnormal behaviour in linkchecker.
2547 // The steps taken by Workbench Moderation is to save the forward revision
2548 // first and overwrite this with the live version in a shutdown function in
2549 // a second step. This will confuse linkchecker. D7 has no generic property
2550 // in the node object, if the node that is updated is the 'published' version
2551 // or only a draft of a future version.
2553 // This behaviour will change in D8 where $node->isDefaultRevision has been
2554 // introduced. See below links for more details.
2555 // - http://drupal.org/node/1879482
2556 // - http://drupal.org/node/218755
2557 // - http://drupal.org/node/1522154
2559 // Every moderation module saving a forward revision needs to return FALSE.
2560 // @todo: Refactor this workaround under D8.
2562 // Workbench Moderation module.
2563 if (module_exists('workbench_moderation') && workbench_moderation_node_type_moderated($entity->type) === TRUE && empty($entity->workbench_moderation['updating_live_revision'])) {
2571 * Returns the language code of the given entity.
2573 * Backward compatibility layer to ensure that installations running an older
2574 * version of core where entity_language() is not avilable do not break.
2576 * @param string $entity_type
2578 * @param object $entity
2582 * The entity language code.
2584 function linkchecker_entity_language($entity_type, $entity) {
2586 if (function_exists('entity_language')) {
2587 $langcode = entity_language($entity_type, $entity);
2589 elseif (!empty($entity->language)) {
2590 $langcode = $entity->language;
2596 * Return all the values of one-dimensional and multidimensional arrays.
2599 * Returns all the values from the input array and indexes the array numerically.
2601 function _linkchecker_array_values_recursive(array $array) {
2602 $array_values = array();
2604 foreach ($array as $value) {
2605 if (is_array($value)) {
2606 $array_values = array_merge($array_values, _linkchecker_array_values_recursive($value));
2609 $array_values[] = $value;
2613 return $array_values;