3 * Zend Framework (http://framework.zend.com/)
5 * @link http://github.com/zendframework/zf2 for the canonical source repository
6 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
7 * @license http://framework.zend.com/license/new-bsd New BSD License
10 namespace Zend\Feed\Reader;
14 use Zend\Cache\Storage\StorageInterface as CacheStorage;
15 use Zend\Feed\Reader\Exception\InvalidHttpClientException;
16 use Zend\Http as ZendHttp;
17 use Zend\Stdlib\ErrorHandler;
21 class Reader implements ReaderImportInterface
26 const NAMESPACE_ATOM_03 = 'http://purl.org/atom/ns#';
27 const NAMESPACE_ATOM_10 = 'http://www.w3.org/2005/Atom';
28 const NAMESPACE_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
29 const NAMESPACE_RSS_090 = 'http://my.netscape.com/rdf/simple/0.9/';
30 const NAMESPACE_RSS_10 = 'http://purl.org/rss/1.0/';
35 const TYPE_ANY = 'any';
36 const TYPE_ATOM_03 = 'atom-03';
37 const TYPE_ATOM_10 = 'atom-10';
38 const TYPE_ATOM_10_ENTRY = 'atom-10-entry';
39 const TYPE_ATOM_ANY = 'atom';
40 const TYPE_RSS_090 = 'rss-090';
41 const TYPE_RSS_091 = 'rss-091';
42 const TYPE_RSS_091_NETSCAPE = 'rss-091n';
43 const TYPE_RSS_091_USERLAND = 'rss-091u';
44 const TYPE_RSS_092 = 'rss-092';
45 const TYPE_RSS_093 = 'rss-093';
46 const TYPE_RSS_094 = 'rss-094';
47 const TYPE_RSS_10 = 'rss-10';
48 const TYPE_RSS_20 = 'rss-20';
49 const TYPE_RSS_ANY = 'rss';
56 protected static $cache = null;
59 * HTTP client object to use for retrieving feeds
61 * @var Http\ClientInterface
63 protected static $httpClient = null;
66 * Override HTTP PUT and DELETE request methods?
70 protected static $httpMethodOverride = false;
72 protected static $httpConditionalGet = false;
74 protected static $extensionManager = null;
76 protected static $extensions = [
98 * @return CacheStorage
100 public static function getCache()
102 return static::$cache;
108 * @param CacheStorage $cache
111 public static function setCache(CacheStorage $cache)
113 static::$cache = $cache;
117 * Set the HTTP client instance
119 * Sets the HTTP client object to use for retrieving the feeds.
121 * @param ZendHttp\Client | Http\ClientInterface $httpClient
124 public static function setHttpClient($httpClient)
126 if ($httpClient instanceof ZendHttp\Client) {
127 $httpClient = new Http\ZendHttpClientDecorator($httpClient);
130 if (! $httpClient instanceof Http\ClientInterface) {
131 throw new InvalidHttpClientException();
133 static::$httpClient = $httpClient;
137 * Gets the HTTP client object. If none is set, a new ZendHttp\Client will be used.
139 * @return Http\ClientInterface
141 public static function getHttpClient()
143 if (! static::$httpClient) {
144 static::$httpClient = new Http\ZendHttpClientDecorator(new ZendHttp\Client());
147 return static::$httpClient;
151 * Toggle using POST instead of PUT and DELETE HTTP methods
153 * Some feed implementations do not accept PUT and DELETE HTTP
154 * methods, or they can't be used because of proxies or other
155 * measures. This allows turning on using POST where PUT and
156 * DELETE would normally be used; in addition, an
157 * X-Method-Override header will be sent with a value of PUT or
158 * DELETE as appropriate.
160 * @param bool $override Whether to override PUT and DELETE.
163 public static function setHttpMethodOverride($override = true)
165 static::$httpMethodOverride = $override;
169 * Get the HTTP override state
173 public static function getHttpMethodOverride()
175 return static::$httpMethodOverride;
179 * Set the flag indicating whether or not to use HTTP conditional GET
184 public static function useHttpConditionalGet($bool = true)
186 static::$httpConditionalGet = $bool;
190 * Import a feed by providing a URI
192 * @param string $uri The URI to the feed
193 * @param string $etag OPTIONAL Last received ETag for this resource
194 * @param string $lastModified OPTIONAL Last-Modified value for this resource
195 * @return Feed\FeedInterface
196 * @throws Exception\RuntimeException
198 public static function import($uri, $etag = null, $lastModified = null)
200 $cache = self::getCache();
201 $client = self::getHttpClient();
202 $cacheId = 'Zend_Feed_Reader_' . md5($uri);
204 if (static::$httpConditionalGet && $cache) {
206 $data = $cache->getItem($cacheId);
207 if ($data && $client instanceof Http\HeaderAwareClientInterface) {
208 // Only check for ETag and last modified values in the cache
209 // if we have a client capable of emitting headers in the first place.
210 if ($etag === null) {
211 $etag = $cache->getItem($cacheId . '_etag');
213 if ($lastModified === null) {
214 $lastModified = $cache->getItem($cacheId . '_lastmodified');
217 $headers['If-None-Match'] = [$etag];
220 $headers['If-Modified-Since'] = [$lastModified];
223 $response = $client->get($uri, $headers);
224 if ($response->getStatusCode() !== 200 && $response->getStatusCode() !== 304) {
225 throw new Exception\RuntimeException(
226 'Feed failed to load, got response code ' . $response->getStatusCode()
229 if ($response->getStatusCode() == 304) {
230 $responseXml = $data;
232 $responseXml = $response->getBody();
233 $cache->setItem($cacheId, $responseXml);
235 if ($response instanceof Http\HeaderAwareResponseInterface) {
236 if ($response->getHeaderLine('ETag', false)) {
237 $cache->setItem($cacheId . '_etag', $response->getHeaderLine('ETag'));
239 if ($response->getHeaderLine('Last-Modified', false)) {
240 $cache->setItem($cacheId . '_lastmodified', $response->getHeaderLine('Last-Modified'));
244 return static::importString($responseXml);
246 $data = $cache->getItem($cacheId);
248 return static::importString($data);
250 $response = $client->get($uri);
251 if ((int) $response->getStatusCode() !== 200) {
252 throw new Exception\RuntimeException(
253 'Feed failed to load, got response code ' . $response->getStatusCode()
256 $responseXml = $response->getBody();
257 $cache->setItem($cacheId, $responseXml);
258 return static::importString($responseXml);
260 $response = $client->get($uri);
261 if ((int) $response->getStatusCode() !== 200) {
262 throw new Exception\RuntimeException(
263 'Feed failed to load, got response code ' . $response->getStatusCode()
266 $reader = static::importString($response->getBody());
267 $reader->setOriginalSourceUri($uri);
273 * Import a feed from a remote URI
275 * Performs similarly to import(), except it uses the HTTP client passed to
276 * the method, and does not take into account cached data.
278 * Primary purpose is to make it possible to use the Reader with alternate
279 * HTTP client implementations.
282 * @param Http\ClientInterface $client
284 * @throws Exception\RuntimeException if response is not an Http\ResponseInterface
286 public static function importRemoteFeed($uri, Http\ClientInterface $client)
288 $response = $client->get($uri);
289 if (! $response instanceof Http\ResponseInterface) {
290 throw new Exception\RuntimeException(sprintf(
291 'Did not receive a %s\Http\ResponseInterface from the provided HTTP client; received "%s"',
293 (is_object($response) ? get_class($response) : gettype($response))
297 if ((int) $response->getStatusCode() !== 200) {
298 throw new Exception\RuntimeException(
299 'Feed failed to load, got response code ' . $response->getStatusCode()
302 $reader = static::importString($response->getBody());
303 $reader->setOriginalSourceUri($uri);
308 * Import a feed from a string
310 * @param string $string
311 * @return Feed\FeedInterface
312 * @throws Exception\InvalidArgumentException
313 * @throws Exception\RuntimeException
315 public static function importString($string)
317 $trimmed = trim($string);
318 if (! is_string($string) || empty($trimmed)) {
319 throw new Exception\InvalidArgumentException('Only non empty strings are allowed as input');
322 $libxmlErrflag = libxml_use_internal_errors(true);
323 $oldValue = libxml_disable_entity_loader(true);
324 $dom = new DOMDocument;
325 $status = $dom->loadXML(trim($string));
326 foreach ($dom->childNodes as $child) {
327 if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
328 throw new Exception\InvalidArgumentException(
329 'Invalid XML: Detected use of illegal DOCTYPE'
333 libxml_disable_entity_loader($oldValue);
334 libxml_use_internal_errors($libxmlErrflag);
337 // Build error message
338 $error = libxml_get_last_error();
339 if ($error && $error->message) {
340 $error->message = trim($error->message);
341 $errormsg = "DOMDocument cannot parse XML: {$error->message}";
343 $errormsg = "DOMDocument cannot parse XML: Please check the XML document's validity";
345 throw new Exception\RuntimeException($errormsg);
348 $type = static::detectType($dom);
350 static::registerCoreExtensions();
352 if (substr($type, 0, 3) == 'rss') {
353 $reader = new Feed\Rss($dom, $type);
354 } elseif (substr($type, 8, 5) == 'entry') {
355 $reader = new Entry\Atom($dom->documentElement, 0, self::TYPE_ATOM_10);
356 } elseif (substr($type, 0, 4) == 'atom') {
357 $reader = new Feed\Atom($dom, $type);
359 throw new Exception\RuntimeException('The URI used does not point to a '
360 . 'valid Atom, RSS or RDF feed that Zend\Feed\Reader can parse.');
366 * Imports a feed from a file located at $filename.
368 * @param string $filename
369 * @throws Exception\RuntimeException
370 * @return Feed\FeedInterface
372 public static function importFile($filename)
374 ErrorHandler::start();
375 $feed = file_get_contents($filename);
376 $err = ErrorHandler::stop();
377 if ($feed === false) {
378 throw new Exception\RuntimeException("File '{$filename}' could not be loaded", 0, $err);
380 return static::importString($feed);
388 * @throws Exception\RuntimeException
390 public static function findFeedLinks($uri)
392 $client = static::getHttpClient();
393 $response = $client->get($uri);
394 if ($response->getStatusCode() !== 200) {
395 throw new Exception\RuntimeException(
396 "Failed to access $uri, got response code " . $response->getStatusCode()
399 $responseHtml = $response->getBody();
400 $libxmlErrflag = libxml_use_internal_errors(true);
401 $oldValue = libxml_disable_entity_loader(true);
402 $dom = new DOMDocument;
403 $status = $dom->loadHTML(trim($responseHtml));
404 libxml_disable_entity_loader($oldValue);
405 libxml_use_internal_errors($libxmlErrflag);
407 // Build error message
408 $error = libxml_get_last_error();
409 if ($error && $error->message) {
410 $error->message = trim($error->message);
411 $errormsg = "DOMDocument cannot parse HTML: {$error->message}";
413 $errormsg = "DOMDocument cannot parse HTML: Please check the XML document's validity";
415 throw new Exception\RuntimeException($errormsg);
417 $feedSet = new FeedSet;
418 $links = $dom->getElementsByTagName('link');
419 $feedSet->addLinks($links, $uri);
424 * Detect the feed type of the provided feed
426 * @param Feed\AbstractFeed|DOMDocument|string $feed
427 * @param bool $specOnly
429 * @throws Exception\InvalidArgumentException
430 * @throws Exception\RuntimeException
432 public static function detectType($feed, $specOnly = false)
434 if ($feed instanceof Feed\AbstractFeed) {
435 $dom = $feed->getDomDocument();
436 } elseif ($feed instanceof DOMDocument) {
438 } elseif (is_string($feed) && ! empty($feed)) {
439 ErrorHandler::start(E_NOTICE | E_WARNING);
440 ini_set('track_errors', 1);
441 $oldValue = libxml_disable_entity_loader(true);
442 $dom = new DOMDocument;
443 $status = $dom->loadXML($feed);
444 foreach ($dom->childNodes as $child) {
445 if ($child->nodeType === XML_DOCUMENT_TYPE_NODE) {
446 throw new Exception\InvalidArgumentException(
447 'Invalid XML: Detected use of illegal DOCTYPE'
451 libxml_disable_entity_loader($oldValue);
452 ini_restore('track_errors');
453 ErrorHandler::stop();
455 if (! isset($phpErrormsg)) {
456 if (function_exists('xdebug_is_enabled')) {
457 $phpErrormsg = '(error message not available, when XDebug is running)';
459 $phpErrormsg = '(error message not available)';
462 throw new Exception\RuntimeException("DOMDocument cannot parse XML: $phpErrormsg");
465 throw new Exception\InvalidArgumentException('Invalid object/scalar provided: must'
466 . ' be of type Zend\Feed\Reader\Feed, DomDocument or string');
468 $xpath = new DOMXPath($dom);
470 if ($xpath->query('/rss')->length) {
471 $type = self::TYPE_RSS_ANY;
472 $version = $xpath->evaluate('string(/rss/@version)');
474 if (strlen($version) > 0) {
477 $type = self::TYPE_RSS_20;
481 $type = self::TYPE_RSS_094;
485 $type = self::TYPE_RSS_093;
489 $type = self::TYPE_RSS_092;
493 $type = self::TYPE_RSS_091;
501 $xpath->registerNamespace('rdf', self::NAMESPACE_RDF);
503 if ($xpath->query('/rdf:RDF')->length) {
504 $xpath->registerNamespace('rss', self::NAMESPACE_RSS_10);
506 if ($xpath->query('/rdf:RDF/rss:channel')->length
507 || $xpath->query('/rdf:RDF/rss:image')->length
508 || $xpath->query('/rdf:RDF/rss:item')->length
509 || $xpath->query('/rdf:RDF/rss:textinput')->length
511 return self::TYPE_RSS_10;
514 $xpath->registerNamespace('rss', self::NAMESPACE_RSS_090);
516 if ($xpath->query('/rdf:RDF/rss:channel')->length
517 || $xpath->query('/rdf:RDF/rss:image')->length
518 || $xpath->query('/rdf:RDF/rss:item')->length
519 || $xpath->query('/rdf:RDF/rss:textinput')->length
521 return self::TYPE_RSS_090;
525 $xpath->registerNamespace('atom', self::NAMESPACE_ATOM_10);
527 if ($xpath->query('//atom:feed')->length) {
528 return self::TYPE_ATOM_10;
531 if ($xpath->query('//atom:entry')->length) {
532 if ($specOnly == true) {
533 return self::TYPE_ATOM_10;
535 return self::TYPE_ATOM_10_ENTRY;
539 $xpath->registerNamespace('atom', self::NAMESPACE_ATOM_03);
541 if ($xpath->query('//atom:feed')->length) {
542 return self::TYPE_ATOM_03;
545 return self::TYPE_ANY;
549 * Set plugin manager for use with Extensions
551 * @param ExtensionManagerInterface $extensionManager
553 public static function setExtensionManager(ExtensionManagerInterface $extensionManager)
555 static::$extensionManager = $extensionManager;
559 * Get plugin manager for use with Extensions
561 * @return ExtensionManagerInterface
563 public static function getExtensionManager()
565 if (! isset(static::$extensionManager)) {
566 static::setExtensionManager(new StandaloneExtensionManager());
568 return static::$extensionManager;
572 * Register an Extension by name
574 * @param string $name
576 * @throws Exception\RuntimeException if unable to resolve Extension class
578 public static function registerExtension($name)
580 $feedName = $name . '\Feed';
581 $entryName = $name . '\Entry';
582 $manager = static::getExtensionManager();
583 if (static::isRegistered($name)) {
584 if ($manager->has($feedName) || $manager->has($entryName)) {
589 if (! $manager->has($feedName) && ! $manager->has($entryName)) {
590 throw new Exception\RuntimeException('Could not load extension: ' . $name
591 . ' using Plugin Loader. Check prefix paths are configured and extension exists.');
593 if ($manager->has($feedName)) {
594 static::$extensions['feed'][] = $feedName;
596 if ($manager->has($entryName)) {
597 static::$extensions['entry'][] = $entryName;
602 * Is a given named Extension registered?
604 * @param string $extensionName
607 public static function isRegistered($extensionName)
609 $feedName = $extensionName . '\Feed';
610 $entryName = $extensionName . '\Entry';
611 if (in_array($feedName, static::$extensions['feed'])
612 || in_array($entryName, static::$extensions['entry'])
620 * Get a list of extensions
624 public static function getExtensions()
626 return static::$extensions;
630 * Reset class state to defaults
634 public static function reset()
636 static::$cache = null;
637 static::$httpClient = null;
638 static::$httpMethodOverride = false;
639 static::$httpConditionalGet = false;
640 static::$extensionManager = null;
641 static::$extensions = [
662 * Register core (default) extensions
666 protected static function registerCoreExtensions()
668 static::registerExtension('DublinCore');
669 static::registerExtension('Content');
670 static::registerExtension('Atom');
671 static::registerExtension('Slash');
672 static::registerExtension('WellFormedWeb');
673 static::registerExtension('Thread');
674 static::registerExtension('Podcast');
678 * Utility method to apply array_unique operation to a multidimensional
684 public static function arrayUnique(array $array)
686 foreach ($array as &$value) {
687 $value = serialize($value);
689 $array = array_unique($array);
690 foreach ($array as &$value) {
691 $value = unserialize($value);