rss-bridge/lib/contents.php

<?php

final class HttpException extends \Exception {}

// todo: move this somewhere useful, possibly into a function
const RSSBRIDGE_HTTP_STATUS_CODES = [
	'100' => 'Continue',
	'101' => 'Switching Protocols',
	'200' => 'OK',
	'201' => 'Created',
	'202' => 'Accepted',
	'203' => 'Non-Authoritative Information',
	'204' => 'No Content',
	'205' => 'Reset Content',
	'206' => 'Partial Content',
	'300' => 'Multiple Choices',
	'302' => 'Found',
	'303' => 'See Other',
	'304' => 'Not Modified',
	'305' => 'Use Proxy',
	'400' => 'Bad Request',
	'401' => 'Unauthorized',
	'402' => 'Payment Required',
	'403' => 'Forbidden',
	'404' => 'Not Found',
	'405' => 'Method Not Allowed',
	'406' => 'Not Acceptable',
	'407' => 'Proxy Authentication Required',
	'408' => 'Request Timeout',
	'409' => 'Conflict',
	'410' => 'Gone',
	'411' => 'Length Required',
	'412' => 'Precondition Failed',
	'413' => 'Request Entity Too Large',
	'414' => 'Request-URI Too Long',
	'415' => 'Unsupported Media Type',
	'416' => 'Requested Range Not Satisfiable',
	'417' => 'Expectation Failed',
	'429' => 'Too Many Requests',
	'500' => 'Internal Server Error',
	'501' => 'Not Implemented',
	'502' => 'Bad Gateway',
	'503' => 'Service Unavailable',
	'504' => 'Gateway Timeout',
	'505' => 'HTTP Version Not Supported'
];

/**
 * Fetch data from an http url
 *
 * @param array $httpHeaders E.g. ['Content-type: text/plain']
 * @param array $curlOptions Associative array e.g. [CURLOPT_MAXREDIRS => 3]
 * @param bool $returnFull Whether to return an array:
 *                         [
 *                              'code' => int,
 *                              'header' => array,
 *                              'content' => string,
 *                              'status_lines' => array,
 *                         ]

 * @return string|array
 */
function getContents(
	string $url,
	array $httpHeaders = [],
	array $curlOptions = [],
	bool $returnFull = false
) {
	$cacheFactory = new CacheFactory();
	$cacheFactory->setWorkingDir(PATH_LIB_CACHES);
	$cache = $cacheFactory->create(Configuration::getConfig('cache', 'type'));
	$cache->setScope('server');
	$cache->purgeCache(86400); // 24 hours (forced)
	$cache->setKey([$url]);

	$config = [
		'headers' => $httpHeaders,
		'curl_options' => $curlOptions,
	];
	if (defined('PROXY_URL') && !defined('NOPROXY')) {
		$config['proxy'] = PROXY_URL;
	}
	if(!Debug::isEnabled() && $cache->getTime()) {
		$config['if_not_modified_since'] = $cache->getTime();
	}

	$result = _http_request($url, $config);
	$response = [
		'code' => $result['code'],
		'status_lines' => $result['status_lines'],
		'header' => $result['headers'],
		'content' => $result['body'],
	];

	switch($result['code']) {
		case 200:
		case 201:
		case 202:
			if(isset($result['headers']['cache-control'])) {
				$cachecontrol = $result['headers']['cache-control'];
				$lastValue = array_pop($cachecontrol);
				$directives = explode(',', $lastValue);
				$directives = array_map('trim', $directives);
				if(in_array('no-cache', $directives) || in_array('no-store', $directives)) {
					// Don't cache as instructed by the server
					break;
				}
			}
			$cache->saveData($result['body']);
			break;
		case 304: // Not Modified
			$response['content'] = $cache->loadData();
			break;
		default:
			throw new HttpException(
				sprintf(
					'%s %s',
					$result['code'],
					RSSBRIDGE_HTTP_STATUS_CODES[$result['code']] ?? ''
				),
				$result['code']
			);
	}
	if ($returnFull === true) {
		return $response;
	}
	return $response['content'];
}

/**
 * Private function used internally
 *
 * Fetch content from url
 *
 * @throws HttpException
 */
function _http_request(string $url, array $config = []): array
{
	$defaults = [
		'useragent' => Configuration::getConfig('http', 'useragent'),
		'timeout' => Configuration::getConfig('http', 'timeout'),
		'headers' => [],
		'proxy' => null,
		'curl_options' => [],
		'if_not_modified_since' => null,
		'retries' => 3,
	];
	$config = array_merge($defaults, $config);

	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
	curl_setopt($ch, CURLOPT_HEADER, false);
	curl_setopt($ch, CURLOPT_HTTPHEADER, $config['headers']);
	curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']);
	curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']);
	curl_setopt($ch, CURLOPT_ENCODING, '');
	curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
	if($config['proxy']) {
		curl_setopt($ch, CURLOPT_PROXY, $config['proxy']);
	}
	if (curl_setopt_array($ch, $config['curl_options']) === false) {
		throw new \Exception('Tried to set an illegal curl option');
	}

	if ($config['if_not_modified_since']) {
		curl_setopt($ch, CURLOPT_TIMEVALUE, $config['if_not_modified_since']);
		curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
	}

	$responseStatusLines = [];
	$responseHeaders = [];
	curl_setopt($ch, CURLOPT_HEADERFUNCTION, function ($ch, $rawHeader) use (&$responseHeaders, &$responseStatusLines) {
		$len = strlen($rawHeader);
		if ($rawHeader === "\r\n") {
			return $len;
		}
		if (preg_match('#^HTTP/(2|1.1|1.0)#', $rawHeader)) {
			$responseStatusLines[] = $rawHeader;
			return $len;
		}
		$header = explode(':', $rawHeader);
		if (count($header) === 1) {
			return $len;
		}
		$name = mb_strtolower(trim($header[0]));
		$value = trim(implode(':', array_slice($header, 1)));
		if (!isset($responseHeaders[$name])) {
			$responseHeaders[$name] = [];
		}
		$responseHeaders[$name][] = $value;
		return $len;
	});

	$attempts = 0;
	while(true) {
		$attempts++;
		$data = curl_exec($ch);
		if ($data !== false) {
			// The network call was successful, so break out of the loop
			break;
		}
		if ($attempts > $config['retries']) {
			// Finally give up
			throw new HttpException(sprintf('%s (%s)', curl_error($ch), curl_errno($ch)));
		}
	}

	$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
	curl_close($ch);
	return [
		'code'      => $statusCode,
		'status_lines' => $responseStatusLines,
		'headers'   => $responseHeaders,
		'body'      => $data,
	];
}

/**
 * Gets contents from the Internet as simplhtmldom object.
 *
 * @param string $url The URL.
 * @param array $header (optional) A list of cURL header.
 * For more information follow the links below.
 * * https://php.net/manual/en/function.curl-setopt.php
 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
 * @param array $opts (optional) A list of cURL options as associative array in
 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
 * option and `$value` the corresponding value.
 *
 * For more information see http://php.net/manual/en/function.curl-setopt.php
 * @param bool $lowercase Force all selectors to lowercase.
 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
 *
 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
 * lead to parsing errors.
 * @param string $target_charset Defines the target charset.
 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
 * @param string $defaultBRText Specifies the replacement text for `<br>` tags
 * when returning plaintext.
 * @param string $defaultSpanText Specifies the replacement text for `<span />`
 * tags when returning plaintext.
 * @return false|simple_html_dom Contents as simplehtmldom object.
 */
function getSimpleHTMLDOM($url,
	$header = array(),
	$opts = array(),
	$lowercase = true,
	$forceTagsClosed = true,
	$target_charset = DEFAULT_TARGET_CHARSET,
	$stripRN = true,
	$defaultBRText = DEFAULT_BR_TEXT,
	$defaultSpanText = DEFAULT_SPAN_TEXT){

	$content = getContents(
		$url,
		$header ?? [],
		$opts ?? []
	);
	return str_get_html($content,
	$lowercase,
	$forceTagsClosed,
	$target_charset,
	$stripRN,
	$defaultBRText,
	$defaultSpanText);
}

/**
 * Gets contents from the Internet as simplhtmldom object. Contents are cached
 * and re-used for subsequent calls until the cache duration elapsed.
 *
 * _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds).
 *
 * @param string $url The URL.
 * @param int $duration Cache duration in seconds.
 * @param array $header (optional) A list of cURL header.
 * For more information follow the links below.
 * * https://php.net/manual/en/function.curl-setopt.php
 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
 * @param array $opts (optional) A list of cURL options as associative array in
 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
 * option and `$value` the corresponding value.
 *
 * For more information see http://php.net/manual/en/function.curl-setopt.php
 * @param bool $lowercase Force all selectors to lowercase.
 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
 *
 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
 * lead to parsing errors.
 * @param string $target_charset Defines the target charset.
 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
 * @param string $defaultBRText Specifies the replacement text for `<br>` tags
 * when returning plaintext.
 * @param string $defaultSpanText Specifies the replacement text for `<span />`
 * tags when returning plaintext.
 * @return false|simple_html_dom Contents as simplehtmldom object.
 */
function getSimpleHTMLDOMCached($url,
	$duration = 86400,
	$header = array(),
	$opts = array(),
	$lowercase = true,
	$forceTagsClosed = true,
	$target_charset = DEFAULT_TARGET_CHARSET,
	$stripRN = true,
	$defaultBRText = DEFAULT_BR_TEXT,
	$defaultSpanText = DEFAULT_SPAN_TEXT){

	Debug::log('Caching url ' . $url . ', duration ' . $duration);

	// Initialize cache
	$cacheFac = new CacheFactory();
	$cacheFac->setWorkingDir(PATH_LIB_CACHES);
	$cache = $cacheFac->create(Configuration::getConfig('cache', 'type'));
	$cache->setScope('pages');
	$cache->purgeCache(86400); // 24 hours (forced)

	$params = array($url);
	$cache->setKey($params);

	// Determine if cached file is within duration
	$time = $cache->getTime();
	if($time !== false
	&& (time() - $duration < $time)
	&& !Debug::isEnabled()) { // Contents within duration
		$content = $cache->loadData();
	} else { // Content not within duration
		$content = getContents(
			$url,
			$header ?? [],
			$opts ?? []
		);
		if($content !== false) {
			$cache->saveData($content);
		}
	}

	return str_get_html($content,
	$lowercase,
	$forceTagsClosed,
	$target_charset,
	$stripRN,
	$defaultBRText,
	$defaultSpanText);
}

/**
 * Determines the MIME type from a URL/Path file extension.
 *
 * _Remarks_:
 *
 * * The built-in functions `mime_content_type` and `fileinfo` require fetching
 * remote contents.
 * * A caller can hint for a MIME type by appending `#.ext` to the URL (i.e. `#.image`).
 *
 * Based on https://stackoverflow.com/a/1147952
 *
 * @param string $url The URL or path to the file.
 * @return string The MIME type of the file.
 */
function getMimeType($url) {
	static $mime = null;

	if (is_null($mime)) {
		// Default values, overriden by /etc/mime.types when present
		$mime = array(
			'jpg' => 'image/jpeg',
			'gif' => 'image/gif',
			'png' => 'image/png',
			'image' => 'image/*',
			'mp3' => 'audio/mpeg',
		);
		// '@' is used to mute open_basedir warning, see issue #818
		if (@is_readable('/etc/mime.types')) {
			$file = fopen('/etc/mime.types', 'r');
			while(($line = fgets($file)) !== false) {
				$line = trim(preg_replace('/#.*/', '', $line));
				if(!$line)
					continue;
				$parts = preg_split('/\s+/', $line);
				if(count($parts) == 1)
					continue;
				$type = array_shift($parts);
				foreach($parts as $part)
					$mime[$part] = $type;
			}
			fclose($file);
		}
	}

	if (strpos($url, '?') !== false) {
		$url_temp = substr($url, 0, strpos($url, '?'));
		if (strpos($url, '#') !== false) {
			$anchor = substr($url, strpos($url, '#'));
			$url_temp .= $anchor;
		}
		$url = $url_temp;
	}

	$ext = strtolower(pathinfo($url, PATHINFO_EXTENSION));
	if (!empty($mime[$ext])) {
		return $mime[$ext];
	}

	return 'application/octet-stream';
}