Files
FreshRSS/app/Utils/httpUtil.php
Alexandre Alapetite ae2ab45266 Handle fetch of text/plain as <pre> (#8340)
* Handle fetch of text/plain as <pre>
fix https://github.com/FreshRSS/FreshRSS/issues/8328

* class="text-plain"
2025-12-24 21:38:38 +01:00

524 lines
18 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
declare(strict_types=1);
final class FreshRSS_http_Util {
private const RETRY_AFTER_PATH = DATA_PATH . '/Retry-After/';
private static function getRetryAfterFile(string $url, string $proxy): string {
$domain = parse_url($url, PHP_URL_HOST);
if (!is_string($domain) || $domain === '') {
return '';
}
$domainWide = Minz_Request::serverIsPublic($domain);
$port = parse_url($url, PHP_URL_PORT);
if (is_int($port)) {
$domain .= ':' . $port;
}
return self::RETRY_AFTER_PATH . urlencode($domain) .
($domainWide ? '' : '_' . hash('sha256', $url)) .
(empty($proxy) ? '' : '_' . urlencode($proxy)) . '.txt';
}
/**
* Clean up old Retry-After files
*/
private static function cleanRetryAfters(): void {
if (!is_dir(self::RETRY_AFTER_PATH)) {
return;
}
$files = glob(self::RETRY_AFTER_PATH . '*.txt', GLOB_NOSORT);
if ($files === false) {
return;
}
foreach ($files as $file) {
if (@filemtime($file) < time()) {
@unlink($file);
}
}
}
/**
* Check whether the URL needs to wait for a Retry-After period.
* @return int The timestamp of when the Retry-After expires, or 0 if not set.
*/
public static function getRetryAfter(string $url, string $proxy): int {
if (rand(0, 30) === 1) { // Remove old files once in a while
self::cleanRetryAfters();
}
$txt = self::getRetryAfterFile($url, $proxy);
if ($txt === '') {
return 0;
}
$retryAfter = @filemtime($txt) ?: 0;
if ($retryAfter <= 0) {
return 0;
}
if ($retryAfter < time()) {
@unlink($txt);
return 0;
}
return $retryAfter;
}
/**
* Store the HTTP Retry-After header value of an HTTP `429 Too Many Requests` or `503 Service Unavailable` response.
*/
public static function setRetryAfter(string $url, string $proxy, string $retryAfter): int {
$txt = self::getRetryAfterFile($url, $proxy);
if ($txt === '') {
return 0;
}
$limits = FreshRSS_Context::systemConf()->limits;
if (ctype_digit($retryAfter)) {
$retryAfter = time() + (int)$retryAfter;
} else {
$retryAfter = \SimplePie\Misc::parse_date($retryAfter) ?:
(time() + max(600, $limits['retry_after_default'] ?? 0));
}
$retryAfter = min($retryAfter, time() + max(3600, $limits['retry_after_max'] ?? 0));
@mkdir(self::RETRY_AFTER_PATH);
if (!touch($txt, $retryAfter)) {
Minz_Log::error('Failed to set Retry-After for ' . $url);
return 0;
}
return $retryAfter;
}
/**
* @param array<mixed> $curl_params
* @return array<mixed>
*/
public static function sanitizeCurlParams(array $curl_params): array {
$safe_params = [
CURLOPT_COOKIE,
CURLOPT_COOKIEFILE,
CURLOPT_FOLLOWLOCATION,
CURLOPT_HTTPHEADER,
CURLOPT_MAXREDIRS,
CURLOPT_POST,
CURLOPT_POSTFIELDS,
CURLOPT_PROXY,
CURLOPT_PROXYTYPE,
CURLOPT_USERAGENT,
];
foreach ($curl_params as $k => $_) {
if (!in_array($k, $safe_params, true)) {
unset($curl_params[$k]);
continue;
}
// Allow only an empty value just to enable the libcurl cookie engine
if ($k === CURLOPT_COOKIEFILE) {
$curl_params[$k] = '';
}
}
return $curl_params;
}
private static function idn_to_puny(string $url): string {
if (function_exists('idn_to_ascii')) {
$idn = parse_url($url, PHP_URL_HOST);
if (is_string($idn) && $idn != '') {
$puny = idn_to_ascii($idn);
$pos = strpos($url, $idn);
if ($puny != false && $pos !== false) {
$url = substr_replace($url, $puny, $pos, strlen($idn));
}
}
}
return $url;
}
public static function checkUrl(string $url, bool $fixScheme = true): string|false {
$url = trim($url);
if ($url == '') {
return '';
}
if ($fixScheme && preg_match('#^https?://#i', $url) !== 1) {
$url = 'https://' . ltrim($url, '/');
}
$url = self::idn_to_puny($url); // https://bugs.php.net/bug.php?id=53474
$urlRelaxed = str_replace('_', 'z', $url); //PHP discussion #64948 Underscore
if (is_string(filter_var($urlRelaxed, FILTER_VALIDATE_URL))) {
return $url;
} else {
return false;
}
}
/**
* Remove the charset meta information of an HTML document, e.g.:
* `<meta charset="..." />`
* `<meta http-equiv="Content-Type" content="text/html; charset=...">`
*/
private static function stripHtmlMetaCharset(string $html): string {
return preg_replace('/<meta\s[^>]*charset\s*=\s*[^>]+>/i', '', $html, 1) ?? '';
}
/**
* Set an XML preamble to enforce the HTML content type charset received by HTTP.
* @param string $html the raw downloaded HTML content
* @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
* @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
*/
private static function enforceHttpEncoding(string $html, string $contentType = ''): string {
$httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
if ($httpCharset == '') {
// No charset defined by HTTP
if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
// Detect UTF-8 even if declared too deep in HTML for DOMDocument
$httpCharset = 'UTF-8';
} else {
// Do nothing
return $html;
}
}
$httpCharsetNormalized = \SimplePie\Misc::encoding($httpCharset);
if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
// Default charset for HTTP, do nothing
return $html;
}
if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM
substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM
substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM
substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM
substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM
// Existing byte order mark, do nothing
return $html;
}
if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) {
// Existing XML declaration, do nothing
return $html;
}
if ($httpCharsetNormalized !== 'UTF-8') {
// Try to change encoding to UTF-8 using mbstring or iconv or intl
$utf8 = \SimplePie\Misc::change_encoding($html, $httpCharsetNormalized, 'UTF-8');
if (is_string($utf8)) {
$html = self::stripHtmlMetaCharset($utf8);
$httpCharsetNormalized = 'UTF-8';
}
}
if ($httpCharsetNormalized === 'UTF-8') {
// Save encoding information as Unicode BOM
return "\xEF\xBB\xBF" . $html;
}
// Give up
return $html;
}
/**
* Set an HTML base URL to the HTML content if there is none.
* @param string $html the raw downloaded HTML content
* @param string $href the HTML base URL
* @return string an HTML string
*/
private static function enforceHtmlBase(string $html, string $href): string {
$doc = new DOMDocument();
$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
if ($doc->documentElement === null) {
return '';
}
$xpath = new DOMXPath($doc);
$bases = $xpath->evaluate('//base');
if (!($bases instanceof DOMNodeList) || $bases->length === 0) {
$base = $doc->createElement('base');
if ($base === false) {
return $html;
}
$base->setAttribute('href', $href);
$head = null;
$heads = $xpath->evaluate('//head');
if ($heads instanceof DOMNodeList && $heads->length > 0) {
$head = $heads->item(0);
}
if ($head instanceof DOMElement) {
$head->insertBefore($base, $head->firstChild);
} else {
$doc->documentElement->insertBefore($base, $doc->documentElement->firstChild);
}
}
// Save the start of HTML because libxml2 saveHTML() risks scrambling it
$htmlPos = stripos($html, '<html');
$htmlStart = $htmlPos === false || $htmlPos > 512 ? '' : substr($html, 0, $htmlPos);
$html = $doc->saveHTML() ?: $html;
if ($htmlStart !== '' && !str_starts_with($html, $htmlStart)) {
// libxml2 saveHTML() risks removing Unicode BOM and XML declaration,
// which affects future detection of charset encoding, so manually restore it
$htmlPos = stripos($html, '<html');
$html = $htmlPos === false || $htmlPos > 512 ? $html : $htmlStart . substr($html, $htmlPos);
}
return $html;
}
/**
* @param non-empty-string $url
* @param string $type {html,ico,json,opml,xml}
* @param array<string,mixed> $attributes
* @param array<int,mixed> $curl_options
* @return array{body:string,effective_url:string,redirect_count:int,fail:bool}
*/
public static function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): array {
$limits = FreshRSS_Context::systemConf()->limits;
$feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
$cacheMtime = @filemtime($cachePath);
if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) {
$body = @file_get_contents($cachePath);
if ($body != false) {
syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false];
}
}
if (rand(0, 30) === 1) { // Remove old cache once in a while
cleanCache(CLEANCACHE_HOURS);
}
$options = [];
$accept = '';
$proxy = is_string(FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] ?? null) ? FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] : '';
if (is_array($attributes['curl_params'] ?? null)) {
$options = self::sanitizeCurlParams($attributes['curl_params']);
$proxy = is_string($options[CURLOPT_PROXY] ?? null) ? $options[CURLOPT_PROXY] : '';
if (is_array($options[CURLOPT_HTTPHEADER] ?? null)) {
// Remove headers problematic for security
$options[CURLOPT_HTTPHEADER] = array_filter($options[CURLOPT_HTTPHEADER],
fn($header) => is_string($header) && !preg_match('/^(Remote-User|X-WebAuth-User)\\s*:/i', $header));
// Add Accept header if it is not set
if (preg_grep('/^Accept\\s*:/i', $options[CURLOPT_HTTPHEADER]) === false) {
$options[CURLOPT_HTTPHEADER][] = 'Accept: ' . $accept;
}
}
}
if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url, $proxy)) > 0) {
Minz_Log::warning('For that domain, will first retry after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
return ['body' => '', 'effective_url' => $url, 'redirect_count' => 0, 'fail' => true];
}
if (FreshRSS_Context::systemConf()->simplepie_syslog_enabled) {
syslog(LOG_INFO, 'FreshRSS GET ' . $type . ' ' . \SimplePie\Misc::url_remove_credentials($url));
}
switch ($type) {
case 'json':
$accept = 'application/json,application/feed+json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7';
break;
case 'opml':
$accept = 'text/x-opml,text/xml;q=0.9,application/xml;q=0.9,*/*;q=0.8';
break;
case 'xml':
$accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8';
break;
case 'ico':
$accept = 'image/x-icon,image/vnd.microsoft.icon,image/ico,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.1';
break;
case 'html':
default:
$accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
break;
}
// TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
$ch = curl_init();
if ($ch === false) {
return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true];
}
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_HTTPHEADER => ['Accept: ' . $accept],
CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
CURLOPT_MAXREDIRS => 4,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_ENCODING => '', //Enable all encodings
//CURLOPT_VERBOSE => 1, // To debug sent HTTP headers
]);
curl_setopt_array($ch, $options);
curl_setopt_array($ch, FreshRSS_Context::systemConf()->curl_options);
$responseHeaders = '';
curl_setopt($ch, CURLOPT_HEADERFUNCTION, function (\CurlHandle $ch, string $header) use (&$responseHeaders) {
if (trim($header) !== '') { // Skip e.g. separation with trailer headers
$responseHeaders .= $header;
}
return strlen($header);
});
if (isset($attributes['ssl_verify'])) {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, empty($attributes['ssl_verify']) ? 0 : 2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (bool)$attributes['ssl_verify']);
if (empty($attributes['ssl_verify'])) {
curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
}
}
curl_setopt_array($ch, $curl_options);
$body = curl_exec($ch);
$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
$c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$c_redirect_count = curl_getinfo($ch, CURLINFO_REDIRECT_COUNT);
$c_error = curl_error($ch);
$headers = [];
if ($body !== false) {
assert($c_redirect_count >= 0);
$responseHeaders = \SimplePie\HTTP\Parser::prepareHeaders($responseHeaders, $c_redirect_count + 1);
$parser = new \SimplePie\HTTP\Parser($responseHeaders);
if ($parser->parse()) {
$headers = $parser->headers;
}
}
$fail = $c_status != 200 || $c_error != '' || $body === false;
if ($fail) {
$body = '';
Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
if (in_array($c_status, [429, 503], true)) {
$retryAfter = FreshRSS_http_Util::setRetryAfter($url, $proxy, $headers['retry-after'] ?? '');
if ($c_status === 429) {
$errorMessage = 'HTTP 429 Too Many Requests! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
} elseif ($c_status === 503) {
$errorMessage = 'HTTP 503 Service Unavailable! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
}
if ($retryAfter > 0) {
$errorMessage .= ' We may retry after ' . date('c', $retryAfter);
}
}
// TODO: Implement HTTP 410 Gone
} elseif (!is_string($body) || strlen($body) === 0) {
$body = '';
} else {
if (in_array($type, ['html', 'json', 'opml', 'xml'], true)) {
$body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
}
if (in_array($type, ['html', 'xml', 'opml'], true)) {
$body = self::enforceHttpEncoding($body, $c_content_type);
}
if (in_array($type, ['html'], true)) {
if (stripos($c_content_type, 'text/plain') !== false) {
// Plain text to be displayed as preformatted text. Prefixed with UTF-8 BOM
$body = "\xEF\xBB\xBF" . '<pre class="text-plain">' . htmlspecialchars($body, ENT_NOQUOTES, 'UTF-8') . '</pre>';
} else {
$body = self::enforceHtmlBase($body, $c_effective_url);
}
}
}
if (file_put_contents($cachePath, $body) === false) {
Minz_Log::warning("Error saving cache $cachePath for $url");
}
return ['body' => $body, 'effective_url' => $c_effective_url, 'redirect_count' => $c_redirect_count, 'fail' => $fail];
}
/**
* Converts an IP (v4 or v6) to a binary representation using inet_pton
*
* @param string $ip the IP to convert
* @return string a binary representation of the specified IP
*/
private static function ipToBits(string $ip): string {
$binaryip = '';
foreach (str_split(inet_pton($ip) ?: '') as $char) {
$binaryip .= str_pad(decbin(ord($char)), 8, '0', STR_PAD_LEFT);
}
return $binaryip;
}
/**
* Check if an ip belongs to the provided range (in CIDR format)
*
* @param string $ip the IP that we want to verify (ex: 192.168.16.1)
* @param string $range the range to check against (ex: 192.168.16.0/24)
* @return bool true if the IP is in the range, otherwise false
*/
private static function checkCIDR(string $ip, string $range): bool {
$binary_ip = self::ipToBits($ip);
$split = explode('/', $range);
$subnet = $split[0] ?? '';
if ($subnet == '') {
return false;
}
$binary_subnet = self::ipToBits($subnet);
$mask_bits = $split[1] ?? '';
$mask_bits = (int)$mask_bits;
if ($mask_bits === 0) {
$mask_bits = null;
}
$ip_net_bits = substr($binary_ip, 0, $mask_bits);
$subnet_bits = substr($binary_subnet, 0, $mask_bits);
return $ip_net_bits === $subnet_bits;
}
/**
* Check if the client (e.g. last proxy) is allowed to send unsafe headers.
* This uses the `TRUSTED_PROXY` environment variable or the `trusted_sources` configuration option to get an array of the authorized ranges,
* The connection IP is obtained from the `CONN_REMOTE_ADDR`
* (if available, to be robust even when using Apache mod_remoteip) or `REMOTE_ADDR` environment variables.
* @return bool true if the senders IP is in one of the ranges defined in the configuration, else false
*/
public static function checkTrustedIP(): bool {
if (!FreshRSS_Context::hasSystemConf()) {
return false;
}
$remoteIp = Minz_Request::connectionRemoteAddress();
if ($remoteIp === '') {
return false;
}
$trusted = getenv('TRUSTED_PROXY');
if ($trusted != 0 && is_string($trusted)) {
$trusted = preg_split('/\s+/', $trusted, -1, PREG_SPLIT_NO_EMPTY);
}
if (!is_array($trusted) || empty($trusted)) {
$trusted = FreshRSS_Context::systemConf()->trusted_sources;
}
foreach ($trusted as $cidr) {
if (self::checkCIDR($remoteIp, $cidr)) {
return true;
}
}
return false;
}
public static function httpAuthUser(bool $onlyTrusted = true): string {
$auths = array_unique(
array_intersect_key($_SERVER, ['REMOTE_USER' => '', 'REDIRECT_REMOTE_USER' => '', 'HTTP_REMOTE_USER' => '', 'HTTP_X_WEBAUTH_USER' => ''])
);
if (count($auths) > 1) {
Minz_Log::warning('Multiple HTTP authentication headers!');
return '';
}
if (!empty($_SERVER['REMOTE_USER']) && is_string($_SERVER['REMOTE_USER'])) {
return $_SERVER['REMOTE_USER'];
}
if (!empty($_SERVER['REDIRECT_REMOTE_USER']) && is_string($_SERVER['REDIRECT_REMOTE_USER'])) {
return $_SERVER['REDIRECT_REMOTE_USER'];
}
if (!$onlyTrusted || self::checkTrustedIP()) {
if (!empty($_SERVER['HTTP_REMOTE_USER']) && is_string($_SERVER['HTTP_REMOTE_USER'])) {
return $_SERVER['HTTP_REMOTE_USER'];
}
if (!empty($_SERVER['HTTP_X_WEBAUTH_USER']) && is_string($_SERVER['HTTP_X_WEBAUTH_USER'])) {
return $_SERVER['HTTP_X_WEBAUTH_USER'];
}
}
return '';
}
}