2020-09-22 06:04:33 +00:00
< ? php
class UrlHelper {
2021-06-18 08:20:57 +00:00
const EXTRA_HREF_SCHEMES = [
2021-06-06 22:51:02 +00:00
" magnet " ,
" mailto " ,
" tel "
];
2022-02-17 19:38:38 +00:00
const EXTRA_SCHEMES_BY_CONTENT_TYPE = [
" application/x-bittorrent " => [ " magnet " ],
];
2021-11-12 21:17:31 +00:00
// TODO: class properties can be switched to PHP typing if/when the minimum PHP_VERSION is raised to 7.4.0+
/** @var string */
static $fetch_last_error ;
/** @var int */
static $fetch_last_error_code ;
/** @var string */
static $fetch_last_error_content ;
/** @var string */
static $fetch_last_content_type ;
/** @var string */
static $fetch_last_modified ;
2022-03-22 11:32:32 +00:00
2021-11-12 21:17:31 +00:00
/** @var string */
static $fetch_effective_url ;
/** @var string */
static $fetch_effective_ip_addr ;
/** @var bool */
static $fetch_curl_used ;
2021-11-10 21:38:25 +00:00
/**
2021-11-11 11:08:04 +00:00
* @ param array < string , string | int > $parts
2021-11-10 21:38:25 +00:00
*/
static function build_url ( array $parts ) : string {
2020-12-12 16:04:22 +00:00
$tmp = $parts [ 'scheme' ] . " :// " . $parts [ 'host' ];
2020-09-22 06:04:33 +00:00
2020-12-12 16:04:22 +00:00
if ( isset ( $parts [ 'path' ])) $tmp .= $parts [ 'path' ];
2020-09-22 06:04:33 +00:00
if ( isset ( $parts [ 'query' ])) $tmp .= '?' . $parts [ 'query' ];
if ( isset ( $parts [ 'fragment' ])) $tmp .= '#' . $parts [ 'fragment' ];
return $tmp ;
}
/**
2021-05-21 12:39:41 +00:00
* Converts a ( possibly ) relative URL to a absolute one , using provided base URL .
2021-06-18 08:20:57 +00:00
* Provides some exceptions for additional schemes like data : if called with owning element / attribute .
2020-09-22 06:04:33 +00:00
*
2021-05-21 12:39:41 +00:00
* @ param string $base_url Base URL ( i . e . from where the document is )
2020-09-22 06:04:33 +00:00
* @ param string $rel_url Possibly relative URL in the document
2021-06-18 08:30:11 +00:00
* @ param string $owner_element Owner element tag name ( i . e . " a " ) ( optional )
* @ param string $owner_attribute Owner attribute ( i . e . " href " ) ( optional )
2022-02-17 19:38:38 +00:00
* @ param string $content_type URL content type as specified by enclosures , etc .
2020-09-22 06:04:33 +00:00
*
2021-11-15 03:28:17 +00:00
* @ return false | string Absolute URL or false on failure ( either during URL parsing or validation )
2020-09-22 06:04:33 +00:00
*/
2022-02-17 19:38:38 +00:00
public static function rewrite_relative ( $base_url ,
$rel_url ,
string $owner_element = " " ,
string $owner_attribute = " " ,
string $content_type = " " ) {
2020-09-22 06:04:33 +00:00
$rel_parts = parse_url ( $rel_url );
2022-03-22 11:32:32 +00:00
if ( ! $rel_url ) return $base_url ;
2021-11-15 03:28:17 +00:00
/**
* If parse_url failed to parse $rel_url return false to match the current " invalid thing " behavior
* of UrlHelper :: validate () .
*
* TODO : There are many places where a string return value is assumed . We should either update those
* to account for the possibility of failure , or look into updating this function ' s return values .
*/
if ( $rel_parts === false ) {
return false ;
}
2021-02-05 20:41:32 +00:00
if ( ! empty ( $rel_parts [ 'host' ]) && ! empty ( $rel_parts [ 'scheme' ])) {
2020-09-22 11:54:15 +00:00
return self :: validate ( $rel_url );
2021-06-18 08:20:57 +00:00
// protocol-relative URL (rare but they exist)
2020-09-22 06:04:33 +00:00
} else if ( strpos ( $rel_url , " // " ) === 0 ) {
2020-09-22 11:54:15 +00:00
return self :: validate ( " https: " . $rel_url );
2021-06-18 08:20:57 +00:00
// allow some extra schemes for A href
2021-06-18 08:30:11 +00:00
} else if ( in_array ( $rel_parts [ " scheme " ] ? ? " " , self :: EXTRA_HREF_SCHEMES , true ) &&
2021-06-18 08:20:57 +00:00
$owner_element == " a " &&
$owner_attribute == " href " ) {
return $rel_url ;
2022-02-17 19:38:38 +00:00
// allow some extra schemes for links with feed-specified content type i.e. enclosures
} else if ( $content_type &&
2022-02-18 13:44:03 +00:00
isset ( self :: EXTRA_SCHEMES_BY_CONTENT_TYPE [ $content_type ]) &&
2022-02-17 19:38:38 +00:00
in_array ( $rel_parts [ " scheme " ], self :: EXTRA_SCHEMES_BY_CONTENT_TYPE [ $content_type ])) {
return $rel_url ;
2021-06-18 08:20:57 +00:00
// allow limited subset of inline base64-encoded images for IMG elements
2021-06-18 10:52:29 +00:00
} else if (( $rel_parts [ " scheme " ] ? ? " " ) == " data " &&
2021-06-18 08:20:57 +00:00
preg_match ( '%^image/(webp|gif|jpg|png|svg);base64,%' , $rel_parts [ " path " ]) &&
$owner_element == " img " &&
$owner_attribute == " src " ) {
2020-09-22 06:04:33 +00:00
return $rel_url ;
} else {
2021-05-21 12:39:41 +00:00
$base_parts = parse_url ( $base_url );
2020-09-22 06:04:33 +00:00
2021-11-20 13:11:44 +00:00
$rel_parts [ 'host' ] = $base_parts [ 'host' ] ? ? " " ;
$rel_parts [ 'scheme' ] = $base_parts [ 'scheme' ] ? ? " " ;
2020-09-22 06:04:33 +00:00
2022-03-22 16:43:32 +00:00
if ( $rel_parts [ 'path' ] ? ? " " ) {
2021-05-21 12:39:41 +00:00
2022-03-22 09:24:31 +00:00
// we append dirname() of base path to relative URL path as per RFC 3986 section 5.2.2
2022-03-22 16:43:32 +00:00
$base_path = with_trailing_slash ( dirname ( $base_parts [ 'path' ] ? ? " " ));
2021-05-21 12:39:41 +00:00
2022-03-22 09:24:31 +00:00
// 1. absolute relative path (/test.html) = no-op, proceed as is
// 2. dotslash relative URI (./test.html) - strip "./", append base path
if ( strpos ( $rel_parts [ 'path' ], './' ) === 0 ) {
$rel_parts [ 'path' ] = $base_path . substr ( $rel_parts [ 'path' ], 2 );
// 3. anything else relative (test.html) - append dirname() of base path
} else if ( strpos ( $rel_parts [ 'path' ], '/' ) !== 0 ) {
$rel_parts [ 'path' ] = $base_path . $rel_parts [ 'path' ];
2021-05-21 12:39:41 +00:00
}
2020-09-22 06:04:33 +00:00
2022-03-22 09:24:31 +00:00
//$rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']);
//$rel_parts['path'] = str_replace("//", "/", $rel_parts['path']);
2020-12-12 16:04:22 +00:00
}
2020-09-22 06:04:33 +00:00
2020-09-22 11:54:15 +00:00
return self :: validate ( self :: build_url ( $rel_parts ));
2020-09-22 06:04:33 +00:00
}
}
2021-11-14 17:53:30 +00:00
/** extended filtering involves validation for safe ports and loopback
* @ return false | string false if something went wrong , otherwise the URL string
2021-11-10 21:38:25 +00:00
*/
static function validate ( string $url , bool $extended_filtering = false ) {
2020-09-22 06:04:33 +00:00
2020-09-22 16:56:26 +00:00
$url = clean ( $url );
2020-09-22 06:04:33 +00:00
# fix protocol-relative URLs
if ( strpos ( $url , " // " ) === 0 )
$url = " https: " . $url ;
$tokens = parse_url ( $url );
2020-09-22 11:37:45 +00:00
// this isn't really necessary because filter_var(... FILTER_VALIDATE_URL) requires host and scheme
// as per https://php.watch/versions/7.3/filter-var-flag-deprecation but it might save time
2021-02-05 20:41:32 +00:00
if ( empty ( $tokens [ 'host' ]))
2020-09-22 06:04:33 +00:00
return false ;
if ( ! in_array ( strtolower ( $tokens [ 'scheme' ]), [ 'http' , 'https' ]))
return false ;
2020-09-22 12:32:22 +00:00
//convert IDNA hostname to punycode if possible
if ( function_exists ( " idn_to_ascii " )) {
if ( mb_detect_encoding ( $tokens [ 'host' ]) != 'ASCII' ) {
2020-11-14 12:13:35 +00:00
if ( defined ( 'IDNA_NONTRANSITIONAL_TO_ASCII' ) && defined ( 'INTL_IDNA_VARIANT_UTS46' )) {
$tokens [ 'host' ] = idn_to_ascii ( $tokens [ 'host' ], IDNA_NONTRANSITIONAL_TO_ASCII , INTL_IDNA_VARIANT_UTS46 );
} else {
$tokens [ 'host' ] = idn_to_ascii ( $tokens [ 'host' ]);
}
2021-11-11 11:08:04 +00:00
// if `idn_to_ascii` failed
if ( $tokens [ 'host' ] === false ) {
return false ;
}
2020-09-22 12:32:22 +00:00
}
}
2020-09-28 16:46:31 +00:00
// separate set of tokens with urlencoded 'path' because filter_var() rightfully fails on non-latin characters
// (used for validation only, we actually request the original URL, in case of urlencode breaking it)
$tokens_filter_var = $tokens ;
2021-02-05 20:41:32 +00:00
if ( $tokens [ 'path' ] ? ? false ) {
2020-09-28 16:46:31 +00:00
$tokens_filter_var [ 'path' ] = implode ( " / " ,
array_map ( " rawurlencode " ,
array_map ( " rawurldecode " ,
explode ( " / " , $tokens [ 'path' ]))));
}
2020-09-22 11:37:45 +00:00
$url = self :: build_url ( $tokens );
2020-09-28 16:46:31 +00:00
$url_filter_var = self :: build_url ( $tokens_filter_var );
2020-09-22 11:37:45 +00:00
2020-09-28 16:46:31 +00:00
if ( filter_var ( $url_filter_var , FILTER_VALIDATE_URL ) === false )
2020-09-22 11:37:45 +00:00
return false ;
2020-09-22 06:04:33 +00:00
if ( $extended_filtering ) {
2021-02-05 20:41:32 +00:00
if ( ! in_array ( $tokens [ 'port' ] ? ? '' , [ 80 , 443 , '' ]))
2020-09-22 06:04:33 +00:00
return false ;
if ( strtolower ( $tokens [ 'host' ]) == 'localhost' || $tokens [ 'host' ] == '::1' || strpos ( $tokens [ 'host' ], '127.' ) === 0 )
return false ;
}
return $url ;
}
2021-11-10 21:38:25 +00:00
/**
2021-11-14 17:53:30 +00:00
* @ return false | string
2021-11-10 21:38:25 +00:00
*/
static function resolve_redirects ( string $url , int $timeout , int $nest = 0 ) {
2020-09-22 06:04:33 +00:00
// too many redirects
if ( $nest > 10 )
return false ;
if ( version_compare ( PHP_VERSION , '7.1.0' , '>=' )) {
$context_options = array (
'http' => array (
'header' => array (
'Connection: close'
),
'method' => 'HEAD' ,
'timeout' => $timeout ,
'protocol_version' => 1.1 )
);
2021-02-23 06:01:27 +00:00
if ( Config :: get ( Config :: HTTP_PROXY )) {
2020-09-22 06:04:33 +00:00
$context_options [ 'http' ][ 'request_fulluri' ] = true ;
2021-02-23 06:01:27 +00:00
$context_options [ 'http' ][ 'proxy' ] = Config :: get ( Config :: HTTP_PROXY );
2020-09-22 06:04:33 +00:00
}
$context = stream_context_create ( $context_options );
2021-11-01 20:36:48 +00:00
// PHP 8 changed the second param from int to bool, but we still support PHP >= 7.1.0
// @phpstan-ignore-next-line
2020-09-22 06:04:33 +00:00
$headers = get_headers ( $url , 0 , $context );
} else {
2021-11-01 20:36:48 +00:00
// PHP 8 changed the second param from int to bool, but we still support PHP >= 7.1.0
// @phpstan-ignore-next-line
2020-09-22 06:04:33 +00:00
$headers = get_headers ( $url , 0 );
}
if ( is_array ( $headers )) {
$headers = array_reverse ( $headers ); // last one is the correct one
foreach ( $headers as $header ) {
if ( stripos ( $header , 'Location:' ) === 0 ) {
2020-09-22 11:54:15 +00:00
$url = self :: rewrite_relative ( $url , trim ( substr ( $header , strlen ( 'Location:' ))));
2020-09-22 06:04:33 +00:00
2020-09-22 11:54:15 +00:00
return self :: resolve_redirects ( $url , $timeout , $nest + 1 );
2020-09-22 06:04:33 +00:00
}
}
return $url ;
}
// request failed?
return false ;
}
2021-11-10 21:38:25 +00:00
/**
2021-11-11 11:08:04 +00:00
* @ param array < string , bool | int | string >| string $options
2021-11-11 22:07:32 +00:00
* @ return false | string false if something went wrong , otherwise string contents
2021-11-10 21:38:25 +00:00
*/
// TODO: max_size currently only works for CURL transfers
2020-09-22 06:04:33 +00:00
// TODO: multiple-argument way is deprecated, first parameter is a hash now
public static function fetch ( $options /* previously : 0 : $url , 1 : $type = false , 2 : $login = false , 3 : $pass = false ,
4 : $post_query = false , 5 : $timeout = false , 6 : $timestamp = 0 , 7 : $useragent = false */ ) {
2021-11-10 21:38:25 +00:00
self :: $fetch_last_error = " " ;
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_code = - 1 ;
self :: $fetch_last_error_content = " " ;
self :: $fetch_last_content_type = " " ;
self :: $fetch_curl_used = false ;
self :: $fetch_last_modified = " " ;
self :: $fetch_effective_url = " " ;
self :: $fetch_effective_ip_addr = " " ;
2020-09-22 06:04:33 +00:00
if ( ! is_array ( $options )) {
// falling back on compatibility shim
$option_names = [ " url " , " type " , " login " , " pass " , " post_query " , " timeout " , " last_modified " , " useragent " ];
$tmp = [];
for ( $i = 0 ; $i < func_num_args (); $i ++ ) {
$tmp [ $option_names [ $i ]] = func_get_arg ( $i );
}
$options = $tmp ;
/* $options = array (
" url " => func_get_arg ( 0 ),
" type " => @ func_get_arg ( 1 ),
" login " => @ func_get_arg ( 2 ),
" pass " => @ func_get_arg ( 3 ),
" post_query " => @ func_get_arg ( 4 ),
" timeout " => @ func_get_arg ( 5 ),
" timestamp " => @ func_get_arg ( 6 ),
" useragent " => @ func_get_arg ( 7 )
); */
}
$url = $options [ " url " ];
$type = isset ( $options [ " type " ]) ? $options [ " type " ] : false ;
$login = isset ( $options [ " login " ]) ? $options [ " login " ] : false ;
$pass = isset ( $options [ " pass " ]) ? $options [ " pass " ] : false ;
$post_query = isset ( $options [ " post_query " ]) ? $options [ " post_query " ] : false ;
$timeout = isset ( $options [ " timeout " ]) ? $options [ " timeout " ] : false ;
$last_modified = isset ( $options [ " last_modified " ]) ? $options [ " last_modified " ] : " " ;
$useragent = isset ( $options [ " useragent " ]) ? $options [ " useragent " ] : false ;
$followlocation = isset ( $options [ " followlocation " ]) ? $options [ " followlocation " ] : true ;
2021-02-22 19:35:27 +00:00
$max_size = isset ( $options [ " max_size " ]) ? $options [ " max_size " ] : Config :: get ( Config :: MAX_DOWNLOAD_FILE_SIZE ); // in bytes
2020-09-22 06:04:33 +00:00
$http_accept = isset ( $options [ " http_accept " ]) ? $options [ " http_accept " ] : false ;
$http_referrer = isset ( $options [ " http_referrer " ]) ? $options [ " http_referrer " ] : false ;
$url = ltrim ( $url , ' ' );
$url = str_replace ( ' ' , '%20' , $url );
2021-11-24 05:19:04 +00:00
Debug :: log ( " [UrlHelper] fetching: $url " , Debug :: LOG_EXTENDED );
2020-09-22 11:54:15 +00:00
$url = self :: validate ( $url , true );
2020-09-22 06:04:33 +00:00
if ( ! $url ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error = " Requested URL failed extended validation. " ;
2020-09-22 06:04:33 +00:00
return false ;
}
$url_host = parse_url ( $url , PHP_URL_HOST );
$ip_addr = gethostbyname ( $url_host );
if ( ! $ip_addr || strpos ( $ip_addr , " 127. " ) === 0 ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error = " URL hostname failed to resolve or resolved to a loopback address ( $ip_addr ) " ;
2020-09-22 06:04:33 +00:00
return false ;
}
2021-02-23 06:01:27 +00:00
if ( function_exists ( 'curl_init' ) && ! ini_get ( " open_basedir " )) {
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
self :: $fetch_curl_used = true ;
2020-09-22 06:04:33 +00:00
$ch = curl_init ( $url );
2021-02-06 14:19:07 +00:00
if ( ! $ch ) return false ;
2020-09-22 06:04:33 +00:00
$curl_http_headers = [];
if ( $last_modified && ! $post_query )
array_push ( $curl_http_headers , " If-Modified-Since: $last_modified " );
if ( $http_accept )
array_push ( $curl_http_headers , " Accept: " . $http_accept );
if ( count ( $curl_http_headers ) > 0 )
curl_setopt ( $ch , CURLOPT_HTTPHEADER , $curl_http_headers );
2021-02-22 19:35:27 +00:00
curl_setopt ( $ch , CURLOPT_CONNECTTIMEOUT , $timeout ? $timeout : Config :: get ( Config :: FILE_FETCH_CONNECT_TIMEOUT ));
curl_setopt ( $ch , CURLOPT_TIMEOUT , $timeout ? $timeout : Config :: get ( Config :: FILE_FETCH_TIMEOUT ));
2021-11-01 20:36:48 +00:00
curl_setopt ( $ch , CURLOPT_FOLLOWLOCATION , $followlocation );
2020-09-22 06:04:33 +00:00
curl_setopt ( $ch , CURLOPT_MAXREDIRS , 20 );
curl_setopt ( $ch , CURLOPT_BINARYTRANSFER , true );
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , true );
curl_setopt ( $ch , CURLOPT_HEADER , true );
2022-05-23 05:42:16 +00:00
curl_setopt ( $ch , CURLOPT_HTTPAUTH , CURLAUTH_BASIC );
2021-08-23 07:56:31 +00:00
curl_setopt ( $ch , CURLOPT_USERAGENT , $useragent ? $useragent : Config :: get_user_agent ());
2020-09-22 06:04:33 +00:00
curl_setopt ( $ch , CURLOPT_ENCODING , " " );
2021-11-01 20:36:48 +00:00
curl_setopt ( $ch , CURLOPT_COOKIEJAR , " /dev/null " );
2020-09-22 06:04:33 +00:00
if ( $http_referrer )
curl_setopt ( $ch , CURLOPT_REFERER , $http_referrer );
if ( $max_size ) {
curl_setopt ( $ch , CURLOPT_NOPROGRESS , false );
curl_setopt ( $ch , CURLOPT_BUFFERSIZE , 16384 ); // needed to get 5 arguments in progress function?
// holy shit closures in php
// download & upload are *expected* sizes respectively, could be zero
2021-03-13 08:18:59 +00:00
curl_setopt ( $ch , CURLOPT_PROGRESSFUNCTION , function ( $curl_handle , $download_size , $downloaded , $upload_size , $uploaded ) use ( & $max_size , $url ) {
//Debug::log("[curl progressfunction] $downloaded $max_size", Debug::$LOG_EXTENDED);
2020-09-22 06:04:33 +00:00
2021-03-13 08:18:59 +00:00
if ( $downloaded > $max_size ) {
2021-11-24 05:19:04 +00:00
Debug :: log ( " [UrlHelper] fetch error: curl reached max size of $max_size bytes downloading $url , aborting. " , Debug :: LOG_VERBOSE );
2021-03-13 08:18:59 +00:00
return 1 ;
}
return 0 ;
2020-09-22 06:04:33 +00:00
});
}
2021-02-23 06:01:27 +00:00
if ( Config :: get ( Config :: HTTP_PROXY )) {
curl_setopt ( $ch , CURLOPT_PROXY , Config :: get ( Config :: HTTP_PROXY ));
2020-09-22 06:04:33 +00:00
}
if ( $post_query ) {
curl_setopt ( $ch , CURLOPT_POST , true );
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $post_query );
}
if ( $login && $pass )
curl_setopt ( $ch , CURLOPT_USERPWD , " $login : $pass " );
$ret = @ curl_exec ( $ch );
2022-05-23 05:42:16 +00:00
$http_code = curl_getinfo ( $ch , CURLINFO_HTTP_CODE );
// CURLAUTH_BASIC didn't work, let's retry with CURLAUTH_ANY in case it's actually something
// unusual like NTLM...
if ( $http_code == 403 && $login && $pass ) {
curl_setopt ( $ch , CURLOPT_HTTPAUTH , CURLAUTH_ANY );
$ret = @ curl_exec ( $ch );
}
2020-09-22 06:04:33 +00:00
2022-05-21 22:46:46 +00:00
if ( curl_errno ( $ch ) === 23 || curl_errno ( $ch ) === 61 ) {
curl_setopt ( $ch , CURLOPT_ENCODING , 'none' );
2022-05-21 23:02:56 +00:00
$ret = @ curl_exec ( $ch );
2022-05-21 22:46:46 +00:00
}
2020-09-22 06:04:33 +00:00
$headers_length = curl_getinfo ( $ch , CURLINFO_HEADER_SIZE );
$headers = explode ( " \r \n " , substr ( $ret , 0 , $headers_length ));
$contents = substr ( $ret , $headers_length );
foreach ( $headers as $header ) {
if ( strstr ( $header , " : " ) !== false ) {
list ( $key , $value ) = explode ( " : " , $header );
if ( strtolower ( $key ) == " last-modified " ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_modified = $value ;
2020-09-22 06:04:33 +00:00
}
}
if ( substr ( strtolower ( $header ), 0 , 7 ) == 'http/1.' ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_code = ( int ) substr ( $header , 9 , 3 );
self :: $fetch_last_error = $header ;
2020-09-22 06:04:33 +00:00
}
}
$http_code = curl_getinfo ( $ch , CURLINFO_HTTP_CODE );
2021-02-28 07:12:57 +00:00
self :: $fetch_last_content_type = curl_getinfo ( $ch , CURLINFO_CONTENT_TYPE );
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
self :: $fetch_effective_url = curl_getinfo ( $ch , CURLINFO_EFFECTIVE_URL );
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
if ( ! self :: validate ( self :: $fetch_effective_url , true )) {
self :: $fetch_last_error = " URL received after redirection failed extended validation. " ;
2020-09-22 06:04:33 +00:00
return false ;
}
2021-02-28 07:12:57 +00:00
self :: $fetch_effective_ip_addr = gethostbyname ( parse_url ( self :: $fetch_effective_url , PHP_URL_HOST ));
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
if ( ! self :: $fetch_effective_ip_addr || strpos ( self :: $fetch_effective_ip_addr , " 127. " ) === 0 ) {
self :: $fetch_last_error = " URL hostname received after redirection failed to resolve or resolved to a loopback address ( " . self :: $fetch_effective_ip_addr . " ) " ;
2020-09-22 06:04:33 +00:00
return false ;
}
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_code = $http_code ;
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
if ( $http_code != 200 || $type && strpos ( self :: $fetch_last_content_type , " $type " ) === false ) {
2020-09-22 06:04:33 +00:00
if ( curl_errno ( $ch ) != 0 ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error .= " ; " . curl_errno ( $ch ) . " " . curl_error ( $ch );
2021-12-23 14:32:44 +00:00
} else {
self :: $fetch_last_error = " HTTP Code: $http_code " ;
2020-09-22 06:04:33 +00:00
}
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_content = $contents ;
2020-09-22 06:04:33 +00:00
curl_close ( $ch );
return false ;
}
if ( ! $contents ) {
2022-08-14 16:38:58 +00:00
if ( curl_errno ( $ch ) === 0 ) {
self :: $fetch_last_error = 'Successful response, but no content was received.' ;
} else {
self :: $fetch_last_error = curl_errno ( $ch ) . " " . curl_error ( $ch );
}
2020-09-22 06:04:33 +00:00
curl_close ( $ch );
return false ;
}
curl_close ( $ch );
$is_gzipped = RSSUtils :: is_gzipped ( $contents );
2021-02-06 14:19:07 +00:00
if ( $is_gzipped && is_string ( $contents )) {
2020-09-22 06:04:33 +00:00
$tmp = @ gzdecode ( $contents );
if ( $tmp ) $contents = $tmp ;
}
return $contents ;
} else {
2021-02-28 07:12:57 +00:00
self :: $fetch_curl_used = false ;
2020-09-22 06:04:33 +00:00
if ( $login && $pass ){
$url_parts = array ();
preg_match ( " /(^[^:]*): \ / \ /(.*)/ " , $url , $url_parts );
$pass = urlencode ( $pass );
if ( $url_parts [ 1 ] && $url_parts [ 2 ]) {
$url = $url_parts [ 1 ] . " :// $login : $pass @ " . $url_parts [ 2 ];
}
}
// TODO: should this support POST requests or not? idk
$context_options = array (
'http' => array (
'header' => array (
'Connection: close'
),
'method' => 'GET' ,
'ignore_errors' => true ,
2021-02-22 19:35:27 +00:00
'timeout' => $timeout ? $timeout : Config :: get ( Config :: FILE_FETCH_TIMEOUT ),
2020-09-22 06:04:33 +00:00
'protocol_version' => 1.1 )
);
if ( ! $post_query && $last_modified )
array_push ( $context_options [ 'http' ][ 'header' ], " If-Modified-Since: $last_modified " );
if ( $http_accept )
array_push ( $context_options [ 'http' ][ 'header' ], " Accept: $http_accept " );
if ( $http_referrer )
array_push ( $context_options [ 'http' ][ 'header' ], " Referer: $http_referrer " );
2021-02-23 06:01:27 +00:00
if ( Config :: get ( Config :: HTTP_PROXY )) {
2020-09-22 06:04:33 +00:00
$context_options [ 'http' ][ 'request_fulluri' ] = true ;
2021-02-23 06:01:27 +00:00
$context_options [ 'http' ][ 'proxy' ] = Config :: get ( Config :: HTTP_PROXY );
2020-09-22 06:04:33 +00:00
}
$context = stream_context_create ( $context_options );
$old_error = error_get_last ();
2021-02-28 07:12:57 +00:00
self :: $fetch_effective_url = self :: resolve_redirects ( $url , $timeout ? $timeout : Config :: get ( Config :: FILE_FETCH_CONNECT_TIMEOUT ));
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
if ( ! self :: validate ( self :: $fetch_effective_url , true )) {
self :: $fetch_last_error = " URL received after redirection failed extended validation. " ;
2020-09-22 06:04:33 +00:00
return false ;
}
2021-02-28 07:12:57 +00:00
self :: $fetch_effective_ip_addr = gethostbyname ( parse_url ( self :: $fetch_effective_url , PHP_URL_HOST ));
2020-09-22 06:04:33 +00:00
2021-02-28 07:12:57 +00:00
if ( ! self :: $fetch_effective_ip_addr || strpos ( self :: $fetch_effective_ip_addr , " 127. " ) === 0 ) {
self :: $fetch_last_error = " URL hostname received after redirection failed to resolve or resolved to a loopback address ( " . self :: $fetch_effective_ip_addr . " ) " ;
2020-09-22 06:04:33 +00:00
return false ;
}
$data = @ file_get_contents ( $url , false , $context );
2022-08-14 16:52:35 +00:00
if ( $data === false ) {
self :: $fetch_last_error = " 'file_get_contents' failed. " ;
return false ;
}
2021-02-06 14:19:07 +00:00
foreach ( $http_response_header as $header ) {
if ( strstr ( $header , " : " ) !== false ) {
list ( $key , $value ) = explode ( " : " , $header );
$key = strtolower ( $key );
2020-09-22 06:04:33 +00:00
2021-02-06 14:19:07 +00:00
if ( $key == 'content-type' ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_content_type = $value ;
2021-02-06 14:19:07 +00:00
// don't abort here b/c there might be more than one
// e.g. if we were being redirected -- last one is the right one
} else if ( $key == 'last-modified' ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_modified = $value ;
2021-02-06 14:19:07 +00:00
} else if ( $key == 'location' ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_effective_url = $value ;
2020-09-22 06:04:33 +00:00
}
}
2021-02-06 14:19:07 +00:00
if ( substr ( strtolower ( $header ), 0 , 7 ) == 'http/1.' ) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_code = ( int ) substr ( $header , 9 , 3 );
self :: $fetch_last_error = $header ;
2021-02-06 14:19:07 +00:00
}
2020-09-22 06:04:33 +00:00
}
2021-02-28 07:12:57 +00:00
if ( self :: $fetch_last_error_code != 200 ) {
2020-09-22 06:04:33 +00:00
$error = error_get_last ();
Fix "array offset on value of type null" for $error and $old_error
I tried applying to only $error and only $old_error, but both appear to be needed.
Log entries:
E_NOTICE (8) classes/urlhelper.php:464 Trying to access array offset on value of type null
1. classes/urlhelper.php(464): ttrss_error_handler(8, Trying to access array offset on value of type null, classes/urlhelper.php, 464, [)
2. classes/rssutils.php(464): fetch([{"url":"https://some.url.rss","login":"","pass":"","timeout":15,"last_modified":"Sat, 31 Aug 2019 15:22:31 GMT"})
3. update.php(235): update_rss_feed(732, 1)
2021-03-06 19:31:55 +00:00
if (( $error [ 'message' ] ? ? '' ) != ( $old_error [ 'message' ] ? ? '' )) {
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error .= " ; " . $error [ " message " ];
2020-09-22 06:04:33 +00:00
}
2021-02-28 07:12:57 +00:00
self :: $fetch_last_error_content = $data ;
2020-09-22 06:04:33 +00:00
return false ;
}
2022-08-15 04:59:24 +00:00
if ( $data ) {
$is_gzipped = RSSUtils :: is_gzipped ( $data );
2022-08-14 16:38:58 +00:00
2022-08-15 04:59:24 +00:00
if ( $is_gzipped ) {
$tmp = @ gzdecode ( $data );
2020-09-22 06:04:33 +00:00
2022-08-15 04:59:24 +00:00
if ( $tmp ) $data = $tmp ;
}
2020-09-22 06:04:33 +00:00
2022-08-15 04:59:24 +00:00
return $data ;
} else {
self :: $fetch_last_error = 'Successful response, but no content was received.' ;
return false ;
2020-09-22 06:04:33 +00:00
}
}
}
2021-11-11 11:08:04 +00:00
/**
2021-11-14 17:53:30 +00:00
* @ return false | string false if the provided URL didn ' t match expected patterns , otherwise the video ID string
2021-11-11 11:08:04 +00:00
*/
public static function url_to_youtube_vid ( string $url ) {
2021-05-07 04:37:27 +00:00
$url = str_replace ( " youtube.com " , " youtube-nocookie.com " , $url );
$regexps = [
" / \ / \ /www \ .youtube-nocookie \ .com \ /v \ /([ \ w-]+)/ " ,
" / \ / \ /www \ .youtube-nocookie \ .com \ /embed \ /([ \ w-]+)/ " ,
" / \ / \ /www \ .youtube-nocookie \ .com \ /watch?v=([ \ w-]+)/ " ,
" / \ / \ /youtu.be \ /([ \ w-]+)/ " ,
];
foreach ( $regexps as $re ) {
$matches = [];
if ( preg_match ( $re , $url , $matches )) {
return $matches [ 1 ];
}
}
return false ;
}
2020-09-22 06:04:33 +00:00
}