HttpUtils.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. <?php
  2. /**
  3. * GET an HTTP URL to retrieve its content
  4. * Uses the cURL library or a fallback method
  5. *
  6. * @param string $url URL to get (http://...)
  7. * @param int $timeout network timeout (in seconds)
  8. * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
  9. * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
  10. * Can be used to add download conditions on the headers (response code, content type, etc.).
  11. *
  12. * @return array HTTP response headers, downloaded content
  13. *
  14. * Output format:
  15. * [0] = associative array containing HTTP response headers
  16. * [1] = URL content (downloaded data)
  17. *
  18. * Example:
  19. * list($headers, $data) = get_http_response('http://sebauvage.net/');
  20. * if (strpos($headers[0], '200 OK') !== false) {
  21. * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  22. * } else {
  23. * echo 'There was an error: '.htmlspecialchars($headers[0]);
  24. * }
  25. *
  26. * @see https://secure.php.net/manual/en/ref.curl.php
  27. * @see https://secure.php.net/manual/en/functions.anonymous.php
  28. * @see https://secure.php.net/manual/en/function.preg-split.php
  29. * @see https://secure.php.net/manual/en/function.explode.php
  30. * @see http://stackoverflow.com/q/17641073
  31. * @see http://stackoverflow.com/q/9183178
  32. * @see http://stackoverflow.com/q/1462720
  33. */
  34. function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
  35. {
  36. $urlObj = new Url($url);
  37. $cleanUrl = $urlObj->idnToAscii();
  38. if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  39. return array(array(0 => 'Invalid HTTP Url'), false);
  40. }
  41. $userAgent =
  42. 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  43. . ' Gecko/20100101 Firefox/45.0';
  44. $acceptLanguage =
  45. substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  46. $maxRedirs = 3;
  47. if (!function_exists('curl_init')) {
  48. return get_http_response_fallback(
  49. $cleanUrl,
  50. $timeout,
  51. $maxBytes,
  52. $userAgent,
  53. $acceptLanguage,
  54. $maxRedirs
  55. );
  56. }
  57. $ch = curl_init($cleanUrl);
  58. if ($ch === false) {
  59. return array(array(0 => 'curl_init() error'), false);
  60. }
  61. // General cURL settings
  62. curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  63. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  64. curl_setopt($ch, CURLOPT_HEADER, true);
  65. curl_setopt(
  66. $ch,
  67. CURLOPT_HTTPHEADER,
  68. array('Accept-Language: ' . $acceptLanguage)
  69. );
  70. curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  71. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  72. curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  73. curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  74. if (is_callable($curlWriteFunction)) {
  75. curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
  76. }
  77. // Max download size management
  78. curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
  79. curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  80. curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
  81. function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
  82. {
  83. if (version_compare(phpversion(), '5.5', '<')) {
  84. // PHP version lower than 5.5
  85. // Callback has 4 arguments
  86. $downloaded = $arg1;
  87. } else {
  88. // Callback has 5 arguments
  89. $downloaded = $arg2;
  90. }
  91. // Non-zero return stops downloading
  92. return ($downloaded > $maxBytes) ? 1 : 0;
  93. }
  94. );
  95. $response = curl_exec($ch);
  96. $errorNo = curl_errno($ch);
  97. $errorStr = curl_error($ch);
  98. $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
  99. curl_close($ch);
  100. if ($response === false) {
  101. if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
  102. /*
  103. * Workaround to match fallback method behaviour
  104. * Removing this would require updating
  105. * GetHttpUrlTest::testGetInvalidRemoteUrl()
  106. */
  107. return array(false, false);
  108. }
  109. return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
  110. }
  111. // Formatting output like the fallback method
  112. $rawHeaders = substr($response, 0, $headSize);
  113. // Keep only headers from latest redirection
  114. $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
  115. $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
  116. $content = substr($response, $headSize);
  117. $headers = array();
  118. foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
  119. if (empty($line) || ctype_space($line)) {
  120. continue;
  121. }
  122. $splitLine = explode(': ', $line, 2);
  123. if (count($splitLine) > 1) {
  124. $key = $splitLine[0];
  125. $value = $splitLine[1];
  126. if (array_key_exists($key, $headers)) {
  127. if (!is_array($headers[$key])) {
  128. $headers[$key] = array(0 => $headers[$key]);
  129. }
  130. $headers[$key][] = $value;
  131. } else {
  132. $headers[$key] = $value;
  133. }
  134. } else {
  135. $headers[] = $splitLine[0];
  136. }
  137. }
  138. return array($headers, $content);
  139. }
  140. /**
  141. * GET an HTTP URL to retrieve its content (fallback method)
  142. *
  143. * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
  144. * @param int $timeout network timeout (in seconds)
  145. * @param int $maxBytes maximum downloaded bytes
  146. * @param string $userAgent "User-Agent" header
  147. * @param string $acceptLanguage "Accept-Language" header
  148. * @param int $maxRedr maximum amount of redirections followed
  149. *
  150. * @return array HTTP response headers, downloaded content
  151. *
  152. * Output format:
  153. * [0] = associative array containing HTTP response headers
  154. * [1] = URL content (downloaded data)
  155. *
  156. * @see http://php.net/manual/en/function.file-get-contents.php
  157. * @see http://php.net/manual/en/function.stream-context-create.php
  158. * @see http://php.net/manual/en/function.get-headers.php
  159. */
  160. function get_http_response_fallback(
  161. $cleanUrl,
  162. $timeout,
  163. $maxBytes,
  164. $userAgent,
  165. $acceptLanguage,
  166. $maxRedr
  167. ) {
  168. $options = array(
  169. 'http' => array(
  170. 'method' => 'GET',
  171. 'timeout' => $timeout,
  172. 'user_agent' => $userAgent,
  173. 'header' => "Accept: */*\r\n"
  174. . 'Accept-Language: ' . $acceptLanguage
  175. )
  176. );
  177. stream_context_set_default($options);
  178. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  179. if (! $headers || strpos($headers[0], '200 OK') === false) {
  180. $options['http']['request_fulluri'] = true;
  181. stream_context_set_default($options);
  182. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  183. }
  184. if (! $headers) {
  185. return array($headers, false);
  186. }
  187. try {
  188. // TODO: catch Exception in calling code (thumbnailer)
  189. $context = stream_context_create($options);
  190. $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
  191. } catch (Exception $exc) {
  192. return array(array(0 => 'HTTP Error'), $exc->getMessage());
  193. }
  194. return array($headers, $content);
  195. }
  196. /**
  197. * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
  198. *
  199. * @param string $url initial URL to reach.
  200. * @param int $redirectionLimit max redirection follow.
  201. *
  202. * @return array HTTP headers, or false if it failed.
  203. */
  204. function get_redirected_headers($url, $redirectionLimit = 3)
  205. {
  206. $headers = get_headers($url, 1);
  207. if (!empty($headers['location']) && empty($headers['Location'])) {
  208. $headers['Location'] = $headers['location'];
  209. }
  210. // Headers found, redirection found, and limit not reached.
  211. if ($redirectionLimit-- > 0
  212. && !empty($headers)
  213. && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
  214. && !empty($headers['Location'])) {
  215. $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
  216. if ($redirection != $url) {
  217. $redirection = getAbsoluteUrl($url, $redirection);
  218. return get_redirected_headers($redirection, $redirectionLimit);
  219. }
  220. }
  221. return array($headers, $url);
  222. }
  223. /**
  224. * Get an absolute URL from a complete one, and another absolute/relative URL.
  225. *
  226. * @param string $originalUrl The original complete URL.
  227. * @param string $newUrl The new one, absolute or relative.
  228. *
  229. * @return string Final URL:
  230. * - $newUrl if it was already an absolute URL.
  231. * - if it was relative, absolute URL from $originalUrl path.
  232. */
  233. function getAbsoluteUrl($originalUrl, $newUrl)
  234. {
  235. $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
  236. // Already an absolute URL.
  237. if (!empty($newScheme)) {
  238. return $newUrl;
  239. }
  240. $parts = parse_url($originalUrl);
  241. $final = $parts['scheme'] .'://'. $parts['host'];
  242. $final .= (!empty($parts['port'])) ? $parts['port'] : '';
  243. $final .= '/';
  244. if ($newUrl[0] != '/') {
  245. $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
  246. }
  247. $final .= ltrim($newUrl, '/');
  248. return $final;
  249. }
  250. /**
  251. * Returns the server's base URL: scheme://domain.tld[:port]
  252. *
  253. * @param array $server the $_SERVER array
  254. *
  255. * @return string the server's base URL
  256. *
  257. * @see http://www.ietf.org/rfc/rfc7239.txt
  258. * @see http://www.ietf.org/rfc/rfc6648.txt
  259. * @see http://stackoverflow.com/a/3561399
  260. * @see http://stackoverflow.com/q/452375
  261. */
  262. function server_url($server)
  263. {
  264. $scheme = 'http';
  265. $port = '';
  266. // Shaarli is served behind a proxy
  267. if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
  268. // Keep forwarded scheme
  269. if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
  270. $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
  271. $scheme = trim($schemes[0]);
  272. } else {
  273. $scheme = $server['HTTP_X_FORWARDED_PROTO'];
  274. }
  275. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  276. // Keep forwarded port
  277. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  278. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  279. $port = trim($ports[0]);
  280. } else {
  281. $port = $server['HTTP_X_FORWARDED_PORT'];
  282. }
  283. // This is a workaround for proxies that don't forward the scheme properly.
  284. // Connecting over port 443 has to be in HTTPS.
  285. // See https://github.com/shaarli/Shaarli/issues/1022
  286. if ($port == '443') {
  287. $scheme = 'https';
  288. }
  289. if (($scheme == 'http' && $port != '80')
  290. || ($scheme == 'https' && $port != '443')
  291. ) {
  292. $port = ':' . $port;
  293. } else {
  294. $port = '';
  295. }
  296. }
  297. if (isset($server['HTTP_X_FORWARDED_HOST'])) {
  298. // Keep forwarded host
  299. if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
  300. $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
  301. $host = trim($hosts[0]);
  302. } else {
  303. $host = $server['HTTP_X_FORWARDED_HOST'];
  304. }
  305. } else {
  306. $host = $server['SERVER_NAME'];
  307. }
  308. return $scheme.'://'.$host.$port;
  309. }
  310. // SSL detection
  311. if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
  312. || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
  313. $scheme = 'https';
  314. }
  315. // Do not append standard port values
  316. if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
  317. || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
  318. $port = ':'.$server['SERVER_PORT'];
  319. }
  320. return $scheme.'://'.$server['SERVER_NAME'].$port;
  321. }
  322. /**
  323. * Returns the absolute URL of the current script, without the query
  324. *
  325. * If the resource is "index.php", then it is removed (for better-looking URLs)
  326. *
  327. * @param array $server the $_SERVER array
  328. *
  329. * @return string the absolute URL of the current script, without the query
  330. */
  331. function index_url($server)
  332. {
  333. $scriptname = $server['SCRIPT_NAME'];
  334. if (endsWith($scriptname, 'index.php')) {
  335. $scriptname = substr($scriptname, 0, -9);
  336. }
  337. return server_url($server) . $scriptname;
  338. }
  339. /**
  340. * Returns the absolute URL of the current script, with the query
  341. *
  342. * If the resource is "index.php", then it is removed (for better-looking URLs)
  343. *
  344. * @param array $server the $_SERVER array
  345. *
  346. * @return string the absolute URL of the current script, with the query
  347. */
  348. function page_url($server)
  349. {
  350. if (! empty($server['QUERY_STRING'])) {
  351. return index_url($server).'?'.$server['QUERY_STRING'];
  352. }
  353. return index_url($server);
  354. }
  355. /**
  356. * Retrieve the initial IP forwarded by the reverse proxy.
  357. *
  358. * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
  359. *
  360. * @param array $server $_SERVER array which contains HTTP headers.
  361. * @param array $trustedIps List of trusted IP from the configuration.
  362. *
  363. * @return string|bool The forwarded IP, or false if none could be extracted.
  364. */
  365. function getIpAddressFromProxy($server, $trustedIps)
  366. {
  367. $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
  368. if (empty($server[$forwardedIpHeader])) {
  369. return false;
  370. }
  371. $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
  372. $ips = array_diff($ips, $trustedIps);
  373. if (empty($ips)) {
  374. return false;
  375. }
  376. return array_pop($ips);
  377. }
  378. /**
  379. * Returns true if Shaarli's currently browsed in HTTPS.
  380. * Supports reverse proxies (if the headers are correctly set).
  381. *
  382. * @param array $server $_SERVER.
  383. *
  384. * @return bool true if HTTPS, false otherwise.
  385. */
  386. function is_https($server)
  387. {
  388. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  389. // Keep forwarded port
  390. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  391. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  392. $port = trim($ports[0]);
  393. } else {
  394. $port = $server['HTTP_X_FORWARDED_PORT'];
  395. }
  396. if ($port == '443') {
  397. return true;
  398. }
  399. }
  400. return ! empty($server['HTTPS']);
  401. }