HttpUtils.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. <?php
  2. /**
  3. * GET an HTTP URL to retrieve its content
  4. * Uses the cURL library or a fallback method
  5. *
  6. * @param string $url URL to get (http://...)
  7. * @param int $timeout network timeout (in seconds)
  8. * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
  9. * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
  10. * Can be used to add download conditions on the
  11. * headers (response code, content type, etc.).
  12. *
  13. * @return array HTTP response headers, downloaded content
  14. *
  15. * Output format:
  16. * [0] = associative array containing HTTP response headers
  17. * [1] = URL content (downloaded data)
  18. *
  19. * Example:
  20. * list($headers, $data) = get_http_response('http://sebauvage.net/');
  21. * if (strpos($headers[0], '200 OK') !== false) {
  22. * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  23. * } else {
  24. * echo 'There was an error: '.htmlspecialchars($headers[0]);
  25. * }
  26. *
  27. * @see https://secure.php.net/manual/en/ref.curl.php
  28. * @see https://secure.php.net/manual/en/functions.anonymous.php
  29. * @see https://secure.php.net/manual/en/function.preg-split.php
  30. * @see https://secure.php.net/manual/en/function.explode.php
  31. * @see http://stackoverflow.com/q/17641073
  32. * @see http://stackoverflow.com/q/9183178
  33. * @see http://stackoverflow.com/q/1462720
  34. */
  35. function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
  36. {
  37. $urlObj = new Url($url);
  38. $cleanUrl = $urlObj->idnToAscii();
  39. if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  40. return array(array(0 => 'Invalid HTTP Url'), false);
  41. }
  42. $userAgent =
  43. 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  44. . ' Gecko/20100101 Firefox/45.0';
  45. $acceptLanguage =
  46. substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  47. $maxRedirs = 3;
  48. if (!function_exists('curl_init')) {
  49. return get_http_response_fallback(
  50. $cleanUrl,
  51. $timeout,
  52. $maxBytes,
  53. $userAgent,
  54. $acceptLanguage,
  55. $maxRedirs
  56. );
  57. }
  58. $ch = curl_init($cleanUrl);
  59. if ($ch === false) {
  60. return array(array(0 => 'curl_init() error'), false);
  61. }
  62. // General cURL settings
  63. curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  64. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  65. curl_setopt($ch, CURLOPT_HEADER, true);
  66. curl_setopt(
  67. $ch,
  68. CURLOPT_HTTPHEADER,
  69. array('Accept-Language: ' . $acceptLanguage)
  70. );
  71. curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  72. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  73. curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  74. curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  75. if (is_callable($curlWriteFunction)) {
  76. curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
  77. }
  78. // Max download size management
  79. curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
  80. curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  81. curl_setopt(
  82. $ch,
  83. CURLOPT_PROGRESSFUNCTION,
  84. function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
  85. if (version_compare(phpversion(), '5.5', '<')) {
  86. // PHP version lower than 5.5
  87. // Callback has 4 arguments
  88. $downloaded = $arg1;
  89. } else {
  90. // Callback has 5 arguments
  91. $downloaded = $arg2;
  92. }
  93. // Non-zero return stops downloading
  94. return ($downloaded > $maxBytes) ? 1 : 0;
  95. }
  96. );
  97. $response = curl_exec($ch);
  98. $errorNo = curl_errno($ch);
  99. $errorStr = curl_error($ch);
  100. $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
  101. curl_close($ch);
  102. if ($response === false) {
  103. if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
  104. /*
  105. * Workaround to match fallback method behaviour
  106. * Removing this would require updating
  107. * GetHttpUrlTest::testGetInvalidRemoteUrl()
  108. */
  109. return array(false, false);
  110. }
  111. return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
  112. }
  113. // Formatting output like the fallback method
  114. $rawHeaders = substr($response, 0, $headSize);
  115. // Keep only headers from latest redirection
  116. $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
  117. $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
  118. $content = substr($response, $headSize);
  119. $headers = array();
  120. foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
  121. if (empty($line) || ctype_space($line)) {
  122. continue;
  123. }
  124. $splitLine = explode(': ', $line, 2);
  125. if (count($splitLine) > 1) {
  126. $key = $splitLine[0];
  127. $value = $splitLine[1];
  128. if (array_key_exists($key, $headers)) {
  129. if (!is_array($headers[$key])) {
  130. $headers[$key] = array(0 => $headers[$key]);
  131. }
  132. $headers[$key][] = $value;
  133. } else {
  134. $headers[$key] = $value;
  135. }
  136. } else {
  137. $headers[] = $splitLine[0];
  138. }
  139. }
  140. return array($headers, $content);
  141. }
  142. /**
  143. * GET an HTTP URL to retrieve its content (fallback method)
  144. *
  145. * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
  146. * @param int $timeout network timeout (in seconds)
  147. * @param int $maxBytes maximum downloaded bytes
  148. * @param string $userAgent "User-Agent" header
  149. * @param string $acceptLanguage "Accept-Language" header
  150. * @param int $maxRedr maximum amount of redirections followed
  151. *
  152. * @return array HTTP response headers, downloaded content
  153. *
  154. * Output format:
  155. * [0] = associative array containing HTTP response headers
  156. * [1] = URL content (downloaded data)
  157. *
  158. * @see http://php.net/manual/en/function.file-get-contents.php
  159. * @see http://php.net/manual/en/function.stream-context-create.php
  160. * @see http://php.net/manual/en/function.get-headers.php
  161. */
  162. function get_http_response_fallback(
  163. $cleanUrl,
  164. $timeout,
  165. $maxBytes,
  166. $userAgent,
  167. $acceptLanguage,
  168. $maxRedr
  169. ) {
  170. $options = array(
  171. 'http' => array(
  172. 'method' => 'GET',
  173. 'timeout' => $timeout,
  174. 'user_agent' => $userAgent,
  175. 'header' => "Accept: */*\r\n"
  176. . 'Accept-Language: ' . $acceptLanguage
  177. )
  178. );
  179. stream_context_set_default($options);
  180. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  181. if (! $headers || strpos($headers[0], '200 OK') === false) {
  182. $options['http']['request_fulluri'] = true;
  183. stream_context_set_default($options);
  184. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  185. }
  186. if (! $headers) {
  187. return array($headers, false);
  188. }
  189. try {
  190. // TODO: catch Exception in calling code (thumbnailer)
  191. $context = stream_context_create($options);
  192. $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
  193. } catch (Exception $exc) {
  194. return array(array(0 => 'HTTP Error'), $exc->getMessage());
  195. }
  196. return array($headers, $content);
  197. }
  198. /**
  199. * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
  200. *
  201. * @param string $url initial URL to reach.
  202. * @param int $redirectionLimit max redirection follow.
  203. *
  204. * @return array HTTP headers, or false if it failed.
  205. */
  206. function get_redirected_headers($url, $redirectionLimit = 3)
  207. {
  208. $headers = get_headers($url, 1);
  209. if (!empty($headers['location']) && empty($headers['Location'])) {
  210. $headers['Location'] = $headers['location'];
  211. }
  212. // Headers found, redirection found, and limit not reached.
  213. if ($redirectionLimit-- > 0
  214. && !empty($headers)
  215. && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
  216. && !empty($headers['Location'])) {
  217. $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
  218. if ($redirection != $url) {
  219. $redirection = getAbsoluteUrl($url, $redirection);
  220. return get_redirected_headers($redirection, $redirectionLimit);
  221. }
  222. }
  223. return array($headers, $url);
  224. }
  225. /**
  226. * Get an absolute URL from a complete one, and another absolute/relative URL.
  227. *
  228. * @param string $originalUrl The original complete URL.
  229. * @param string $newUrl The new one, absolute or relative.
  230. *
  231. * @return string Final URL:
  232. * - $newUrl if it was already an absolute URL.
  233. * - if it was relative, absolute URL from $originalUrl path.
  234. */
  235. function getAbsoluteUrl($originalUrl, $newUrl)
  236. {
  237. $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
  238. // Already an absolute URL.
  239. if (!empty($newScheme)) {
  240. return $newUrl;
  241. }
  242. $parts = parse_url($originalUrl);
  243. $final = $parts['scheme'] .'://'. $parts['host'];
  244. $final .= (!empty($parts['port'])) ? $parts['port'] : '';
  245. $final .= '/';
  246. if ($newUrl[0] != '/') {
  247. $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
  248. }
  249. $final .= ltrim($newUrl, '/');
  250. return $final;
  251. }
  252. /**
  253. * Returns the server's base URL: scheme://domain.tld[:port]
  254. *
  255. * @param array $server the $_SERVER array
  256. *
  257. * @return string the server's base URL
  258. *
  259. * @see http://www.ietf.org/rfc/rfc7239.txt
  260. * @see http://www.ietf.org/rfc/rfc6648.txt
  261. * @see http://stackoverflow.com/a/3561399
  262. * @see http://stackoverflow.com/q/452375
  263. */
  264. function server_url($server)
  265. {
  266. $scheme = 'http';
  267. $port = '';
  268. // Shaarli is served behind a proxy
  269. if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
  270. // Keep forwarded scheme
  271. if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
  272. $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
  273. $scheme = trim($schemes[0]);
  274. } else {
  275. $scheme = $server['HTTP_X_FORWARDED_PROTO'];
  276. }
  277. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  278. // Keep forwarded port
  279. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  280. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  281. $port = trim($ports[0]);
  282. } else {
  283. $port = $server['HTTP_X_FORWARDED_PORT'];
  284. }
  285. // This is a workaround for proxies that don't forward the scheme properly.
  286. // Connecting over port 443 has to be in HTTPS.
  287. // See https://github.com/shaarli/Shaarli/issues/1022
  288. if ($port == '443') {
  289. $scheme = 'https';
  290. }
  291. if (($scheme == 'http' && $port != '80')
  292. || ($scheme == 'https' && $port != '443')
  293. ) {
  294. $port = ':' . $port;
  295. } else {
  296. $port = '';
  297. }
  298. }
  299. if (isset($server['HTTP_X_FORWARDED_HOST'])) {
  300. // Keep forwarded host
  301. if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
  302. $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
  303. $host = trim($hosts[0]);
  304. } else {
  305. $host = $server['HTTP_X_FORWARDED_HOST'];
  306. }
  307. } else {
  308. $host = $server['SERVER_NAME'];
  309. }
  310. return $scheme.'://'.$host.$port;
  311. }
  312. // SSL detection
  313. if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
  314. || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
  315. $scheme = 'https';
  316. }
  317. // Do not append standard port values
  318. if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
  319. || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
  320. $port = ':'.$server['SERVER_PORT'];
  321. }
  322. return $scheme.'://'.$server['SERVER_NAME'].$port;
  323. }
  324. /**
  325. * Returns the absolute URL of the current script, without the query
  326. *
  327. * If the resource is "index.php", then it is removed (for better-looking URLs)
  328. *
  329. * @param array $server the $_SERVER array
  330. *
  331. * @return string the absolute URL of the current script, without the query
  332. */
  333. function index_url($server)
  334. {
  335. $scriptname = $server['SCRIPT_NAME'];
  336. if (endsWith($scriptname, 'index.php')) {
  337. $scriptname = substr($scriptname, 0, -9);
  338. }
  339. return server_url($server) . $scriptname;
  340. }
  341. /**
  342. * Returns the absolute URL of the current script, with the query
  343. *
  344. * If the resource is "index.php", then it is removed (for better-looking URLs)
  345. *
  346. * @param array $server the $_SERVER array
  347. *
  348. * @return string the absolute URL of the current script, with the query
  349. */
  350. function page_url($server)
  351. {
  352. if (! empty($server['QUERY_STRING'])) {
  353. return index_url($server).'?'.$server['QUERY_STRING'];
  354. }
  355. return index_url($server);
  356. }
  357. /**
  358. * Retrieve the initial IP forwarded by the reverse proxy.
  359. *
  360. * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
  361. *
  362. * @param array $server $_SERVER array which contains HTTP headers.
  363. * @param array $trustedIps List of trusted IP from the configuration.
  364. *
  365. * @return string|bool The forwarded IP, or false if none could be extracted.
  366. */
  367. function getIpAddressFromProxy($server, $trustedIps)
  368. {
  369. $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
  370. if (empty($server[$forwardedIpHeader])) {
  371. return false;
  372. }
  373. $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
  374. $ips = array_diff($ips, $trustedIps);
  375. if (empty($ips)) {
  376. return false;
  377. }
  378. return array_pop($ips);
  379. }
  380. /**
  381. * Return an identifier based on the advertised client IP address(es)
  382. *
  383. * This aims at preventing session hijacking from users behind the same proxy
  384. * by relying on HTTP headers.
  385. *
  386. * See:
  387. * - https://secure.php.net/manual/en/reserved.variables.server.php
  388. * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php
  389. * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking
  390. * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor
  391. *
  392. * @param array $server The $_SERVER array
  393. *
  394. * @return string An identifier based on client IP address information
  395. */
  396. function client_ip_id($server)
  397. {
  398. $ip = $server['REMOTE_ADDR'];
  399. if (isset($server['HTTP_X_FORWARDED_FOR'])) {
  400. $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR'];
  401. }
  402. if (isset($server['HTTP_CLIENT_IP'])) {
  403. $ip = $ip . '_' . $server['HTTP_CLIENT_IP'];
  404. }
  405. return $ip;
  406. }
  407. /**
  408. * Returns true if Shaarli's currently browsed in HTTPS.
  409. * Supports reverse proxies (if the headers are correctly set).
  410. *
  411. * @param array $server $_SERVER.
  412. *
  413. * @return bool true if HTTPS, false otherwise.
  414. */
  415. function is_https($server)
  416. {
  417. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  418. // Keep forwarded port
  419. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  420. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  421. $port = trim($ports[0]);
  422. } else {
  423. $port = $server['HTTP_X_FORWARDED_PORT'];
  424. }
  425. if ($port == '443') {
  426. return true;
  427. }
  428. }
  429. return ! empty($server['HTTPS']);
  430. }