HttpUtils.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. <?php
  2. /**
  3. * GET an HTTP URL to retrieve its content
  4. * Uses the cURL library or a fallback method
  5. *
  6. * @param string $url URL to get (http://...)
  7. * @param int $timeout network timeout (in seconds)
  8. * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
  9. *
  10. * @return array HTTP response headers, downloaded content
  11. *
  12. * Output format:
  13. * [0] = associative array containing HTTP response headers
  14. * [1] = URL content (downloaded data)
  15. *
  16. * Example:
  17. * list($headers, $data) = get_http_response('http://sebauvage.net/');
  18. * if (strpos($headers[0], '200 OK') !== false) {
  19. * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  20. * } else {
  21. * echo 'There was an error: '.htmlspecialchars($headers[0]);
  22. * }
  23. *
  24. * @see https://secure.php.net/manual/en/ref.curl.php
  25. * @see https://secure.php.net/manual/en/functions.anonymous.php
  26. * @see https://secure.php.net/manual/en/function.preg-split.php
  27. * @see https://secure.php.net/manual/en/function.explode.php
  28. * @see http://stackoverflow.com/q/17641073
  29. * @see http://stackoverflow.com/q/9183178
  30. * @see http://stackoverflow.com/q/1462720
  31. */
  32. function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
  33. {
  34. $urlObj = new Url($url);
  35. $cleanUrl = $urlObj->idnToAscii();
  36. if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  37. return array(array(0 => 'Invalid HTTP Url'), false);
  38. }
  39. $userAgent =
  40. 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  41. . ' Gecko/20100101 Firefox/45.0';
  42. $acceptLanguage =
  43. substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  44. $maxRedirs = 3;
  45. if (!function_exists('curl_init')) {
  46. return get_http_response_fallback(
  47. $cleanUrl,
  48. $timeout,
  49. $maxBytes,
  50. $userAgent,
  51. $acceptLanguage,
  52. $maxRedirs
  53. );
  54. }
  55. $ch = curl_init($cleanUrl);
  56. if ($ch === false) {
  57. return array(array(0 => 'curl_init() error'), false);
  58. }
  59. // General cURL settings
  60. curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  61. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  62. curl_setopt($ch, CURLOPT_HEADER, true);
  63. curl_setopt(
  64. $ch,
  65. CURLOPT_HTTPHEADER,
  66. array('Accept-Language: ' . $acceptLanguage)
  67. );
  68. curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  69. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  70. curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  71. curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  72. // Max download size management
  73. curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
  74. curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  75. curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
  76. function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
  77. {
  78. if (version_compare(phpversion(), '5.5', '<')) {
  79. // PHP version lower than 5.5
  80. // Callback has 4 arguments
  81. $downloaded = $arg1;
  82. } else {
  83. // Callback has 5 arguments
  84. $downloaded = $arg2;
  85. }
  86. // Non-zero return stops downloading
  87. return ($downloaded > $maxBytes) ? 1 : 0;
  88. }
  89. );
  90. $response = curl_exec($ch);
  91. $errorNo = curl_errno($ch);
  92. $errorStr = curl_error($ch);
  93. $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
  94. curl_close($ch);
  95. if ($response === false) {
  96. if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
  97. /*
  98. * Workaround to match fallback method behaviour
  99. * Removing this would require updating
  100. * GetHttpUrlTest::testGetInvalidRemoteUrl()
  101. */
  102. return array(false, false);
  103. }
  104. return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
  105. }
  106. // Formatting output like the fallback method
  107. $rawHeaders = substr($response, 0, $headSize);
  108. // Keep only headers from latest redirection
  109. $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
  110. $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
  111. $content = substr($response, $headSize);
  112. $headers = array();
  113. foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
  114. if (empty($line) || ctype_space($line)) {
  115. continue;
  116. }
  117. $splitLine = explode(': ', $line, 2);
  118. if (count($splitLine) > 1) {
  119. $key = $splitLine[0];
  120. $value = $splitLine[1];
  121. if (array_key_exists($key, $headers)) {
  122. if (!is_array($headers[$key])) {
  123. $headers[$key] = array(0 => $headers[$key]);
  124. }
  125. $headers[$key][] = $value;
  126. } else {
  127. $headers[$key] = $value;
  128. }
  129. } else {
  130. $headers[] = $splitLine[0];
  131. }
  132. }
  133. return array($headers, $content);
  134. }
  135. /**
  136. * GET an HTTP URL to retrieve its content (fallback method)
  137. *
  138. * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
  139. * @param int $timeout network timeout (in seconds)
  140. * @param int $maxBytes maximum downloaded bytes
  141. * @param string $userAgent "User-Agent" header
  142. * @param string $acceptLanguage "Accept-Language" header
  143. * @param int $maxRedr maximum amount of redirections followed
  144. *
  145. * @return array HTTP response headers, downloaded content
  146. *
  147. * Output format:
  148. * [0] = associative array containing HTTP response headers
  149. * [1] = URL content (downloaded data)
  150. *
  151. * @see http://php.net/manual/en/function.file-get-contents.php
  152. * @see http://php.net/manual/en/function.stream-context-create.php
  153. * @see http://php.net/manual/en/function.get-headers.php
  154. */
  155. function get_http_response_fallback(
  156. $cleanUrl,
  157. $timeout,
  158. $maxBytes,
  159. $userAgent,
  160. $acceptLanguage,
  161. $maxRedr
  162. ) {
  163. $options = array(
  164. 'http' => array(
  165. 'method' => 'GET',
  166. 'timeout' => $timeout,
  167. 'user_agent' => $userAgent,
  168. 'header' => "Accept: */*\r\n"
  169. . 'Accept-Language: ' . $acceptLanguage
  170. )
  171. );
  172. stream_context_set_default($options);
  173. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  174. if (! $headers || strpos($headers[0], '200 OK') === false) {
  175. $options['http']['request_fulluri'] = true;
  176. stream_context_set_default($options);
  177. list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
  178. }
  179. if (! $headers) {
  180. return array($headers, false);
  181. }
  182. try {
  183. // TODO: catch Exception in calling code (thumbnailer)
  184. $context = stream_context_create($options);
  185. $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
  186. } catch (Exception $exc) {
  187. return array(array(0 => 'HTTP Error'), $exc->getMessage());
  188. }
  189. return array($headers, $content);
  190. }
  191. /**
  192. * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
  193. *
  194. * @param string $url initial URL to reach.
  195. * @param int $redirectionLimit max redirection follow.
  196. *
  197. * @return array HTTP headers, or false if it failed.
  198. */
  199. function get_redirected_headers($url, $redirectionLimit = 3)
  200. {
  201. $headers = get_headers($url, 1);
  202. if (!empty($headers['location']) && empty($headers['Location'])) {
  203. $headers['Location'] = $headers['location'];
  204. }
  205. // Headers found, redirection found, and limit not reached.
  206. if ($redirectionLimit-- > 0
  207. && !empty($headers)
  208. && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
  209. && !empty($headers['Location'])) {
  210. $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
  211. if ($redirection != $url) {
  212. $redirection = getAbsoluteUrl($url, $redirection);
  213. return get_redirected_headers($redirection, $redirectionLimit);
  214. }
  215. }
  216. return array($headers, $url);
  217. }
  218. /**
  219. * Get an absolute URL from a complete one, and another absolute/relative URL.
  220. *
  221. * @param string $originalUrl The original complete URL.
  222. * @param string $newUrl The new one, absolute or relative.
  223. *
  224. * @return string Final URL:
  225. * - $newUrl if it was already an absolute URL.
  226. * - if it was relative, absolute URL from $originalUrl path.
  227. */
  228. function getAbsoluteUrl($originalUrl, $newUrl)
  229. {
  230. $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
  231. // Already an absolute URL.
  232. if (!empty($newScheme)) {
  233. return $newUrl;
  234. }
  235. $parts = parse_url($originalUrl);
  236. $final = $parts['scheme'] .'://'. $parts['host'];
  237. $final .= (!empty($parts['port'])) ? $parts['port'] : '';
  238. $final .= '/';
  239. if ($newUrl[0] != '/') {
  240. $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
  241. }
  242. $final .= ltrim($newUrl, '/');
  243. return $final;
  244. }
  245. /**
  246. * Returns the server's base URL: scheme://domain.tld[:port]
  247. *
  248. * @param array $server the $_SERVER array
  249. *
  250. * @return string the server's base URL
  251. *
  252. * @see http://www.ietf.org/rfc/rfc7239.txt
  253. * @see http://www.ietf.org/rfc/rfc6648.txt
  254. * @see http://stackoverflow.com/a/3561399
  255. * @see http://stackoverflow.com/q/452375
  256. */
  257. function server_url($server)
  258. {
  259. $scheme = 'http';
  260. $port = '';
  261. // Shaarli is served behind a proxy
  262. if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
  263. // Keep forwarded scheme
  264. if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
  265. $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
  266. $scheme = trim($schemes[0]);
  267. } else {
  268. $scheme = $server['HTTP_X_FORWARDED_PROTO'];
  269. }
  270. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  271. // Keep forwarded port
  272. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  273. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  274. $port = trim($ports[0]);
  275. } else {
  276. $port = $server['HTTP_X_FORWARDED_PORT'];
  277. }
  278. if (($scheme == 'http' && $port != '80')
  279. || ($scheme == 'https' && $port != '443')
  280. ) {
  281. $port = ':' . $port;
  282. } else {
  283. $port = '';
  284. }
  285. }
  286. if (isset($server['HTTP_X_FORWARDED_HOST'])) {
  287. // Keep forwarded host
  288. if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
  289. $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
  290. $host = trim($hosts[0]);
  291. } else {
  292. $host = $server['HTTP_X_FORWARDED_HOST'];
  293. }
  294. } else {
  295. $host = $server['SERVER_NAME'];
  296. }
  297. return $scheme.'://'.$host.$port;
  298. }
  299. // SSL detection
  300. if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
  301. || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
  302. $scheme = 'https';
  303. }
  304. // Do not append standard port values
  305. if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
  306. || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
  307. $port = ':'.$server['SERVER_PORT'];
  308. }
  309. return $scheme.'://'.$server['SERVER_NAME'].$port;
  310. }
  311. /**
  312. * Returns the absolute URL of the current script, without the query
  313. *
  314. * If the resource is "index.php", then it is removed (for better-looking URLs)
  315. *
  316. * @param array $server the $_SERVER array
  317. *
  318. * @return string the absolute URL of the current script, without the query
  319. */
  320. function index_url($server)
  321. {
  322. $scriptname = $server['SCRIPT_NAME'];
  323. if (endsWith($scriptname, 'index.php')) {
  324. $scriptname = substr($scriptname, 0, -9);
  325. }
  326. return server_url($server) . $scriptname;
  327. }
  328. /**
  329. * Returns the absolute URL of the current script, with the query
  330. *
  331. * If the resource is "index.php", then it is removed (for better-looking URLs)
  332. *
  333. * @param array $server the $_SERVER array
  334. *
  335. * @return string the absolute URL of the current script, with the query
  336. */
  337. function page_url($server)
  338. {
  339. if (! empty($server['QUERY_STRING'])) {
  340. return index_url($server).'?'.$server['QUERY_STRING'];
  341. }
  342. return index_url($server);
  343. }
  344. /**
  345. * Retrieve the initial IP forwarded by the reverse proxy.
  346. *
  347. * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
  348. *
  349. * @param array $server $_SERVER array which contains HTTP headers.
  350. * @param array $trustedIps List of trusted IP from the configuration.
  351. *
  352. * @return string|bool The forwarded IP, or false if none could be extracted.
  353. */
  354. function getIpAddressFromProxy($server, $trustedIps)
  355. {
  356. $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
  357. if (empty($server[$forwardedIpHeader])) {
  358. return false;
  359. }
  360. $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
  361. $ips = array_diff($ips, $trustedIps);
  362. if (empty($ips)) {
  363. return false;
  364. }
  365. return array_pop($ips);
  366. }
  367. /**
  368. * Returns true if Shaarli's currently browsed in HTTPS.
  369. * Supports reverse proxies (if the headers are correctly set).
  370. *
  371. * @param array $server $_SERVER.
  372. *
  373. * @return bool true if HTTPS, false otherwise.
  374. */
  375. function is_https($server)
  376. {
  377. if (isset($server['HTTP_X_FORWARDED_PORT'])) {
  378. // Keep forwarded port
  379. if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
  380. $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
  381. $port = trim($ports[0]);
  382. } else {
  383. $port = $server['HTTP_X_FORWARDED_PORT'];
  384. }
  385. if ($port == '443') {
  386. return true;
  387. }
  388. }
  389. return ! empty($server['HTTPS']);
  390. }