LinkUtils.php 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. <?php
  2. /**
  3. * Get cURL callback function for CURLOPT_WRITEFUNCTION
  4. *
  5. * @param string $charset to extract from the downloaded page (reference)
  6. * @param string $title to extract from the downloaded page (reference)
  7. * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
  8. *
  9. * @return Closure
  10. */
  11. function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
  12. {
  13. $isRedirected = false;
  14. /**
  15. * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
  16. *
  17. * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
  18. * Then we extract the title and the charset and stop the download when it's done.
  19. *
  20. * @param resource $ch cURL resource
  21. * @param string $data chunk of data being downloaded
  22. *
  23. * @return int|bool length of $data or false if we need to stop the download
  24. */
  25. return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {
  26. $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
  27. if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
  28. $isRedirected = true;
  29. return strlen($data);
  30. }
  31. if (!empty($responseCode) && $responseCode !== 200) {
  32. return false;
  33. }
  34. // After a redirection, the content type will keep the previous request value
  35. // until it finds the next content-type header.
  36. if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
  37. $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
  38. }
  39. if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
  40. return false;
  41. }
  42. if (!empty($contentType) && empty($charset)) {
  43. $charset = header_extract_charset($contentType);
  44. }
  45. if (empty($charset)) {
  46. $charset = html_extract_charset($data);
  47. }
  48. if (empty($title)) {
  49. $title = html_extract_title($data);
  50. }
  51. // We got everything we want, stop the download.
  52. if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
  53. return false;
  54. }
  55. return strlen($data);
  56. };
  57. }
  58. /**
  59. * Extract title from an HTML document.
  60. *
  61. * @param string $html HTML content where to look for a title.
  62. *
  63. * @return bool|string Extracted title if found, false otherwise.
  64. */
  65. function html_extract_title($html)
  66. {
  67. if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
  68. return trim(str_replace("\n", '', $matches[1]));
  69. }
  70. return false;
  71. }
  72. /**
  73. * Extract charset from HTTP header if it's defined.
  74. *
  75. * @param string $header HTTP header Content-Type line.
  76. *
  77. * @return bool|string Charset string if found (lowercase), false otherwise.
  78. */
  79. function header_extract_charset($header)
  80. {
  81. preg_match('/charset="?([^; ]+)/i', $header, $match);
  82. if (! empty($match[1])) {
  83. return strtolower(trim($match[1]));
  84. }
  85. return false;
  86. }
  87. /**
  88. * Extract charset HTML content (tag <meta charset>).
  89. *
  90. * @param string $html HTML content where to look for charset.
  91. *
  92. * @return bool|string Charset string if found, false otherwise.
  93. */
  94. function html_extract_charset($html)
  95. {
  96. // Get encoding specified in HTML header.
  97. preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
  98. if (!empty($enc[1])) {
  99. return strtolower($enc[1]);
  100. }
  101. return false;
  102. }
  103. /**
  104. * Count private links in given linklist.
  105. *
  106. * @param array|Countable $links Linklist.
  107. *
  108. * @return int Number of private links.
  109. */
  110. function count_private($links)
  111. {
  112. $cpt = 0;
  113. foreach ($links as $link) {
  114. if ($link['private']) {
  115. $cpt += 1;
  116. }
  117. }
  118. return $cpt;
  119. }
  120. /**
  121. * In a string, converts URLs to clickable links.
  122. *
  123. * @param string $text input string.
  124. * @param string $redirector if a redirector is set, use it to gerenate links.
  125. * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not.
  126. *
  127. * @return string returns $text with all links converted to HTML links.
  128. *
  129. * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722
  130. */
  131. function text2clickable($text, $redirector = '', $urlEncode = true)
  132. {
  133. $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9\(\)]/?)!si';
  134. if (empty($redirector)) {
  135. return preg_replace($regex, '<a href="$1">$1</a>', $text);
  136. }
  137. // Redirector is set, urlencode the final URL.
  138. return preg_replace_callback(
  139. $regex,
  140. function ($matches) use ($redirector, $urlEncode) {
  141. $url = $urlEncode ? urlencode($matches[1]) : $matches[1];
  142. return '<a href="' . $redirector . $url .'">'. $matches[1] .'</a>';
  143. },
  144. $text
  145. );
  146. }
  147. /**
  148. * Auto-link hashtags.
  149. *
  150. * @param string $description Given description.
  151. * @param string $indexUrl Root URL.
  152. *
  153. * @return string Description with auto-linked hashtags.
  154. */
  155. function hashtag_autolink($description, $indexUrl = '')
  156. {
  157. /*
  158. * To support unicode: http://stackoverflow.com/a/35498078/1484919
  159. * \p{Pc} - to match underscore
  160. * \p{N} - numeric character in any script
  161. * \p{L} - letter from any language
  162. * \p{Mn} - any non marking space (accents, umlauts, etc)
  163. */
  164. $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}]+)/mui';
  165. $replacement = '$1<a href="'. $indexUrl .'?addtag=$2" title="Hashtag $2">#$2</a>';
  166. return preg_replace($regex, $replacement, $description);
  167. }
  168. /**
  169. * This function inserts &nbsp; where relevant so that multiple spaces are properly displayed in HTML
  170. * even in the absence of <pre> (This is used in description to keep text formatting).
  171. *
  172. * @param string $text input text.
  173. *
  174. * @return string formatted text.
  175. */
  176. function space2nbsp($text)
  177. {
  178. return preg_replace('/(^| ) /m', '$1&nbsp;', $text);
  179. }
  180. /**
  181. * Format Shaarli's description
  182. *
  183. * @param string $description shaare's description.
  184. * @param string $redirector if a redirector is set, use it to gerenate links.
  185. * @param bool $urlEncode Use `urlencode()` on the URL after the redirector or not.
  186. * @param string $indexUrl URL to Shaarli's index.
  187. * @return string formatted description.
  188. */
  189. function format_description($description, $redirector = '', $urlEncode = true, $indexUrl = '')
  190. {
  191. return nl2br(space2nbsp(hashtag_autolink(text2clickable($description, $redirector, $urlEncode), $indexUrl)));
  192. }
  193. /**
  194. * Generate a small hash for a link.
  195. *
  196. * @param DateTime $date Link creation date.
  197. * @param int $id Link ID.
  198. *
  199. * @return string the small hash generated from link data.
  200. */
  201. function link_small_hash($date, $id)
  202. {
  203. return smallHash($date->format(LinkDB::LINK_DATE_FORMAT) . $id);
  204. }