LinkUtils.php 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. <?php
  2. /**
  3. * Extract title from an HTML document.
  4. *
  5. * @param string $html HTML content where to look for a title.
  6. *
  7. * @return bool|string Extracted title if found, false otherwise.
  8. */
  9. function html_extract_title($html)
  10. {
  11. if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
  12. return trim(str_replace("\n", ' ', $matches[1]));
  13. }
  14. return false;
  15. }
  16. /**
  17. * Determine charset from downloaded page.
  18. * Priority:
  19. * 1. HTTP headers (Content type).
  20. * 2. HTML content page (tag <meta charset>).
  21. * 3. Use a default charset (default: UTF-8).
  22. *
  23. * @param array $headers HTTP headers array.
  24. * @param string $htmlContent HTML content where to look for charset.
  25. * @param string $defaultCharset Default charset to apply if other methods failed.
  26. *
  27. * @return string Determined charset.
  28. */
  29. function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
  30. {
  31. if ($charset = headers_extract_charset($headers)) {
  32. return $charset;
  33. }
  34. if ($charset = html_extract_charset($htmlContent)) {
  35. return $charset;
  36. }
  37. return $defaultCharset;
  38. }
  39. /**
  40. * Extract charset from HTTP headers if it's defined.
  41. *
  42. * @param array $headers HTTP headers array.
  43. *
  44. * @return bool|string Charset string if found (lowercase), false otherwise.
  45. */
  46. function headers_extract_charset($headers)
  47. {
  48. if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
  49. preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
  50. if (! empty($match[1])) {
  51. return strtolower(trim($match[1]));
  52. }
  53. }
  54. return false;
  55. }
  56. /**
  57. * Extract charset HTML content (tag <meta charset>).
  58. *
  59. * @param string $html HTML content where to look for charset.
  60. *
  61. * @return bool|string Charset string if found, false otherwise.
  62. */
  63. function html_extract_charset($html)
  64. {
  65. // Get encoding specified in HTML header.
  66. preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
  67. if (!empty($enc[1])) {
  68. return strtolower($enc[1]);
  69. }
  70. return false;
  71. }