__init__.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. import os
  2. import logging
  3. import time
  4. import re
  5. import urlparse
  6. from redis.exceptions import ResponseError
  7. import requests
  8. import praw
  9. log = logging.getLogger(__name__)
  10. class Ripper(object):
  11. timeout = 40
  12. agent = "Reddrip v1.0"
  13. submission_limit = 50
  14. supported_exts = [ "jpg", "png", "gif" ]
  15. def __init__(self, redis_conn, output_dir):
  16. self.redis_conn = redis_conn
  17. self.output_dir = os.path.abspath(output_dir)
  18. self.reddit = praw.Reddit(user_agent=self.agent)
  19. def _clean_title(self, title):
  20. return re.sub(r'([^\s\w]|_)+', '', title)
  21. def _get_ext(self, url):
  22. url_info = urlparse.urlparse(url)
  23. return url_info.path.split(".")[-1].lower()
  24. def _seen(self, sub_id, subreddit):
  25. """ Check if we already have this submission. """
  26. return self.redis_conn.sismember(
  27. "stat:%s:processed:all" % subreddit, sub_id
  28. )
  29. def _save(self, sub_id, url, subreddit, filename, ext):
  30. """ Save submission and mark it as processed. """
  31. log.debug("Processing %s" % sub_id)
  32. subdir = self.output_dir + "/" + subreddit
  33. if not os.path.exists(subdir):
  34. os.makedirs(subdir)
  35. suffix = 2
  36. full_file = "%s/%s.%s" % (subdir, filename, ext)
  37. while os.path.exists(full_file):
  38. full_file = "%s/%s_%s.%s" % (subdir, filename, suffix, ext)
  39. suffix += 1
  40. r = requests.get(url)
  41. with open(full_file, "wb") as output:
  42. output.write(r.content)
  43. # Increment saved number.
  44. self.redis_conn.incr("stat:%s:saved:count" % subreddit)
  45. # Add the download size to the total size.
  46. try:
  47. self.redis_conn.incrby(
  48. "stat:%s:saved:size" % subreddit,
  49. r.headers["content-length"]
  50. )
  51. except (AttributeError, KeyError, ResponseError):
  52. log.debug("Content-length is wrong: %s" % r.headers)
  53. # Add the donwload time.
  54. datehour = time.strftime("%Y%m%d%H", time.localtime())
  55. if not self.redis_conn.sismember(
  56. "stat:%s:dates" % subreddit, datehour
  57. ):
  58. self.redis_conn.sadd(
  59. "stat:%s:dates" % subreddit, datehour
  60. )
  61. self.redis_conn.incr("stat:%s:date:%s" % (subreddit, datehour))
  62. log.info("Saved %s in %s: %s" % (sub_id, subreddit, filename))
  63. time.sleep(1)
  64. def process(self, sub):
  65. log.debug("Processing %s" % sub["name"])
  66. if not self.redis_conn.sismember("subreddits", sub["name"]):
  67. self.redis_conn.sadd("subreddits", sub["name"])
  68. if sub["type"] == "hot":
  69. submissions = self.reddit.get_subreddit(sub["name"]).get_hot(
  70. limit=self.submission_limit
  71. )
  72. elif sub["type"] == "new":
  73. submissions = self.reddit.get_subreddit(sub["name"]).get_new(
  74. limit=self.submission_limit
  75. )
  76. else:
  77. raise Exception("Unsupported type %s" % sub["type"])
  78. for submission in submissions:
  79. if self._seen(submission.id, sub["name"]):
  80. continue
  81. self.redis_conn.sadd(
  82. "stat:%s:processed:all" % sub["name"], submission.id
  83. )
  84. ext = self._get_ext(submission.url)
  85. if ext not in self.supported_exts:
  86. log.debug("Skipped %s, not supported extension in: %s" % (
  87. submission.id, submission.url
  88. ))
  89. continue
  90. self._save(
  91. sub_id=submission.id,
  92. url=submission.url,
  93. subreddit=sub["name"],
  94. filename=self._clean_title(submission.title),
  95. ext=ext,
  96. )
  97. time.sleep(self.timeout)