media.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. # copied from social48.tools
  2. # in the future, import from that project directly
  3. from mimetypes import guess_type
  4. import requests
  5. import tempfile
  6. import os
  7. import time
  8. import hashlib
  9. import shutil
  10. __all__ = ['checksum', 'guess_mimetype', 'extension', 'save_from_url']
  11. # TODO: rename this module so it doesn't clash with variables named "media"
  12. _extension_lookup = {
  13. "image/jpeg": "jpg",
  14. "image/png": "png",
  15. "image/gif": "gif",
  16. "video/mp4": "mp4",
  17. "video/mpeg4": "mp4"
  18. }
  19. # http://stackoverflow.com/a/24847608/3380530
  20. def hashsum(hash, filename):
  21. with open(filename, 'rb') as infp:
  22. for chunk in iter(lambda: infp.read(128 * hash.block_size), b""):
  23. hash.update(chunk)
  24. return hash.hexdigest()
  25. def md5sum(filename):
  26. md5hash = hashlib.md5()
  27. return hashsum(md5hash, filename)
  28. def blake2bsum(filename):
  29. b2bhash = hashlib.blake2b(digest_size=32)
  30. try:
  31. return hashsum(b2bhash, filename)
  32. except FileNotFoundError:
  33. return ""
  34. checksum = blake2bsum
  35. def guess_mimetype(url):
  36. """
  37. Tries to guess the file type based on extension, assumes jpeg if it can't
  38. """
  39. guess = guess_type(url)[0]
  40. return guess if guess else 'image/jpeg'
  41. def extension(mimetype):
  42. return _extension_lookup.get(mimetype, 'UNKNOWN')
  43. class MediaFileExistsError(FileExistsError):
  44. def __init__(self, *args, size_bytes, checksum, tempfile):
  45. super().__init__(self, *args)
  46. self.size_bytes = size_bytes
  47. self.checksum = checksum
  48. self.tempfile = tempfile
  49. class MediaURLNotFound(Exception):
  50. pass
  51. def save_from_url(url, outfile, tmpdir=None, skip_exists=False, raise_exists=False):
  52. """
  53. Simple media file saver.
  54. Saves directly accessed videos and images to a specified directory.
  55. :param url: URL to download from
  56. :param outfile: Destination to save to
  57. :param tmpdir: Temp directory (e.g. to utilise SSD storage)
  58. :param skip_exists: Checks if the file already exists before downloading, use with caution
  59. :return: A tuple with two elements:
  60. The bytes written if successful, else 0
  61. The checksum, if successful, else ""
  62. Raises:
  63. MediaURLNotFound: URL 404s
  64. MediaFileExistsError:
  65. raised when raise_exists and the destination file exists and checksums do not match
  66. stores the tempfile path, file size, and checksum as tempfile, size_bytes, and checksum attributes
  67. Also may raise any exception raised by requests.Response.raise_for_status()
  68. """
  69. # TODO: return code
  70. if '/' in outfile:
  71. destdir = outfile.rsplit('/', 1)[0]
  72. os.makedirs(destdir, exist_ok=True)
  73. attempts = 0
  74. max_attempts = 5
  75. if skip_exists:
  76. if os.path.exists(outfile):
  77. return os.path.getsize(outfile), checksum(outfile)
  78. while True:
  79. try:
  80. r = requests.get(url, timeout=(3, 10), stream=True)
  81. r.raise_for_status()
  82. except requests.exceptions.HTTPError as e:
  83. # TODO: handle other meaningful error codes
  84. if e.response.status_code == 404:
  85. raise MediaURLNotFound
  86. attempts += 1
  87. error = str(e)
  88. wait = 2 ** attempts
  89. except (requests.exceptions.Timeout, requests.exceptions.ConnectTimeout) as e:
  90. attempts += 1
  91. error = str(e)
  92. wait = 2 ** attempts * 0.5
  93. except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
  94. attempts += 1
  95. max_attempts += 1
  96. error = str(e)
  97. wait = 3 ** attempts * 5
  98. else:
  99. fno, tmppath = tempfile.mkstemp(dir=tmpdir)
  100. bytes_written = 0
  101. with open(tmppath, 'wb') as outfp:
  102. for chunk in r.iter_content(chunk_size=2048):
  103. bytes_written += outfp.write(chunk)
  104. os.close(fno)
  105. cs = checksum(tmppath)
  106. # TODO: test if outfile exists, if so compare first
  107. if os.path.exists(outfile):
  108. if checksum(outfile) == cs:
  109. os.remove(tmppath)
  110. else:
  111. err = "{} exists.\nCheck {} for downloaded file".format(outfile, tmppath)
  112. if raise_exists:
  113. raise MediaFileExistsError(err, size_bytes=bytes_written, checksum=cs, tempfile=tmppath)
  114. else:
  115. print(err)
  116. bytes_written = 0
  117. else:
  118. shutil.move(tmppath, outfile)
  119. return bytes_written, cs
  120. if attempts < max_attempts:
  121. time.sleep(wait) # this definitely needs to happen in another thread
  122. else:
  123. return 0, ""