TubeUp.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. import os
  2. import sys
  3. import re
  4. import glob
  5. import time
  6. import json
  7. import logging
  8. import internetarchive
  9. from internetarchive.config import parse_config_file
  10. from datetime import datetime
  11. from yt_dlp import YoutubeDL
  12. from .utils import (sanitize_identifier, check_is_file_empty,
  13. EMPTY_ANNOTATION_FILE)
  14. from logging import getLogger
  15. from urllib.parse import urlparse
  16. from tubeup import __version__
  17. DOWNLOAD_DIR_NAME = 'downloads'
  18. class TubeUp(object):
  19. def __init__(self,
  20. verbose=False,
  21. dir_path='~/.tubeup',
  22. ia_config_path=None,
  23. output_template=None,
  24. get_comments=False):
  25. """
  26. `tubeup` is a tool to archive YouTube by downloading the videos and
  27. uploading it back to the archive.org.
  28. :param verbose: A boolean, True means all loggings will be
  29. printed out to stdout.
  30. :param dir_path: A path to directory that will be used for
  31. saving the downloaded resources. Default to
  32. '~/.tubeup'.
  33. :param ia_config_path: Path to an internetarchive config file, will
  34. be used in uploading the file.
  35. :param output_template: A template string that will be used to
  36. generate the output filenames.
  37. :param get_comments: A boolean, True means that the comments will
  38. be scraped.
  39. """
  40. self.dir_path = dir_path
  41. self.verbose = verbose
  42. self.ia_config_path = ia_config_path
  43. self.logger = getLogger(__name__)
  44. if output_template is None:
  45. self.output_template = '%(id)s.%(ext)s'
  46. else:
  47. self.output_template = output_template
  48. self.get_comments = get_comments
  49. # Just print errors in quiet mode
  50. if not self.verbose:
  51. self.logger.setLevel(logging.ERROR)
  52. @property
  53. def dir_path(self):
  54. return self._dir_path
  55. @dir_path.setter
  56. def dir_path(self, dir_path):
  57. """
  58. Set a directory to be the saving directory for resources that have
  59. been downloaded.
  60. :param dir_path: Path to a directory that will be used to save the
  61. videos, if it not created yet, the directory
  62. will be created.
  63. """
  64. extended_usr_dir_path = os.path.expanduser(dir_path)
  65. # Create the directories.
  66. os.makedirs(
  67. os.path.join(extended_usr_dir_path, DOWNLOAD_DIR_NAME),
  68. exist_ok=True)
  69. self._dir_path = {
  70. 'root': extended_usr_dir_path,
  71. 'downloads': os.path.join(extended_usr_dir_path,
  72. DOWNLOAD_DIR_NAME)
  73. }
  74. def get_resource_basenames(self, urls,
  75. cookie_file=None, proxy_url=None,
  76. ydl_username=None, ydl_password=None,
  77. use_download_archive=False,
  78. ignore_existing_item=False):
  79. """
  80. Get resource basenames from an url.
  81. :param urls: A list of urls that will be downloaded with
  82. youtubedl.
  83. :param cookie_file: A cookie file for YoutubeDL.
  84. :param proxy_url: A proxy url for YoutubeDL.
  85. :param ydl_username: Username that will be used to download the
  86. resources with youtube_dl.
  87. :param ydl_password: Password of the related username, will be used
  88. to download the resources with youtube_dl.
  89. :param use_download_archive: Record the video url to the download archive.
  90. This will download only videos not listed in
  91. the archive file. Record the IDs of all
  92. downloaded videos in it.
  93. :param ignore_existing_item: Ignores the check for existing items on archive.org.
  94. :return: Set of videos basename that has been downloaded.
  95. """
  96. downloaded_files_basename = set()
  97. def check_if_ia_item_exists(infodict):
  98. itemname = sanitize_identifier('%s-%s' % (infodict['extractor'],
  99. infodict['display_id']))
  100. item = internetarchive.get_item(itemname)
  101. if item.exists and self.verbose:
  102. print("\n:: Item already exists. Not downloading.")
  103. print('Title: %s' % infodict['title'])
  104. print('Video URL: %s\n' % infodict['webpage_url'])
  105. return 1
  106. return 0
  107. def ydl_progress_hook(d):
  108. if d['status'] == 'downloading' and self.verbose:
  109. if d.get('_total_bytes_str') is not None:
  110. msg_template = ('%(_percent_str)s of %(_total_bytes_str)s '
  111. 'at %(_speed_str)s ETA %(_eta_str)s')
  112. elif d.get('_total_bytes_estimate_str') is not None:
  113. msg_template = ('%(_percent_str)s of '
  114. '~%(_total_bytes_estimate_str)s at '
  115. '%(_speed_str)s ETA %(_eta_str)s')
  116. elif d.get('_downloaded_bytes_str') is not None:
  117. if d.get('_elapsed_str'):
  118. msg_template = ('%(_downloaded_bytes_str)s at '
  119. '%(_speed_str)s (%(_elapsed_str)s)')
  120. else:
  121. msg_template = ('%(_downloaded_bytes_str)s '
  122. 'at %(_speed_str)s')
  123. else:
  124. msg_template = ('%(_percent_str)s % at '
  125. '%(_speed_str)s ETA %(_eta_str)s')
  126. process_msg = '\r[download] ' + (msg_template % d) + '\033[K'
  127. sys.stdout.write(process_msg)
  128. sys.stdout.flush()
  129. if d['status'] == 'finished':
  130. msg = 'Downloaded %s' % d['filename']
  131. self.logger.debug(d)
  132. self.logger.info(msg)
  133. if self.verbose:
  134. print(msg)
  135. if d['status'] == 'error':
  136. # TODO: Complete the error message
  137. msg = 'Error when downloading the video'
  138. self.logger.error(msg)
  139. if self.verbose:
  140. print(msg)
  141. ydl_opts = self.generate_ydl_options(ydl_progress_hook,
  142. cookie_file, proxy_url,
  143. ydl_username, ydl_password,
  144. use_download_archive)
  145. with YoutubeDL(ydl_opts) as ydl:
  146. for url in urls:
  147. if not ignore_existing_item:
  148. # Get the info dict of the url, without getting comments
  149. ydl_opts["getcomments"] = False
  150. with YoutubeDL(ydl_opts) as ydl_nocomments:
  151. info_dict = ydl_nocomments.extract_info(url, download=False)
  152. if info_dict.get('_type', 'video') == 'playlist':
  153. for entry in info_dict['entries']:
  154. if ydl.in_download_archive(entry):
  155. continue
  156. if check_if_ia_item_exists(entry) == 0:
  157. ydl.extract_info(entry['webpage_url'])
  158. downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry))
  159. else:
  160. ydl.record_download_archive(entry)
  161. else:
  162. if ydl.in_download_archive(info_dict):
  163. continue
  164. if check_if_ia_item_exists(info_dict) == 0:
  165. ydl.extract_info(url)
  166. downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict))
  167. else:
  168. ydl.record_download_archive(info_dict)
  169. else:
  170. info_dict = ydl.extract_info(url)
  171. downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict))
  172. self.logger.debug(
  173. 'Basenames obtained from url (%s): %s'
  174. % (url, downloaded_files_basename))
  175. return downloaded_files_basename
  176. def create_basenames_from_ydl_info_dict(self, ydl, info_dict):
  177. """
  178. Create basenames from YoutubeDL info_dict.
  179. :param ydl: A `youtube_dl.YoutubeDL` instance.
  180. :param info_dict: A ydl info_dict that will be used to create
  181. the basenames.
  182. :return: A set that contains basenames that created from
  183. the `info_dict`.
  184. """
  185. info_type = info_dict.get('_type', 'video')
  186. self.logger.debug('Creating basenames from ydl info dict with type %s'
  187. % info_type)
  188. filenames = set()
  189. if info_type == 'playlist':
  190. # Iterate and get the filenames through the playlist
  191. for video in info_dict['entries']:
  192. filenames.add(ydl.prepare_filename(video))
  193. else:
  194. filenames.add(ydl.prepare_filename(info_dict))
  195. basenames = set()
  196. for filename in filenames:
  197. filename_without_ext = os.path.splitext(filename)[0]
  198. file_basename = re.sub(r'(\.f\d+)', '', filename_without_ext)
  199. basenames.add(file_basename)
  200. return basenames
  201. def generate_ydl_options(self,
  202. ydl_progress_hook,
  203. cookie_file=None,
  204. proxy_url=None,
  205. ydl_username=None,
  206. ydl_password=None,
  207. use_download_archive=False,
  208. ydl_output_template=None):
  209. """
  210. Generate a dictionary that contains options that will be used
  211. by yt-dlp.
  212. :param ydl_progress_hook: A function that will be called during the
  213. download process by youtube_dl.
  214. :param proxy_url: A proxy url for YoutubeDL.
  215. :param ydl_username: Username that will be used to download the
  216. resources with youtube_dl.
  217. :param ydl_password: Password of the related username, will be
  218. used to download the resources with
  219. youtube_dl.
  220. :param use_download_archive: Record the video url to the download archive.
  221. This will download only videos not listed in
  222. the archive file. Record the IDs of all
  223. downloaded videos in it.
  224. :return: A dictionary that contains options that will
  225. be used by youtube_dl.
  226. """
  227. ydl_opts = {
  228. 'outtmpl': os.path.join(self.dir_path['downloads'],
  229. self.output_template),
  230. 'restrictfilenames': True,
  231. 'quiet': not self.verbose,
  232. 'verbose': self.verbose,
  233. 'progress_with_newline': True,
  234. 'forcetitle': True,
  235. 'continuedl': True,
  236. 'retries': 9001,
  237. 'fragment_retries': 9001,
  238. 'forcejson': False,
  239. 'writeinfojson': True,
  240. 'writedescription': True,
  241. 'getcomments': self.get_comments,
  242. 'writethumbnail': True,
  243. 'writeannotations': True,
  244. 'writesubtitles': True,
  245. 'allsubtitles': True,
  246. 'ignoreerrors': True, # Geo-blocked,
  247. # copyrighted/private/deleted
  248. # will be printed to STDOUT and channel
  249. # ripping will continue uninterupted,
  250. # use with verbose off
  251. 'fixup': 'warn', # Slightly more verbosity for debugging
  252. # problems
  253. 'nooverwrites': True, # Don't touch what's already been
  254. # downloaded speeds things
  255. 'consoletitle': True, # Download percentage in console title
  256. 'prefer_ffmpeg': True, # `ffmpeg` is better than `avconv`,
  257. # let's prefer it's use
  258. # Warns on out of date youtube-dl script, helps debugging for
  259. # youtube-dl devs
  260. 'call_home': False,
  261. 'logger': self.logger,
  262. 'progress_hooks': [ydl_progress_hook]
  263. }
  264. if cookie_file is not None:
  265. ydl_opts['cookiefile'] = cookie_file
  266. if proxy_url is not None:
  267. ydl_opts['proxy'] = proxy_url
  268. if ydl_username is not None:
  269. ydl_opts['username'] = ydl_username
  270. if ydl_password is not None:
  271. ydl_opts['password'] = ydl_password
  272. if use_download_archive:
  273. ydl_opts['download_archive'] = os.path.join(self.dir_path['root'],
  274. '.ytdlarchive')
  275. return ydl_opts
  276. def upload_ia(self, videobasename, custom_meta=None):
  277. """
  278. Upload video to archive.org.
  279. :param videobasename: A video base name.
  280. :param custom_meta: A custom meta, will be used by internetarchive
  281. library when uploading to archive.org.
  282. :return: A tuple containing item name and metadata used
  283. when uploading to archive.org and whether the item
  284. already exists.
  285. """
  286. json_metadata_filepath = videobasename + '.info.json'
  287. with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
  288. vid_meta = json.load(f)
  289. itemname = ('%s-%s' % (vid_meta['extractor'],
  290. vid_meta['display_id']))
  291. # Exit if video download did not complete, don't upload .part files to IA
  292. for ext in ['*.part', '*.f303*', '*.f302*', '*.ytdl', '*.f251*', '*.248*', '*.f247*', '*.temp']:
  293. if glob.glob(videobasename + ext):
  294. msg = 'Video download incomplete, re-attempt archival attempt, exiting...'
  295. raise Exception(msg)
  296. # Replace illegal characters within identifer
  297. itemname = sanitize_identifier(itemname)
  298. metadata = self.create_archive_org_metadata_from_youtubedl_meta(
  299. vid_meta)
  300. # Delete empty description file
  301. description_file_path = videobasename + '.description'
  302. if (os.path.exists(description_file_path) and
  303. (('description' in vid_meta and
  304. vid_meta['description'] == '') or
  305. check_is_file_empty(description_file_path))):
  306. os.remove(description_file_path)
  307. # Delete empty annotations.xml file so it isn't uploaded
  308. annotations_file_path = videobasename + '.annotations.xml'
  309. if (os.path.exists(annotations_file_path) and
  310. (('annotations' in vid_meta and
  311. vid_meta['annotations'] in {'', EMPTY_ANNOTATION_FILE}) or
  312. check_is_file_empty(annotations_file_path))):
  313. os.remove(annotations_file_path)
  314. # Upload all files with videobase name: e.g. video.mp4,
  315. # video.info.json, video.srt, etc.
  316. files_to_upload = glob.glob(videobasename + '*')
  317. # Upload the item to the Internet Archive
  318. item = internetarchive.get_item(itemname)
  319. if custom_meta:
  320. metadata.update(custom_meta)
  321. # Parse internetarchive configuration file.
  322. parsed_ia_s3_config = parse_config_file(self.ia_config_path)[2]['s3']
  323. s3_access_key = parsed_ia_s3_config['access']
  324. s3_secret_key = parsed_ia_s3_config['secret']
  325. if None in {s3_access_key, s3_secret_key}:
  326. msg = ('`internetarchive` configuration file is not configured'
  327. ' properly.')
  328. self.logger.error(msg)
  329. if self.verbose:
  330. print(msg)
  331. raise Exception(msg)
  332. item.upload(files_to_upload, metadata=metadata, retries=9001,
  333. request_kwargs=dict(timeout=9001), delete=True,
  334. verbose=self.verbose, access_key=s3_access_key,
  335. secret_key=s3_secret_key)
  336. return itemname, metadata
  337. def archive_urls(self, urls, custom_meta=None,
  338. cookie_file=None, proxy=None,
  339. ydl_username=None, ydl_password=None,
  340. use_download_archive=False,
  341. ignore_existing_item=False):
  342. """
  343. Download and upload videos from youtube_dl supported sites to
  344. archive.org
  345. :param urls: List of url that will be downloaded and uploaded
  346. to archive.org
  347. :param custom_meta: A custom metadata that will be used when
  348. uploading the file with archive.org.
  349. :param cookie_file: A cookie file for YoutubeDL.
  350. :param proxy_url: A proxy url for YoutubeDL.
  351. :param ydl_username: Username that will be used to download the
  352. resources with youtube_dl.
  353. :param ydl_password: Password of the related username, will be used
  354. to download the resources with youtube_dl.
  355. :param use_download_archive: Record the video url to the download archive.
  356. This will download only videos not listed in
  357. the archive file. Record the IDs of all
  358. downloaded videos in it.
  359. :param ignore_existing_item: Ignores the check for existing items on archive.org.
  360. :return: Tuple containing identifier and metadata of the
  361. file that has been uploaded to archive.org.
  362. """
  363. downloaded_file_basenames = self.get_resource_basenames(
  364. urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
  365. ignore_existing_item)
  366. for basename in downloaded_file_basenames:
  367. identifier, meta = self.upload_ia(basename, custom_meta)
  368. yield identifier, meta
  369. @staticmethod
  370. def determine_collection_type(url):
  371. """
  372. Determine collection type for an url.
  373. :param url: URL that the collection type will be determined.
  374. :return: String, name of a collection.
  375. """
  376. if urlparse(url).netloc == 'soundcloud.com':
  377. return 'opensource_audio'
  378. return 'opensource_movies'
  379. @staticmethod
  380. def determine_licenseurl(vid_meta):
  381. """
  382. Determine licenseurl for an url
  383. :param vid_meta:
  384. :return:
  385. """
  386. licenseurl = ''
  387. licenses = {
  388. "Creative Commons Attribution license (reuse allowed)": "https://creativecommons.org/licenses/by/3.0/",
  389. "Attribution-NonCommercial-ShareAlike": "https://creativecommons.org/licenses/by-nc-sa/2.0/",
  390. "Attribution-NonCommercial": "https://creativecommons.org/licenses/by-nc/2.0/",
  391. "Attribution-NonCommercial-NoDerivs": "https://creativecommons.org/licenses/by-nc-nd/2.0/",
  392. "Attribution": "https://creativecommons.org/licenses/by/2.0/",
  393. "Attribution-ShareAlike": "https://creativecommons.org/licenses/by-sa/2.0/",
  394. "Attribution-NoDerivs": "https://creativecommons.org/licenses/by-nd/2.0/"
  395. }
  396. if 'license' in vid_meta and vid_meta['license']:
  397. licenseurl = licenses.get(vid_meta['license'])
  398. return licenseurl
  399. @staticmethod
  400. def create_archive_org_metadata_from_youtubedl_meta(vid_meta):
  401. """
  402. Create an archive.org from youtubedl-generated metadata.
  403. :param vid_meta: A dict containing youtubedl-generated metadata.
  404. :return: A dict containing metadata to be used by
  405. internetarchive library.
  406. """
  407. title = '%s' % (vid_meta['title'])
  408. videourl = vid_meta['webpage_url']
  409. collection = TubeUp.determine_collection_type(videourl)
  410. # Some video services don't tell you the uploader,
  411. # use our program's name in that case.
  412. try:
  413. if vid_meta['extractor_key'] == 'TwitchClips' and 'creator' in vid_meta and vid_meta['creator']:
  414. uploader = vid_meta['creator']
  415. elif 'uploader' in vid_meta and vid_meta['uploader']:
  416. uploader = vid_meta['uploader']
  417. elif 'uploader_url' in vid_meta and vid_meta['uploader_url']:
  418. uploader = vid_meta['uploader_url']
  419. else:
  420. uploader = 'tubeup.py'
  421. except TypeError: # apparently uploader is null as well
  422. uploader = 'tubeup.py'
  423. uploader_url = vid_meta.get('uploader_url', videourl)
  424. try: # some videos don't give an upload date
  425. d = datetime.strptime(vid_meta['upload_date'], '%Y%m%d')
  426. upload_date = d.isoformat().split('T')[0]
  427. upload_year = upload_date[:4] # 20150614 -> 2015
  428. except (KeyError, TypeError):
  429. # Use current date and time as default values
  430. upload_date = time.strftime("%Y-%m-%d")
  431. upload_year = time.strftime("%Y")
  432. # load up tags into an IA compatible semicolon-separated string
  433. # example: Youtube;video;
  434. tags_string = '%s;video;' % vid_meta['extractor_key']
  435. if 'categories' in vid_meta:
  436. # add categories as tags as well, if they exist
  437. try:
  438. for category in vid_meta['categories']:
  439. tags_string += '%s;' % category
  440. except Exception:
  441. print("No categories found.")
  442. if 'tags' in vid_meta: # some video services don't have tags
  443. try:
  444. if 'tags' in vid_meta is None:
  445. tags_string += '%s;' % vid_meta['id']
  446. tags_string += '%s;' % 'video'
  447. else:
  448. for tag in vid_meta['tags']:
  449. tags_string += '%s;' % tag
  450. except Exception:
  451. print("Unable to process tags successfully.")
  452. # license
  453. licenseurl = TubeUp.determine_licenseurl(vid_meta)
  454. # if there is no description don't upload the empty .description file
  455. description_text = vid_meta.get('description', '')
  456. if description_text is None:
  457. description_text = ''
  458. # archive.org does not display raw newlines
  459. description_text = re.sub('\r?\n', '<br>', description_text)
  460. description = ('{0} <br/><br/>Source: <a href="{1}">{2}</a>'
  461. '<br/>Uploader: <a href="{3}">{4}</a>').format(
  462. description_text, videourl, videourl, uploader_url, uploader)
  463. metadata = dict(
  464. mediatype=('audio' if collection == 'opensource_audio'
  465. else 'movies'),
  466. creator=uploader,
  467. collection=collection,
  468. title=title,
  469. description=description,
  470. date=upload_date,
  471. year=upload_year,
  472. subject=tags_string,
  473. originalurl=videourl,
  474. licenseurl=licenseurl,
  475. # Set 'scanner' metadata pair to allow tracking of TubeUp
  476. # powered uploads, per request from archive.org
  477. scanner='TubeUp Video Stream Mirroring Application {}'.format(__version__))
  478. return metadata