podcast_archiver.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #!/usr/bin/env python3
  2. """
  3. Podcast Archiver v0.3: Feed parser for local podcast archive creation
  4. Copyright (c) 2014-2017 Jan Willhaus
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to deal
  7. in the Software without restriction, including without limitation the rights
  8. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. THE SOFTWARE.
  20. """
  21. import sys
  22. import argparse
  23. from argparse import ArgumentTypeError
  24. import feedparser
  25. from feedparser import CharacterEncodingOverride
  26. from urllib.request import urlopen, Request
  27. import urllib.error
  28. from shutil import copyfileobj
  29. from os import path, remove, makedirs, access, W_OK
  30. from urllib.parse import urlparse
  31. import unicodedata
  32. import re
  33. import xml.etree.ElementTree as etree
  34. class writeable_dir(argparse.Action):
  35. def __call__(self, parser, namespace, values, option_string=None):
  36. prospective_dir = values
  37. if not path.isdir(prospective_dir):
  38. raise ArgumentTypeError("%s is not a valid path" % prospective_dir)
  39. if access(prospective_dir, W_OK):
  40. setattr(namespace, self.dest, prospective_dir)
  41. else:
  42. raise ArgumentTypeError("%s is not a writeable dir" % prospective_dir)
  43. class PodcastArchiver:
  44. _feed_title = ''
  45. _feedobj = None
  46. _feed_info_dict = {}
  47. _userAgent = 'Podcast-Archiver/0.4 (https://github.com/janwh/podcast-archiver)'
  48. _headers = {'User-Agent': _userAgent}
  49. _global_info_keys = ['author', 'language', 'link', 'subtitle', 'title', ]
  50. _episode_info_keys = ['author', 'link', 'subtitle', 'title', ]
  51. _date_keys = ['published', ]
  52. savedir = ''
  53. verbose = 0
  54. subdirs = False
  55. update = False
  56. progress = False
  57. maximumEpisodes = None
  58. feedlist = []
  59. def __init__(self):
  60. feedparser.USER_AGENT = self._userAgent
  61. def addArguments(self, args):
  62. # if type(args) is argparse.ArgumentParser:
  63. # args = parser.parse_args()
  64. self.verbose = args.verbose or 0
  65. if self.verbose > 2:
  66. print('Input arguments:', args)
  67. for feed in (args.feed or []):
  68. self.addFeed(feed)
  69. for opml in (args.opml or []):
  70. self.parseOpmlFile(opml)
  71. if args.dir:
  72. self.savedir = args.dir
  73. self.subdirs = args.subdirs
  74. self.update = args.update
  75. self.progress = args.progress
  76. self.slugify = args.slugify
  77. self.maximumEpisodes = args.max_episodes or None
  78. if self.verbose > 1:
  79. print("Verbose level: ", self.verbose)
  80. def addFeed(self, feed):
  81. if path.isfile(feed):
  82. self.feedlist += open(feed, 'r').read().strip().splitlines()
  83. else:
  84. self.feedlist.append(feed)
  85. def parseOpmlFile(self, opml):
  86. with opml as file:
  87. tree = etree.fromstringlist(file)
  88. for feed in [node.get('xmlUrl') for node
  89. in tree.findall("*/outline/[@type='rss']")
  90. if node.get('xmlUrl') is not None]:
  91. self.addFeed(feed)
  92. def processFeeds(self):
  93. if self.verbose > 0 and self.update:
  94. print("Updating archive")
  95. for feed in self.feedlist:
  96. if self.verbose > 0:
  97. print("\nDownloading archive for: " + feed)
  98. linklist = self.processPodcastLink(feed)
  99. self.downloadPodcastFiles(linklist)
  100. if self.verbose > 0:
  101. print("\nDone.")
  102. def parseGlobalFeedInfo(self, feedobj=None):
  103. if feedobj is None:
  104. feedobj = self._feedobj
  105. self._feed_info_dict = {}
  106. if 'feed' in feedobj:
  107. for key in self._global_info_keys:
  108. self._feed_info_dict['feed_' + key] = feedobj['feed'].get(key, None)
  109. return self._feed_info_dict
  110. def slugifyString(filename):
  111. filename = unicodedata.normalize('NFKD', filename).encode('ascii', 'ignore')
  112. filename = re.sub('[^\w\s\-\.]', '', filename.decode('ascii')).strip()
  113. filename = re.sub('[-\s]+', '-', filename)
  114. return filename
  115. def linkToTargetFilename(self, link, must_have_ext=False):
  116. # Remove HTTP GET parameters from filename by parsing URL properly
  117. linkpath = urlparse(link).path
  118. basename = path.basename(linkpath)
  119. _, ext = path.splitext(basename)
  120. if must_have_ext and not ext:
  121. return None
  122. # If requested, slugify the filename
  123. if self.slugify:
  124. basename = PodcastArchiver.slugifyString(basename)
  125. self._feed_title = PodcastArchiver.slugifyString(self._feed_title)
  126. else:
  127. basename.replace(path.pathsep, '_')
  128. basename.replace(path.sep, '_')
  129. self._feed_title.replace(path.pathsep, '_')
  130. self._feed_title.replace(path.sep, '_')
  131. # Generate local path and check for existence
  132. if self.subdirs:
  133. filename = path.join(self.savedir, self._feed_title, basename)
  134. else:
  135. filename = path.join(self.savedir, basename)
  136. return filename
  137. def parseFeedToNextPage(self, feedobj=None):
  138. if feedobj is None:
  139. feedobj = self._feedobj
  140. # Assuming there will only be one link declared as 'next'
  141. self._feed_next_page = [link['href'] for link in feedobj['feed']['links']
  142. if link['rel'] == 'next']
  143. if len(self._feed_next_page) > 0:
  144. self._feed_next_page = self._feed_next_page[0]
  145. else:
  146. self._feed_next_page = None
  147. return self._feed_next_page
  148. def parseFeedToLinks(self, feed=None):
  149. if feed is None:
  150. feed = self._feedobj
  151. # Try different feed episode layouts: 'items' or 'entries'
  152. episodeList = feed.get('items', False) or feed.get('entries', False)
  153. if episodeList:
  154. linklist = [self.parseEpisode(episode) for episode in episodeList]
  155. linklist = [link for link in linklist if len(link) > 0]
  156. else:
  157. linklist = []
  158. return linklist
  159. def parseEpisode(self, episode):
  160. url = None
  161. episode_info = {}
  162. for link in episode['links']:
  163. if 'type' in link.keys():
  164. if link['type'].startswith('audio'):
  165. url = link['href']
  166. elif link['type'].startswith('video'):
  167. url = link['href']
  168. if url is not None:
  169. for key in self._episode_info_keys + self._date_keys:
  170. episode_info[key] = episode.get(key, None)
  171. episode_info['url'] = url
  172. return episode_info
  173. def processPodcastLink(self, link):
  174. if self.verbose > 0:
  175. print("1. Gathering link list ...", end="", flush=True)
  176. self._feed_title = None
  177. self._feed_next_page = link
  178. first_page = True
  179. linklist = []
  180. while self._feed_next_page is not None:
  181. if self.verbose > 0:
  182. print(".", end="", flush=True)
  183. self._feedobj = feedparser.parse(self._feed_next_page)
  184. # Escape improper feed-URL
  185. if 'status' in self._feedobj.keys() and self._feedobj['status'] >= 400:
  186. print("\nQuery returned HTTP error", self._feedobj['status'])
  187. return None
  188. # Escape malformatted XML
  189. if self._feedobj['bozo'] == 1:
  190. # If the character encoding is wrong, we continue as long as the reparsing succeeded
  191. if type(self._feedobj['bozo_exception']) is not CharacterEncodingOverride:
  192. print('\nDownloaded feed is malformatted on', self._feed_next_page)
  193. return None
  194. if first_page:
  195. self.parseGlobalFeedInfo()
  196. first_page = False
  197. # Parse the feed object for episodes and the next page
  198. linklist += self.parseFeedToLinks(self._feedobj)
  199. self._feed_next_page = self.parseFeedToNextPage(self._feedobj)
  200. if self._feed_title is None:
  201. self._feed_title = self._feedobj['feed']['title']
  202. numberOfLinks = len(linklist)
  203. # On given option, run an update, break at first existing episode
  204. if self.update:
  205. for index, episode_dict in enumerate(linklist):
  206. link = episode_dict['url']
  207. filename = self.linkToTargetFilename(link)
  208. if path.isfile(filename):
  209. del(linklist[index:])
  210. break
  211. numberOfLinks = len(linklist)
  212. # On given option, crop linklist to maximum number of episodes
  213. if self.maximumEpisodes is not None and self.maximumEpisodes < numberOfLinks:
  214. linklist = linklist[0:self.maximumEpisodes]
  215. numberOfLinks = self.maximumEpisodes
  216. if self.maximumEpisodes is not None or self.update:
  217. break
  218. linklist.reverse()
  219. if self.verbose > 0:
  220. print(" %d episodes" % numberOfLinks)
  221. if self.verbose > 2:
  222. import json
  223. print('Feed info:\n%s\n' % json.dumps(self._feed_info_dict, indent=2))
  224. return linklist
  225. def downloadPodcastFiles(self, linklist):
  226. if linklist is None or self._feed_title is None:
  227. return
  228. nlinks = len(linklist)
  229. if nlinks > 0:
  230. if self.verbose == 1:
  231. print("2. Downloading content ... ", end="")
  232. elif self.verbose > 1:
  233. print("2. Downloading content ...")
  234. for cnt, episode_dict in enumerate(linklist):
  235. link = episode_dict['url']
  236. if self.verbose == 1:
  237. print("\r2. Downloading content ... {0}/{1}"
  238. .format(cnt + 1, nlinks), end="", flush=True)
  239. elif self.verbose > 1:
  240. print("\n\tDownloading file no. {0}/{1}:\n\t{2}"
  241. .format(cnt + 1, nlinks, link))
  242. if self.verbose > 2:
  243. import json
  244. print('\tEpisode info:')
  245. for key in episode_dict.keys():
  246. print("\t * %10s: %s" % (key, episode_dict[key]))
  247. # Check existence once ...
  248. filename = self.linkToTargetFilename(link)
  249. if self.verbose > 1:
  250. print("\tLocal filename:", filename)
  251. if path.isfile(filename):
  252. if self.verbose > 1:
  253. print("\t✓ Already exists.")
  254. continue
  255. # Begin downloading
  256. prepared_request = Request(link, headers=self._headers)
  257. try:
  258. with urlopen(prepared_request) as response:
  259. # Check existence another time, with resolved link
  260. link = response.geturl()
  261. total_size = int(response.getheader('content-length', '0'))
  262. new_filename = self.linkToTargetFilename(link, must_have_ext=True)
  263. if new_filename and new_filename != filename:
  264. filename = new_filename
  265. if self.verbose > 1:
  266. print("\tResolved filename:", filename)
  267. if path.isfile(filename):
  268. if self.verbose > 1:
  269. print("\t✓ Already exists.")
  270. continue
  271. # Create the subdir, if it does not exist
  272. makedirs(path.dirname(filename), exist_ok=True)
  273. if self.progress and total_size > 0:
  274. from tqdm import tqdm
  275. with tqdm(total=total_size, unit='B',
  276. unit_scale=True, unit_divisor=1024) as progress_bar:
  277. with open(filename, 'wb') as outfile:
  278. self.prettyCopyfileobj(response, outfile,
  279. callback=progress_bar.update)
  280. else:
  281. with open(filename, 'wb') as outfile:
  282. copyfileobj(response, outfile)
  283. if self.verbose > 1:
  284. print("\t✓ Download successful.")
  285. except (urllib.error.HTTPError,
  286. urllib.error.URLError) as error:
  287. if self.verbose > 1:
  288. print("\t✗ Download failed. Query returned '%s'" % error)
  289. except KeyboardInterrupt:
  290. if self.verbose > 0:
  291. print("\n\t✗ Unexpected interruption. Deleting unfinished file.")
  292. remove(filename)
  293. raise
  294. def prettyCopyfileobj(self, fsrc, fdst, callback, block_size=8 * 1024):
  295. while True:
  296. buf = fsrc.read(block_size)
  297. if not buf:
  298. break
  299. fdst.write(buf)
  300. callback(len(buf))
  301. if __name__ == "__main__":
  302. try:
  303. parser = argparse.ArgumentParser()
  304. parser.add_argument('-o', '--opml', action='append', type=argparse.FileType('r'),
  305. help='''Provide an OPML file (as exported by many other podcatchers)
  306. containing your feeds. The parameter can be used multiple
  307. times, once for every OPML file.''')
  308. parser.add_argument('-f', '--feed', action='append',
  309. help='''Add a feed URl to the archiver. The parameter can be used
  310. multiple times, once for every feed.''')
  311. parser.add_argument('-d', '--dir', action=writeable_dir,
  312. help='''Set the output directory of the podcast archive.''')
  313. parser.add_argument('-s', '--subdirs', action='store_true',
  314. help='''Place downloaded podcasts in separate subdirectories per
  315. podcast (named with their title).''')
  316. parser.add_argument('-u', '--update', action='store_true',
  317. help='''Force the archiver to only update the feeds with newly added
  318. episodes. As soon as the first old episode found in the
  319. download directory, further downloading is interrupted.''')
  320. parser.add_argument('-v', '--verbose', action='count',
  321. help='''Increase the level of verbosity while downloading.''')
  322. parser.add_argument('-p', '--progress', action='store_true',
  323. help='''Show progress bars while downloading episodes.''')
  324. parser.add_argument('-S', '--slugify', action='store_true',
  325. help='''Clean all folders and filename of potentially weird
  326. characters that might cause trouble with one or another
  327. target filesystem.''')
  328. parser.add_argument('-m', '--max-episodes', type=int,
  329. help='''Only download the given number of episodes per podcast
  330. feed. Useful if you don't really need the entire backlog.''')
  331. args = parser.parse_args()
  332. pa = PodcastArchiver()
  333. pa.addArguments(args)
  334. pa.processFeeds()
  335. except KeyboardInterrupt:
  336. sys.exit('\nERROR: Interrupted by user')
  337. except FileNotFoundError as error:
  338. sys.exit('\nERROR: %s' % error)
  339. except ArgumentTypeError as error:
  340. sys.exit('\nERROR: Your config is invalid: %s' % error)