123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437 |
- #!/usr/bin/env python3
- """
- Podcast Archiver v0.3: Feed parser for local podcast archive creation
- Copyright (c) 2014-2017 Jan Willhaus
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- """
- import sys
- import argparse
- from argparse import ArgumentTypeError
- import feedparser
- from feedparser import CharacterEncodingOverride
- from urllib.request import urlopen, Request
- import urllib.error
- from shutil import copyfileobj
- from os import path, remove, makedirs, access, W_OK
- from urllib.parse import urlparse
- import unicodedata
- import re
- import xml.etree.ElementTree as etree
- class writeable_dir(argparse.Action):
- def __call__(self, parser, namespace, values, option_string=None):
- prospective_dir = values
- if not path.isdir(prospective_dir):
- raise ArgumentTypeError("%s is not a valid path" % prospective_dir)
- if access(prospective_dir, W_OK):
- setattr(namespace, self.dest, prospective_dir)
- else:
- raise ArgumentTypeError("%s is not a writeable dir" % prospective_dir)
- class PodcastArchiver:
- _feed_title = ''
- _feedobj = None
- _feed_info_dict = {}
- _userAgent = 'Podcast-Archiver/0.4 (https://github.com/janwh/podcast-archiver)'
- _headers = {'User-Agent': _userAgent}
- _global_info_keys = ['author', 'language', 'link', 'subtitle', 'title', ]
- _episode_info_keys = ['author', 'link', 'subtitle', 'title', ]
- _date_keys = ['published', ]
- savedir = ''
- verbose = 0
- subdirs = False
- update = False
- progress = False
- maximumEpisodes = None
- feedlist = []
- def __init__(self):
- feedparser.USER_AGENT = self._userAgent
- def addArguments(self, args):
- # if type(args) is argparse.ArgumentParser:
- # args = parser.parse_args()
- self.verbose = args.verbose or 0
- if self.verbose > 2:
- print('Input arguments:', args)
- for feed in (args.feed or []):
- self.addFeed(feed)
- for opml in (args.opml or []):
- self.parseOpmlFile(opml)
- if args.dir:
- self.savedir = args.dir
- self.subdirs = args.subdirs
- self.update = args.update
- self.progress = args.progress
- self.slugify = args.slugify
- self.maximumEpisodes = args.max_episodes or None
- if self.verbose > 1:
- print("Verbose level: ", self.verbose)
- def addFeed(self, feed):
- if path.isfile(feed):
- self.feedlist += open(feed, 'r').read().strip().splitlines()
- else:
- self.feedlist.append(feed)
- def parseOpmlFile(self, opml):
- with opml as file:
- tree = etree.fromstringlist(file)
- for feed in [node.get('xmlUrl') for node
- in tree.findall("*/outline/[@type='rss']")
- if node.get('xmlUrl') is not None]:
- self.addFeed(feed)
- def processFeeds(self):
- if self.verbose > 0 and self.update:
- print("Updating archive")
- for feed in self.feedlist:
- if self.verbose > 0:
- print("\nDownloading archive for: " + feed)
- linklist = self.processPodcastLink(feed)
- self.downloadPodcastFiles(linklist)
- if self.verbose > 0:
- print("\nDone.")
- def parseGlobalFeedInfo(self, feedobj=None):
- if feedobj is None:
- feedobj = self._feedobj
- self._feed_info_dict = {}
- if 'feed' in feedobj:
- for key in self._global_info_keys:
- self._feed_info_dict['feed_' + key] = feedobj['feed'].get(key, None)
- return self._feed_info_dict
- def slugifyString(filename):
- filename = unicodedata.normalize('NFKD', filename).encode('ascii', 'ignore')
- filename = re.sub('[^\w\s\-\.]', '', filename.decode('ascii')).strip()
- filename = re.sub('[-\s]+', '-', filename)
- return filename
- def linkToTargetFilename(self, link, must_have_ext=False):
- # Remove HTTP GET parameters from filename by parsing URL properly
- linkpath = urlparse(link).path
- basename = path.basename(linkpath)
- _, ext = path.splitext(basename)
- if must_have_ext and not ext:
- return None
- # If requested, slugify the filename
- if self.slugify:
- basename = PodcastArchiver.slugifyString(basename)
- self._feed_title = PodcastArchiver.slugifyString(self._feed_title)
- else:
- basename.replace(path.pathsep, '_')
- basename.replace(path.sep, '_')
- self._feed_title.replace(path.pathsep, '_')
- self._feed_title.replace(path.sep, '_')
- # Generate local path and check for existence
- if self.subdirs:
- filename = path.join(self.savedir, self._feed_title, basename)
- else:
- filename = path.join(self.savedir, basename)
- return filename
- def parseFeedToNextPage(self, feedobj=None):
- if feedobj is None:
- feedobj = self._feedobj
- # Assuming there will only be one link declared as 'next'
- self._feed_next_page = [link['href'] for link in feedobj['feed']['links']
- if link['rel'] == 'next']
- if len(self._feed_next_page) > 0:
- self._feed_next_page = self._feed_next_page[0]
- else:
- self._feed_next_page = None
- return self._feed_next_page
- def parseFeedToLinks(self, feed=None):
- if feed is None:
- feed = self._feedobj
- # Try different feed episode layouts: 'items' or 'entries'
- episodeList = feed.get('items', False) or feed.get('entries', False)
- if episodeList:
- linklist = [self.parseEpisode(episode) for episode in episodeList]
- linklist = [link for link in linklist if len(link) > 0]
- else:
- linklist = []
- return linklist
- def parseEpisode(self, episode):
- url = None
- episode_info = {}
- for link in episode['links']:
- if 'type' in link.keys():
- if link['type'].startswith('audio'):
- url = link['href']
- elif link['type'].startswith('video'):
- url = link['href']
- if url is not None:
- for key in self._episode_info_keys + self._date_keys:
- episode_info[key] = episode.get(key, None)
- episode_info['url'] = url
- return episode_info
- def processPodcastLink(self, link):
- if self.verbose > 0:
- print("1. Gathering link list ...", end="", flush=True)
- self._feed_title = None
- self._feed_next_page = link
- first_page = True
- linklist = []
- while self._feed_next_page is not None:
- if self.verbose > 0:
- print(".", end="", flush=True)
- self._feedobj = feedparser.parse(self._feed_next_page)
- # Escape improper feed-URL
- if 'status' in self._feedobj.keys() and self._feedobj['status'] >= 400:
- print("\nQuery returned HTTP error", self._feedobj['status'])
- return None
- # Escape malformatted XML
- if self._feedobj['bozo'] == 1:
- # If the character encoding is wrong, we continue as long as the reparsing succeeded
- if type(self._feedobj['bozo_exception']) is not CharacterEncodingOverride:
- print('\nDownloaded feed is malformatted on', self._feed_next_page)
- return None
- if first_page:
- self.parseGlobalFeedInfo()
- first_page = False
- # Parse the feed object for episodes and the next page
- linklist += self.parseFeedToLinks(self._feedobj)
- self._feed_next_page = self.parseFeedToNextPage(self._feedobj)
- if self._feed_title is None:
- self._feed_title = self._feedobj['feed']['title']
- numberOfLinks = len(linklist)
- # On given option, run an update, break at first existing episode
- if self.update:
- for index, episode_dict in enumerate(linklist):
- link = episode_dict['url']
- filename = self.linkToTargetFilename(link)
- if path.isfile(filename):
- del(linklist[index:])
- break
- numberOfLinks = len(linklist)
- # On given option, crop linklist to maximum number of episodes
- if self.maximumEpisodes is not None and self.maximumEpisodes < numberOfLinks:
- linklist = linklist[0:self.maximumEpisodes]
- numberOfLinks = self.maximumEpisodes
- if self.maximumEpisodes is not None or self.update:
- break
- linklist.reverse()
- if self.verbose > 0:
- print(" %d episodes" % numberOfLinks)
- if self.verbose > 2:
- import json
- print('Feed info:\n%s\n' % json.dumps(self._feed_info_dict, indent=2))
- return linklist
- def downloadPodcastFiles(self, linklist):
- if linklist is None or self._feed_title is None:
- return
- nlinks = len(linklist)
- if nlinks > 0:
- if self.verbose == 1:
- print("2. Downloading content ... ", end="")
- elif self.verbose > 1:
- print("2. Downloading content ...")
- for cnt, episode_dict in enumerate(linklist):
- link = episode_dict['url']
- if self.verbose == 1:
- print("\r2. Downloading content ... {0}/{1}"
- .format(cnt + 1, nlinks), end="", flush=True)
- elif self.verbose > 1:
- print("\n\tDownloading file no. {0}/{1}:\n\t{2}"
- .format(cnt + 1, nlinks, link))
- if self.verbose > 2:
- import json
- print('\tEpisode info:')
- for key in episode_dict.keys():
- print("\t * %10s: %s" % (key, episode_dict[key]))
- # Check existence once ...
- filename = self.linkToTargetFilename(link)
- if self.verbose > 1:
- print("\tLocal filename:", filename)
- if path.isfile(filename):
- if self.verbose > 1:
- print("\t✓ Already exists.")
- continue
- # Begin downloading
- prepared_request = Request(link, headers=self._headers)
- try:
- with urlopen(prepared_request) as response:
- # Check existence another time, with resolved link
- link = response.geturl()
- total_size = int(response.getheader('content-length', '0'))
- new_filename = self.linkToTargetFilename(link, must_have_ext=True)
- if new_filename and new_filename != filename:
- filename = new_filename
- if self.verbose > 1:
- print("\tResolved filename:", filename)
- if path.isfile(filename):
- if self.verbose > 1:
- print("\t✓ Already exists.")
- continue
- # Create the subdir, if it does not exist
- makedirs(path.dirname(filename), exist_ok=True)
- if self.progress and total_size > 0:
- from tqdm import tqdm
- with tqdm(total=total_size, unit='B',
- unit_scale=True, unit_divisor=1024) as progress_bar:
- with open(filename, 'wb') as outfile:
- self.prettyCopyfileobj(response, outfile,
- callback=progress_bar.update)
- else:
- with open(filename, 'wb') as outfile:
- copyfileobj(response, outfile)
- if self.verbose > 1:
- print("\t✓ Download successful.")
- except (urllib.error.HTTPError,
- urllib.error.URLError) as error:
- if self.verbose > 1:
- print("\t✗ Download failed. Query returned '%s'" % error)
- except KeyboardInterrupt:
- if self.verbose > 0:
- print("\n\t✗ Unexpected interruption. Deleting unfinished file.")
- remove(filename)
- raise
- def prettyCopyfileobj(self, fsrc, fdst, callback, block_size=8 * 1024):
- while True:
- buf = fsrc.read(block_size)
- if not buf:
- break
- fdst.write(buf)
- callback(len(buf))
- if __name__ == "__main__":
- try:
- parser = argparse.ArgumentParser()
- parser.add_argument('-o', '--opml', action='append', type=argparse.FileType('r'),
- help='''Provide an OPML file (as exported by many other podcatchers)
- containing your feeds. The parameter can be used multiple
- times, once for every OPML file.''')
- parser.add_argument('-f', '--feed', action='append',
- help='''Add a feed URl to the archiver. The parameter can be used
- multiple times, once for every feed.''')
- parser.add_argument('-d', '--dir', action=writeable_dir,
- help='''Set the output directory of the podcast archive.''')
- parser.add_argument('-s', '--subdirs', action='store_true',
- help='''Place downloaded podcasts in separate subdirectories per
- podcast (named with their title).''')
- parser.add_argument('-u', '--update', action='store_true',
- help='''Force the archiver to only update the feeds with newly added
- episodes. As soon as the first old episode found in the
- download directory, further downloading is interrupted.''')
- parser.add_argument('-v', '--verbose', action='count',
- help='''Increase the level of verbosity while downloading.''')
- parser.add_argument('-p', '--progress', action='store_true',
- help='''Show progress bars while downloading episodes.''')
- parser.add_argument('-S', '--slugify', action='store_true',
- help='''Clean all folders and filename of potentially weird
- characters that might cause trouble with one or another
- target filesystem.''')
- parser.add_argument('-m', '--max-episodes', type=int,
- help='''Only download the given number of episodes per podcast
- feed. Useful if you don't really need the entire backlog.''')
- args = parser.parse_args()
- pa = PodcastArchiver()
- pa.addArguments(args)
- pa.processFeeds()
- except KeyboardInterrupt:
- sys.exit('\nERROR: Interrupted by user')
- except FileNotFoundError as error:
- sys.exit('\nERROR: %s' % error)
- except ArgumentTypeError as error:
- sys.exit('\nERROR: Your config is invalid: %s' % error)
|