LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
							#!/usr/bin/env python3
"""
Podcast Archiver v0.3: Feed parser for local podcast archive creation

Copyright (c) 2014-2017 Jan Willhaus

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""


import sys
import argparse
from argparse import ArgumentTypeError
import feedparser
from feedparser import CharacterEncodingOverride
from urllib.request import urlopen, Request
import urllib.error
from shutil import copyfileobj
from os import path, remove, makedirs, access, W_OK
from urllib.parse import urlparse
import unicodedata
import re
import xml.etree.ElementTree as etree


class writeable_dir(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        prospective_dir = values
        if not path.isdir(prospective_dir):
            raise ArgumentTypeError("%s is not a valid path" % prospective_dir)
        if access(prospective_dir, W_OK):
            setattr(namespace, self.dest, prospective_dir)
        else:
            raise ArgumentTypeError("%s is not a writeable dir" % prospective_dir)


class PodcastArchiver:

    _feed_title = ''
    _feedobj = None
    _feed_info_dict = {}

    _userAgent = 'Podcast-Archiver/0.4 (https://github.com/janwh/podcast-archiver)'
    _headers = {'User-Agent': _userAgent}
    _global_info_keys = ['author', 'language', 'link', 'subtitle', 'title', ]
    _episode_info_keys = ['author', 'link', 'subtitle', 'title', ]
    _date_keys = ['published', ]

    savedir = ''
    verbose = 0
    subdirs = False
    update = False
    progress = False
    maximumEpisodes = None

    feedlist = []

    def __init__(self):

        feedparser.USER_AGENT = self._userAgent

    def addArguments(self, args):

        # if type(args) is argparse.ArgumentParser:
        #     args = parser.parse_args()

        self.verbose = args.verbose or 0
        if self.verbose > 2:
            print('Input arguments:', args)

        for feed in (args.feed or []):
            self.addFeed(feed)

        for opml in (args.opml or []):
            self.parseOpmlFile(opml)

        if args.dir:
            self.savedir = args.dir

        self.subdirs = args.subdirs
        self.update = args.update
        self.progress = args.progress
        self.slugify = args.slugify
        self.maximumEpisodes = args.max_episodes or None

        if self.verbose > 1:
            print("Verbose level: ", self.verbose)

    def addFeed(self, feed):
        if path.isfile(feed):
            self.feedlist += open(feed, 'r').read().strip().splitlines()
        else:
            self.feedlist.append(feed)

    def parseOpmlFile(self, opml):
        with opml as file:
            tree = etree.fromstringlist(file)

        for feed in [node.get('xmlUrl') for node
                     in tree.findall("*/outline/[@type='rss']")
                     if node.get('xmlUrl') is not None]:

            self.addFeed(feed)

    def processFeeds(self):

        if self.verbose > 0 and self.update:
            print("Updating archive")

        for feed in self.feedlist:
            if self.verbose > 0:
                print("\nDownloading archive for: " + feed)
            linklist = self.processPodcastLink(feed)
            self.downloadPodcastFiles(linklist)

        if self.verbose > 0:
            print("\nDone.")

    def parseGlobalFeedInfo(self, feedobj=None):
        if feedobj is None:
            feedobj = self._feedobj

        self._feed_info_dict = {}
        if 'feed' in feedobj:
            for key in self._global_info_keys:
                self._feed_info_dict['feed_' + key] = feedobj['feed'].get(key, None)

        return self._feed_info_dict

    def slugifyString(filename):
        filename = unicodedata.normalize('NFKD', filename).encode('ascii', 'ignore')
        filename = re.sub('[^\w\s\-\.]', '', filename.decode('ascii')).strip()
        filename = re.sub('[-\s]+', '-', filename)

        return filename

    def linkToTargetFilename(self, link, must_have_ext=False):

        # Remove HTTP GET parameters from filename by parsing URL properly
        linkpath = urlparse(link).path
        basename = path.basename(linkpath)

        _, ext = path.splitext(basename)
        if must_have_ext and not ext:
            return None

        # If requested, slugify the filename
        if self.slugify:
            basename = PodcastArchiver.slugifyString(basename)
            self._feed_title = PodcastArchiver.slugifyString(self._feed_title)
        else:
            basename.replace(path.pathsep, '_')
            basename.replace(path.sep, '_')
            self._feed_title.replace(path.pathsep, '_')
            self._feed_title.replace(path.sep, '_')

        # Generate local path and check for existence
        if self.subdirs:
            filename = path.join(self.savedir, self._feed_title, basename)
        else:
            filename = path.join(self.savedir, basename)

        return filename

    def parseFeedToNextPage(self, feedobj=None):

        if feedobj is None:
            feedobj = self._feedobj

        # Assuming there will only be one link declared as 'next'
        self._feed_next_page = [link['href'] for link in feedobj['feed']['links']
                                if link['rel'] == 'next']

        if len(self._feed_next_page) > 0:
            self._feed_next_page = self._feed_next_page[0]
        else:
            self._feed_next_page = None

        return self._feed_next_page

    def parseFeedToLinks(self, feed=None):

        if feed is None:
            feed = self._feedobj

        # Try different feed episode layouts: 'items' or 'entries'
        episodeList = feed.get('items', False) or feed.get('entries', False)
        if episodeList:
            linklist = [self.parseEpisode(episode) for episode in episodeList]
            linklist = [link for link in linklist if len(link) > 0]
        else:
            linklist = []

        return linklist

    def parseEpisode(self, episode):
        url = None
        episode_info = {}
        for link in episode['links']:
            if 'type' in link.keys():
                if link['type'].startswith('audio'):
                    url = link['href']
                elif link['type'].startswith('video'):
                    url = link['href']

                if url is not None:
                    for key in self._episode_info_keys + self._date_keys:
                        episode_info[key] = episode.get(key, None)
                    episode_info['url'] = url

        return episode_info

    def processPodcastLink(self, link):
        if self.verbose > 0:
            print("1. Gathering link list ...", end="", flush=True)

        self._feed_title = None
        self._feed_next_page = link
        first_page = True
        linklist = []
        while self._feed_next_page is not None:
            if self.verbose > 0:
                print(".", end="", flush=True)

            self._feedobj = feedparser.parse(self._feed_next_page)

            # Escape improper feed-URL
            if 'status' in self._feedobj.keys() and self._feedobj['status'] >= 400:
                print("\nQuery returned HTTP error", self._feedobj['status'])
                return None

            # Escape malformatted XML
            if self._feedobj['bozo'] == 1:

                # If the character encoding is wrong, we continue as long as the reparsing succeeded
                if type(self._feedobj['bozo_exception']) is not CharacterEncodingOverride:
                    print('\nDownloaded feed is malformatted on', self._feed_next_page)
                    return None

            if first_page:
                self.parseGlobalFeedInfo()
                first_page = False

            # Parse the feed object for episodes and the next page
            linklist += self.parseFeedToLinks(self._feedobj)
            self._feed_next_page = self.parseFeedToNextPage(self._feedobj)

            if self._feed_title is None:
                self._feed_title = self._feedobj['feed']['title']

            numberOfLinks = len(linklist)

            # On given option, run an update, break at first existing episode
            if self.update:
                for index, episode_dict in enumerate(linklist):
                    link = episode_dict['url']
                    filename = self.linkToTargetFilename(link)

                    if path.isfile(filename):
                        del(linklist[index:])
                        break
                numberOfLinks = len(linklist)

            # On given option, crop linklist to maximum number of episodes
            if self.maximumEpisodes is not None and self.maximumEpisodes < numberOfLinks:
                linklist = linklist[0:self.maximumEpisodes]
                numberOfLinks = self.maximumEpisodes

            if self.maximumEpisodes is not None or self.update:
                break

        linklist.reverse()

        if self.verbose > 0:
            print(" %d episodes" % numberOfLinks)

        if self.verbose > 2:
            import json
            print('Feed info:\n%s\n' % json.dumps(self._feed_info_dict, indent=2))

        return linklist


    def downloadPodcastFiles(self, linklist):
        if linklist is None or self._feed_title is None:
            return

        nlinks = len(linklist)
        if nlinks > 0:
            if self.verbose == 1:
                print("2. Downloading content ... ", end="")
            elif self.verbose > 1:
                print("2. Downloading content ...")

        for cnt, episode_dict in enumerate(linklist):
            link = episode_dict['url']
            if self.verbose == 1:
                print("\r2. Downloading content ... {0}/{1}"
                      .format(cnt + 1, nlinks), end="", flush=True)
            elif self.verbose > 1:
                print("\n\tDownloading file no. {0}/{1}:\n\t{2}"
                      .format(cnt + 1, nlinks, link))

                if self.verbose > 2:
                    import json
                    print('\tEpisode info:')
                    for key in episode_dict.keys():
                        print("\t * %10s: %s" % (key, episode_dict[key]))

            # Check existence once ...
            filename = self.linkToTargetFilename(link)

            if self.verbose > 1:
                print("\tLocal filename:", filename)

            if path.isfile(filename):
                if self.verbose > 1:
                    print("\t✓ Already exists.")
                continue

            # Begin downloading
            prepared_request = Request(link, headers=self._headers)
            try:
                with urlopen(prepared_request) as response:

                    # Check existence another time, with resolved link
                    link = response.geturl()
                    total_size = int(response.getheader('content-length', '0'))
                    new_filename = self.linkToTargetFilename(link, must_have_ext=True)

                    if new_filename and new_filename != filename:
                        filename = new_filename
                        if self.verbose > 1:
                            print("\tResolved filename:", filename)

                        if path.isfile(filename):
                            if self.verbose > 1:
                                print("\t✓ Already exists.")
                            continue

                    # Create the subdir, if it does not exist
                    makedirs(path.dirname(filename), exist_ok=True)

                    if self.progress and total_size > 0:
                        from tqdm import tqdm
                        with tqdm(total=total_size, unit='B',
                                  unit_scale=True, unit_divisor=1024) as progress_bar:

                            with open(filename, 'wb') as outfile:
                                self.prettyCopyfileobj(response, outfile,
                                                       callback=progress_bar.update)
                    else:
                        with open(filename, 'wb') as outfile:
                            copyfileobj(response, outfile)

                if self.verbose > 1:
                    print("\t✓ Download successful.")
            except (urllib.error.HTTPError,
                    urllib.error.URLError) as error:
                if self.verbose > 1:
                    print("\t✗ Download failed. Query returned '%s'" % error)
            except KeyboardInterrupt:
                if self.verbose > 0:
                    print("\n\t✗ Unexpected interruption. Deleting unfinished file.")

                remove(filename)
                raise

    def prettyCopyfileobj(self, fsrc, fdst, callback, block_size=8 * 1024):
        while True:
            buf = fsrc.read(block_size)
            if not buf:
                break
            fdst.write(buf)
            callback(len(buf))


if __name__ == "__main__":
    try:

        parser = argparse.ArgumentParser()
        parser.add_argument('-o', '--opml', action='append', type=argparse.FileType('r'),
                            help='''Provide an OPML file (as exported by many other podcatchers)
                                 containing your feeds. The parameter can be used multiple
                                 times, once for every OPML file.''')
        parser.add_argument('-f', '--feed', action='append',
                            help='''Add a feed URl to the archiver. The parameter can be used
                                 multiple times, once for every feed.''')
        parser.add_argument('-d', '--dir', action=writeable_dir,
                            help='''Set the output directory of the podcast archive.''')
        parser.add_argument('-s', '--subdirs', action='store_true',
                            help='''Place downloaded podcasts in separate subdirectories per
                                 podcast (named with their title).''')
        parser.add_argument('-u', '--update', action='store_true',
                            help='''Force the archiver to only update the feeds with newly added
                                 episodes. As soon as the first old episode found in the
                                 download directory, further downloading is interrupted.''')
        parser.add_argument('-v', '--verbose', action='count',
                            help='''Increase the level of verbosity while downloading.''')
        parser.add_argument('-p', '--progress', action='store_true',
                            help='''Show progress bars while downloading episodes.''')
        parser.add_argument('-S', '--slugify', action='store_true',
                            help='''Clean all folders and filename of potentially weird
                                 characters that might cause trouble with one or another
                                 target filesystem.''')
        parser.add_argument('-m', '--max-episodes', type=int,
                            help='''Only download the given number of episodes per podcast
                                 feed. Useful if you don't really need the entire backlog.''')

        args = parser.parse_args()

        pa = PodcastArchiver()
        pa.addArguments(args)
        pa.processFeeds()
    except KeyboardInterrupt:
        sys.exit('\nERROR: Interrupted by user')
    except FileNotFoundError as error:
        sys.exit('\nERROR: %s' % error)
    except ArgumentTypeError as error:
        sys.exit('\nERROR: Your config is invalid: %s' % error)