bobain Feb 9 2009 at 18:16

Python, подкасты и велосипеды.

4 min

848

В ходе подготовки, пк к роли домашнего сервера постоянно пытюсь отказаться от всяких gui. Дошли руки и до подкастов. Hpodder не вызвал восторга, ломал тэги в файлах. Вот и был написан свой велосипед на python.

Получилось два файла.
Файл настроек.

# -*- coding: utf8 -*-

# Файл настроек для подкаст скрипта.

[settings]

download_dir = /home/bobathecar/scriptForPodcast/episodes/

download_episodes = 1



[podcasts]

budam = budam.rpod.ru/rss.xml

Василий Стрельников = vasilysweekend.rpod.ru/rss.xml

Umputun = feeds.feedburner.com/Umputun

Сергей Петренко = it-thoughts.rpod.ru/rss.xml

iAnime = i-anime.rpod.ru/rss.xml

Radio-T = feeds.feedburner.com/Radio-t

Название будет записано в тег artist.
и собственно сам скрипт

#!/usr/bin/env python
# -*- coding: utf8 -*-
# Скрипт для скачивания последних подкастов 
# Из файла со списком urlов

from xml.dom import minidom
from UserDict import UserDict
import time, urllib, sys, os, re

class Podcast(UserDict):
    '''Class to repr single podcast item'''
    extentionsDict = {
        "audio/mpeg": "mp3",
        "video/mp4":  "mp4",
        "video/quicktime": "mov",
        "video/x-ms-wmv": "wmv"
    }

    def __init__(self):
        UserDict.__init__(self)
        self["linkToFile"] = "none"
        self["fileType"] = "audio/mpeg"

    def download(self, *args, **kwargs):
        ''' download podcast into file '''
        # make sure we have what to download
        if self["linkToFile"] == "none":
            raise LinkNotFound

        # default directory set to current directory
        outputDir = os.path.abspath(os.path.curdir)

        if len(args) == 1:
            # Only file name
            fileName = os.path.splitext("%s" %args[0])[0]

        elif len(args) == 2:
            # file name and output dir
            outputDir = "%s" %args[0]
            fileName = os.path.splitext("%s" %args[1])[0]

        elif kwargs.has_key("file"):
            fileName = os.path.splitext(kwargs["file"])[0]
            if kwargs.has_key("dir"):
                outputDir = kwargs["dir"]
        outputDir = outputDir.rstrip(os.path.sep)

        # if output directory not exist
        # try to create output directory
        if os.path.exists(outputDir) == False:
            os.mkdir(outputDir)

        u = urllib.urlopen(self["linkToFile"])
        # Получаем заголовки от сервера
        strInfo =  str(u.info())
        try:
            # Пробуем получить тип медиа содержимого
            self["fileType"] = re.findall(r"(?<=Content-Type: )\w+\/?\w*", strInfo)[0]
        except:
            pass

        # Складываем полный путь к файлу
        self["path_to_file"] = os.path.join(outputDir,
            fileName + os.path.extsep + self.extentionsDict[self["fileType"]]
        )
        # Поиск заголовка Content-Length
        LentgthHeader = re.findall(r"(?<=Content-Length: )\d+", strInfo)

        if len(LentgthHeader) == 1:
            # Проверяю не скачан ли уже подкаст
            # Проверка идет на +-10% от размера из заголовков сервера
            NotDownloadedLength = float(LentgthHeader[0])
            if os.path.exists(self["path_to_file"]) and \
                NotDownloadedLength*0.9 < \
                float(os.path.getsize(self["path_to_file"]))\
                < NotDownloadedLength*1.1:
                print "Подкаст %s уже скачан, пропускаем" % self["path_to_file"].encode("utf8")
                u.close()
                return self["path_to_file"]

        try:
            f = open(self["path_to_file"], "wb")
            f.write(u.read())
        except IOError:
            print "Не смогли скачать подкаст"
        finally:
            f.close()
        u.close()

    def rewriteTags(self, **kwargs):
        #  Метод для перезаписи тегов
        #  В аргументах должны передаваться
        #  tag=value
        if self["path_to_file"] == "none":
            return False
        #  default action
        if self["fileType"] == "audio/mpeg":
            # Для перезаписи тэгов используем библотеку mutagen
            from mutagen.easyid3 import EasyID3
            from mutagen.mp3 import MP3
            import mutagen.id3
            try:
                tags = MP3(self["path_to_file"], ID3=EasyID3)
                try:
                    tags.add_tags(ID3=EasyID3)
                except mutagen.id3.error:
                    pass

                for tag, value in kwargs.iteritems():
                    tags[tag] = value
                    print("%s = %s" %(tag, value))
                tags.save()

            except IOError:
                print "Нет такого файла %s" % self["path_to_file"]
                return False

class LinkNotFound(Exception): pass

def _getFirstNodeOrNoneByName(obj, tag):
    els = obj.getElementsByTagName(tag)
    if len(els) > 0:
        return els[0].firstChild.data
    else:
        return "unknown"

if __name__ == "__main__":
    from ConfigParser import SafeConfigParser
    # Читаем конфиг из файла
    config = SafeConfigParser()
    config.read("./gpds.conf")
    try:
        download_episodes = config.get("settings", "download_episodes")
        download_episodes = int(download_episodes)
    except:
        download_episodes = 1
    try:
        download_dir = config.get("settings", "download_dir")
    except:
        download_dir = os.path.abspath(os.curdir)

    links = config.items("podcasts")

    timePattern = re.compile(r"\w{2,6}, \d\d \w{2,6} \d{2,4} \d\d:\d\d:\d\d")
  
    #  Получение и разбор лент подкастов
    for (author, rssLink) in links:
        try:
            rss = urllib.urlopen(rssLink)
            rssXML = minidom.parseString(rss.read())
            items = rssXML.getElementsByTagName("item")
            for lastXMLPodcast in items[:download_episodes]:
                p = Podcast()
                p["title"] = _getFirstNodeOrNoneByName(lastXMLPodcast, "title")
                p["author"] = author.decode("utf8")
                enclosure = lastXMLPodcast.getElementsByTagName("enclosure")[0]
                p["linkToFile"] = enclosure.getAttribute("url")

                pubDate = _getFirstNodeOrNoneByName(lastXMLPodcast, "pubDate")
                try:
                    p["pubDate"] = time.strftime("%d-%m-%y", \
                        time.strptime(timePattern.findall(pubDate)[0], \
                                        "%a, %d %b %Y %H:%M:%S")\
                    )
                except:
                    p["pubDate"] = "_downloaded_at_%s" % \
                    time.strftime("%d-%m-%y", time.gmtime())
                print "Качаем %s - %s от %s" %(\
                p["author"].encode("utf8"), \
                p["title"].encode("utf8"), \
                p["pubDate"].encode("utf8")\
                )
                p.download( os.path.join(download_dir, p["author"]),\
                    "%s_%s" %(p["title"], p["pubDate"])\
                    )
                p.rewriteTags(          \
                artist=p["author"],     \
                title=p["title"],       \
                album=u"Подкаст от %s" %p["author"],       \
                genre=u"Podcast"        \
                )
        finally:
            rss.close()

В итоге, мой велоспед качает указанное количество последних эпизодов подкастов. Переименовывает файлы, и переписывает тэги. Наконец у меня в мп3 плеере будет наведен порядок и подкасты всегда будут там где я их ожидаю увидеть.

Hubs:

Lumber room