Commit 7904471f authored by Lysander Trischler's avatar Lysander Trischler

Complete Python rewrite due to API changes

parent 3a4a9008
......@@ -6,4 +6,5 @@
*.pyo
boydl.index
boydl.error
README.html
......@@ -2,13 +2,13 @@
boydl – Best Of YouTube DownLoader
==================================
**boydl** is a Bash script to download all newly available videos using
**boydl** is a Python script to download all newly available videos using
the Python script **youtube-dl** it can find in the *bestofyoutube.com*
RSS feed. An index is used to remember already downloaded videos, so only
new clips are processed.
Written by Lysander Trischler <software@lyse.isobeef.org> and published
under terms of WTF PL.
under terms of WTFPL.
Installation
============
......@@ -45,7 +45,7 @@ contents and save it on your disk.
Configuration
+++++++++++++
To set up **boydl** just put the ``boydl`` main Bash script it in your
To set up **boydl** just put the ``boydl`` main Python script it in your
``PATH``.
If you're a *bestofyoutube.com* user, you may want to ignore your already
......@@ -65,7 +65,7 @@ alias to the **boydl** executable in your shell configuration (e.g.
``~/.zshrc`` or ``~/.bashrc``).
You may want to customize your downloaded video filenames. You can do this
in the ``boydl`` Bash script, simply adjust the ``YOUTUBE_DL_PY_COMMAND``
variable at the very top of the script. Read **youtube-dl** documentation
in the ``boydl`` Python script, simply adjust the ``ydl`` variable somewhere
at the top of the script (line 58 atm). Read **youtube-dl** documentation
to get an idea of what can be done.
#!/bin/bash
# Lysander Trischler <software@lyse.isobeef.org>
# Published under WTF PL
# configure your youtube-dl.py command here
YOUTUBE_DL_PY_COMMAND="youtube-dl -wo $HOME/Desktop/%(stitle)s-%(id)s.%(ext)s"
# the URL of the RSS feed to download and parse
RSS_URL='http://bestofyoutube.com/rss.php'
# the index file for successfully downloaded videos and error log
# index file contains HTML URLs from the RSS feed, error log has
# timestamp, HTML URL and video URL
INDEX_FILE="$(dirname $0)/boydl.index"
ERROR_FILE="$(dirname $0)/boydl.error"
# match strings for finding the video URL
VIDEO_URL_START_MATCHER='http://www.youtube.com/v/'
VIDEO_URL_END_MATCHER='&fs=1'
__PROGRAM__='boydl'
__VERSION__='1.0'
__RELEASE__='2011-07-17'
__AUTHOR__='Lysander Trischler <software@lyse.isobeef.org>'
__LICENSE__='WTF PL'
function version {
echo "$__PROGRAM__ Version $__VERSION__ ($__RELEASE__)"
}
function usage {
echo "Usage: $0 [ --html-urls | --version | --help ]"
}
function help {
version
echo
echo "Written by $__AUTHOR__."
echo "Published under terms of $__LICENSE__."
echo
usage
echo
echo "Options:"
echo " --html-urls Only print HTML URLs, do not download them."
echo " --version Display $__PROGRAM__ version and exit."
echo " --help Display this $__PROGRAM__ help and exit."
}
# command line option parsing
HTML_URLS=false
while [ $# -gt 0 ]
do
case $1 in
--html-urls) HTML_URLS=true;;
--version) version; exit;;
--help) help; exit;;
*) echo "$0: Error: Unknown option \`$1'!"
usage
exit 2
esac
shift
done
# the interesting work begins here
# download the RSS feed and parse the HTML page URLs
wget -qO - "$RSS_URL" | grep '<link>' | while read LINE
do
HTML_URL=${LINE/<link>/}
HTML_URL=${HTML_URL/<\/link>/}
# ignore global RSS description part
[ "$HTML_URL" == "http://bestofyoutube.com" ] && continue
# check if only HTML URLs are requested
$HTML_URLS && {
echo "$HTML_URL"
continue
}
#!/usr/bin/python
# encoding: utf-8
"""
boydl – Best Of Youtube DownLoader
This script fetches the Best Of YouTube RSS feed and downloads new videos.
Via an index every video is downloaded only once.
This is a complete Python rewrite from the former Bash implementation due
to RSS feed migrations.
Usage: boydl [ --help | -h | --version | --html-urls | --youtube-urls ]
Options:
--help Display this boydl's help and exit.
--version Display boydl's version and exit.
--html-urls Only print HTML URLs, do not download them.
--youtube-urls Only print YouTube URLs, do not download them.
"""
__author__ = "Lysander Trischler"
__copyright__ = "Copyright 2011, Lysander Trischler"
__license__ = "WTFPL"
__maintainer__ = "Lysander Trischler"
__email__ = "software@lyse.isobeef.org"
__version__ = "2.0"
__version_info__ = (2, 0)
import os.path
import sys
me = os.path.basename(__file__)
html_urls = False
youtube_urls = False
for arg in sys.argv[1:]:
if arg in ("--help", "-h"):
print(__doc__.strip())
exit(0)
elif arg == "--version":
print("%s Version %s" % (me, __version__))
exit(0)
elif arg == "--html-urls":
html_urls = True
elif arg == "--youtube-urls":
youtube_urls = True
else:
sys.stderr.write("%s: Unknown argument `%s'!\n" % (me, arg))
exit(2)
import xml.dom.minidom
import urllib
import re
import os
rss_url = 'http://feeds.feedburner.com/bestofyoutubedotcom?format=xml'
ydl = 'youtube-dl -wo "' + os.path.expanduser("~") + '/Desktop/%(stitle)s-%(id)s.%(ext)s"'
link_regex = re.compile(r'<a href="(.+?)">')
url_regex = re.compile(r'"(http://www\.youtube\.com/v/.+?)&amp;hl=en_US&amp;fs=1"')
index_file = os.path.join(os.path.dirname(__file__), "boydl.index")
error_file = os.path.join(os.path.dirname(__file__), "boydl.error")
if rss_url.startswith(("http://", "https://")):
rss_stream = urllib.urlopen(rss_url)
else:
rss_stream = open(rss_url, "rb")
tree = xml.dom.minidom.parse(rss_stream)
rss_stream.close()
channel = tree.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0]
for item in channel.getElementsByTagName("item"):
#
# first try to fetch the URL from the description, to bypass feed proxy
#
url = None
m = link_regex.search(item.getElementsByTagName("description")[0].firstChild.data)
if m:
url =m.group(1)
#
# if description link cannot be parsed use feed proxy's URL
#
if url is None:
print("%s: Cannot extract direct link to Best of YouTube website. Using feed proxy's URL." % me)
url = item.getElementsByTagName("link")[0].firstChild.data
# ignore already downloaded videos
[ -e "$INDEX_FILE" ] && grep -q "$HTML_URL" "$INDEX_FILE" && continue
# download HTML page to extract video URL
echo -n "Obtaining video URL for '$HTML_URL'... "
VIDEO_URL=$(wget -qO - "$HTML_URL" | grep "$VIDEO_URL_START_MATCHER" | grep "$VIDEO_URL_END_MATCHER") # NOTE: second grep unnecessary
VIDEO_URL=$(python -c "print '$VIDEO_URL'['$VIDEO_URL'.index('$VIDEO_URL_START_MATCHER'):'$VIDEO_URL'.index('$VIDEO_URL_END_MATCHER')]")
echo "$VIDEO_URL"
#
# skip already downloaded videos
# Always read the index newly so parallel running boydls may
# not download the same video multiple times. Please note this
# implementation is not 100% safe: It is possible two or more
# boydl instances execute this code at the very same time and
# therefore fetch the video twice or more often. Because this
# scenario is considered to happen only very rarly we don't
# care about it here. Most of the time this holds.
#
cont = False
index_stream = open(index_file, "rb")
for line in index_stream.readlines():
if line.strip() == url:
cont = True
break
index_stream.close()
if cont: continue
#
# check if only HTML page URLs are requested by the user
#
if html_urls:
print(url)
continue
#
# download HTML page and extract video URL
#
yurl = None
html_stream = urllib.urlopen(url)
for line in html_stream.readlines():
m = url_regex.search(line)
if m:
yurl = m.group(1)
break
html_stream.close()
#
# check if only YouTube video URLs are requested by the user
#
if youtube_urls:
print(yurl)
continue
#
# download video
#
if yurl is None:
sys.stderr.write("%s: Cannot extract YouTube video URL!\n" % me)
else:
index_stream = open(index_file, "ab")
index_stream.write("%s\n" % url)
index_stream.close()
# download video using youtube-dl.py script and finally add it to
# downloaded videos list if downloading was sucessfull, to failed
# list otherwise
$YOUTUBE_DL_PY_COMMAND "$VIDEO_URL" \
&& echo "$HTML_URL" >> "$INDEX_FILE" \
|| echo "$(date "+%Y-%m-%d %H:%M:%S") - $HTML_URL - $VIDEO_URL" >> "$ERROR_FILE"
done
cmd = ydl + ' ' + yurl
print('—' * 80)
exit_code = os.system(cmd) % 255
if exit_code != 0:
sys.stderr.write("'%s' failed with exit code %d\n" % (cmd, exit_code))
error_stream = open(error_file, "ab")
error_stream.write("%s\t%s\t%d\n" % (url, cmd, exit_code))
error_stream.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment