Commit 7904471f authored by Lysander Trischler's avatar Lysander Trischler

Complete Python rewrite due to API changes

parent 3a4a9008
...@@ -6,4 +6,5 @@ ...@@ -6,4 +6,5 @@
*.pyo *.pyo
boydl.index boydl.index
boydl.error
README.html README.html
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
boydl – Best Of YouTube DownLoader boydl – Best Of YouTube DownLoader
================================== ==================================
**boydl** is a Bash script to download all newly available videos using **boydl** is a Python script to download all newly available videos using
the Python script **youtube-dl** it can find in the *bestofyoutube.com* the Python script **youtube-dl** it can find in the *bestofyoutube.com*
RSS feed. An index is used to remember already downloaded videos, so only RSS feed. An index is used to remember already downloaded videos, so only
new clips are processed. new clips are processed.
Written by Lysander Trischler <software@lyse.isobeef.org> and published Written by Lysander Trischler <software@lyse.isobeef.org> and published
under terms of WTF PL. under terms of WTFPL.
Installation Installation
============ ============
...@@ -45,7 +45,7 @@ contents and save it on your disk. ...@@ -45,7 +45,7 @@ contents and save it on your disk.
Configuration Configuration
+++++++++++++ +++++++++++++
To set up **boydl** just put the ``boydl`` main Bash script it in your To set up **boydl** just put the ``boydl`` main Python script it in your
``PATH``. ``PATH``.
If you're a *bestofyoutube.com* user, you may want to ignore your already If you're a *bestofyoutube.com* user, you may want to ignore your already
...@@ -65,7 +65,7 @@ alias to the **boydl** executable in your shell configuration (e.g. ...@@ -65,7 +65,7 @@ alias to the **boydl** executable in your shell configuration (e.g.
``~/.zshrc`` or ``~/.bashrc``). ``~/.zshrc`` or ``~/.bashrc``).
You may want to customize your downloaded video filenames. You can do this You may want to customize your downloaded video filenames. You can do this
in the ``boydl`` Bash script, simply adjust the ``YOUTUBE_DL_PY_COMMAND`` in the ``boydl`` Python script, simply adjust the ``ydl`` variable somewhere
variable at the very top of the script. Read **youtube-dl** documentation at the top of the script (line 58 atm). Read **youtube-dl** documentation
to get an idea of what can be done. to get an idea of what can be done.
#!/bin/bash #!/usr/bin/python
# Lysander Trischler <software@lyse.isobeef.org> # encoding: utf-8
# Published under WTF PL
"""
# configure your youtube-dl.py command here boydl – Best Of Youtube DownLoader
YOUTUBE_DL_PY_COMMAND="youtube-dl -wo $HOME/Desktop/%(stitle)s-%(id)s.%(ext)s"
This script fetches the Best Of YouTube RSS feed and downloads new videos.
# the URL of the RSS feed to download and parse Via an index every video is downloaded only once.
RSS_URL='http://bestofyoutube.com/rss.php' This is a complete Python rewrite from the former Bash implementation due
# the index file for successfully downloaded videos and error log to RSS feed migrations.
# index file contains HTML URLs from the RSS feed, error log has
# timestamp, HTML URL and video URL Usage: boydl [ --help | -h | --version | --html-urls | --youtube-urls ]
INDEX_FILE="$(dirname $0)/boydl.index"
ERROR_FILE="$(dirname $0)/boydl.error" Options:
--help Display this boydl's help and exit.
# match strings for finding the video URL --version Display boydl's version and exit.
VIDEO_URL_START_MATCHER='http://www.youtube.com/v/' --html-urls Only print HTML URLs, do not download them.
VIDEO_URL_END_MATCHER='&fs=1' --youtube-urls Only print YouTube URLs, do not download them.
"""
__PROGRAM__='boydl'
__VERSION__='1.0' __author__ = "Lysander Trischler"
__RELEASE__='2011-07-17' __copyright__ = "Copyright 2011, Lysander Trischler"
__AUTHOR__='Lysander Trischler <software@lyse.isobeef.org>' __license__ = "WTFPL"
__LICENSE__='WTF PL' __maintainer__ = "Lysander Trischler"
__email__ = "software@lyse.isobeef.org"
function version { __version__ = "2.0"
echo "$__PROGRAM__ Version $__VERSION__ ($__RELEASE__)" __version_info__ = (2, 0)
}
function usage { import os.path
echo "Usage: $0 [ --html-urls | --version | --help ]" import sys
}
function help { me = os.path.basename(__file__)
version
echo html_urls = False
echo "Written by $__AUTHOR__." youtube_urls = False
echo "Published under terms of $__LICENSE__." for arg in sys.argv[1:]:
echo if arg in ("--help", "-h"):
usage print(__doc__.strip())
echo exit(0)
echo "Options:" elif arg == "--version":
echo " --html-urls Only print HTML URLs, do not download them." print("%s Version %s" % (me, __version__))
echo " --version Display $__PROGRAM__ version and exit." exit(0)
echo " --help Display this $__PROGRAM__ help and exit." elif arg == "--html-urls":
} html_urls = True
elif arg == "--youtube-urls":
# command line option parsing youtube_urls = True
HTML_URLS=false else:
while [ $# -gt 0 ] sys.stderr.write("%s: Unknown argument `%s'!\n" % (me, arg))
do exit(2)
case $1 in
--html-urls) HTML_URLS=true;; import xml.dom.minidom
--version) version; exit;; import urllib
--help) help; exit;; import re
*) echo "$0: Error: Unknown option \`$1'!" import os
usage
exit 2 rss_url = 'http://feeds.feedburner.com/bestofyoutubedotcom?format=xml'
esac ydl = 'youtube-dl -wo "' + os.path.expanduser("~") + '/Desktop/%(stitle)s-%(id)s.%(ext)s"'
shift link_regex = re.compile(r'<a href="(.+?)">')
done url_regex = re.compile(r'"(http://www\.youtube\.com/v/.+?)&amp;hl=en_US&amp;fs=1"')
index_file = os.path.join(os.path.dirname(__file__), "boydl.index")
# the interesting work begins here error_file = os.path.join(os.path.dirname(__file__), "boydl.error")
# download the RSS feed and parse the HTML page URLs
wget -qO - "$RSS_URL" | grep '<link>' | while read LINE if rss_url.startswith(("http://", "https://")):
do rss_stream = urllib.urlopen(rss_url)
HTML_URL=${LINE/<link>/} else:
HTML_URL=${HTML_URL/<\/link>/} rss_stream = open(rss_url, "rb")
# ignore global RSS description part tree = xml.dom.minidom.parse(rss_stream)
[ "$HTML_URL" == "http://bestofyoutube.com" ] && continue rss_stream.close()
channel = tree.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0]
# check if only HTML URLs are requested
$HTML_URLS && { for item in channel.getElementsByTagName("item"):
echo "$HTML_URL"
continue #
} # first try to fetch the URL from the description, to bypass feed proxy
#
url = None
m = link_regex.search(item.getElementsByTagName("description")[0].firstChild.data)
if m:
url =m.group(1)
#
# if description link cannot be parsed use feed proxy's URL
#
if url is None:
print("%s: Cannot extract direct link to Best of YouTube website. Using feed proxy's URL." % me)
url = item.getElementsByTagName("link")[0].firstChild.data
# ignore already downloaded videos #
[ -e "$INDEX_FILE" ] && grep -q "$HTML_URL" "$INDEX_FILE" && continue # skip already downloaded videos
# Always read the index newly so parallel running boydls may
# download HTML page to extract video URL # not download the same video multiple times. Please note this
echo -n "Obtaining video URL for '$HTML_URL'... " # implementation is not 100% safe: It is possible two or more
VIDEO_URL=$(wget -qO - "$HTML_URL" | grep "$VIDEO_URL_START_MATCHER" | grep "$VIDEO_URL_END_MATCHER") # NOTE: second grep unnecessary # boydl instances execute this code at the very same time and
VIDEO_URL=$(python -c "print '$VIDEO_URL'['$VIDEO_URL'.index('$VIDEO_URL_START_MATCHER'):'$VIDEO_URL'.index('$VIDEO_URL_END_MATCHER')]") # therefore fetch the video twice or more often. Because this
echo "$VIDEO_URL" # scenario is considered to happen only very rarly we don't
# care about it here. Most of the time this holds.
#
cont = False
index_stream = open(index_file, "rb")
for line in index_stream.readlines():
if line.strip() == url:
cont = True
break
index_stream.close()
if cont: continue
#
# check if only HTML page URLs are requested by the user
#
if html_urls:
print(url)
continue
#
# download HTML page and extract video URL
#
yurl = None
html_stream = urllib.urlopen(url)
for line in html_stream.readlines():
m = url_regex.search(line)
if m:
yurl = m.group(1)
break
html_stream.close()
#
# check if only YouTube video URLs are requested by the user
#
if youtube_urls:
print(yurl)
continue
#
# download video
#
if yurl is None:
sys.stderr.write("%s: Cannot extract YouTube video URL!\n" % me)
else:
index_stream = open(index_file, "ab")
index_stream.write("%s\n" % url)
index_stream.close()
# download video using youtube-dl.py script and finally add it to cmd = ydl + ' ' + yurl
# downloaded videos list if downloading was sucessfull, to failed print('—' * 80)
# list otherwise exit_code = os.system(cmd) % 255
$YOUTUBE_DL_PY_COMMAND "$VIDEO_URL" \ if exit_code != 0:
&& echo "$HTML_URL" >> "$INDEX_FILE" \ sys.stderr.write("'%s' failed with exit code %d\n" % (cmd, exit_code))
|| echo "$(date "+%Y-%m-%d %H:%M:%S") - $HTML_URL - $VIDEO_URL" >> "$ERROR_FILE" error_stream = open(error_file, "ab")
done error_stream.write("%s\t%s\t%d\n" % (url, cmd, exit_code))
error_stream.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment