boydl 4.26 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
# encoding: utf-8

"""
boydl – Best Of Youtube DownLoader

This script fetches the Best Of YouTube RSS feed and downloads new videos.
Via an index every video is downloaded only once.

This is a complete Python rewrite from the former Bash implementation due
to RSS feed migrations.

Usage: boydl [ --help | -h | --version | --html-urls | --youtube-urls ]

Options:
    --help            Display this boydl's help and exit.
    --version         Display boydl's version and exit.
    --html-urls       Only print HTML URLs, do not download them.
    --youtube-urls    Only print YouTube URLs, do not download them.
"""

__author__       = "Lysander Trischler"
__copyright__    = "Copyright 2011, Lysander Trischler"
__license__      = "WTFPL"
__maintainer__   = "Lysander Trischler"
__email__        = "software@lyse.isobeef.org"
__version__      = "2.0"
__version_info__ = (2, 0)

import os.path
import sys

me = os.path.basename(__file__)

html_urls    = False
youtube_urls = False
for arg in sys.argv[1:]:
	if arg in ("--help", "-h"):
		print(__doc__.strip())
		exit(0)
	elif arg == "--version":
		print("%s Version %s" % (me, __version__))
		exit(0)
	elif arg == "--html-urls":
		html_urls = True
	elif arg == "--youtube-urls":
		youtube_urls = True
	else:
		sys.stderr.write("%s: Unknown argument `%s'!\n" % (me, arg))
		exit(2)

import xml.dom.minidom
import urllib
import re
import os

rss_url    = 'http://feeds.feedburner.com/bestofyoutubedotcom?format=xml'
ydl        = 'youtube-dl -wo "' + os.path.expanduser("~") + '/Desktop/%(stitle)s-%(id)s.%(ext)s"'
link_regex = re.compile(r'<a href="(.+?)">')
url_regex  = re.compile(r'"(http://www\.youtube\.com/v/.+?)&amp;hl=en_US&amp;fs=1"')
index_file = os.path.join(os.path.dirname(__file__), "boydl.index")
error_file = os.path.join(os.path.dirname(__file__), "boydl.error")

if rss_url.startswith(("http://", "https://")):
	rss_stream = urllib.urlopen(rss_url)
else:
	rss_stream = open(rss_url, "rb")

69
tree = xml.dom.minidom.parseString(rss_stream.read().replace("&eacute;", "é"))
70
71
72
73
74
75
76
77
78
79
80
81
rss_stream.close()
channel = tree.getElementsByTagName("rss")[0].getElementsByTagName("channel")[0]

for item in channel.getElementsByTagName("item"):

	#
	# first try to fetch the URL from the description, to bypass feed proxy
	#
	url = None
	m = link_regex.search(item.getElementsByTagName("description")[0].firstChild.data)
	if m:
		url =m.group(1)
Lysander Trischler's avatar
Lysander Trischler committed
82

83
84
85
86
	#
	# if description link cannot be parsed use feed proxy's URL
	#
	if url is None:
87
88
89
		print("%s: Cannot extract direct link to Best of YouTube website for '%s'."
		      " Using feed proxy's URL:" % (
				me, item.getElementsByTagName('title')[0].firstChild.data)),
90
		url = item.getElementsByTagName("link")[0].firstChild.data
91
		print(url)
Lysander Trischler's avatar
Lysander Trischler committed
92

93
94
95
96
97
98
99
100
101
	#
	# skip already downloaded videos
	# Always read the index newly so parallel running boydls may
	# not download the same video multiple times. Please note this
	# implementation is not 100% safe: It is possible two or more
	# boydl instances execute this code at the very same time and
	# therefore fetch the video twice or more often. Because this
	# scenario is considered to happen only very rarly we don't
	# care about it here. Most of the time this holds.
Lysander Trischler's avatar
Lysander Trischler committed
102
	#
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
	cont = False
	index_stream = open(index_file, "rb")
	for line in index_stream.readlines():
		if line.strip() == url:
			cont = True
			break
	index_stream.close()
	if cont: continue

	#
	# check if only HTML page URLs are requested by the user
	#
	if html_urls:
		print(url)
		continue

	#
	# download HTML page and extract video URL
	#
	yurl = None
	html_stream = urllib.urlopen(url)
	for line in html_stream.readlines():
		m = url_regex.search(line)
		if m:
			yurl = m.group(1)
			break
	html_stream.close()

	#
	# check if only YouTube video URLs are requested by the user
	#
	if youtube_urls:
		print(yurl)
		continue

	#
	# download video
	#
	if yurl is None:
		sys.stderr.write("%s: Cannot extract YouTube video URL!\n" % me)
	else:
		index_stream = open(index_file, "ab")
		index_stream.write("%s\n" % url)
		index_stream.close()
Lysander Trischler's avatar
Lysander Trischler committed
147

148
149
150
151
152
153
154
155
		cmd = ydl + ' ' + yurl
		print('—' * 80)
		exit_code = os.system(cmd) % 255
		if exit_code != 0:
			sys.stderr.write("'%s' failed with exit code %d\n" % (cmd, exit_code))
			error_stream = open(error_file, "ab")
			error_stream.write("%s\t%s\t%d\n" % (url, cmd, exit_code))
			error_stream.close()