Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bbc #60

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open

bbc #60

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 104 additions & 55 deletions youtube_dl/extractor/bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,9 +608,10 @@ class BBCIE(BBCCoUkIE):
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'

_MEDIASELECTOR_URLS = [
'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/iptv-all/vpid/%s/format/xml/',
# Provides HQ HLS streams but fails with geolocation in some cases when it's
# even not geo restricted at all
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/legacy-iptv-all/vpid/%s',
# Provides more formats, namely direct mp4 links, but fails on some videos with
# notukerror for non UK (?) users (e.g.
# http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
Expand Down Expand Up @@ -754,7 +755,7 @@ class BBCIE(BBCCoUkIE):
},
'skip': 'Georestricted to UK',
}, {
# single video with playlist.sxml URL in playlist param
# single video with "pid" paramter
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
Expand Down Expand Up @@ -793,10 +794,11 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
'only_matching': True,
}, {
# single video article embedded with data-media-vpid
# single video article embedded with Morph "vpid" parameter
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
}, {
# single video with "vpid" parameter
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
Expand All @@ -809,6 +811,7 @@ class BBCIE(BBCCoUkIE):
}
}, {
# window.__PRELOADED_STATE__
# 404
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
'info_dict': {
'id': 'b0b9z4vz',
Expand All @@ -819,6 +822,7 @@ class BBCIE(BBCCoUkIE):
'uploader_id': 'bbc_radio_three',
},
}, {
# article with embedded video using data-pid parameter
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': {
'id': 'p06w9tws',
Expand Down Expand Up @@ -904,6 +908,37 @@ def _real_extract(self, url):

entries = []

initial_data_re = self._search_regex(
r'<script[^>]*>window.__INITIAL_DATA__=(.*?);</script>', webpage,
'initial data', default=None)
if initial_data_re:
initial_data = self._parse_json(initial_data_re, playlist_id)
for key in initial_data['data']:
data = initial_data['data'][key].get('data')
if data and isinstance(data, dict):
mediaItems = []
initialItem = data.get('initialItem')
blocks = data.get('blocks')
if initialItem:
mediaItems.append(initialItem.get('mediaItem'))
if blocks:
for block in blocks:
if block.get('type') == 'media':
mediaItems.append(block.get('model'))
for mediaItem in mediaItems:
title = mediaItem['title']['content'] if mediaItem.get('title') else mediaItem.get('caption')
description = '\n'.join([block['model']['text'] for block in mediaItem['summary']['blocks']]) if mediaItem.get('summary') else None
programme_id = mediaItem['media']['items'][0]['id']
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': title,
'description': description,
'formats': formats,
'subtitles': subtitles,
})

# article with multiple videos embedded with playlist.sxml (e.g.
# http://www.bbc.com/sport/0/football/34475836)
playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
Expand Down Expand Up @@ -977,22 +1012,25 @@ def _real_extract(self, url):
if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)

# http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
group_id = self._search_regex(
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
webpage, 'group id', default=None)
if playlist_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())

# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX,
r'"vpid":"(%s)"' % self._ID_REGEX,
r'"pid":"(%s)"' % self._ID_REGEX],
webpage, 'vpid', default=None)

# bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
initial_data = self._search_regex(
r'<script[^>]+id="initial-data"[^>]+data-json=\'(.+)\'>',
webpage, 'initial data', fatal=False, default=None)
if initial_data:
programme_id = self._search_regex(
r'"versionID":"(%s)"' % self._ID_REGEX,
unescapeHTML(initial_data),
'programme id', fatal=False, default=None)

if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
Expand All @@ -1014,49 +1052,60 @@ def _real_extract(self, url):
'subtitles': subtitles,
}

# http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
group_id = self._search_regex(
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
webpage, 'group id', default=None)
if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())

# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
morph_payload = self._parse_json(
self._search_regex(
r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
webpage, 'morph payload', default='{}'),
playlist_id, fatal=False)
if morph_payload:
components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
for component in components:
if not isinstance(component, dict):
continue
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
if not lead_media:
continue
identifiers = lead_media.get('identifiers')
if not identifiers or not isinstance(identifiers, dict):
continue
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
if not programme_id:
continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
duration = None
duration_d = lead_media.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return {
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
'subtitles': subtitles,
}
morph_payloads = re.findall(
r'Morph\.setPayload\([^,]+,\s*({.+?})\);', webpage)
if morph_payloads:
for morph_payload_text in morph_payloads:
morph_payload = self._parse_json(
morph_payload_text, playlist_id, fatal=False)
if morph_payload:
body_text = try_get(morph_payload, lambda x: x['body']['content']['article']['body']) or None
if not body_text:
continue
body = self._parse_json(
body_text, playlist_id, fatal=False)
if not isinstance(body, list):
continue
for item in body:
if not isinstance(item, dict):
continue
videoData = item.get('videoData')
if videoData:
programme_id = videoData.get('vpid') or videoData.get('playablePid')
if not programme_id:
continue
title = videoData.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
description = videoData.get('caption') or videoData.get('summary')
uploader = videoData.get('masterBrand')
uploader_id = videoData.get('mid')
duration = None
duration_d = videoData.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
entries.append({
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
'subtitles': subtitles,
})
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)

preload_state = self._parse_json(self._search_regex(
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
Expand Down