From e4788fa663383d6930a526d9140af1d668aebbbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 22 Jul 2021 20:37:05 +0200 Subject: [PATCH] [bbc] add 'gallery' and 'programme' extractors (closes #1706) --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bbc.py | 80 ++++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 88 insertions(+) create mode 100644 gallery_dl/extractor/bbc.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0a4febc5..2e2c4292 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -79,6 +79,12 @@ Consider all sites to be NSFW unless otherwise known. Albums, Artwork Listings, Challenges, individual Images, Likes, Search Results, User Profiles + + BBC + https://bbc.co.uk/ + Galleries, Programmes + + Behance https://www.behance.net/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 42365884..740fb7d7 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -20,6 +20,7 @@ modules = [ "architizer", "artstation", "aryion", + "bbc", "bcy", "behance", "blogger", diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py new file mode 100644 index 00000000..ace8a285 --- /dev/null +++ b/gallery_dl/extractor/bbc.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bbc.co.uk/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/" + + +class BbcGalleryExtractor(GalleryExtractor): + """Extractor for a programme gallery on bbc.co.uk""" + category = "bbc" + root = "https://www.bbc.co.uk" + directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}", + "{path[3:]:J - /}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{programme}_{num}" + pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" + test = ( + ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", { + "pattern": r"https://ichef\.bbci\.co\.uk" + r"/images/ic/976x549_b/\w+\.jpg", + "count": 37, + "keyword": { + "programme": "p084qtzs", + "path": ["BBC One", "Doctor Who", "The Timeless Children"], + }, + }), + ("https://www.bbc.co.uk/programmes/p084qtzs"), + ) + + def metadata(self, page): + data = json.loads(text.extract( + page, '')[0]) + return { + "programme": self.gallery_url.split("/")[4], + "path": list(util.unique_sequence( + element["name"] + for element in data["itemListElement"] + )), + } + + def images(self, page): + return [ + (imgset.rpartition(", ")[2].partition(" ")[0], None) + for imgset in text.extract_iter(page, 'data-image-src-sets="', '"') + ] + + +class BbcProgrammeExtractor(Extractor): + """Extractor for all galleries of a bbc programme""" + category = "bbc" + subcategory = "programme" + root = "https://www.bbc.co.uk" + pattern = BASE_PATTERN + r"[^/?#]+/galleries)" + test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { + "pattern": BbcGalleryExtractor.pattern, + "count": ">= 24", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.galleries_url = self.root + match.group(1) + + def items(self): + page = self.request(self.galleries_url).text + data = {"_extractor": BbcGalleryExtractor} + + for programme_id in text.extract_iter( + page, '