[bbc] extract more metadata (#6582)
https://github.com/mikf/gallery-dl/issues/6582#issuecomment-2745905375 - title - title_image - description - synopsis
This commit is contained in:
@@ -27,7 +27,12 @@ class BbcGalleryExtractor(GalleryExtractor):
|
||||
|
||||
def metadata(self, page):
|
||||
data = self._extract_jsonld(page)
|
||||
|
||||
return {
|
||||
"title": text.unescape(text.extr(
|
||||
page, "<h1>", "</h1>").rpartition("</span>")[2]),
|
||||
"description": text.unescape(text.extr(
|
||||
page, 'property="og:description" content="', '"')),
|
||||
"programme": self.gallery_url.split("/")[4],
|
||||
"path": list(util.unique_sequence(
|
||||
element["name"]
|
||||
@@ -40,11 +45,20 @@ class BbcGalleryExtractor(GalleryExtractor):
|
||||
width = width - width % 16 if width else 1920
|
||||
dimensions = "/{}xn/".format(width)
|
||||
|
||||
return [
|
||||
(src.replace("/320x180_b/", dimensions),
|
||||
{"_fallback": self._fallback_urls(src, width)})
|
||||
for src in text.extract_iter(page, 'data-image-src="', '"')
|
||||
]
|
||||
results = []
|
||||
for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"):
|
||||
src = text.extr(img, 'data-image-src="', '"')
|
||||
results.append((
|
||||
src.replace("/320x180_b/", dimensions),
|
||||
{
|
||||
"title_image": text.unescape(text.extr(
|
||||
img, 'data-gallery-title="', '"')),
|
||||
"synopsis": text.unescape(text.extr(
|
||||
img, 'data-gallery-synopsis="', '"')),
|
||||
"_fallback": self._fallback_urls(src, width),
|
||||
},
|
||||
))
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _fallback_urls(src, max_width):
|
||||
@@ -62,14 +76,11 @@ class BbcProgrammeExtractor(Extractor):
|
||||
pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
|
||||
example = "https://www.bbc.co.uk/programmes/ID/galleries"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self.path, self.page = match.groups()
|
||||
|
||||
def items(self):
|
||||
path, pnum = self.groups
|
||||
data = {"_extractor": BbcGalleryExtractor}
|
||||
params = {"page": text.parse_int(self.page, 1)}
|
||||
galleries_url = self.root + self.path
|
||||
params = {"page": text.parse_int(pnum, 1)}
|
||||
galleries_url = self.root + path
|
||||
|
||||
while True:
|
||||
page = self.request(galleries_url, params=params).text
|
||||
|
||||
@@ -15,14 +15,52 @@ __tests__ = (
|
||||
"#pattern" : r"https://ichef\.bbci\.co\.uk/images/ic/1920xn/\w+\.jpg",
|
||||
"#count" : 37,
|
||||
|
||||
"programme": "p084qtzs",
|
||||
"path" : [
|
||||
"count" : 37,
|
||||
"num" : range(1, 37),
|
||||
"description": "The Cybermen attack. And for the Doctor, nothing will ever be the same.",
|
||||
"programme" : "p084qtzs",
|
||||
"synopsis" : "The Cybermen attack. And for the Doctor, nothing will ever be the same.",
|
||||
"title" : "The Timeless Children",
|
||||
"title_image": {"The Timeless Children", ": The Timeless Children"},
|
||||
"path" : [
|
||||
"BBC One",
|
||||
"Doctor Who (2005–2022)",
|
||||
"The Timeless Children",
|
||||
],
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://www.bbc.co.uk/programmes/p086f8yf/p086f8j6",
|
||||
"#category": ("", "bbc", "gallery"),
|
||||
"#class" : bbc.BbcGalleryExtractor,
|
||||
"#pattern" : r"https://ichef\.bbci\.co\.uk/images/ic/1920xn/\w+\.jpg",
|
||||
"#range" : "1-2",
|
||||
"#count" : 2,
|
||||
|
||||
"count" : 9,
|
||||
"num" : {1, 2},
|
||||
"description": "Continuing his journey, Colin gives unique insights into the unique animals he finds.",
|
||||
"extension" : "jpg",
|
||||
"filename" : {"p086f7yn", "p086f80n"},
|
||||
"programme" : "p086f8yf",
|
||||
"title" : "Wild Cuba: A Caribbean Journey - Part 2",
|
||||
"title_image": {
|
||||
"Cuba is home to many unique birds",
|
||||
"A Cuban pygmy owl looks out of its tree hole",
|
||||
},
|
||||
"synopsis" : {
|
||||
"This vibrant Cuban tody is just one of more than 300 species of bird found in Cuba.",
|
||||
"Cuban pygmy owls nest in abandoned holes carved out by woodpeckers.",
|
||||
},
|
||||
"path" : [
|
||||
"BBC Two",
|
||||
"Natural World",
|
||||
"2019-2020",
|
||||
"Wild Cuba: A Caribbean Journey - Part 2",
|
||||
"Wildlife camera operator Colin Stafford-Johnson has loved Cuba since he was a little boy"
|
||||
],
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://www.bbc.co.uk/programmes/p084qtzs",
|
||||
"#category": ("", "bbc", "gallery"),
|
||||
|
||||
Reference in New Issue
Block a user