use 'text.extract_from()' in a few places

2019-04-19 23:02:29 +02:00
parent 21a7e395a7
commit f2cf1c1d73
10 changed files with 116 additions and 162 deletions
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -77,42 +77,31 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
        GalleryExtractor.__init__(self, match, url)

    def metadata(self, page):
-        extr = text.extract
-        title, pos = extr(page, '"og:title" content="', '"')
-        thumb, pos = extr(page, '"og:image" content="', '"', pos)
+        extr = text.extract_from(page)
+        title = extr('"og:title" content="', '"')
        title_en, _, title_jp = text.unescape(title).partition("/")
        title_en = title_en.strip()
        title_jp = title_jp.strip()

-        uploader  , pos = extr(page, 'id="Uploader">'  , '</div>', pos)
-        date      , pos = extr(page, 'id="Uploaded">'  , '</div>', pos)
-        rating    , pos = extr(page, 'id="Rating">'    , '</div>', pos)
-        gtype     , pos = extr(page, 'id="Category">'  , '</div>', pos)
-        collection, pos = extr(page, 'id="Collection">', '</div>', pos)
-        group     , pos = extr(page, 'id="Group">'     , '</div>', pos)
-        artist    , pos = extr(page, 'id="Artist">'    , '</div>', pos)
-        parody    , pos = extr(page, 'id="Parody">'    , '</div>', pos)
-        character , pos = extr(page, 'id="Character">' , '</div>', pos)
-        tags      , pos = extr(page, 'id="Tag">'       , '</div>', pos)
-
        return {
            "gallery_id": text.parse_int(self.gallery_id),
-            "title": title_en or title_jp,
-            "title_en": title_en,
-            "title_jp": title_jp,
-            "thumbnail": thumb,
-            "uploader": text.remove_html(uploader),
-            "date": date.strip(),
-            "rating": text.parse_float(rating.partition(" ")[0]),
-            "type": text.remove_html(gtype),
-            "collection": text.remove_html(collection),
-            "group": text.split_html(group),
-            "artist": text.split_html(artist),
-            "parody": text.split_html(parody),
-            "characters": text.split_html(character),
-            "tags": text.split_html(tags),
-            "language": "English",
-            "lang": "en",
+            "title"     : title_en or title_jp,
+            "title_en"  : title_en,
+            "title_jp"  : title_jp,
+            "thumbnail" : extr('"og:image" content="', '"'),
+            "uploader"  : text.remove_html(extr('id="Uploader">', '</div>')),
+            "date"      : extr('id="Uploaded">', '</div>').strip(),
+            "rating"    : text.parse_float(extr(
+                'id="Rating">', '</div>').partition(" ")[0]),
+            "type"      : text.remove_html(extr('id="Category">'  , '</div>')),
+            "collection": text.remove_html(extr('id="Collection">', '</div>')),
+            "group"     : text.split_html(extr('id="Group">'      , '</div>')),
+            "artist"    : text.split_html(extr('id="Artist">'     , '</div>')),
+            "parody"    : text.split_html(extr('id="Parody">'     , '</div>')),
+            "characters": text.split_html(extr('id="Character">'  , '</div>')),
+            "tags"      : text.split_html(extr('id="Tag">'        , '</div>')),
+            "language"  : "English",
+            "lang"      : "en",
        }

    def images(self, page):