diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py index 2c45bf3a..fbf6c6a7 100644 --- a/gallery_dl/extractor/agnph.py +++ b/gallery_dl/extractor/agnph.py @@ -9,10 +9,8 @@ """Extractors for https://agn.ph/""" from . import booru -from .. import text - +from .. import text, util import collections -import re BASE_PATTERN = r"(?:https?://)?agn\.ph" @@ -72,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile(r'class="(.)typetag">([^<]+)') + pattern = util.re(r'class="(.)typetag">([^<]+)') for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) for key, value in tags.items(): diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index b1b32600..b7181fe0 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -8,7 +8,6 @@ from .common import Extractor, Message from .. import text, util, exception -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live" @@ -65,8 +64,8 @@ class ArcalivePostExtractor(ArcaliveExtractor): def _extract_files(self, post): files = [] - for video, media in self._extract_media(post["content"]): - + for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall( + post["content"]): if not self.emoticons and 'class="arca-emoticon"' in media: continue @@ -113,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor): return files - def _extract_media(self, content): - ArcalivePostExtractor._extract_media = extr = re.compile( - r"<(?:img|vide(o)) ([^>]+)").findall - return extr(content) - class ArcaliveBoardExtractor(ArcaliveExtractor): """Extractor for an arca.live board's posts""" diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a1ad3ae8..03f103aa 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -7,8 +7,7 @@ """Extractors for https://bato.to/""" from .common import Extractor, ChapterExtractor, MangaExtractor -from .. import text, exception -import re +from .. import text, util, exception BASE_PATTERN = (r"(?:https?://)?(" r"(?:ba|d|f|h|j|m|w)to\.to|" @@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): info = text.remove_html(extr('link-hover">', "")) info = text.unescape(info) - match = re.match( + match = util.re( r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" - r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info) + r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info) if match: volume, chapter, minor = match.groups() else: diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index ef117da7..e755ea1b 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -10,7 +10,6 @@ from .common import BaseExtractor, Message from .. import text, util -import re class BloggerExtractor(BaseExtractor): @@ -33,13 +32,13 @@ class BloggerExtractor(BaseExtractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub - findall_image = re.compile( + sub = util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub + findall_image = util.re( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall - findall_video = re.compile( + findall_video = util.re( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall metadata = self.metadata() diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index cf0de99d..461d5b07 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -15,7 +15,6 @@ import collections import mimetypes import binascii import time -import re BASE_PATTERN = ( r"(?:https?://)?(?:" @@ -66,10 +65,13 @@ class DeviantartExtractor(Extractor): if self.quality: if self.quality == "png": self.quality = "-fullview.png?" - self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub + self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub else: self.quality = ",q_{}".format(self.quality) - self.quality_sub = re.compile(r",q_\d+").sub + self.quality_sub = util.re(r",q_\d+").sub + + if self.intermediary: + self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn if isinstance(self.original, str) and \ self.original.lower().startswith("image"): @@ -271,7 +273,7 @@ class DeviantartExtractor(Extractor): ) # filename metadata - sub = re.compile(r"\W").sub + sub = util.re(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", @@ -666,8 +668,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ if content["src"].startswith("https://images-wixmp-"): if self.intermediary and deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - intermediary, count = re.subn( - r"(/f/[^/]+/[^/]+)/v\d+/.*", + intermediary, count = self.intermediary_subn( r"/intermediary\1", content["src"], 1) if count: deviation["is_original"] = False @@ -682,8 +683,8 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ @staticmethod def _find_folder(folders, name, uuid): if uuid.isdecimal(): - match = re.compile(name.replace( - "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match + match = util.re( + "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match for folder in folders: if match(folder["name"]): return folder diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index e24b6435..4551d505 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .. import text, util -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -47,12 +46,11 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - match = re.match( - (r"(?:]*>)?([^<]+)(?:)?" # manga name - r"(?: ch(\d+)([^:<]*))?" # chapter info - r"(?:: (.+))?"), # title - extr("