From b5c88b3d3ed505aefecf51b5908d6c3503c457dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 6 Jun 2025 12:26:21 +0200 Subject: [PATCH] replace standard library 're' uses with 'util.re()' --- gallery_dl/extractor/agnph.py | 6 ++-- gallery_dl/extractor/arcalive.py | 10 ++---- gallery_dl/extractor/batoto.py | 7 ++-- gallery_dl/extractor/blogger.py | 7 ++-- gallery_dl/extractor/deviantart.py | 17 +++++----- gallery_dl/extractor/dynastyscans.py | 12 +++---- gallery_dl/extractor/everia.py | 7 ++-- gallery_dl/extractor/fanbox.py | 3 +- gallery_dl/extractor/gelbooru_v02.py | 7 ++-- gallery_dl/extractor/generic.py | 11 +++---- gallery_dl/extractor/hatenablog.py | 7 ++-- gallery_dl/extractor/hentai2read.py | 6 ++-- gallery_dl/extractor/hentaihere.py | 6 ++-- gallery_dl/extractor/hiperdex.py | 7 ++-- gallery_dl/extractor/hitomi.py | 13 ++++---- gallery_dl/extractor/imagebam.py | 8 ++--- gallery_dl/extractor/imgbox.py | 6 ++-- gallery_dl/extractor/instagram.py | 3 +- gallery_dl/extractor/kemono.py | 9 +++--- gallery_dl/extractor/komikcast.py | 14 ++++---- gallery_dl/extractor/mangahere.py | 7 ++-- gallery_dl/extractor/mangapark.py | 16 ++++----- gallery_dl/extractor/mangaread.py | 9 +++--- gallery_dl/extractor/moebooru.py | 6 ++-- gallery_dl/extractor/newgrounds.py | 5 ++- gallery_dl/extractor/pillowfort.py | 7 ++-- gallery_dl/extractor/pixiv.py | 3 +- gallery_dl/extractor/plurk.py | 4 +-- gallery_dl/extractor/postmill.py | 7 ++-- gallery_dl/extractor/realbooru.py | 4 +-- gallery_dl/extractor/recursive.py | 5 ++- gallery_dl/extractor/rule34us.py | 5 ++- gallery_dl/extractor/sankaku.py | 11 +++---- gallery_dl/extractor/sankakucomplex.py | 7 ++-- gallery_dl/extractor/speakerdeck.py | 6 ++-- gallery_dl/extractor/subscribestar.py | 9 +++--- gallery_dl/extractor/tumblr.py | 11 +++---- gallery_dl/extractor/vk.py | 5 ++- gallery_dl/extractor/zerochan.py | 3 +- test/results/aryion.py | 2 +- test/results/batoto.py | 45 +++++--------------------- test/results/hitomi.py | 2 +- test/results/mangaread.py | 2 +- test/results/realbooru.py | 6 ++-- test/results/sankaku.py | 10 +++--- 45 files changed, 143 insertions(+), 220 deletions(-) diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py index 2c45bf3a..fbf6c6a7 100644 --- a/gallery_dl/extractor/agnph.py +++ b/gallery_dl/extractor/agnph.py @@ -9,10 +9,8 @@ """Extractors for https://agn.ph/""" from . import booru -from .. import text - +from .. import text, util import collections -import re BASE_PATTERN = r"(?:https?://)?agn\.ph" @@ -72,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile(r'class="(.)typetag">([^<]+)') + pattern = util.re(r'class="(.)typetag">([^<]+)') for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) for key, value in tags.items(): diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index b1b32600..b7181fe0 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -8,7 +8,6 @@ from .common import Extractor, Message from .. import text, util, exception -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live" @@ -65,8 +64,8 @@ class ArcalivePostExtractor(ArcaliveExtractor): def _extract_files(self, post): files = [] - for video, media in self._extract_media(post["content"]): - + for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall( + post["content"]): if not self.emoticons and 'class="arca-emoticon"' in media: continue @@ -113,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor): return files - def _extract_media(self, content): - ArcalivePostExtractor._extract_media = extr = re.compile( - r"<(?:img|vide(o)) ([^>]+)").findall - return extr(content) - class ArcaliveBoardExtractor(ArcaliveExtractor): """Extractor for an arca.live board's posts""" diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a1ad3ae8..03f103aa 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -7,8 +7,7 @@ """Extractors for https://bato.to/""" from .common import Extractor, ChapterExtractor, MangaExtractor -from .. import text, exception -import re +from .. import text, util, exception BASE_PATTERN = (r"(?:https?://)?(" r"(?:ba|d|f|h|j|m|w)to\.to|" @@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): info = text.remove_html(extr('link-hover">', "\ if content["src"].startswith("https://images-wixmp-"): if self.intermediary and deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - intermediary, count = re.subn( - r"(/f/[^/]+/[^/]+)/v\d+/.*", + intermediary, count = self.intermediary_subn( r"/intermediary\1", content["src"], 1) if count: deviation["is_original"] = False @@ -682,8 +683,8 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ @staticmethod def _find_folder(folders, name, uuid): if uuid.isdecimal(): - match = re.compile(name.replace( - "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match + match = util.re( + "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match for folder in folders: if match(folder["name"]): return folder diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index e24b6435..4551d505 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .. import text, util -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -47,12 +46,11 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - match = re.match( - (r"(?:]*>)?([^<]+)(?:)?" # manga name - r"(?: ch(\d+)([^:<]*))?" # chapter info - r"(?:: (.+))?"), # title - extr("

", ""), - ) + match = util.re( + r"(?:]*>)?([^<]+)(?:)?" # manga name + r"(?: ch(\d+)([^:<]*))?" # chapter info + r"(?:: (.+))?" # title + ).match(extr("

", "")) author = extr(" by ", "") group = extr('"icon-print"> ', '') diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index d7e097d0..9bae49ca 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -7,8 +7,7 @@ """Extractors for https://everia.club""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util BASE_PATTERN = r"(?:https?://)?everia\.club" @@ -26,7 +25,7 @@ class EveriaExtractor(Extractor): return self._pagination(self.groups[0]) def _pagination(self, path, params=None, pnum=1): - find_posts = re.compile(r'thumbnail">\s*\s*', "\s]*)?" # optional query and fragment ) - imageurls_src = re.findall(imageurl_pattern_src, page) - imageurls_ext = re.findall(imageurl_pattern_ext, page) + imageurls_src = util.re(imageurl_pattern_src).findall(page) + imageurls_ext = util.re(imageurl_pattern_ext).findall(page) imageurls = imageurls_src + imageurls_ext # Resolve relative urls @@ -182,8 +181,8 @@ class GenericExtractor(Extractor): # by prepending a suitable base url. # # If the page contains a element, use it as base url - basematch = re.search( - r"(?i)(?:[^\"' >]+)", page) + basematch = util.re( + r"(?i)(?:[^\"' >]+)").search(page) if basematch: self.baseurl = basematch.group('url').rstrip('/') # Otherwise, extract the base url from self.url diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 792f6664..7c899061 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -6,9 +6,8 @@ """Extractors for https://hatenablog.com""" -import re from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = ( @@ -31,7 +30,7 @@ class HatenablogExtractor(Extractor): self.domain = match.group(1) or match.group(2) def _init(self): - self._find_img = re.compile(r']+)').finditer + self._find_img = util.re(r']+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) @@ -74,7 +73,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): def _init(self): HatenablogExtractor._init(self) - self._find_pager_url = re.compile( + self._find_pager_url = util.re( r' class="pager-next">\s*", "") chapter_id = text.extr(page, 'report/C', '"') chapter, sep, minor = self.chapter.partition(".") - pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " - match = re.match(pattern, title) + match = util.re( + r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by " + r"(.+) at ").match(title) return { "manga": match.group(1), "manga_id": text.parse_int(self.manga_id), diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index f15aab71..de04be3e 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -9,9 +9,8 @@ """Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text +from .. import text, util from ..cache import memcache -import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))") @@ -80,10 +79,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): return self.chapter_data(self.chapter) def images(self, page): + pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') return [ (url.strip(), None) - for url in re.findall( - r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page) + for url in pattern.findall(page) ] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 086b77c1..7b362be2 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -13,7 +13,6 @@ from .nozomi import decode_nozomi from ..cache import memcache from .. import text, util import string -import re class HitomiExtractor(Extractor): @@ -257,8 +256,8 @@ def _parse_gg(extr): m = {} keys = [] - for match in re.finditer( - r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page): + for match in util.re_compile( + r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?").finditer(page): key, value = match.groups() keys.append(int(key)) @@ -268,11 +267,11 @@ def _parse_gg(extr): m[key] = value keys.clear() - for match in re.finditer( - r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page): + for match in util.re_compile( + r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)").finditer(page): m[int(match.group(1))] = int(match.group(2)) - d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) - b = re.search(r"b:\s*[\"'](.+)[\"']", page) + d = util.re_compile(r"(?:var\s|default:)\s*o\s*=\s*(\d+)").search(page) + b = util.re_compile(r"b:\s*[\"'](.+)[\"']").search(page) return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index b4ba597f..504dee64 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -9,8 +9,7 @@ """Extractors for https://www.imagebam.com/""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util class ImagebamExtractor(Extractor): @@ -70,9 +69,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor): page, 'id="gallery-name">', '<').strip())} def images(self, page): - findall = re.compile(r'', page)
+        self.image_keys = util.re(
+            r'<a href=').findall(page)
 
         title = text.extr(page, ", "

") title, _, count = title.rpartition(" - ") diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 21bd0594..46b913e8 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -14,7 +14,6 @@ from .. import text, util, exception from ..cache import cache, memcache import itertools import binascii -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)" @@ -39,7 +38,7 @@ class InstagramExtractor(Extractor): def _init(self): self.www_claim = "0" self.csrf_token = util.generate_token() - self._find_tags = re.compile(r"#\w+").findall + self._find_tags = util.re(r"#\w+").findall self._logged_in = True self._cursor = None self._user = None diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index 1a2b3a36..4853825f 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -13,7 +13,6 @@ from .. import text, util, exception from ..cache import cache, memcache import itertools import json -import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(su|party)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" @@ -44,7 +43,7 @@ class KemonoExtractor(Extractor): order = self.config("order-revisions") self.revisions_reverse = order[0] in ("r", "a") if order else False - self._find_inline = re.compile( + self._find_inline = util.re( r'src="(?:https?://(?:kemono|coomer)\.su)?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall self._json_dumps = json.JSONEncoder( @@ -52,7 +51,7 @@ class KemonoExtractor(Extractor): sort_keys=True, separators=(",", ":")).encode def items(self): - find_hash = re.compile(HASH_PATTERN).match + find_hash = util.re(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) announcements = True if self.config("announcements") else None archives = True if self.config("archives") else False @@ -409,10 +408,10 @@ class KemonoDiscordExtractor(KemonoExtractor): "parent_id" : channel["parent_channel_id"], } - find_inline = re.compile( + find_inline = util.re( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall - find_hash = re.compile(HASH_PATTERN).match + find_hash = util.re(HASH_PATTERN).match posts = self.api.discord_channel(channel_id) max_posts = self.config("max-posts") diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 89a1b5e2..e4119fc2 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -9,8 +9,7 @@ """Extractors for https://komikcast.la/""" from .common import ChapterExtractor, MangaExtractor -from .. import text -import re +from .. import text, util BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" r"komikcast\.(?:la|cz|lol|site|mo?e|com)") @@ -24,13 +23,11 @@ class KomikcastBase(): @staticmethod def parse_chapter_string(chapter_string, data=None): """Parse 'chapter_string' value and add its info to 'data'""" - if not data: + if data is None: data = {} - match = re.match( - r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?", - text.unescape(chapter_string), - ) + pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?") + match = pattern.match(text.unescape(chapter_string)) manga, chapter, data["chapter_minor"], title = match.groups() if manga: @@ -59,9 +56,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): def images(page): readerarea = text.extr( page, '
]* src=[\"']([^\"']+)") return [ (text.unescape(url), None) - for url in re.findall(r"]* src=[\"']([^\"']+)", readerarea) + for url in pattern.findall(readerarea) ] diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 8c94f048..7cab7f61 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -9,8 +9,7 @@ """Extractors for https://www.mangahere.cc/""" from .common import ChapterExtractor, MangaExtractor -from .. import text -import re +from .. import text, util class MangahereBase(): @@ -104,8 +103,8 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor): info, pos = text.extract(page, 'class="title3">', '<', pos) date, pos = text.extract(page, 'class="title2">', '<', pos) - match = re.match( - r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info) + match = util.re( + r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info) if match: volume, chapter, minor, title = match.groups() else: diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index b11f81de..1fb091c2 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -11,7 +11,6 @@ from .common import ChapterExtractor, Extractor, Message from .. import text, util, exception from ..cache import memcache -import re BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:" r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|" @@ -22,17 +21,14 @@ BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:" class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - _match_title = None def _parse_chapter_title(self, title): - if not self._match_title: - MangaparkBase._match_title = re.compile( - r"(?i)" - r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" - r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" - r"(?:\s*:\s*(.*))?" - ).match - match = self._match_title(title) + match = util.re( + r"(?i)" + r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" + r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" + r"(?:\s*:\s*(.*))?" + ).match(title) return match.groups() if match else (0, 0, "", "") @memcache(keyarg=1) diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 6970b4f8..23a3f7cd 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -7,8 +7,7 @@ """Extractors for https://mangaread.org/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, exception -import re +from .. import text, util, exception class MangareadBase(): @@ -18,9 +17,9 @@ class MangareadBase(): @staticmethod def parse_chapter_string(chapter_string, data): - match = re.match( - r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?", - text.unescape(chapter_string).strip()) + match = util.re( + r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?" + ).match(text.unescape(chapter_string).strip()) manga, chapter, minor, title = match.groups() manga = manga.strip() if manga else "" data["manga"] = data.pop("manga", manga) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 9fd66e2f..8186d2e0 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -9,11 +9,9 @@ """Extractors for Moebooru based sites""" from .booru import BooruExtractor -from .. import text - +from .. import text, util import collections import datetime -import re class MoebooruExtractor(BooruExtractor): @@ -36,7 +34,7 @@ class MoebooruExtractor(BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") + pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index f586401b..1bf750a8 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -12,7 +12,6 @@ from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache import itertools -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com" USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com" @@ -35,7 +34,7 @@ class NewgroundsExtractor(Extractor): self.user_root = "https://{}.newgrounds.com".format(self.user) def _init(self): - self._extract_comment_urls = re.compile( + self._extract_comment_urls = util.re( r'(?:') tags = collections.defaultdict(list) - pattern = re.compile( - r']*>]*src=[\"']([^\"']+)", content) + return util.re(r"]*src=[\"']([^\"']+)").findall(content) @staticmethod def _extract_embeds(content): return [ "ytdl:" + url for url in - re.findall(r"