replace 'util.re()' with 'text.re()'

remove unnecessary 'util' imports
This commit is contained in:
Mike Fährmann
2025-10-20 17:44:58 +02:00
parent c8fc790028
commit 9bf76c1352
42 changed files with 91 additions and 91 deletions

View File

@@ -9,7 +9,7 @@
"""Extractors for https://agn.ph/""" """Extractors for https://agn.ph/"""
from . import booru from . import booru
from .. import text, util from .. import text
import collections import collections
BASE_PATTERN = r"(?:https?://)?agn\.ph" BASE_PATTERN = r"(?:https?://)?agn\.ph"
@@ -70,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = util.re(r'class="(.)typetag">([^<]+)') pattern = text.re(r'class="(.)typetag">([^<]+)')
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) tags[tag_type].append(text.unquote(tag_name).replace(" ", "_"))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -63,7 +63,7 @@ class ArcalivePostExtractor(ArcaliveExtractor):
def _extract_files(self, post): def _extract_files(self, post):
files = [] files = []
for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall( for video, media in text.re(r"<(?:img|vide(o)) ([^>]+)").findall(
post["content"]): post["content"]):
if not self.emoticons and 'class="arca-emoticon"' in media: if not self.emoticons and 'class="arca-emoticon"' in media:
continue continue

View File

@@ -104,7 +104,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
info = text.remove_html(extr('link-hover">', "</")) info = text.remove_html(extr('link-hover">', "</"))
info = text.unescape(info) info = text.unescape(info)
match = util.re( match = text.re(
r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info) r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info)
if match: if match:

View File

@@ -13,7 +13,7 @@ from .. import text, util
def original(url): def original(url):
return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)") return (text.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)")
.sub(r"\1s0", url) .sub(r"\1s0", url)
.replace("http:", "https:", 1)) .replace("http:", "https:", 1))
@@ -32,7 +32,7 @@ class BloggerExtractor(BaseExtractor):
self.videos = self.config("videos", True) self.videos = self.config("videos", True)
if self.videos: if self.videos:
self.findall_video = util.re( self.findall_video = text.re(
r"""src=["'](https?://www\.blogger\.com""" r"""src=["'](https?://www\.blogger\.com"""
r"""/video\.g\?token=[^"']+)""").findall r"""/video\.g\?token=[^"']+)""").findall
@@ -43,7 +43,7 @@ class BloggerExtractor(BaseExtractor):
blog["date"] = self.parse_datetime_iso(blog["published"]) blog["date"] = self.parse_datetime_iso(blog["published"])
del blog["selfLink"] del blog["selfLink"]
findall_image = util.re( findall_image = text.re(
r'src="(https?://(?:' r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|' r'blogger\.googleusercontent\.com/img|'
r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|'

View File

@@ -64,13 +64,13 @@ class DeviantartExtractor(Extractor):
if self.quality: if self.quality:
if self.quality == "png": if self.quality == "png":
self.quality = "-fullview.png?" self.quality = "-fullview.png?"
self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub self.quality_sub = text.re(r"-fullview\.[a-z0-9]+\?").sub
else: else:
self.quality = f",q_{self.quality}" self.quality = f",q_{self.quality}"
self.quality_sub = util.re(r",q_\d+").sub self.quality_sub = text.re(r",q_\d+").sub
if self.intermediary: if self.intermediary:
self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn self.intermediary_subn = text.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn
if isinstance(self.original, str) and \ if isinstance(self.original, str) and \
self.original.lower().startswith("image"): self.original.lower().startswith("image"):
@@ -269,7 +269,7 @@ class DeviantartExtractor(Extractor):
) )
# filename metadata # filename metadata
sub = util.re(r"\W").sub sub = text.re(r"\W").sub
deviation["filename"] = "".join(( deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["title"].lower()), "_by_",
sub("_", deviation["author"]["username"].lower()), "-d", sub("_", deviation["author"]["username"].lower()), "-d",
@@ -675,7 +675,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
def _find_folder(self, folders, name, uuid): def _find_folder(self, folders, name, uuid):
if uuid.isdecimal(): if uuid.isdecimal():
match = util.re( match = text.re(
"(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match
for folder in folders: for folder in folders:
if match(folder["name"]): if match(folder["name"]):

View File

@@ -46,7 +46,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
def metadata(self, page): def metadata(self, page):
extr = text.extract_from(page) extr = text.extract_from(page)
match = util.re( match = text.re(
r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
r"(?: ch(\d+)([^:<]*))?" # chapter info r"(?: ch(\d+)([^:<]*))?" # chapter info
r"(?:: (.+))?" # title r"(?:: (.+))?" # title

View File

@@ -7,7 +7,7 @@
"""Extractors for https://everia.club""" """Extractors for https://everia.club"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text
BASE_PATTERN = r"(?:https?://)?everia\.club" BASE_PATTERN = r"(?:https?://)?everia\.club"
@@ -25,7 +25,7 @@ class EveriaExtractor(Extractor):
return self._pagination(self.groups[0]) return self._pagination(self.groups[0])
def _pagination(self, path, params=None, pnum=1): def _pagination(self, path, params=None, pnum=1):
find_posts = util.re(r'thumbnail">\s*<a href="([^"]+)').findall find_posts = text.re(r'thumbnail">\s*<a href="([^"]+)').findall
while True: while True:
if pnum == 1: if pnum == 1:
@@ -52,7 +52,7 @@ class EveriaPostExtractor(EveriaExtractor):
url = self.root + self.groups[0] + "/" url = self.root + self.groups[0] + "/"
page = self.request(url).text page = self.request(url).text
content = text.extr(page, 'itemprop="text">', "<h3") content = text.extr(page, 'itemprop="text">', "<h3")
urls = util.re(r'img.*?lazy-src="([^"]+)').findall(content) urls = text.re(r'img.*?lazy-src="([^"]+)').findall(content)
data = { data = {
"title": text.unescape( "title": text.unescape(

View File

@@ -216,7 +216,7 @@ class FanboxExtractor(Extractor):
def _get_urls_from_post(self, content_body, post): def _get_urls_from_post(self, content_body, post):
num = 0 num = 0
if cover_image := post.get("coverImageUrl"): if cover_image := post.get("coverImageUrl"):
cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image) cover_image = text.re("/c/[0-9a-z_]+").sub("", cover_image)
final_post = post.copy() final_post = post.copy()
final_post["isCoverImage"] = True final_post["isCoverImage"] = True
final_post["fileUrl"] = cover_image final_post["fileUrl"] = cover_image

View File

@@ -96,7 +96,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start * self.per_page params["pid"] = self.page_start * self.per_page
data = {} data = {}
find_ids = util.re(r"\sid=\"p(\d+)").findall find_ids = text.re(r"\sid=\"p(\d+)").findall
while True: while True:
page = self.request(url, params=params).text page = self.request(url, params=params).text
@@ -136,7 +136,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)") pattern = text.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)")
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name))) tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -7,7 +7,7 @@
"""Generic information extractor""" """Generic information extractor"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import config, text, util from .. import config, text
import os.path import os.path
@@ -171,8 +171,8 @@ class GenericExtractor(Extractor):
r"(?:[^\"'<>\s]*)?" # optional query and fragment r"(?:[^\"'<>\s]*)?" # optional query and fragment
) )
imageurls_src = util.re(imageurl_pattern_src).findall(page) imageurls_src = text.re(imageurl_pattern_src).findall(page)
imageurls_ext = util.re(imageurl_pattern_ext).findall(page) imageurls_ext = text.re(imageurl_pattern_ext).findall(page)
imageurls = imageurls_src + imageurls_ext imageurls = imageurls_src + imageurls_ext
# Resolve relative urls # Resolve relative urls
@@ -181,7 +181,7 @@ class GenericExtractor(Extractor):
# by prepending a suitable base url. # by prepending a suitable base url.
# #
# If the page contains a <base> element, use it as base url # If the page contains a <base> element, use it as base url
basematch = util.re( basematch = text.re(
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page) r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
if basematch: if basematch:
self.baseurl = basematch['url'].rstrip('/') self.baseurl = basematch['url'].rstrip('/')

View File

@@ -5,7 +5,7 @@
# published by the Free Software Foundation. # published by the Free Software Foundation.
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, exception
from ..cache import cache from ..cache import cache
BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com" BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com"
@@ -155,7 +155,7 @@ class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor):
raise exception.AuthorizationError(msg) raise exception.AuthorizationError(msg)
page = response.text page = response.text
match = util.re(r"Page (\d+) of (\d+)").search(page) match = text.re(r"Page (\d+) of (\d+)").search(page)
current, total = match.groups() current, total = match.groups()
current, total = text.parse_int(current), text.parse_int(total) current, total = text.parse_int(current), text.parse_int(total)

View File

@@ -7,7 +7,7 @@
"""Extractors for https://hatenablog.com""" """Extractors for https://hatenablog.com"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text
BASE_PATTERN = ( BASE_PATTERN = (
@@ -30,7 +30,7 @@ class HatenablogExtractor(Extractor):
self.domain = match[1] or match[2] self.domain = match[1] or match[2]
def _init(self): def _init(self):
self._find_img = util.re(r'<img +([^>]+)').finditer self._find_img = text.re(r'<img +([^>]+)').finditer
def _handle_article(self, article: str): def _handle_article(self, article: str):
extr = text.extract_from(article) extr = text.extract_from(article)
@@ -73,7 +73,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
def _init(self): def _init(self):
HatenablogExtractor._init(self) HatenablogExtractor._init(self)
self._find_pager_url = util.re( self._find_pager_url = text.re(
r' class="pager-next">\s*<a href="([^"]+)').search r' class="pager-next">\s*<a href="([^"]+)').search
def items(self): def items(self):

View File

@@ -30,7 +30,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
chapter, sep, minor = self.groups[1].partition(".") chapter, sep, minor = self.groups[1].partition(".")
match = util.re( match = text.re(
r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - "
r"([^:]+): (.+) . Page 1 ").match(title) r"([^:]+): (.+) . Page 1 ").match(title)
if match: if match:

View File

@@ -33,7 +33,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
title = text.extr(page, "<title>", "</title>") title = text.extr(page, "<title>", "</title>")
chapter_id = text.extr(page, 'report/C', '"') chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".") chapter, sep, minor = self.chapter.partition(".")
match = util.re( match = text.re(
r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by " r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by "
r"(.+) at ").match(title) r"(.+) at ").match(title)
return { return {

View File

@@ -9,7 +9,7 @@
"""Extractors for https://hiperdex.com/""" """Extractors for https://hiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text
from ..cache import memcache from ..cache import memcache
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
@@ -79,7 +79,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
return self.chapter_data(self.chapter) return self.chapter_data(self.chapter)
def images(self, page): def images(self, page):
pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') pattern = text.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)')
return [ return [
(url.strip(), None) (url.strip(), None)
for url in pattern.findall(page) for url in pattern.findall(page)

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.imagebam.com/""" """Extractors for https://www.imagebam.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text
class ImagebamExtractor(Extractor): class ImagebamExtractor(Extractor):
@@ -69,7 +69,7 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
page, 'id="gallery-name">', '<').strip())} page, 'id="gallery-name">', '<').strip())}
def images(self, page): def images(self, page):
findall = util.re(r'<a href="https://www\.imagebam\.com' findall = text.re(r'<a href="https://www\.imagebam\.com'
r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall
paths = [] paths = []
while True: while True:

View File

@@ -9,7 +9,7 @@
"""Extractors for https://imgbox.com/""" """Extractors for https://imgbox.com/"""
from .common import Extractor, Message, AsynchronousMixin from .common import Extractor, Message, AsynchronousMixin
from .. import text, util, exception from .. import text, exception
class ImgboxExtractor(Extractor): class ImgboxExtractor(Extractor):
@@ -69,7 +69,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
page = self.request(self.root + "/g/" + self.gallery_key).text page = self.request(self.root + "/g/" + self.gallery_key).text
if "The specified gallery could not be found." in page: if "The specified gallery could not be found." in page:
raise exception.NotFoundError("gallery") raise exception.NotFoundError("gallery")
self.image_keys = util.re( self.image_keys = text.re(
r'<a href="/([^"]+)"><img alt="').findall(page) r'<a href="/([^"]+)"><img alt="').findall(page)
title = text.extr(page, "<h1>", "</h1>") title = text.extr(page, "<h1>", "</h1>")

View File

@@ -38,7 +38,7 @@ class InstagramExtractor(Extractor):
def _init(self): def _init(self):
self.www_claim = "0" self.www_claim = "0"
self.csrf_token = util.generate_token() self.csrf_token = util.generate_token()
self._find_tags = util.re(r"#\w+").findall self._find_tags = text.re(r"#\w+").findall
self._logged_in = True self._logged_in = True
self._cursor = None self._cursor = None
self._user = None self._user = None

View File

@@ -44,7 +44,7 @@ class KemonoExtractor(Extractor):
order = self.config("order-revisions") order = self.config("order-revisions")
self.revisions_reverse = order[0] in ("r", "a") if order else False self.revisions_reverse = order[0] in ("r", "a") if order else False
self._find_inline = util.re( self._find_inline = text.re(
r'src="(?:https?://(?:kemono\.cr|coomer\.st))?(/inline/[^"]+' r'src="(?:https?://(?:kemono\.cr|coomer\.st))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder( self._json_dumps = json.JSONEncoder(
@@ -52,7 +52,7 @@ class KemonoExtractor(Extractor):
sort_keys=True, separators=(",", ":")).encode sort_keys=True, separators=(",", ":")).encode
def items(self): def items(self):
find_hash = util.re(HASH_PATTERN).match find_hash = text.re(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files")) generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None announcements = True if self.config("announcements") else None
archives = True if self.config("archives") else False archives = True if self.config("archives") else False
@@ -413,10 +413,10 @@ class KemonoDiscordExtractor(KemonoExtractor):
"parent_id" : channel["parent_channel_id"], "parent_id" : channel["parent_channel_id"],
} }
find_inline = util.re( find_inline = text.re(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
find_hash = util.re(HASH_PATTERN).match find_hash = text.re(HASH_PATTERN).match
if (order := self.config("order-posts")) and order[0] in ("r", "d"): if (order := self.config("order-posts")) and order[0] in ("r", "d"):
posts = self.api.discord_channel(channel_id, channel["post_count"]) posts = self.api.discord_channel(channel_id, channel["post_count"])

View File

@@ -9,7 +9,7 @@
"""Extractors for https://komikcast.li/""" """Extractors for https://komikcast.li/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
r"komikcast\d*\.(?:l(?:i|a|ol)|com|cz|site|mo?e)") r"komikcast\d*\.(?:l(?:i|a|ol)|com|cz|site|mo?e)")
@@ -25,7 +25,7 @@ class KomikcastBase():
if data is None: if data is None:
data = {} data = {}
pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?") pattern = text.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?")
match = pattern.match(text.unescape(chapter_string)) match = pattern.match(text.unescape(chapter_string))
manga, chapter, data["chapter_minor"], title = match.groups() manga, chapter, data["chapter_minor"], title = match.groups()
@@ -54,7 +54,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
def images(self, page): def images(self, page):
readerarea = text.extr( readerarea = text.extr(
page, '<div class="main-reading-area', '</div') page, '<div class="main-reading-area', '</div')
pattern = util.re(r"<img[^>]* src=[\"']([^\"']+)") pattern = text.re(r"<img[^>]* src=[\"']([^\"']+)")
return [ return [
(text.unescape(url), None) (text.unescape(url), None)
for url in pattern.findall(readerarea) for url in pattern.findall(readerarea)

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.mangahere.cc/""" """Extractors for https://www.mangahere.cc/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text
class MangahereBase(): class MangahereBase():
@@ -102,7 +102,7 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
info, pos = text.extract(page, 'class="title3">', '<', pos) info, pos = text.extract(page, 'class="title3">', '<', pos)
date, pos = text.extract(page, 'class="title2">', '<', pos) date, pos = text.extract(page, 'class="title2">', '<', pos)
match = util.re( match = text.re(
r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info) r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info)
if match: if match:
volume, chapter, minor, title = match.groups() volume, chapter, minor, title = match.groups()

View File

@@ -23,7 +23,7 @@ class MangaparkBase():
category = "mangapark" category = "mangapark"
def _parse_chapter_title(self, title): def _parse_chapter_title(self, title):
match = util.re( match = text.re(
r"(?i)" r"(?i)"
r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?"
r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)"

View File

@@ -7,7 +7,7 @@
"""Extractors for https://mangaread.org/""" """Extractors for https://mangaread.org/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util, exception from .. import text, exception
class MangareadBase(): class MangareadBase():
@@ -16,7 +16,7 @@ class MangareadBase():
root = "https://www.mangaread.org" root = "https://www.mangaread.org"
def parse_chapter_string(self, chapter_string, data): def parse_chapter_string(self, chapter_string, data):
match = util.re( match = text.re(
r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?" r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?"
).match(text.unescape(chapter_string).strip()) ).match(text.unescape(chapter_string).strip())
manga, chapter, minor, title = match.groups() manga, chapter, minor, title = match.groups()

View File

@@ -9,7 +9,7 @@
"""Extractors for Moebooru based sites""" """Extractors for Moebooru based sites"""
from .booru import BooruExtractor from .booru import BooruExtractor
from .. import text, util, dt from .. import text, dt
import collections import collections
@@ -32,7 +32,7 @@ class MoebooruExtractor(BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") pattern = text.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name)) tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -34,7 +34,7 @@ class NewgroundsExtractor(Extractor):
self.user_root = f"https://{self.user}.newgrounds.com" self.user_root = f"https://{self.user}.newgrounds.com"
def _init(self): def _init(self):
self._extract_comment_urls = util.re( self._extract_comment_urls = text.re(
r'(?:<img |data-smartload-)src="([^"]+)').findall r'(?:<img |data-smartload-)src="([^"]+)').findall
self.flash = self.config("flash", True) self.flash = self.config("flash", True)
@@ -321,7 +321,7 @@ class NewgroundsExtractor(Extractor):
def _video_formats(self, sources): def _video_formats(self, sources):
src = sources["360p"][0]["src"] src = sources["360p"][0]["src"]
sub = util.re(r"\.360p\.\w+").sub sub = text.re(r"\.360p\.\w+").sub
for fmt in self.format: for fmt in self.format:
try: try:

View File

@@ -10,7 +10,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from ..cache import cache from ..cache import cache
from .. import text, util, exception from .. import text, exception
BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social" BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social"
@@ -36,7 +36,7 @@ class PillowfortExtractor(Extractor):
external = self.config("external", False) external = self.config("external", False)
if inline: if inline:
inline = util.re(r'src="(https://img\d+\.pillowfort\.social' inline = text.re(r'src="(https://img\d+\.pillowfort\.social'
r'/posts/[^"]+)').findall r'/posts/[^"]+)').findall
for post in self.posts(): for post in self.posts():

View File

@@ -43,7 +43,7 @@ class PixivExtractor(Extractor):
self.meta_captions = self.config("captions") self.meta_captions = self.config("captions")
if self.sanity_workaround or self.meta_captions: if self.sanity_workaround or self.meta_captions:
self.meta_captions_sub = util.re( self.meta_captions_sub = text.re(
r'<a href="/jump\.php\?([^"]+)').sub r'<a href="/jump\.php\?([^"]+)').sub
def items(self): def items(self):

View File

@@ -61,7 +61,7 @@ class PlurkExtractor(Extractor):
if not data: if not data:
raise exception.NotFoundError("user") raise exception.NotFoundError("user")
return util.json_loads( return util.json_loads(
util.re(r"new Date\(([^)]+)\)").sub(r"\1", data)) text.re(r"new Date\(([^)]+)\)").sub(r"\1", data))
class PlurkTimelineExtractor(PlurkExtractor): class PlurkTimelineExtractor(PlurkExtractor):

View File

@@ -7,7 +7,7 @@
"""Extractors for Postmill instances""" """Extractors for Postmill instances"""
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, util, exception from .. import text, exception
class PostmillExtractor(BaseExtractor): class PostmillExtractor(BaseExtractor):
@@ -20,8 +20,8 @@ class PostmillExtractor(BaseExtractor):
def _init(self): def _init(self):
self.instance = self.root.partition("://")[2] self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False) self.save_link_post_body = self.config("save-link-post-body", False)
self._search_canonical_url = util.re(r"/f/([\w\d_]+)/(\d+)/").search self._search_canonical_url = text.re(r"/f/([\w\d_]+)/(\d+)/").search
self._search_image_tag = util.re( self._search_image_tag = text.re(
r'<a href="[^"]+"\n +class="submission__image-link"').search r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self): def items(self):

View File

@@ -70,7 +70,7 @@ class RealbooruExtractor(booru.BooruExtractor):
page = post["_html"] page = post["_html"]
tag_container = text.extr(page, 'id="tagLink"', '</div>') tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = util.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') pattern = text.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name))) tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -9,7 +9,7 @@
"""Recursive extractor""" """Recursive extractor"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text
class RecursiveExtractor(Extractor): class RecursiveExtractor(Extractor):
@@ -27,5 +27,5 @@ class RecursiveExtractor(Extractor):
else: else:
page = self.request(text.ensure_http_scheme(url)).text page = self.request(text.ensure_http_scheme(url)).text
for match in util.re(r"https?://[^\s\"']+").finditer(page): for match in text.re(r"https?://[^\s\"']+").finditer(page):
yield Message.Queue, match[0], {} yield Message.Queue, match[0], {}

View File

@@ -9,7 +9,7 @@
"""Extractors for https://rule34.us/""" """Extractors for https://rule34.us/"""
from .booru import BooruExtractor from .booru import BooruExtractor
from .. import text, util from .. import text
import collections import collections
@@ -19,7 +19,7 @@ class Rule34usExtractor(BooruExtractor):
per_page = 42 per_page = 42
def _init(self): def _init(self):
self._find_tags = util.re( self._find_tags = text.re(
r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
def _parse_post(self, post_id): def _parse_post(self, post_id):

View File

@@ -47,7 +47,7 @@ class SankakuExtractor(BooruExtractor):
self.api = SankakuAPI(self) self.api = SankakuAPI(self)
if self.config("tags") == "extended": if self.config("tags") == "extended":
self._tags = self._tags_extended self._tags = self._tags_extended
self._tags_findall = util.re( self._tags_findall = text.re(
r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall
def _file_url(self, post): def _file_url(self, post):
@@ -129,10 +129,10 @@ class SankakuTagExtractor(SankakuExtractor):
if "date:" in self.tags: if "date:" in self.tags:
# rewrite 'date:' tags (#1790) # rewrite 'date:' tags (#1790)
self.tags = util.re( self.tags = text.re(
r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub( r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub(
r"date:\3-\2-\1T00:00", self.tags) r"date:\3-\2-\1T00:00", self.tags)
self.tags = util.re( self.tags = text.re(
r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub( r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub(
r"date:\1-\2-\3T00:00", self.tags) r"date:\1-\2-\3T00:00", self.tags)

View File

@@ -64,19 +64,19 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
yield Message.Url, url, file yield Message.Url, url, file
def _extract_images(self, content): def _extract_images(self, content):
orig_sub = util.re(r"-\d+x\d+\.").sub orig_sub = text.re(r"-\d+x\d+\.").sub
return [ return [
orig_sub(".", url) for url in orig_sub(".", url) for url in
util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
] ]
def _extract_videos(self, content): def _extract_videos(self, content):
return util.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content) return text.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content)
def _extract_embeds(self, content): def _extract_embeds(self, content):
return [ return [
"ytdl:" + url for url in "ytdl:" + url for url in
util.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content) text.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content)
] ]

View File

@@ -9,7 +9,7 @@
"""Extractors for https://speakerdeck.com/""" """Extractors for https://speakerdeck.com/"""
from .common import GalleryExtractor from .common import GalleryExtractor
from .. import text, util from .. import text
class SpeakerdeckPresentationExtractor(GalleryExtractor): class SpeakerdeckPresentationExtractor(GalleryExtractor):
@@ -46,7 +46,7 @@ class SpeakerdeckPresentationExtractor(GalleryExtractor):
def images(self, _): def images(self, _):
url = f"{self.root}/player/{self.presentation_id}" url = f"{self.root}/player/{self.presentation_id}"
page = self.request(url).text page = self.request(url).text
page = util.re(r"\s+").sub(" ", page) page = text.re(r"\s+").sub(" ", page)
return [ return [
(url, None) (url, None)
for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"') for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"')

View File

@@ -161,7 +161,7 @@ class SubscribestarExtractor(Extractor):
attachments = text.extr( attachments = text.extr(
html, 'class="uploads-docs"', 'class="post-edit_form"') html, 'class="uploads-docs"', 'class="post-edit_form"')
if attachments: if attachments:
for att in util.re(r'class="doc_preview[" ]').split( for att in text.re(r'class="doc_preview[" ]').split(
attachments)[1:]: attachments)[1:]:
media.append({ media.append({
"id" : text.parse_int(text.extr( "id" : text.parse_int(text.extr(
@@ -175,7 +175,7 @@ class SubscribestarExtractor(Extractor):
audios = text.extr( audios = text.extr(
html, 'class="uploads-audios"', 'class="post-edit_form"') html, 'class="uploads-audios"', 'class="post-edit_form"')
if audios: if audios:
for audio in util.re(r'class="audio_preview-data[" ]').split( for audio in text.re(r'class="audio_preview-data[" ]').split(
audios)[1:]: audios)[1:]:
media.append({ media.append({
"id" : text.parse_int(text.extr( "id" : text.parse_int(text.extr(

View File

@@ -60,16 +60,16 @@ class TumblrExtractor(Extractor):
blog = None blog = None
# pre-compile regular expressions # pre-compile regular expressions
self._sub_video = util.re( self._sub_video = text.re(
r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
if self.inline: if self.inline:
self._sub_image = util.re( self._sub_image = text.re(
r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
self._subn_orig_image = util.re(r"/s\d+x\d+/").subn self._subn_orig_image = text.re(r"/s\d+x\d+/").subn
_findall_image = util.re('<img src="([^"]+)"').findall _findall_image = text.re('<img src="([^"]+)"').findall
_findall_video = util.re('<source src="([^"]+)"').findall _findall_video = text.re('<source src="([^"]+)"').findall
for post in self.posts(): for post in self.posts():
if self.date_min > post["timestamp"]: if self.date_min > post["timestamp"]:

View File

@@ -76,7 +76,7 @@ class TwitterExtractor(Extractor):
seen_tweets = set() if self.config("unique", True) else None seen_tweets = set() if self.config("unique", True) else None
if self.twitpic: if self.twitpic:
self._find_twitpic = util.re( self._find_twitpic = text.re(
r"https?(://twitpic\.com/(?!photos/)\w+)").findall r"https?(://twitpic\.com/(?!photos/)\w+)").findall
tweets = self.tweets() tweets = self.tweets()

View File

@@ -9,7 +9,7 @@
"""Extractors for https://vk.com/""" """Extractors for https://vk.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, exception
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -36,7 +36,7 @@ class VkExtractor(Extractor):
return num return num
def items(self): def items(self):
subn = util.re(r"/imp[fg]/").subn subn = text.re(r"/imp[fg]/").subn
sizes = "wzyxrqpo" sizes = "wzyxrqpo"
data = self.metadata() data = self.metadata()

View File

@@ -7,7 +7,7 @@
"""Extractors for https://www.xasiat.com""" """Extractors for https://www.xasiat.com"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text
import time import time
BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums" BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums"
@@ -29,7 +29,7 @@ class XasiatExtractor(Extractor):
def _pagination(self, path, pnum=1): def _pagination(self, path, pnum=1):
url = f"{self.root}{path}/" url = f"{self.root}{path}/"
find_posts = util.re(r'class="item ">\s*<a href="([^"]+)').findall find_posts = text.re(r'class="item ">\s*<a href="([^"]+)').findall
while True: while True:
params = { params = {
@@ -69,11 +69,11 @@ class XasiatAlbumExtractor(XasiatExtractor):
data = { data = {
"title": text.unescape(title), "title": text.unescape(title),
"model": util.re( "model": text.re(
r'top_models1"></i>\s*(.+)\s*</span').findall(info), r'top_models1"></i>\s*(.+)\s*</span').findall(info),
"tags": util.re( "tags": text.re(
r'tags/[^"]+\">\s*(.+)\s*</a').findall(info), r'tags/[^"]+\">\s*(.+)\s*</a').findall(info),
"album_category": util.re( "album_category": text.re(
r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)[0], r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)[0],
"album_url": response.url, "album_url": response.url,
"album_id": text.parse_int(album_id), "album_id": text.parse_int(album_id),

View File

@@ -128,7 +128,7 @@ class ZerochanExtractor(BooruExtractor):
return data return data
def _parse_json(self, txt): def _parse_json(self, txt):
txt = util.re(r"[\x00-\x1f\x7f]").sub("", txt) txt = text.re(r"[\x00-\x1f\x7f]").sub("", txt)
main, _, tags = txt.partition('tags": [') main, _, tags = txt.partition('tags": [')
item = {} item = {}

View File

@@ -262,7 +262,7 @@ def parse_command_line(module, argv):
else module.match_filter_func(opts.match_filter)) else module.match_filter_func(opts.match_filter))
if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None): if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None):
pattern = util.re(r"""(?x) pattern = text.re(r"""(?x)
(?P<name>[^+:]+) (?P<name>[^+:]+)
(?:\s*\+\s*(?P<keyring>[^:]+))? (?:\s*\+\s*(?P<keyring>[^:]+))?
(?:\s*:\s*(?!:)(?P<profile>.+?))? (?:\s*:\s*(?!:)(?P<profile>.+?))?
@@ -528,7 +528,7 @@ def legacy_postprocessors(opts, module, ytdlp, compat_opts):
if len(dur) == 2 and all(t is not None for t in dur): if len(dur) == 2 and all(t is not None for t in dur):
remove_ranges.append(tuple(dur)) remove_ranges.append(tuple(dur))
continue continue
remove_chapters_patterns.append(util.re(regex)) remove_chapters_patterns.append(text.re(regex))
if opts.remove_chapters or sponsorblock_query: if opts.remove_chapters or sponsorblock_query:
postprocessors.append({ postprocessors.append({
"key": "ModifyChapters", "key": "ModifyChapters",