replace standard library 're' uses with 'util.re()'

This commit is contained in:
Mike Fährmann
2025-06-06 12:26:21 +02:00
parent e1f03a5a93
commit b5c88b3d3e
45 changed files with 143 additions and 220 deletions

View File

@@ -9,10 +9,8 @@
"""Extractors for https://agn.ph/""" """Extractors for https://agn.ph/"""
from . import booru from . import booru
from .. import text from .. import text, util
import collections import collections
import re
BASE_PATTERN = r"(?:https?://)?agn\.ph" BASE_PATTERN = r"(?:https?://)?agn\.ph"
@@ -72,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = re.compile(r'class="(.)typetag">([^<]+)') pattern = util.re(r'class="(.)typetag">([^<]+)')
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) tags[tag_type].append(text.unquote(tag_name).replace(" ", "_"))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -8,7 +8,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live" BASE_PATTERN = r"(?:https?://)?(?:www\.)?arca\.live"
@@ -65,8 +64,8 @@ class ArcalivePostExtractor(ArcaliveExtractor):
def _extract_files(self, post): def _extract_files(self, post):
files = [] files = []
for video, media in self._extract_media(post["content"]): for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall(
post["content"]):
if not self.emoticons and 'class="arca-emoticon"' in media: if not self.emoticons and 'class="arca-emoticon"' in media:
continue continue
@@ -113,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor):
return files return files
def _extract_media(self, content):
ArcalivePostExtractor._extract_media = extr = re.compile(
r"<(?:img|vide(o)) ([^>]+)").findall
return extr(content)
class ArcaliveBoardExtractor(ArcaliveExtractor): class ArcaliveBoardExtractor(ArcaliveExtractor):
"""Extractor for an arca.live board's posts""" """Extractor for an arca.live board's posts"""

View File

@@ -7,8 +7,7 @@
"""Extractors for https://bato.to/""" """Extractors for https://bato.to/"""
from .common import Extractor, ChapterExtractor, MangaExtractor from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception from .. import text, util, exception
import re
BASE_PATTERN = (r"(?:https?://)?(" BASE_PATTERN = (r"(?:https?://)?("
r"(?:ba|d|f|h|j|m|w)to\.to|" r"(?:ba|d|f|h|j|m|w)to\.to|"
@@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
info = text.remove_html(extr('link-hover">', "</")) info = text.remove_html(extr('link-hover">', "</"))
info = text.unescape(info) info = text.unescape(info)
match = re.match( match = util.re(
r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info) r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info)
if match: if match:
volume, chapter, minor = match.groups() volume, chapter, minor = match.groups()
else: else:

View File

@@ -10,7 +10,6 @@
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, util from .. import text, util
import re
class BloggerExtractor(BaseExtractor): class BloggerExtractor(BaseExtractor):
@@ -33,13 +32,13 @@ class BloggerExtractor(BaseExtractor):
blog["date"] = text.parse_datetime(blog["published"]) blog["date"] = text.parse_datetime(blog["published"])
del blog["selfLink"] del blog["selfLink"]
sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub sub = util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub
findall_image = re.compile( findall_image = util.re(
r'src="(https?://(?:' r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|' r'blogger\.googleusercontent\.com/img|'
r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile( findall_video = util.re(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
metadata = self.metadata() metadata = self.metadata()

View File

@@ -15,7 +15,6 @@ import collections
import mimetypes import mimetypes
import binascii import binascii
import time import time
import re
BASE_PATTERN = ( BASE_PATTERN = (
r"(?:https?://)?(?:" r"(?:https?://)?(?:"
@@ -66,10 +65,13 @@ class DeviantartExtractor(Extractor):
if self.quality: if self.quality:
if self.quality == "png": if self.quality == "png":
self.quality = "-fullview.png?" self.quality = "-fullview.png?"
self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub
else: else:
self.quality = ",q_{}".format(self.quality) self.quality = ",q_{}".format(self.quality)
self.quality_sub = re.compile(r",q_\d+").sub self.quality_sub = util.re(r",q_\d+").sub
if self.intermediary:
self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn
if isinstance(self.original, str) and \ if isinstance(self.original, str) and \
self.original.lower().startswith("image"): self.original.lower().startswith("image"):
@@ -271,7 +273,7 @@ class DeviantartExtractor(Extractor):
) )
# filename metadata # filename metadata
sub = re.compile(r"\W").sub sub = util.re(r"\W").sub
deviation["filename"] = "".join(( deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["title"].lower()), "_by_",
sub("_", deviation["author"]["username"].lower()), "-d", sub("_", deviation["author"]["username"].lower()), "-d",
@@ -666,8 +668,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
if content["src"].startswith("https://images-wixmp-"): if content["src"].startswith("https://images-wixmp-"):
if self.intermediary and deviation["index"] <= 790677560: if self.intermediary and deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069 # https://github.com/r888888888/danbooru/issues/4069
intermediary, count = re.subn( intermediary, count = self.intermediary_subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"], 1) r"/intermediary\1", content["src"], 1)
if count: if count:
deviation["is_original"] = False deviation["is_original"] = False
@@ -682,8 +683,8 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
@staticmethod @staticmethod
def _find_folder(folders, name, uuid): def _find_folder(folders, name, uuid):
if uuid.isdecimal(): if uuid.isdecimal():
match = re.compile(name.replace( match = util.re(
"-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match
for folder in folders: for folder in folders:
if match(folder["name"]): if match(folder["name"]):
return folder return folder

View File

@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .common import ChapterExtractor, MangaExtractor, Extractor, Message
from .. import text, util from .. import text, util
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
@@ -47,12 +46,11 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
def metadata(self, page): def metadata(self, page):
extr = text.extract_from(page) extr = text.extract_from(page)
match = re.match( match = util.re(
(r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
r"(?: ch(\d+)([^:<]*))?" # chapter info r"(?: ch(\d+)([^:<]*))?" # chapter info
r"(?:: (.+))?"), # title r"(?:: (.+))?" # title
extr("<h3 id='chapter-title'><b>", "</b>"), ).match(extr("<h3 id='chapter-title'><b>", "</b>"))
)
author = extr(" by ", "</a>") author = extr(" by ", "</a>")
group = extr('"icon-print"></i> ', '</span>') group = extr('"icon-print"></i> ', '</span>')

View File

@@ -7,8 +7,7 @@
"""Extractors for https://everia.club""" """Extractors for https://everia.club"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text, util
import re
BASE_PATTERN = r"(?:https?://)?everia\.club" BASE_PATTERN = r"(?:https?://)?everia\.club"
@@ -26,7 +25,7 @@ class EveriaExtractor(Extractor):
return self._pagination(self.groups[0]) return self._pagination(self.groups[0])
def _pagination(self, path, params=None, pnum=1): def _pagination(self, path, params=None, pnum=1):
find_posts = re.compile(r'thumbnail">\s*<a href="([^"]+)').findall find_posts = util.re(r'thumbnail">\s*<a href="([^"]+)').findall
while True: while True:
if pnum == 1: if pnum == 1:
@@ -53,7 +52,7 @@ class EveriaPostExtractor(EveriaExtractor):
url = self.root + self.groups[0] + "/" url = self.root + self.groups[0] + "/"
page = self.request(url).text page = self.request(url).text
content = text.extr(page, 'itemprop="text">', "<h3") content = text.extr(page, 'itemprop="text">', "<h3")
urls = re.findall(r'img.*?src="([^"]+)', content) urls = util.re(r'img.*?src="([^"]+)').findall(content)
data = { data = {
"title": text.unescape( "title": text.unescape(

View File

@@ -9,7 +9,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text, util
from ..cache import memcache from ..cache import memcache
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc" BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc"
USER_PATTERN = ( USER_PATTERN = (
@@ -211,7 +210,7 @@ class FanboxExtractor(Extractor):
num = 0 num = 0
cover_image = post.get("coverImageUrl") cover_image = post.get("coverImageUrl")
if cover_image: if cover_image:
cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image) cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image)
final_post = post.copy() final_post = post.copy()
final_post["isCoverImage"] = True final_post["isCoverImage"] = True
final_post["fileUrl"] = cover_image final_post["fileUrl"] = cover_image

View File

@@ -10,9 +10,7 @@
from . import booru from . import booru
from .. import text, util, exception from .. import text, util, exception
import collections import collections
import re
class GelbooruV02Extractor(booru.BooruExtractor): class GelbooruV02Extractor(booru.BooruExtractor):
@@ -77,7 +75,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start * self.per_page params["pid"] = self.page_start * self.per_page
data = {} data = {}
find_ids = re.compile(r"\sid=\"p(\d+)").findall find_ids = util.re(r"\sid=\"p(\d+)").findall
while True: while True:
page = self.request(url, params=params).text page = self.request(url, params=params).text
@@ -108,8 +106,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = re.compile( pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)")
r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name))) tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -7,9 +7,8 @@
"""Generic information extractor""" """Generic information extractor"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import config, text from .. import config, text, util
import os.path import os.path
import re
class GenericExtractor(Extractor): class GenericExtractor(Extractor):
@@ -172,8 +171,8 @@ class GenericExtractor(Extractor):
r"(?:[^\"'<>\s]*)?" # optional query and fragment r"(?:[^\"'<>\s]*)?" # optional query and fragment
) )
imageurls_src = re.findall(imageurl_pattern_src, page) imageurls_src = util.re(imageurl_pattern_src).findall(page)
imageurls_ext = re.findall(imageurl_pattern_ext, page) imageurls_ext = util.re(imageurl_pattern_ext).findall(page)
imageurls = imageurls_src + imageurls_ext imageurls = imageurls_src + imageurls_ext
# Resolve relative urls # Resolve relative urls
@@ -182,8 +181,8 @@ class GenericExtractor(Extractor):
# by prepending a suitable base url. # by prepending a suitable base url.
# #
# If the page contains a <base> element, use it as base url # If the page contains a <base> element, use it as base url
basematch = re.search( basematch = util.re(
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page) r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
if basematch: if basematch:
self.baseurl = basematch.group('url').rstrip('/') self.baseurl = basematch.group('url').rstrip('/')
# Otherwise, extract the base url from self.url # Otherwise, extract the base url from self.url

View File

@@ -6,9 +6,8 @@
"""Extractors for https://hatenablog.com""" """Extractors for https://hatenablog.com"""
import re
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text, util
BASE_PATTERN = ( BASE_PATTERN = (
@@ -31,7 +30,7 @@ class HatenablogExtractor(Extractor):
self.domain = match.group(1) or match.group(2) self.domain = match.group(1) or match.group(2)
def _init(self): def _init(self):
self._find_img = re.compile(r'<img +([^>]+)').finditer self._find_img = util.re(r'<img +([^>]+)').finditer
def _handle_article(self, article: str): def _handle_article(self, article: str):
extr = text.extract_from(article) extr = text.extract_from(article)
@@ -74,7 +73,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
def _init(self): def _init(self):
HatenablogExtractor._init(self) HatenablogExtractor._init(self)
self._find_pager_url = re.compile( self._find_pager_url = util.re(
r' class="pager-next">\s*<a href="([^"]+)').search r' class="pager-next">\s*<a href="([^"]+)').search
def items(self): def items(self):

View File

@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
import re
class Hentai2readBase(): class Hentai2readBase():
@@ -31,8 +30,9 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
chapter, sep, minor = self.groups[1].partition(".") chapter, sep, minor = self.groups[1].partition(".")
match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " match = util.re(
r"([^:]+): (.+) . Page 1 ", title) r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - "
r"([^:]+): (.+) . Page 1 ").match(title)
if match: if match:
manga, type, author, _, title = match.groups() manga, type, author, _, title = match.groups()
else: else:

View File

@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, util from .. import text, util
import re
class HentaihereBase(): class HentaihereBase():
@@ -34,8 +33,9 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
title = text.extr(page, "<title>", "</title>") title = text.extr(page, "<title>", "</title>")
chapter_id = text.extr(page, 'report/C', '"') chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".") chapter, sep, minor = self.chapter.partition(".")
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " match = util.re(
match = re.match(pattern, title) r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by "
r"(.+) at ").match(title)
return { return {
"manga": match.group(1), "manga": match.group(1),
"manga_id": text.parse_int(self.manga_id), "manga_id": text.parse_int(self.manga_id),

View File

@@ -9,9 +9,8 @@
"""Extractors for https://hiperdex.com/""" """Extractors for https://hiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text, util
from ..cache import memcache from ..cache import memcache
import re
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))") r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))")
@@ -80,10 +79,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
return self.chapter_data(self.chapter) return self.chapter_data(self.chapter)
def images(self, page): def images(self, page):
pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)')
return [ return [
(url.strip(), None) (url.strip(), None)
for url in re.findall( for url in pattern.findall(page)
r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page)
] ]

View File

@@ -13,7 +13,6 @@ from .nozomi import decode_nozomi
from ..cache import memcache from ..cache import memcache
from .. import text, util from .. import text, util
import string import string
import re
class HitomiExtractor(Extractor): class HitomiExtractor(Extractor):
@@ -257,8 +256,8 @@ def _parse_gg(extr):
m = {} m = {}
keys = [] keys = []
for match in re.finditer( for match in util.re_compile(
r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page): r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?").finditer(page):
key, value = match.groups() key, value = match.groups()
keys.append(int(key)) keys.append(int(key))
@@ -268,11 +267,11 @@ def _parse_gg(extr):
m[key] = value m[key] = value
keys.clear() keys.clear()
for match in re.finditer( for match in util.re_compile(
r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page): r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)").finditer(page):
m[int(match.group(1))] = int(match.group(2)) m[int(match.group(1))] = int(match.group(2))
d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) d = util.re_compile(r"(?:var\s|default:)\s*o\s*=\s*(\d+)").search(page)
b = re.search(r"b:\s*[\"'](.+)[\"']", page) b = util.re_compile(r"b:\s*[\"'](.+)[\"']").search(page)
return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 return m, b.group(1).strip("/"), int(d.group(1)) if d else 0

View File

@@ -9,8 +9,7 @@
"""Extractors for https://www.imagebam.com/""" """Extractors for https://www.imagebam.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text, util
import re
class ImagebamExtractor(Extractor): class ImagebamExtractor(Extractor):
@@ -70,9 +69,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
page, 'id="gallery-name">', '<').strip())} page, 'id="gallery-name">', '<').strip())}
def images(self, page): def images(self, page):
findall = re.compile(r'<a href="https://www\.imagebam\.com' findall = util.re(r'<a href="https://www\.imagebam\.com'
r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall
paths = [] paths = []
while True: while True:
paths += findall(page) paths += findall(page)

View File

@@ -9,8 +9,7 @@
"""Extractors for https://imgbox.com/""" """Extractors for https://imgbox.com/"""
from .common import Extractor, Message, AsynchronousMixin from .common import Extractor, Message, AsynchronousMixin
from .. import text, exception from .. import text, util, exception
import re
class ImgboxExtractor(Extractor): class ImgboxExtractor(Extractor):
@@ -74,7 +73,8 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
page = self.request(self.root + "/g/" + self.gallery_key).text page = self.request(self.root + "/g/" + self.gallery_key).text
if "The specified gallery could not be found." in page: if "The specified gallery could not be found." in page:
raise exception.NotFoundError("gallery") raise exception.NotFoundError("gallery")
self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page) self.image_keys = util.re(
r'<a href="/([^"]+)"><img alt="').findall(page)
title = text.extr(page, "<h1>", "</h1>") title = text.extr(page, "<h1>", "</h1>")
title, _, count = title.rpartition(" - ") title, _, count = title.rpartition(" - ")

View File

@@ -14,7 +14,6 @@ from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
import binascii import binascii
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)" USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
@@ -39,7 +38,7 @@ class InstagramExtractor(Extractor):
def _init(self): def _init(self):
self.www_claim = "0" self.www_claim = "0"
self.csrf_token = util.generate_token() self.csrf_token = util.generate_token()
self._find_tags = re.compile(r"#\w+").findall self._find_tags = util.re(r"#\w+").findall
self._logged_in = True self._logged_in = True
self._cursor = None self._cursor = None
self._user = None self._user = None

View File

@@ -13,7 +13,6 @@ from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
import json import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(su|party)" BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(su|party)"
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
@@ -44,7 +43,7 @@ class KemonoExtractor(Extractor):
order = self.config("order-revisions") order = self.config("order-revisions")
self.revisions_reverse = order[0] in ("r", "a") if order else False self.revisions_reverse = order[0] in ("r", "a") if order else False
self._find_inline = re.compile( self._find_inline = util.re(
r'src="(?:https?://(?:kemono|coomer)\.su)?(/inline/[^"]+' r'src="(?:https?://(?:kemono|coomer)\.su)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder( self._json_dumps = json.JSONEncoder(
@@ -52,7 +51,7 @@ class KemonoExtractor(Extractor):
sort_keys=True, separators=(",", ":")).encode sort_keys=True, separators=(",", ":")).encode
def items(self): def items(self):
find_hash = re.compile(HASH_PATTERN).match find_hash = util.re(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files")) generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None announcements = True if self.config("announcements") else None
archives = True if self.config("archives") else False archives = True if self.config("archives") else False
@@ -409,10 +408,10 @@ class KemonoDiscordExtractor(KemonoExtractor):
"parent_id" : channel["parent_channel_id"], "parent_id" : channel["parent_channel_id"],
} }
find_inline = re.compile( find_inline = util.re(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
find_hash = re.compile(HASH_PATTERN).match find_hash = util.re(HASH_PATTERN).match
posts = self.api.discord_channel(channel_id) posts = self.api.discord_channel(channel_id)
max_posts = self.config("max-posts") max_posts = self.config("max-posts")

View File

@@ -9,8 +9,7 @@
"""Extractors for https://komikcast.la/""" """Extractors for https://komikcast.la/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text, util
import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
r"komikcast\.(?:la|cz|lol|site|mo?e|com)") r"komikcast\.(?:la|cz|lol|site|mo?e|com)")
@@ -24,13 +23,11 @@ class KomikcastBase():
@staticmethod @staticmethod
def parse_chapter_string(chapter_string, data=None): def parse_chapter_string(chapter_string, data=None):
"""Parse 'chapter_string' value and add its info to 'data'""" """Parse 'chapter_string' value and add its info to 'data'"""
if not data: if data is None:
data = {} data = {}
match = re.match( pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?")
r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?", match = pattern.match(text.unescape(chapter_string))
text.unescape(chapter_string),
)
manga, chapter, data["chapter_minor"], title = match.groups() manga, chapter, data["chapter_minor"], title = match.groups()
if manga: if manga:
@@ -59,9 +56,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
def images(page): def images(page):
readerarea = text.extr( readerarea = text.extr(
page, '<div class="main-reading-area', '</div') page, '<div class="main-reading-area', '</div')
pattern = util.re(r"<img[^>]* src=[\"']([^\"']+)")
return [ return [
(text.unescape(url), None) (text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea) for url in pattern.findall(readerarea)
] ]

View File

@@ -9,8 +9,7 @@
"""Extractors for https://www.mangahere.cc/""" """Extractors for https://www.mangahere.cc/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text, util
import re
class MangahereBase(): class MangahereBase():
@@ -104,8 +103,8 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
info, pos = text.extract(page, 'class="title3">', '<', pos) info, pos = text.extract(page, 'class="title3">', '<', pos)
date, pos = text.extract(page, 'class="title2">', '<', pos) date, pos = text.extract(page, 'class="title2">', '<', pos)
match = re.match( match = util.re(
r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info) r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info)
if match: if match:
volume, chapter, minor, title = match.groups() volume, chapter, minor, title = match.groups()
else: else:

View File

@@ -11,7 +11,6 @@
from .common import ChapterExtractor, Extractor, Message from .common import ChapterExtractor, Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import memcache from ..cache import memcache
import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:" BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:"
r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|" r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|"
@@ -22,17 +21,14 @@ BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:"
class MangaparkBase(): class MangaparkBase():
"""Base class for mangapark extractors""" """Base class for mangapark extractors"""
category = "mangapark" category = "mangapark"
_match_title = None
def _parse_chapter_title(self, title): def _parse_chapter_title(self, title):
if not self._match_title: match = util.re(
MangaparkBase._match_title = re.compile( r"(?i)"
r"(?i)" r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?"
r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)"
r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" r"(?:\s*:\s*(.*))?"
r"(?:\s*:\s*(.*))?" ).match(title)
).match
match = self._match_title(title)
return match.groups() if match else (0, 0, "", "") return match.groups() if match else (0, 0, "", "")
@memcache(keyarg=1) @memcache(keyarg=1)

View File

@@ -7,8 +7,7 @@
"""Extractors for https://mangaread.org/""" """Extractors for https://mangaread.org/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, exception from .. import text, util, exception
import re
class MangareadBase(): class MangareadBase():
@@ -18,9 +17,9 @@ class MangareadBase():
@staticmethod @staticmethod
def parse_chapter_string(chapter_string, data): def parse_chapter_string(chapter_string, data):
match = re.match( match = util.re(
r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?", r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?"
text.unescape(chapter_string).strip()) ).match(text.unescape(chapter_string).strip())
manga, chapter, minor, title = match.groups() manga, chapter, minor, title = match.groups()
manga = manga.strip() if manga else "" manga = manga.strip() if manga else ""
data["manga"] = data.pop("manga", manga) data["manga"] = data.pop("manga", manga)

View File

@@ -9,11 +9,9 @@
"""Extractors for Moebooru based sites""" """Extractors for Moebooru based sites"""
from .booru import BooruExtractor from .booru import BooruExtractor
from .. import text from .. import text, util
import collections import collections
import datetime import datetime
import re
class MoebooruExtractor(BooruExtractor): class MoebooruExtractor(BooruExtractor):
@@ -36,7 +34,7 @@ class MoebooruExtractor(BooruExtractor):
return return
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)") pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name)) tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -12,7 +12,6 @@ from .common import Extractor, Message, Dispatch
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import itertools import itertools
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com" BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com"
USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com" USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com"
@@ -35,7 +34,7 @@ class NewgroundsExtractor(Extractor):
self.user_root = "https://{}.newgrounds.com".format(self.user) self.user_root = "https://{}.newgrounds.com".format(self.user)
def _init(self): def _init(self):
self._extract_comment_urls = re.compile( self._extract_comment_urls = util.re(
r'(?:<img |data-smartload-)src="([^"]+)').findall r'(?:<img |data-smartload-)src="([^"]+)').findall
self.flash = self.config("flash", True) self.flash = self.config("flash", True)
@@ -322,7 +321,7 @@ class NewgroundsExtractor(Extractor):
def _video_formats(self, sources): def _video_formats(self, sources):
src = sources["360p"][0]["src"] src = sources["360p"][0]["src"]
sub = re.compile(r"\.360p\.\w+").sub sub = util.re(r"\.360p\.\w+").sub
for fmt in self.format: for fmt in self.format:
try: try:

View File

@@ -10,8 +10,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from ..cache import cache from ..cache import cache
from .. import text, exception from .. import text, util, exception
import re
BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social" BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social"
@@ -37,8 +36,8 @@ class PillowfortExtractor(Extractor):
external = self.config("external", False) external = self.config("external", False)
if inline: if inline:
inline = re.compile(r'src="(https://img\d+\.pillowfort\.social' inline = util.re(r'src="(https://img\d+\.pillowfort\.social'
r'/posts/[^"]+)').findall r'/posts/[^"]+)').findall
for post in self.posts(): for post in self.posts():
if "original_post" in post and not reblogs: if "original_post" in post and not reblogs:

View File

@@ -14,7 +14,6 @@ from ..cache import cache, memcache
from datetime import datetime, timedelta from datetime import datetime, timedelta
import itertools import itertools
import hashlib import hashlib
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?ph?ixiv\.net" BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?ph?ixiv\.net"
USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)"
@@ -45,7 +44,7 @@ class PixivExtractor(Extractor):
self.meta_captions = self.config("captions") self.meta_captions = self.config("captions")
if self.meta_captions: if self.meta_captions:
self.meta_captions_sub = re.compile( self.meta_captions_sub = util.re(
r'<a href="/jump\.php\?([^"]+)').sub r'<a href="/jump\.php\?([^"]+)').sub
def items(self): def items(self):

View File

@@ -11,7 +11,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
import datetime import datetime
import re
class PlurkExtractor(Extractor): class PlurkExtractor(Extractor):
@@ -64,7 +63,8 @@ class PlurkExtractor(Extractor):
def _load(data): def _load(data):
if not data: if not data:
raise exception.NotFoundError("user") raise exception.NotFoundError("user")
return util.json_loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data)) return util.json_loads(
util.re(r"new Date\(([^)]+)\)").sub(r"\1", data))
class PlurkTimelineExtractor(PlurkExtractor): class PlurkTimelineExtractor(PlurkExtractor):

View File

@@ -6,9 +6,8 @@
"""Extractors for Postmill instances""" """Extractors for Postmill instances"""
import re
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, exception from .. import text, util, exception
class PostmillExtractor(BaseExtractor): class PostmillExtractor(BaseExtractor):
@@ -21,8 +20,8 @@ class PostmillExtractor(BaseExtractor):
def _init(self): def _init(self):
self.instance = self.root.partition("://")[2] self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False) self.save_link_post_body = self.config("save-link-post-body", False)
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search self._search_canonical_url = util.re(r"/f/([\w\d_]+)/(\d+)/").search
self._search_image_tag = re.compile( self._search_image_tag = util.re(
r'<a href="[^"]+"\n +class="submission__image-link"').search r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self): def items(self):

View File

@@ -11,7 +11,6 @@
from . import booru from . import booru
from .. import text, util from .. import text, util
import collections import collections
import re
BASE_PATTERN = r"(?:https?://)?realbooru\.com" BASE_PATTERN = r"(?:https?://)?realbooru\.com"
@@ -72,8 +71,7 @@ class RealbooruExtractor(booru.BooruExtractor):
page = post["_html"] page = post["_html"]
tag_container = text.extr(page, 'id="tagLink"', '</div>') tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list) tags = collections.defaultdict(list)
pattern = re.compile( pattern = util.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
for tag_type, tag_name in pattern.findall(tag_container): for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name))) tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items(): for key, value in tags.items():

View File

@@ -9,8 +9,7 @@
"""Recursive extractor""" """Recursive extractor"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text, util
import re
class RecursiveExtractor(Extractor): class RecursiveExtractor(Extractor):
@@ -28,5 +27,5 @@ class RecursiveExtractor(Extractor):
else: else:
page = self.request(text.ensure_http_scheme(url)).text page = self.request(text.ensure_http_scheme(url)).text
for match in re.finditer(r"https?://[^\s\"']+", page): for match in util.re(r"https?://[^\s\"']+").finditer(page):
yield Message.Queue, match.group(0), {} yield Message.Queue, match.group(0), {}

View File

@@ -9,9 +9,8 @@
"""Extractors for https://rule34.us/""" """Extractors for https://rule34.us/"""
from .booru import BooruExtractor from .booru import BooruExtractor
from .. import text from .. import text, util
import collections import collections
import re
class Rule34usExtractor(BooruExtractor): class Rule34usExtractor(BooruExtractor):
@@ -20,7 +19,7 @@ class Rule34usExtractor(BooruExtractor):
per_page = 42 per_page = 42
def _init(self): def _init(self):
self._find_tags = re.compile( self._find_tags = util.re(
r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
def _parse_post(self, post_id): def _parse_post(self, post_id):

View File

@@ -13,7 +13,6 @@ from .common import Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import collections import collections
import re
BASE_PATTERN = r"(?:https?://)?" \ BASE_PATTERN = r"(?:https?://)?" \
r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
@@ -48,7 +47,7 @@ class SankakuExtractor(BooruExtractor):
self.api = SankakuAPI(self) self.api = SankakuAPI(self)
if self.config("tags") == "extended": if self.config("tags") == "extended":
self._tags = self._tags_extended self._tags = self._tags_extended
self._tags_findall = re.compile( self._tags_findall = util.re(
r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall
def _file_url(self, post): def _file_url(self, post):
@@ -130,11 +129,11 @@ class SankakuTagExtractor(SankakuExtractor):
if "date:" in self.tags: if "date:" in self.tags:
# rewrite 'date:' tags (#1790) # rewrite 'date:' tags (#1790)
self.tags = re.sub( self.tags = util.re(
r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)", r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub(
r"date:\3-\2-\1T00:00", self.tags) r"date:\3-\2-\1T00:00", self.tags)
self.tags = re.sub( self.tags = util.re(
r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)", r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub(
r"date:\1-\2-\3T00:00", self.tags) r"date:\1-\2-\3T00:00", self.tags)
def metadata(self): def metadata(self):

View File

@@ -10,7 +10,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text, util
import re
class SankakucomplexExtractor(Extractor): class SankakucomplexExtractor(Extractor):
@@ -66,7 +65,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
@staticmethod @staticmethod
def _extract_images(content): def _extract_images(content):
orig_sub = re.compile(r"-\d+x\d+\.").sub orig_sub = util.re(r"-\d+x\d+\.").sub
return [ return [
orig_sub(".", url) for url in orig_sub(".", url) for url in
util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
@@ -74,13 +73,13 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
@staticmethod @staticmethod
def _extract_videos(content): def _extract_videos(content):
return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content) return util.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content)
@staticmethod @staticmethod
def _extract_embeds(content): def _extract_embeds(content):
return [ return [
"ytdl:" + url for url in "ytdl:" + url for url in
re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content) util.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content)
] ]

View File

@@ -9,8 +9,7 @@
"""Extractors for https://speakerdeck.com/""" """Extractors for https://speakerdeck.com/"""
from .common import GalleryExtractor from .common import GalleryExtractor
from .. import text from .. import text, util
import re
class SpeakerdeckPresentationExtractor(GalleryExtractor): class SpeakerdeckPresentationExtractor(GalleryExtractor):
@@ -48,7 +47,8 @@ class SpeakerdeckPresentationExtractor(GalleryExtractor):
def images(self, _): def images(self, _):
url = "{}/player/{}".format(self.root, self.presentation_id) url = "{}/player/{}".format(self.root, self.presentation_id)
page = re.sub(r"\s+", " ", self.request(url).text) page = self.request(url).text
page = util.re(r"\s+").sub(" ", page)
return [ return [
(url, None) (url, None)
for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"') for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"')

View File

@@ -11,7 +11,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)" BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)"
@@ -158,8 +157,8 @@ class SubscribestarExtractor(Extractor):
attachments = text.extr( attachments = text.extr(
html, 'class="uploads-docs"', 'class="post-edit_form"') html, 'class="uploads-docs"', 'class="post-edit_form"')
if attachments: if attachments:
for att in re.split( for att in util.re(r'class="doc_preview[" ]').split(
r'class="doc_preview[" ]', attachments)[1:]: attachments)[1:]:
media.append({ media.append({
"id" : text.parse_int(text.extr( "id" : text.parse_int(text.extr(
att, 'data-upload-id="', '"')), att, 'data-upload-id="', '"')),
@@ -172,8 +171,8 @@ class SubscribestarExtractor(Extractor):
audios = text.extr( audios = text.extr(
html, 'class="uploads-audios"', 'class="post-edit_form"') html, 'class="uploads-audios"', 'class="post-edit_form"')
if audios: if audios:
for audio in re.split( for audio in util.re(r'class="audio_preview-data[" ]').split(
r'class="audio_preview-data[" ]', audios)[1:]: audios)[1:]:
media.append({ media.append({
"id" : text.parse_int(text.extr( "id" : text.parse_int(text.extr(
audio, 'data-upload-id="', '"')), audio, 'data-upload-id="', '"')),

View File

@@ -11,7 +11,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, oauth, exception from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
import re
BASE_PATTERN = ( BASE_PATTERN = (
@@ -66,16 +65,16 @@ class TumblrExtractor(Extractor):
blog = None blog = None
# pre-compile regular expressions # pre-compile regular expressions
self._sub_video = re.compile( self._sub_video = util.re(
r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
if self.inline: if self.inline:
self._sub_image = re.compile( self._sub_image = util.re(
r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn self._subn_orig_image = util.re(r"/s\d+x\d+/").subn
_findall_image = re.compile('<img src="([^"]+)"').findall _findall_image = util.re('<img src="([^"]+)"').findall
_findall_video = re.compile('<source src="([^"]+)"').findall _findall_video = util.re('<source src="([^"]+)"').findall
for post in self.posts(): for post in self.posts():
if self.date_min > post["timestamp"]: if self.date_min > post["timestamp"]:

View File

@@ -9,8 +9,7 @@
"""Extractors for https://vk.com/""" """Extractors for https://vk.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, util, exception
import re
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -32,7 +31,7 @@ class VkExtractor(Extractor):
return num return num
def items(self): def items(self):
sub = re.compile(r"/imp[fg]/").sub sub = util.re(r"/imp[fg]/").sub
sizes = "wzyxrqpo" sizes = "wzyxrqpo"
data = self.metadata() data = self.metadata()

View File

@@ -12,7 +12,6 @@ from .booru import BooruExtractor
from ..cache import cache from ..cache import cache
from .. import text, util, exception from .. import text, util, exception
import collections import collections
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -127,7 +126,7 @@ class ZerochanExtractor(BooruExtractor):
return data return data
def _parse_json(self, txt): def _parse_json(self, txt):
txt = re.sub(r"[\x00-\x1f\x7f]", "", txt) txt = util.re(r"[\x00-\x1f\x7f]").sub("", txt)
main, _, tags = txt.partition('tags": [') main, _, tags = txt.partition('tags": [')
item = {} item = {}

View File

@@ -73,7 +73,7 @@ __tests__ = (
"views" : int, "views" : int,
"favorites" : int, "favorites" : int,
"comments" : int, "comments" : int,
"_mtime" : "Sat, 16 Feb 2019 19:30:34 GMT", "_http_lastmodified": "Sat, 16 Feb 2019 19:30:34 GMT",
}, },
{ {

View File

@@ -10,7 +10,6 @@ from gallery_dl import exception
__tests__ = ( __tests__ = (
{ {
"#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8", "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
"#count" : 66, "#count" : 66,
@@ -33,7 +32,6 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5",
"#comment" : "volume (vol) in url", "#comment" : "volume (vol) in url",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
"#count" : 7, "#count" : 7,
@@ -46,7 +44,6 @@ __tests__ = (
{ {
"#url" : "https://mto.to/chapter/2584460", "#url" : "https://mto.to/chapter/2584460",
"#comment" : "'-' in manga title (#5200)", "#comment" : "'-' in manga title (#5200)",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
"chapter" : 9, "chapter" : 9,
@@ -64,7 +61,6 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/90710-new-suitor-for-the-abandoned-wife/2089747-ch_76", "#url" : "https://bato.to/title/90710-new-suitor-for-the-abandoned-wife/2089747-ch_76",
"#comment" : "duplicate info in chapter_minor / title (#5988)", "#comment" : "duplicate info in chapter_minor / title (#5988)",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
"chapter" : 76, "chapter" : 76,
@@ -76,7 +72,6 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/115494-today-with-you/2631897-ch_38", "#url" : "https://bato.to/title/115494-today-with-you/2631897-ch_38",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
"chapter" : 37, "chapter" : 37,
@@ -94,20 +89,17 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/86408/1681030", "#url" : "https://bato.to/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://bato.to/chapter/1681030", "#url" : "https://bato.to/chapter/1681030",
"#comment" : "v2 URL", "#comment" : "v2 URL",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
"#options" : {"domain": "xbato.org"}, "#options" : {"domain": "xbato.org"},
"#count" : ">= 21", "#count" : ">= 21",
@@ -122,7 +114,6 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/104929-86-eighty-six-official", "#url" : "https://bato.to/title/104929-86-eighty-six-official",
"#comment" : "Manga with number in name", "#comment" : "Manga with number in name",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
"#count" : ">= 18", "#count" : ">= 18",
@@ -132,7 +123,6 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan",
"#comment" : "Non-English translation (Indonesian)", "#comment" : "Non-English translation (Indonesian)",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
"#count" : ">= 29", "#count" : ">= 29",
@@ -142,149 +132,130 @@ __tests__ = (
{ {
"#url" : "https://bato.to/title/134270-removed", "#url" : "https://bato.to/title/134270-removed",
"#comment" : "Deleted/removed manga", "#comment" : "Deleted/removed manga",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
"#exception": exception.StopExtraction, "#exception": exception.StopExtraction,
}, },
{ {
"#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
}, },
{ {
"#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official",
"#comment" : "v2 URL", "#comment" : "v2 URL",
"#category": ("", "batoto", "manga"),
"#class" : batoto.BatotoMangaExtractor, "#class" : batoto.BatotoMangaExtractor,
}, },
{ {
"#url" : "https://dto.to/title/86408/1681030", "#url" : "https://dto.to/title/86408/1681030",
"#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor,
},
{
"#url" : "https://fto.to/title/86408/1681030",
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://hto.to/title/86408/1681030", "#url" : "https://hto.to/title/86408/1681030",
"#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor,
},
{
"#url" : "https://jto.to/title/86408/1681030",
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://mto.to/title/86408/1681030", "#url" : "https://mto.to/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://wto.to/title/86408/1681030", "#url" : "https://wto.to/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://mangatoto.com/title/86408/1681030", "#url" : "https://mangatoto.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://mangatoto.net/title/86408/1681030", "#url" : "https://mangatoto.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://mangatoto.org/title/86408/1681030", "#url" : "https://mangatoto.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://batocomic.com/title/86408/1681030", "#url" : "https://batocomic.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://batocomic.net/title/86408/1681030", "#url" : "https://batocomic.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://batocomic.org/title/86408/1681030", "#url" : "https://batocomic.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://readtoto.com/title/86408/1681030", "#url" : "https://readtoto.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://readtoto.net/title/86408/1681030", "#url" : "https://readtoto.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://readtoto.org/title/86408/1681030", "#url" : "https://readtoto.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://xbato.com/title/86408/1681030", "#url" : "https://xbato.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://xbato.net/title/86408/1681030", "#url" : "https://xbato.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://xbato.org/title/86408/1681030", "#url" : "https://xbato.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://zbato.com/title/86408/1681030", "#url" : "https://zbato.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://zbato.net/title/86408/1681030", "#url" : "https://zbato.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://zbato.org/title/86408/1681030", "#url" : "https://zbato.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://comiko.net/title/86408/1681030", "#url" : "https://comiko.net/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://comiko.org/title/86408/1681030", "#url" : "https://comiko.org/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://batotoo.com/title/86408/1681030", "#url" : "https://batotoo.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://batotwo.com/title/86408/1681030", "#url" : "https://batotwo.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },
{ {
"#url" : "https://battwo.com/title/86408/1681030", "#url" : "https://battwo.com/title/86408/1681030",
"#category": ("", "batoto", "chapter"),
"#class" : batoto.BatotoChapterExtractor, "#class" : batoto.BatotoChapterExtractor,
}, },

View File

@@ -92,7 +92,7 @@ __tests__ = (
"extension" : "avif", "extension" : "avif",
"filename" : str, "filename" : str,
"gallery_id": 1615823, "gallery_id": 1615823,
"group" : [], "group" : ["mofumofuen"],
"lang" : "ja", "lang" : "ja",
"language" : "Japanese", "language" : "Japanese",
"num" : range(1, 22), "num" : range(1, 22),

View File

@@ -116,7 +116,7 @@ __tests__ = (
"#url" : "https://www.mangaread.org/manga/doesnotexist", "#url" : "https://www.mangaread.org/manga/doesnotexist",
"#category": ("", "mangaread", "manga"), "#category": ("", "mangaread", "manga"),
"#class" : mangaread.MangareadMangaExtractor, "#class" : mangaread.MangareadMangaExtractor,
"#exception": exception.HttpError, "#exception": exception.NotFoundError,
}, },
) )

View File

@@ -20,9 +20,9 @@ __tests__ = (
"#category": ("booru", "realbooru", "pool"), "#category": ("booru", "realbooru", "pool"),
"#class" : realbooru.RealbooruPoolExtractor, "#class" : realbooru.RealbooruPoolExtractor,
"#urls" : ( "#urls" : (
"https://realbooru.com//images/bf/d6/bfd682f338691e5254de796040fcba21.mp4", "https://video-cdn.realbooru.com//images/bf/d6/bfd682f338691e5254de796040fcba21.mp4",
"https://realbooru.com//images/cb/7d/cb7d921673ba99f688031ac554777695.mp4", "https://video-cdn.realbooru.com//images/cb/7d/cb7d921673ba99f688031ac554777695.mp4",
"https://realbooru.com//images/9e/14/9e140edc1cb2e4cc734ba5bdc4870955.mp4", "https://video-cdn.realbooru.com//images/9e/14/9e140edc1cb2e4cc734ba5bdc4870955.mp4",
), ),
}, },

View File

@@ -13,7 +13,7 @@ __tests__ = (
"#url" : "https://sankaku.app/?tags=bonocho", "#url" : "https://sankaku.app/?tags=bonocho",
"#category": ("booru", "sankaku", "tag"), "#category": ("booru", "sankaku", "tag"),
"#class" : sankaku.SankakuTagExtractor, "#class" : sankaku.SankakuTagExtractor,
"#pattern" : r"https://s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}/[0-9a-f]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+", "#pattern" : r"https://s\.sankakucomplex\.com/o/[^/]{2}/[^/]{2}/[0-9a-f]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+",
"#count" : 5, "#count" : 5,
}, },
@@ -461,7 +461,7 @@ __tests__ = (
"#comment" : "legacy post URL", "#comment" : "legacy post URL",
"#category": ("booru", "sankaku", "post"), "#category": ("booru", "sankaku", "post"),
"#class" : sankaku.SankakuPostExtractor, "#class" : sankaku.SankakuPostExtractor,
"#pattern" : r"https://s\.sankakucomplex\.com/data/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9\.jpg\?e=.+", "#pattern" : r"https://s\.sankakucomplex\.com/o/ac/8e/ac8e3b92ea328ce9cf7211e69c905bf9\.jpg\?e=.+",
# "id": 360451, # "id": 360451,
"id": "y0abGlDOr2o", "id": "y0abGlDOr2o",
@@ -473,7 +473,7 @@ __tests__ = (
"#category": ("booru", "sankaku", "post"), "#category": ("booru", "sankaku", "post"),
"#class" : sankaku.SankakuPostExtractor, "#class" : sankaku.SankakuPostExtractor,
"#auth" : True, "#auth" : True,
"#pattern" : r"https://s\.sankakucomplex\.com/data/13/3c/133cda3bfde249c504284493903fb985\.jpg", "#pattern" : r"https://s\.sankakucomplex\.com/o/13/3c/133cda3bfde249c504284493903fb985\.jpg",
"md5": "133cda3bfde249c504284493903fb985", "md5": "133cda3bfde249c504284493903fb985",
}, },
@@ -506,7 +506,7 @@ __tests__ = (
"#comment" : "md5 hexdigest instead of ID (#3952)", "#comment" : "md5 hexdigest instead of ID (#3952)",
"#category": ("booru", "sankaku", "post"), "#category": ("booru", "sankaku", "post"),
"#class" : sankaku.SankakuPostExtractor, "#class" : sankaku.SankakuPostExtractor,
"#pattern" : r"https://s\.sankakucomplex\.com/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", "#pattern" : r"https://s\.sankakucomplex\.com/o/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg",
"#count" : 1, "#count" : 1,
# "id" : 33195194, # "id" : 33195194,
@@ -519,7 +519,7 @@ __tests__ = (
"#comment" : "/posts/ instead of /post/show/ (#4688)", "#comment" : "/posts/ instead of /post/show/ (#4688)",
"#category": ("booru", "sankaku", "post"), "#category": ("booru", "sankaku", "post"),
"#class" : sankaku.SankakuPostExtractor, "#class" : sankaku.SankakuPostExtractor,
"#pattern" : r"https://s\.sankakucomplex\.com/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", "#pattern" : r"https://s\.sankakucomplex\.com/o/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg",
"#count" : 1, "#count" : 1,
# "id" : 33195194, # "id" : 33195194,