allow '/' and '?' in URL queries

This commit is contained in:
Mike Fährmann
2022-10-02 19:02:05 +02:00
parent 60cce7462c
commit b8d268f57e
5 changed files with 11 additions and 11 deletions

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2021 Mike Fährmann
# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,7 @@ class DirectlinkExtractor(Extractor):
archive_fmt = filename_fmt
pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."
r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
test = (
(("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
"url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
@@ -31,9 +31,9 @@ class DirectlinkExtractor(Extractor):
"keyword": "29dad729c40fb09349f83edafa498dba1297464a",
}),
# more complex example
("https://example.org/path/to/file.webm?que=1&ry=2#fragment", {
"url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622",
"keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0",
("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", {
"url": "6fb1061390f8aada3db01cb24b51797c7ee42b31",
"keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c",
}),
# percent-encoded characters
("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {

View File

@@ -27,9 +27,9 @@ class GenericExtractor(Extractor):
pattern += r"""
(?P<scheme>https?://)? # optional http(s) scheme
(?P<domain>[-\w\.]+) # required domain
(?P<path>/[^?&#]*)? # optional path
(?:\?(?P<query>[^/?#]*))? # optional query
(?:\#(?P<fragment>.*))?$ # optional fragment
(?P<path>/[^?#]*)? # optional path
(?:\?(?P<query>[^#]*))? # optional query
(?:\#(?P<fragment>.*))? # optional fragment
"""
def __init__(self, match):

View File

@@ -210,7 +210,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
class UnsplashSearchExtractor(UnsplashExtractor):
"""Extractor for unsplash search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
test = ("https://unsplash.com/s/photos/hair-style", {
"pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",

View File

@@ -52,7 +52,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
subcategory = "search"
directory_fmt = ("{category}", "{search[q]}")
archive_fmt = "s_{search[q]}_{id}"
pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?"
pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?"
test = (
("https://wallhaven.cc/search?q=touhou"),
(("https://wallhaven.cc/search?q=id%3A87"

View File

@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
__version__ = "1.23.2"
__version__ = "1.23.3-dev"