allow '/' and '?' in URL queries

2022-10-02 19:02:05 +02:00
parent 60cce7462c
commit b8d268f57e
5 changed files with 11 additions and 11 deletions
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,7 @@ class DirectlinkExtractor(Extractor):
    archive_fmt = filename_fmt
    pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."
               r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
-               r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
+               r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
    test = (
        (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
            "url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
@@ -31,9 +31,9 @@ class DirectlinkExtractor(Extractor):
            "keyword": "29dad729c40fb09349f83edafa498dba1297464a",
        }),
        # more complex example
-        ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", {
-            "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622",
-            "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0",
+        ("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", {
+            "url": "6fb1061390f8aada3db01cb24b51797c7ee42b31",
+            "keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c",
        }),
        # percent-encoded characters
        ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -27,9 +27,9 @@ class GenericExtractor(Extractor):
    pattern += r"""
        (?P<scheme>https?://)?          # optional http(s) scheme
        (?P<domain>[-\w\.]+)            # required domain
-        (?P<path>/[^?&#]*)?             # optional path
-        (?:\?(?P<query>[^/?#]*))?       # optional query
-        (?:\#(?P<fragment>.*))?$        # optional fragment
+        (?P<path>/[^?#]*)?              # optional path
+        (?:\?(?P<query>[^#]*))?         # optional query
+        (?:\#(?P<fragment>.*))?         # optional fragment
        """

    def __init__(self, match):
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -210,7 +210,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
 class UnsplashSearchExtractor(UnsplashExtractor):
    """Extractor for unsplash search results"""
    subcategory = "search"
-    pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
+    pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
    test = ("https://unsplash.com/s/photos/hair-style", {
        "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
                   r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -52,7 +52,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
    subcategory = "search"
    directory_fmt = ("{category}", "{search[q]}")
    archive_fmt = "s_{search[q]}_{id}"
-    pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?"
+    pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?"
    test = (
        ("https://wallhaven.cc/search?q=touhou"),
        (("https://wallhaven.cc/search?q=id%3A87"
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-__version__ = "1.23.2"
+__version__ = "1.23.3-dev"