[thefap] add support (#8821 #8822)

* adding site support for thefap.com * fixing typo in url tld * improve & simplify 'model' extractor * update 'post' extractor * update docs/supportedsites * add tests --------- Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
2026-01-14 15:11:56 -05:00
parent 71e7403dfc
commit 43387c535d
5 changed files with 221 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -1087,6 +1087,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Posts, Tag Searches</td>
    <td></td>
 </tr>
+<tr id="thefap" title="thefap">
+    <td>TheFap</td>
+    <td>https://thefap.net/</td>
+    <td>Models, Posts</td>
+    <td></td>
+</tr>
 <tr id="tiktok" title="tiktok">
    <td>TikTok</td>
    <td>https://www.tiktok.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -206,6 +206,7 @@ modules = [
    "tcbscans",
    "telegraph",
    "tenor",
+    "thefap",
    "thehentaiworld",
    "tiktok",
    "tmohentai",
--- a/gallery_dl/extractor/thefap.py
+++ b/gallery_dl/extractor/thefap.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://thefap.net/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?thefap\.net"
+
+
+class ThefapExtractor(Extractor):
+    """Base class for thefap extractors"""
+    category = "thefap"
+    root = "https://thefap.net"
+    directory_fmt = ("{category}", "{model_name} ({model_id})")
+    filename_fmt = "{model}_{num:>03}.{extension}"
+    archive_fmt = "{model_id}_{filename}"
+
+    def _normalize_url(self, url):
+        if not url:
+            return ""
+        url = url.strip()
+        if "?w=" in url:
+            url = url[:url.rfind("?")]
+        elif url.endswith(":small"):
+            url = url[:-6] + ":orig"
+        if url.startswith("//"):
+            url = "https:" + url
+        elif url.startswith("/"):
+            url = self.root + url
+        return url
+
+
+class ThefapPostExtractor(ThefapExtractor):
+    """Extractor for individual thefap.net posts"""
+    subcategory = "post"
+    pattern = (BASE_PATTERN +
+               r"(/([^/?#]+)-(\d+)/([^/?#]+)/i(\d+))")
+    example = "https://thefap.net/MODEL-12345/KIND/i12345"
+
+    def items(self):
+        path, model, model_id, kind, post_id = self.groups
+
+        page = self.request(self.root + path).text
+        if "Not Found" in page:
+            raise exception.NotFoundError("post")
+
+        if model_name := text.extr(page, "<title>", " / "):
+            model_name = text.unescape(model_name)
+        else:
+            model_name = text.unquote(model).replace(".", " ")
+
+        data = {
+            "model"     : model,
+            "model_id"  : text.parse_int(model_id),
+            "model_name": model_name,
+            "kind"      : kind,
+            "post_id"   : text.parse_int(post_id),
+            "_http_headers": {"Referer": None},
+        }
+        yield Message.Directory, "", data
+
+        data["num"] = 0
+        page = text.extract(
+            page, "\n</div>", "\n<!---->", page.index("</header>"))[0]
+        for url in text.extract_iter(page, '<img src="', '"'):
+            if url := self._normalize_url(url):
+                data["num"] += 1
+                yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class ThefapModelExtractor(ThefapExtractor):
+    """Extractor for thefap.net model pages"""
+    subcategory = "model"
+    pattern = BASE_PATTERN + r"/([^/?#]+)-(\d+)"
+    example = "https://thefap.net/MODEL-12345/"
+
+    def items(self):
+        model, model_id = self.groups
+
+        url = f"{self.root}/{model}-{model_id}/"
+        page = self.request(url).text
+
+        if 'id="content"' not in page:
+            raise exception.NotFoundError("model")
+
+        if model_name := text.extr(page, "<h2", "</h2>"):
+            model_name = text.unescape(model_name[model_name.find(">")+1:])
+        else:
+            model_name = text.unquote(model).replace(".", " ")
+
+        data = {
+            "model"     : model,
+            "model_id"  : text.parse_int(model_id),
+            "model_name": model_name,
+            "_http_headers": {"Referer": None},
+        }
+        yield Message.Directory, "", data
+
+        base = f"{self.root}/ajax/model/{model_id}/page-"
+        headers = {
+            "X-Requested-With": "XMLHttpRequest",
+            "Sec-Fetch-Dest"  : "empty",
+            "Sec-Fetch-Mode"  : "cors",
+            "Sec-Fetch-Site"  : "same-origin",
+        }
+
+        page = text.extr(page, '<div id="content"', '<div id="showmore"')
+        imgs = text.extract_iter(page, 'data-src="', '"')
+        pnum = 1
+        data["num"] = 0
+
+        while True:
+            for url in imgs:
+                if url := self._normalize_url(url):
+                    data["num"] += 1
+                    yield Message.Url, url, text.nameext_from_url(url, data)
+
+            pnum += 1
+            page = self.request(base + str(pnum), headers=headers).text
+            if not page:
+                break
+            imgs = text.extract_iter(page, '<img src="', '"')
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -193,6 +193,7 @@ CATEGORY_MAP = {
    "thebarchive"    : "The /b/ Archive",
    "thecollection"  : "The /co/llection",
    "thecollectionS" : "The /co/llection",
+    "thefap"         : "TheFap",
    "thehentaiworld" : "The Hentai World",
    "tiktok"         : "TikTok",
    "tmohentai"      : "TMOHentai",
--- a/test/results/thefap.py
+++ b/test/results/thefap.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import thefap
+
+
+__tests__ = (
+{
+    "#url"     : "https://thefap.net/zoey.curly-374261/xpics/i8",
+    "#class"   : thefap.ThefapPostExtractor,
+    "#results" : "https://cdn31.xpics.me/photo/2024/10/01/09/CR98EY1fSquX.jpg",
+
+    "extension" : "jpg",
+    "filename"  : "CR98EY1fSquX",
+    "kind"      : "xpics",
+    "model"     : "zoey.curly",
+    "model_id"  : 374261,
+    "model_name": "Zoey Curly",
+    "num"       : 1,
+    "post_id"   : 8,
+},
+
+{
+    "#url"     : "https://thefap.net/analovesbananaas-979268/fap-onlyfans-0-1qcckka/i2",
+    "#class"   : thefap.ThefapPostExtractor,
+    "#results" : "https://i0.wp.com/i.redd.it/b4o1olbgi8dg1.jpg",
+
+    "extension" : "jpg",
+    "kind"      : "fap-onlyfans-0-1qcckka",
+    "model"     : "analovesbananaas",
+    "model_id"  : 979268,
+    "model_name": "analovesbananaas",
+    "num"       : 1,
+    "post_id"   : 2,
+},
+
+{
+    "#url"     : "https://thefap.net/tatted-mamma-979518/twpornstars/i1",
+    "#class"   : thefap.ThefapPostExtractor,
+    "#results" : "https://pbs.twimg.com/media/GFmqJn2a8AAAtKu.jpg:orig",
+
+    "extension" : "jpg:orig",
+    "filename"  : "GFmqJn2a8AAAtKu",
+    "kind"      : "twpornstars",
+    "model"     : "tatted-mamma",
+    "model_id"  : 979518,
+    "model_name": "tatted_mamma",
+    "num"       : 1,
+    "post_id"   : 1,
+},
+
+{
+    "#url"     : "https://thefap.net/zoey.curly-374261/",
+    "#class"   : thefap.ThefapModelExtractor,
+    "#range"   : "1-100",
+    "#count"   : 100,
+
+    "extension" : "jpg",
+    "filename"  : str,
+    "model"     : "zoey.curly",
+    "model_id"  : 374261,
+    "model_name": "Zoey Curly",
+    "num"       : range(1, 100),
+},
+
+{
+    "#url"     : "https://thefap.net/analovesbananaas-979268/",
+    "#class"   : thefap.ThefapModelExtractor,
+    "#results" : (
+        "https://i0.wp.com/i.redd.it/icndsjbgi8dg1.jpg",
+        "https://i0.wp.com/i.redd.it/b4o1olbgi8dg1.jpg",
+        "https://i0.wp.com/i.redd.it/aqilnkbgi8dg1.jpg",
+    ),
+
+    "extension" : "jpg",
+    "filename"  : str,
+    "model"     : "analovesbananaas",
+    "model_id"  : 979268,
+    "model_name": "analovesbananaas",
+    "num"       : range(1, 3),
+},
+
+)