Merge branch 'master' into jpgchurch-extractor

This commit is contained in:
enduser420
2022-08-01 09:10:38 +05:30
70 changed files with 1372 additions and 482 deletions

View File

@@ -30,7 +30,7 @@ jobs:
PYV: ${{ matrix.python-version }} PYV: ${{ matrix.python-version }}
run: | run: |
pip install -r requirements.txt pip install -r requirements.txt
pip install flake8 pip install "flake8<4"
pip install youtube-dl pip install youtube-dl
if [[ "$PYV" != "3.4" && "$PYV" != "3.5" ]]; then pip install yt-dlp; fi if [[ "$PYV" != "3.4" && "$PYV" != "3.5" ]]; then pip install yt-dlp; fi

View File

@@ -1,5 +1,75 @@
# Changelog # Changelog
## 1.22.4 - 2022-07-15
### Additions
- [instagram] add `pinned` metadata field ([#2752](https://github.com/mikf/gallery-dl/issues/2752))
- [itaku] categorize sections by group ([#1842](https://github.com/mikf/gallery-dl/issues/1842))
- [khinsider] extract `platform` metadata
- [tumblr] support `/blog/view` URLs ([#2760](https://github.com/mikf/gallery-dl/issues/2760))
- [twitter] implement `strategy` option ([#2712](https://github.com/mikf/gallery-dl/issues/2712))
- [twitter] add `count` metadata field ([#2741](https://github.com/mikf/gallery-dl/issues/2741))
- [formatter] implement `O` format specifier ([#2736](https://github.com/mikf/gallery-dl/issues/2736))
- [postprocessor:mtime] add `value` option ([#2739](https://github.com/mikf/gallery-dl/issues/2739))
- add `--no-postprocessors` command-line option ([#2725](https://github.com/mikf/gallery-dl/issues/2725))
- implement `format-separator` option ([#2737](https://github.com/mikf/gallery-dl/issues/2737))
### Changes
- [pinterest] handle section pins with separate extractors ([#2684](https://github.com/mikf/gallery-dl/issues/2684))
- [postprocessor:ugoira] enable `mtime` by default ([#2714](https://github.com/mikf/gallery-dl/issues/2714))
### Fixes
- [bunkr] fix extraction ([#2732](https://github.com/mikf/gallery-dl/issues/2732))
- [hentaifoundry] fix metadata extraction
- [itaku] fix user caching ([#1842](https://github.com/mikf/gallery-dl/issues/1842))
- [itaku] fix `date` parsing
- [kemonoparty] ensure all files have an `extension` ([#2740](https://github.com/mikf/gallery-dl/issues/2740))
- [komikcast] update domain
- [mangakakalot] update domain
- [newgrounds] only attempt to login if necessary ([#2715](https://github.com/mikf/gallery-dl/issues/2715))
- [newgrounds] prevent exception on empty results ([#2727](https://github.com/mikf/gallery-dl/issues/2727))
- [nozomi] reduce memory consumption during searches ([#2754](https://github.com/mikf/gallery-dl/issues/2754))
- [pixiv] fix default `background` filenames
- [sankaku] rewrite file URLs to s.sankakucomplex.com ([#2746](https://github.com/mikf/gallery-dl/issues/2746))
- [slideshare] fix `description` extraction
- [twitter] ignore previously seen Tweets ([#2712](https://github.com/mikf/gallery-dl/issues/2712))
- [twitter] unescape HTML entities in `content` ([#2757](https://github.com/mikf/gallery-dl/issues/2757))
- [weibo] handle invalid or broken status objects
- [postprocessor:zip] ensure target directory exists ([#2758](https://github.com/mikf/gallery-dl/issues/2758))
- make `brotli` an *optional* dependency ([#2716](https://github.com/mikf/gallery-dl/issues/2716))
- limit path length for `--write-pages` output on Windows ([#2733](https://github.com/mikf/gallery-dl/issues/2733))
### Removals
- [foolfuuka] remove archive.wakarimasen.moe
## 1.22.3 - 2022-06-28
### Changes
- [twitter] revert strategy changes for user URLs ([#2712](https://github.com/mikf/gallery-dl/issues/2712), [#2710](https://github.com/mikf/gallery-dl/issues/2710))
- update default User-Agent headers
## 1.22.2 - 2022-06-27
### Additions
- [cyberdrop] add fallback URLs ([#2668](https://github.com/mikf/gallery-dl/issues/2668))
- [horne] add support for horne.red ([#2700](https://github.com/mikf/gallery-dl/issues/2700))
- [itaku] add `gallery` and `image` extractors ([#1842](https://github.com/mikf/gallery-dl/issues/1842))
- [poipiku] add `user` and `post` extractors ([#1602](https://github.com/mikf/gallery-dl/issues/1602))
- [skeb] add `following` extractor ([#2698](https://github.com/mikf/gallery-dl/issues/2698))
- [twitter] implement `expand` option ([#2665](https://github.com/mikf/gallery-dl/issues/2665))
- [twitter] implement `csrf` option ([#2676](https://github.com/mikf/gallery-dl/issues/2676))
- [unsplash] add `collection_title` and `collection_id` metadata fields ([#2670](https://github.com/mikf/gallery-dl/issues/2670))
- [weibo] support `tabtype=video` listings ([#2601](https://github.com/mikf/gallery-dl/issues/2601))
- [formatter] implement slice operator as format specifier
- support cygwin/BSD/etc for `--cookies-from-browser`
### Fixes
- [instagram] improve metadata generated by `_parse_post_api()` ([#2695](https://github.com/mikf/gallery-dl/issues/2695), [#2660](https://github.com/mikf/gallery-dl/issues/2660))
- [instagram} fix `tag` extractor ([#2659](https://github.com/mikf/gallery-dl/issues/2659))
- [instagram] automatically invalidate expired login sessions
- [twitter] fix pagination for conversion tweets
- [twitter] improve `"replies": "self"` ([#2665](https://github.com/mikf/gallery-dl/issues/2665))
- [twitter] improve strategy for user URLs ([#2665](https://github.com/mikf/gallery-dl/issues/2665))
- [vk] take URLs from `*_src` entries ([#2535](https://github.com/mikf/gallery-dl/issues/2535))
- [weibo] fix URLs generated by `user` extractor ([#2601](https://github.com/mikf/gallery-dl/issues/2601))
- [weibo] fix retweets ([#2601](https://github.com/mikf/gallery-dl/issues/2601))
- [downloader:ytdl] update `_set_outtmpl()` ([#2692](https://github.com/mikf/gallery-dl/issues/2692))
- [formatter] fix `!j` conversion for non-serializable types ([#2624](https://github.com/mikf/gallery-dl/issues/2624))
- [snap] Fix missing libslang dependency ([#2655](https://github.com/mikf/gallery-dl/issues/2655))
## 1.22.1 - 2022-06-04 ## 1.22.1 - 2022-06-04
### Additions ### Additions
- [gfycat] add support for collections ([#2629](https://github.com/mikf/gallery-dl/issues/2629)) - [gfycat] add support for collections ([#2629](https://github.com/mikf/gallery-dl/issues/2629))

View File

@@ -25,6 +25,7 @@ Optional
- FFmpeg_: Pixiv Ugoira to WebM conversion - FFmpeg_: Pixiv Ugoira to WebM conversion
- yt-dlp_ or youtube-dl_: Video downloads - yt-dlp_ or youtube-dl_: Video downloads
- PySocks_: SOCKS proxy support - PySocks_: SOCKS proxy support
- brotli_ or brotlicffi_: Brotli compression support
Installation Installation
@@ -65,8 +66,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and Prebuilt executable files with a Python interpreter and
required Python packages included are available for required Python packages included are available for
- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.22.1/gallery-dl.exe>`__ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.exe>`__
- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.22.1/gallery-dl.bin>`__ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.bin>`__
| Executables build from the latest commit can be found at | Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -217,7 +218,8 @@ and optional for
``subscribestar``, ``subscribestar``,
``tapas``, ``tapas``,
``tsumino``, ``tsumino``,
and ``twitter``. ``twitter``,
and ``zerochan``.
You can set the necessary information in your configuration file You can set the necessary information in your configuration file
(cf. gallery-dl.conf_) (cf. gallery-dl.conf_)
@@ -332,6 +334,8 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _yt-dlp: https://github.com/yt-dlp/yt-dlp .. _yt-dlp: https://github.com/yt-dlp/yt-dlp
.. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/
.. _PySocks: https://pypi.org/project/PySocks/ .. _PySocks: https://pypi.org/project/PySocks/
.. _brotli: https://github.com/google/brotli
.. _brotlicffi: https://github.com/python-hyper/brotlicffi
.. _pyOpenSSL: https://pyopenssl.org/ .. _pyOpenSSL: https://pyopenssl.org/
.. _Snapd: https://docs.snapcraft.io/installing-snapd .. _Snapd: https://docs.snapcraft.io/installing-snapd
.. _OAuth: https://en.wikipedia.org/wiki/OAuth .. _OAuth: https://en.wikipedia.org/wiki/OAuth

View File

@@ -377,6 +377,7 @@ Description
* ``tapas`` * ``tapas``
* ``tsumino`` * ``tsumino``
* ``twitter`` * ``twitter``
* ``zerochan``
These values can also be specified via the These values can also be specified via the
``-u/--username`` and ``-p/--password`` command-line options or ``-u/--username`` and ``-p/--password`` command-line options or
@@ -496,7 +497,7 @@ extractor.*.user-agent
Type Type
``string`` ``string``
Default Default
``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"`` ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"``
Description Description
User-Agent header value to be used for HTTP requests. User-Agent header value to be used for HTTP requests.
@@ -560,6 +561,20 @@ Description
with a ``metadata`` post processor, etc. with a ``metadata`` post processor, etc.
extractor.*.path-metadata
-------------------------
Type
``string``
Default
``null``
Description
Insert a reference to the current `PathFormat <https://github.com/mikf/gallery-dl/blob/v1.22.4/gallery_dl/path.py#L20>`__
data structure into metadata dictionaries as the given name.
For example, setting this option to ``"gdl_path"`` would make it possible
to access the current file's filename as ``"[gdl_path.filename}"``.
extractor.*.category-transfer extractor.*.category-transfer
----------------------------- -----------------------------
Type Type
@@ -1313,6 +1328,18 @@ Description
* ``"new"``: Expect the *new* site layout * ``"new"``: Expect the *new* site layout
extractor.gelbooru.api-key & .user-id
-------------------------------------
Type
``string``
Default
``null``
Description
Values from the API Access Credentials section found at the bottom of your
`Account Options <https://gelbooru.com/index.php?page=account&s=options>`__
page.
extractor.generic.enabled extractor.generic.enabled
------------------------- -------------------------
Type Type
@@ -2129,6 +2156,16 @@ Description
Download videos. Download videos.
extractor.skeb.article
----------------------
Type
``bool``
Default
``false``
Description
Download article images.
extractor.skeb.sent-requests extractor.skeb.sent-requests
---------------------------- ----------------------------
Type Type
@@ -2375,6 +2412,22 @@ Description
will be taken from the original Tweets, not the Retweets. will be taken from the original Tweets, not the Retweets.
extractor.twitter.timeline.strategy
-----------------------------------
Type
``string``
Default
``"auto"``
Description
Controls the strategy / tweet source used for user URLs
(``https://twitter.com/USER``).
* ``"tweets"``: `/tweets <https://twitter.com/USER/tweets>`__ timeline + search
* ``"media"``: `/media <https://twitter.com/USER/media>`__ timeline + search
* ``"with_replies"``: `/with_replies <https://twitter.com/USER/with_replies>`__ timeline + search
* ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets <extractor.twitter.retweets_>`__ and `text-tweets <extractor.twitter.text-tweets_>`__ settings
extractor.twitter.text-tweets extractor.twitter.text-tweets
----------------------------- -----------------------------
Type Type
@@ -2399,6 +2452,16 @@ Description
Extract `TwitPic <https://twitpic.com/>`__ embeds. Extract `TwitPic <https://twitpic.com/>`__ embeds.
extractor.twitter.unique
------------------------
Type
``bool``
Default
``true``
Description
Ignore previously seen Tweets.
extractor.twitter.users extractor.twitter.users
----------------------- -----------------------
Type Type
@@ -3274,13 +3337,15 @@ Type
Default Default
``"json"`` ``"json"``
Description Description
Select how to write metadata. Selects how to process metadata.
* ``"json"``: all metadata using `json.dump() * ``"json"``: write metadata using `json.dump()
<https://docs.python.org/3/library/json.html#json.dump>`_ <https://docs.python.org/3/library/json.html#json.dump>`_
* ``"tags"``: ``tags`` separated by newlines * ``"tags"``: write ``tags`` separated by newlines
* ``"custom"``: result of applying `metadata.content-format`_ * ``"custom"``: write the result of applying `metadata.content-format`_
to a file's metadata dictionary to a file's metadata dictionary
* ``"modify"``: add or modify metadata entries
* ``"delete"``: remove metadata entries
metadata.filename metadata.filename
@@ -3369,6 +3434,32 @@ Description
e.g. a Tweet on Twitter or a post on Patreon. e.g. a Tweet on Twitter or a post on Patreon.
metadata.fields
---------------
Type
* ``list`` of ``strings``
* ``object`` (`field name` -> `format string`_)
Example
* .. code:: json
["blocked", "watching", "status[creator][name]"]
* .. code:: json
{
"blocked" : "***",
"watching" : "\fE 'yes' if watching else 'no'",
"status[username]": "{status[creator][name]!l}"
}
Description
* ``"mode": "delete"``:
A list of metadata field names to remove.
* ``"mode": "modify"``:
An object with metadata field names mapping to a `format string`_
whose result is assigned to said field name.
metadata.content-format metadata.content-format
----------------------- -----------------------
Type Type
@@ -3438,6 +3529,24 @@ Description
This value must either be a UNIX timestamp or a This value must either be a UNIX timestamp or a
|datetime|_ object. |datetime|_ object.
Note: This option gets ignored if `mtime.value`_ is set.
mtime.value
-----------
Type
``string``
Default
``null``
Example
* ``"{status[date]}"``
* ``"{content[0:6]:R22/2022/D%Y%m%d/}"``
Description
A `format string`_ whose value should be used.
The resulting value must either be a UNIX timestamp or a
|datetime|_ object.
ugoira.extension ugoira.extension
---------------- ----------------
@@ -3567,7 +3676,7 @@ ugoira.mtime
Type Type
``bool`` ``bool``
Default Default
``false`` ``true``
Description Description
Set modification times of generated ugoira aniomations. Set modification times of generated ugoira aniomations.
@@ -3666,6 +3775,20 @@ Description
this cache. this cache.
format-separator
----------------
Type
``string``
Default
``"/"``
Description
Character(s) used as argument separator in format string
`format specifiers <formatting.md#format-specifiers>`__.
For example, setting this option to ``"#"`` would allow a replacement
operation to be ``Rold#new#`` instead of the default ``Rold/new/``
signals-ignore signals-ignore
-------------- --------------
Type Type
@@ -3678,6 +3801,17 @@ Description
as signal handler for. as signal handler for.
warnings
--------
Type
``string``
Default
``"default"``
Description
The `Warnings Filter action <https://docs.python.org/3/library/warnings.html#the-warnings-filter>`__
used for (urllib3) warnings.
pyopenssl pyopenssl
--------- ---------
Type Type

View File

@@ -173,10 +173,16 @@ Format specifiers can be used for advanced formatting by using the options provi
<td><code>{updated:D%b %d %Y %I:%M %p/}</code></td> <td><code>{updated:D%b %d %Y %I:%M %p/}</code></td>
<td><code>2010-01-01 00:00:00</code></td> <td><code>2010-01-01 00:00:00</code></td>
</tr> </tr>
<tr>
<td><code>O&lt;offset&gt;/</code></td>
<td>Apply <code>&lt;offset&gt;</code> to a <code>datetime</code> object, either as <code>±HH:MM</code> or <code>local</code> for local UTC offset</td>
<td><code>{date:O-06:30/}</code></td>
<td><code>2009-12-31 17:30:00</code></td>
</tr>
</tbody> </tbody>
</table> </table>
All special format specifiers (`?`, `L`, `J`, `R`, `D`) can be chained and combined with one another, but must always come before any standard format specifiers: All special format specifiers (`?`, `L`, `J`, `R`, `D`, `O`) can be chained and combined with one another, but must always come before any standard format specifiers:
For example `{foo:?//RF/B/Ro/e/> 10}` -> `   Bee Bar` For example `{foo:?//RF/B/Ro/e/> 10}` -> `   Bee Bar`
- `?//` - Tests if `foo` has a value - `?//` - Tests if `foo` has a value

View File

@@ -10,7 +10,7 @@
"proxy": null, "proxy": null,
"skip": true, "skip": true,
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
"retries": 4, "retries": 4,
"timeout": 30.0, "timeout": 30.0,
"verify": true, "verify": true,
@@ -109,6 +109,11 @@
"include": "gallery", "include": "gallery",
"layout": "auto" "layout": "auto"
}, },
"gelbooru":
{
"api-key": null,
"user-id": null
},
"gfycat": "gfycat":
{ {
"format": ["mp4", "webm", "mobile", "gif"] "format": ["mp4", "webm", "mobile", "gif"]
@@ -248,6 +253,12 @@
"username": null, "username": null,
"password": null "password": null
}, },
"skeb":
{
"article": false,
"sent-requests": false,
"thumbnails": false
},
"smugmug": "smugmug":
{ {
"videos": true "videos": true
@@ -285,8 +296,10 @@
"quoted": false, "quoted": false,
"replies": true, "replies": true,
"retweets": false, "retweets": false,
"strategy": null,
"text-tweets": false, "text-tweets": false,
"twitpic": false, "twitpic": false,
"unique": true,
"users": "timeline", "users": "timeline",
"videos": true "videos": true
}, },
@@ -322,6 +335,11 @@
"module": null, "module": null,
"raw-options": null "raw-options": null
}, },
"zerochan":
{
"username": null,
"password": null
},
"booru": "booru":
{ {
"tags": false, "tags": false,

View File

@@ -103,6 +103,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Blogs, Posts, Search Results</td> <td>Blogs, Posts, Search Results</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Bunkr</td>
<td>https://bunkr.is/</td>
<td>Albums</td>
<td></td>
</tr>
<tr> <tr>
<td>Comic Vine</td> <td>Comic Vine</td>
<td>https://comicvine.gamespot.com/</td> <td>https://comicvine.gamespot.com/</td>
@@ -370,7 +376,7 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>Instagram</td> <td>Instagram</td>
<td>https://www.instagram.com/</td> <td>https://www.instagram.com/</td>
<td>Channels, Highlights, Posts, Reels, Saved Posts, Stories, Tag Searches, Tagged Posts, User Profiles</td> <td>Channels, Collections, Highlights, Posts, Reels, Saved Posts, Stories, Tag Searches, Tagged Posts, User Profiles</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@@ -417,7 +423,7 @@ Consider all sites to be NSFW unless otherwise known.
</tr> </tr>
<tr> <tr>
<td>Komikcast</td> <td>Komikcast</td>
<td>https://komikcast.com/</td> <td>https://komikcast.me/</td>
<td>Chapters, Manga</td> <td>Chapters, Manga</td>
<td></td> <td></td>
</tr> </tr>
@@ -889,6 +895,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Galleries, User Profiles</td> <td>Galleries, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Zerochan</td>
<td>https://www.zerochan.net/</td>
<td>individual Images, Tag Searches</td>
<td></td>
</tr>
<tr> <tr>
<td>かべうち</td> <td>かべうち</td>
<td>https://kabe-uchiroom.com/</td> <td>https://kabe-uchiroom.com/</td>
@@ -1149,12 +1161,6 @@ Consider all sites to be NSFW unless otherwise known.
<td>Boards, Galleries, Search Results, Threads</td> <td>Boards, Galleries, Search Results, Threads</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>Wakarimasen Archive</td>
<td>https://archive.wakarimasen.moe/</td>
<td>Boards, Galleries, Search Results, Threads</td>
<td></td>
</tr>
<tr> <tr>
<td colspan="4"><strong>FoOlSlide Instances</strong></td> <td colspan="4"><strong>FoOlSlide Instances</strong></td>
@@ -1267,12 +1273,6 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td colspan="4"><strong>lolisafe and chibisafe</strong></td> <td colspan="4"><strong>lolisafe and chibisafe</strong></td>
</tr> </tr>
<tr>
<td>Bunkr</td>
<td>https://app.bunkr.is/</td>
<td>Albums</td>
<td></td>
</tr>
<tr> <tr>
<td>ZzZz</td> <td>ZzZz</td>
<td>https://zz.ht/</td> <td>https://zz.ht/</td>

View File

@@ -177,6 +177,12 @@ def main():
extractor.modules = modules extractor.modules = modules
extractor._module_iter = iter(modules) extractor._module_iter = iter(modules)
# format string separator
separator = config.get((), "format-separator")
if separator:
from . import formatter
formatter._SEPARATOR = separator
# loglevels # loglevels
output.configure_logging(args.loglevel) output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR: if args.loglevel >= logging.ERROR:

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017-2021 Mike Fährmann # Copyright 2017-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -13,7 +13,7 @@ from .. import text
class _2chanThreadExtractor(Extractor): class _2chanThreadExtractor(Extractor):
"""Extractor for images from threads on www.2chan.net""" """Extractor for 2chan threads"""
category = "2chan" category = "2chan"
subcategory = "thread" subcategory = "thread"
directory_fmt = ("{category}", "{board_name}", "{thread}") directory_fmt = ("{category}", "{board_name}", "{thread}")
@@ -21,9 +21,24 @@ class _2chanThreadExtractor(Extractor):
archive_fmt = "{board}_{thread}_{tim}" archive_fmt = "{board}_{thread}_{tim}"
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)" pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)"
test = ("http://dec.2chan.net/70/res/4752.htm", { test = ("https://dec.2chan.net/70/res/14565.htm", {
"url": "f49aa31340e9a3429226af24e19e01f5b819ca1f", "pattern": r"https://dec\.2chan\.net/70/src/\d{13}\.jpg",
"keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a", "count": ">= 3",
"keyword": {
"board": "70",
"board_name": "新板提案",
"com": str,
"fsize": r"re:\d+",
"name": "名無し",
"no": r"re:1[45]\d\d\d",
"now": r"re:22/../..\(.\)..:..:..",
"post": "無題",
"server": "dec",
"thread": "14565",
"tim": r"re:^\d{13}$",
"time": r"re:^\d{10}$",
"title": "ヒロアカ板"
},
}) })
def __init__(self, match): def __init__(self, match):

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2019-2021 Mike Fährmann # Copyright 2019-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -261,7 +261,7 @@ class _500pxImageExtractor(_500pxExtractor):
"tags": list, "tags": list,
"taken_at": "2017-05-04T17:36:51+00:00", "taken_at": "2017-05-04T17:36:51+00:00",
"times_viewed": int, "times_viewed": int,
"url": "/photo/222049255/Queen-Of-Coasts-by-Olesya-Nabieva", "url": "/photo/222049255/Queen-Of-Coasts-by-Alice-Nabieva",
"user": dict, "user": dict,
"user_id": 12847235, "user_id": 12847235,
"votes_count": int, "votes_count": int,

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2020 Mike Fährmann # Copyright 2020-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -26,10 +26,10 @@ class _8kunThreadExtractor(Extractor):
"count": ">= 8", "count": ">= 8",
}), }),
# old-style file URLs (#1101) # old-style file URLs (#1101)
("https://8kun.top/d/res/13258.html", { # ("https://8kun.top/d/res/13258.html", {
"pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+", # "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+",
"range": "1-20", # "range": "1-20",
}), # }),
) )
def __init__(self, match): def __init__(self, match):

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2019-2021 Mike Fährmann # Copyright 2019-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -55,7 +55,7 @@ class _8musesAlbumExtractor(Extractor):
}, },
}), }),
# custom sorting # custom sorting
("https://www.8muses.com/comics/album/Fakku-Comics/9?sort=az", { ("https://www.8muses.com/comics/album/Fakku-Comics/11?sort=az", {
"count": ">= 70", "count": ">= 70",
"keyword": {"name": r"re:^[R-Zr-z]"}, "keyword": {"name": r"re:^[R-Zr-z]"},
}), }),

View File

@@ -25,6 +25,7 @@ modules = [
"bcy", "bcy",
"behance", "behance",
"blogger", "blogger",
"bunkr",
"comicvine", "comicvine",
"cyberdrop", "cyberdrop",
"danbooru", "danbooru",
@@ -150,6 +151,7 @@ modules = [
"wikieat", "wikieat",
"xhamster", "xhamster",
"xvideos", "xvideos",
"zerochan",
"booru", "booru",
"moebooru", "moebooru",
"foolfuuka", "foolfuuka",

View File

@@ -32,9 +32,11 @@ class ArtstationExtractor(Extractor):
data = self.metadata() data = self.metadata()
for project in self.projects(): for project in self.projects():
for asset in self.get_project_assets(project["hash_id"]): for num, asset in enumerate(
self.get_project_assets(project["hash_id"]), 1):
asset.update(data) asset.update(data)
adict = asset["asset"] adict = asset["asset"]
asset["num"] = num
yield Message.Directory, asset yield Message.Directory, asset
if adict["has_embedded_player"] and self.external: if adict["has_embedded_player"] and self.external:
@@ -85,6 +87,7 @@ class ArtstationExtractor(Extractor):
assets = data["assets"] assets = data["assets"]
del data["assets"] del data["assets"]
data["count"] = len(assets)
if len(assets) == 1: if len(assets) == 1:
data["asset"] = assets[0] data["asset"] = assets[0]
yield data yield data
@@ -147,14 +150,15 @@ class ArtstationUserExtractor(ArtstationExtractor):
r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?" r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?"
r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$")
test = ( test = (
("https://www.artstation.com/gaerikim/", { ("https://www.artstation.com/sungchoi/", {
"pattern": r"https://\w+\.artstation\.com/p/assets/images" "pattern": r"https://\w+\.artstation\.com/p/assets/images"
r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+", r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+",
"count": ">= 6", "range": "1-10",
"count": ">= 10",
}), }),
("https://www.artstation.com/gaerikim/albums/all/"), ("https://www.artstation.com/sungchoi/albums/all/"),
("https://gaerikim.artstation.com/"), ("https://sungchoi.artstation.com/"),
("https://gaerikim.artstation.com/projects/"), ("https://sungchoi.artstation.com/projects/"),
) )
def projects(self): def projects(self):
@@ -400,7 +404,7 @@ class ArtstationFollowingExtractor(ArtstationExtractor):
subcategory = "following" subcategory = "following"
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
r"/(?!artwork|projects|search)([^/?#]+)/following") r"/(?!artwork|projects|search)([^/?#]+)/following")
test = ("https://www.artstation.com/gaerikim/following", { test = ("https://www.artstation.com/sungchoi/following", {
"pattern": ArtstationUserExtractor.pattern, "pattern": ArtstationUserExtractor.pattern,
"count": ">= 50", "count": ">= 50",
}) })

View File

@@ -67,9 +67,6 @@ class BloggerExtractor(Extractor):
key=lambda x: x["format_id"], key=lambda x: x["format_id"],
)["play_url"]) )["play_url"])
if not files:
continue
post["author"] = post["author"]["displayName"] post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"] post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(content) post["content"] = text.remove_html(content)
@@ -136,9 +133,9 @@ class BloggerPostExtractor(BloggerExtractor):
"pattern": r"https://.+\.googlevideo\.com/videoplayback", "pattern": r"https://.+\.googlevideo\.com/videoplayback",
}), }),
# image URLs with width/height (#1061) # image URLs with width/height (#1061)
("https://aaaninja.blogspot.com/2020/08/altera-boob-press-2.html", { # ("https://aaaninja.blogspot.com/2020/08/altera-boob-press-2.html", {
"pattern": r"https://1.bp.blogspot.com/.+/s0/altera_.+png", # "pattern": r"https://1.bp.blogspot.com/.+/s0/altera_.+png",
}), # }),
# new image domain (#2204) # new image domain (#2204)
(("https://randomthingsthroughmyletterbox.blogspot.com/2022/01" (("https://randomthingsthroughmyletterbox.blogspot.com/2022/01"
"/bitter-flowers-by-gunnar-staalesen-blog.html"), { "/bitter-flowers-by-gunnar-staalesen-blog.html"), {

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://bunkr.is/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
import json
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for bunkr.is albums"""
category = "bunkr"
root = "https://bunkr.is"
pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)"
test = (
("https://bunkr.is/a/Lktg9Keq", {
"pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
"keyword": {
"album_id": "Lktg9Keq",
"album_name": 'test テスト "&>',
"count": 1,
"filename": 'test-テスト-"&>-QjgneIQv',
"id": "QjgneIQv",
"name": 'test-テスト-"&>',
"num": int,
},
}),
# mp4 (#2239)
("https://app.bunkr.is/a/ptRHaCn2", {
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
("https://bunkr.to/a/Lktg9Keq"),
)
def fetch_album(self, album_id):
if "//app." in self.root:
return self._fetch_album_api(album_id)
else:
return self._fetch_album_site(album_id)
def _fetch_album_api(self, album_id):
files, data = LolisafeAlbumExtractor.fetch_album(self, album_id)
for file in files:
url = file["file"]
if url.endswith(".mp4"):
file["file"] = url.replace(
"//cdn.bunkr.is/", "//media-files.bunkr.is/", 1)
else:
file["_fallback"] = (url.replace("//cdn.", "//cdn3.", 1),)
return files, data
def _fetch_album_site(self, album_id):
url = self.root + "/a/" + self.album_id
try:
data = json.loads(text.extract(
self.request(url).text,
'id="__NEXT_DATA__" type="application/json">', '<')[0])
props = data["props"]["pageProps"]
album = props["album"]
files = props["files"]
except Exception as exc:
self.log.debug(exc.__class__.__name__, exc)
self.root = self.root.replace("bunkr", "app.bunkr", 1)
return self._fetch_album_api(album_id)
for file in files:
name = file["name"]
cdn = file["cdn"]
if name.endswith(".mp4"):
cdn = cdn.replace("//cdn", "//media-files")
file["file"] = cdn + "/" + name
return files, {
"album_id" : self.album_id,
"album_name" : text.unescape(album["name"]),
"description": text.unescape(album["description"]),
"count" : len(files),
}

View File

@@ -8,6 +8,7 @@
"""Common classes and constants used by extractor modules.""" """Common classes and constants used by extractor modules."""
import os
import re import re
import ssl import ssl
import time import time
@@ -224,7 +225,9 @@ class Extractor():
headers.clear() headers.clear()
ssl_options = ssl_ciphers = 0 ssl_options = ssl_ciphers = 0
browser = self.config("browser") or self.browser browser = self.config("browser")
if browser is None:
browser = self.browser
if browser and isinstance(browser, str): if browser and isinstance(browser, str):
browser, _, platform = browser.lower().partition(":") browser, _, platform = browser.lower().partition(":")
@@ -256,9 +259,13 @@ class Extractor():
else: else:
headers["User-Agent"] = self.config("user-agent", ( headers["User-Agent"] = self.config("user-agent", (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; " "Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
"rv:91.0) Gecko/20100101 Firefox/91.0")) "rv:102.0) Gecko/20100101 Firefox/102.0"))
headers["Accept"] = "*/*" headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Language"] = "en-US,en;q=0.5"
if BROTLI:
headers["Accept-Encoding"] = "gzip, deflate, br"
else:
headers["Accept-Encoding"] = "gzip, deflate" headers["Accept-Encoding"] = "gzip, deflate"
custom_headers = self.config("headers") custom_headers = self.config("headers")
@@ -473,11 +480,16 @@ class Extractor():
fname = "{:>02}_{}".format( fname = "{:>02}_{}".format(
Extractor._dump_index, Extractor._dump_index,
Extractor._dump_sanitize('_', response.url) Extractor._dump_sanitize('_', response.url),
)[:250] )
if util.WINDOWS:
path = os.path.abspath(fname)[:255]
else:
path = fname[:251]
try: try:
with open(fname + ".dump", 'wb') as fp: with open(path + ".txt", 'wb') as fp:
util.dump_response( util.dump_response(
response, fp, headers=(self._write_pages == "all")) response, fp, headers=(self._write_pages == "all"))
except Exception as e: except Exception as e:
@@ -713,16 +725,21 @@ _browser_cookies = {}
HTTP_HEADERS = { HTTP_HEADERS = {
"firefox": ( "firefox": (
("User-Agent", "Mozilla/5.0 ({}; rv:91.0) " ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) "
"Gecko/20100101 Firefox/91.0"), "Gecko/20100101 Firefox/102.0"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,*/*;q=0.8"), "image/avif,image/webp,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.5"), ("Accept-Language", "en-US,en;q=0.5"),
("Accept-Encoding", "gzip, deflate"), ("Accept-Encoding", None),
("Referer", None), ("Referer", None),
("DNT", "1"),
("Connection", "keep-alive"), ("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"), ("Upgrade-Insecure-Requests", "1"),
("Cookie", None), ("Cookie", None),
("Sec-Fetch-Dest", "empty"),
("Sec-Fetch-Mode", "no-cors"),
("Sec-Fetch-Site", "same-origin"),
("TE", "trailers"),
), ),
"chrome": ( "chrome": (
("Upgrade-Insecure-Requests", "1"), ("Upgrade-Insecure-Requests", "1"),
@@ -731,7 +748,7 @@ HTTP_HEADERS = {
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/webp,image/apng,*/*;q=0.8"), "image/webp,image/apng,*/*;q=0.8"),
("Referer", None), ("Referer", None),
("Accept-Encoding", "gzip, deflate"), ("Accept-Encoding", None),
("Accept-Language", "en-US,en;q=0.9"), ("Accept-Language", "en-US,en;q=0.9"),
("Cookie", None), ("Cookie", None),
), ),
@@ -755,8 +772,7 @@ SSL_CIPHERS = {
"AES128-GCM-SHA256:" "AES128-GCM-SHA256:"
"AES256-GCM-SHA384:" "AES256-GCM-SHA384:"
"AES128-SHA:" "AES128-SHA:"
"AES256-SHA:" "AES256-SHA"
"DES-CBC3-SHA"
), ),
"chrome": ( "chrome": (
"TLS_AES_128_GCM_SHA256:" "TLS_AES_128_GCM_SHA256:"
@@ -779,6 +795,24 @@ SSL_CIPHERS = {
} }
urllib3 = requests.packages.urllib3
# detect brotli support
try:
BROTLI = urllib3.response.brotli is not None
except AttributeError:
BROTLI = False
# set (urllib3) warnings filter
action = config.get((), "warnings", "default")
if action:
try:
import warnings
warnings.simplefilter(action, urllib3.exceptions.HTTPWarning)
except Exception:
pass
del action
# Undo automatic pyOpenSSL injection by requests # Undo automatic pyOpenSSL injection by requests
pyopenssl = config.get((), "pyopenssl", False) pyopenssl = config.get((), "pyopenssl", False)
if not pyopenssl: if not pyopenssl:

View File

@@ -181,7 +181,8 @@ class DanbooruTagExtractor(DanbooruExtractor):
# 'external' option (#1747) # 'external' option (#1747)
("https://danbooru.donmai.us/posts?tags=pixiv_id%3A1476533", { ("https://danbooru.donmai.us/posts?tags=pixiv_id%3A1476533", {
"options": (("external", True),), "options": (("external", True),),
"pattern": r"http://img16.pixiv.net/img/takaraakihito/1476533.jpg", "pattern": r"https://i\.pximg\.net/img-original/img"
r"/2008/08/28/02/35/48/1476533_p0\.jpg",
}), }),
("https://e621.net/posts?tags=anry", { ("https://e621.net/posts?tags=anry", {
"url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",

View File

@@ -671,12 +671,12 @@ class DeviantartCollectionExtractor(DeviantartExtractor):
test = ( test = (
(("https://www.deviantart.com/pencilshadings/favourites" (("https://www.deviantart.com/pencilshadings/favourites"
"/70595441/3D-Favorites"), { "/70595441/3D-Favorites"), {
"count": ">= 20", "count": ">= 15",
"options": (("original", False),), "options": (("original", False),),
}), }),
(("https://www.deviantart.com/pencilshadings/favourites" (("https://www.deviantart.com/pencilshadings/favourites"
"/F050486B-CB62-3C66-87FB-1105A7F6379F/3D Favorites"), { "/F050486B-CB62-3C66-87FB-1105A7F6379F/3D Favorites"), {
"count": ">= 20", "count": ">= 15",
"options": (("original", False),), "options": (("original", False),),
}), }),
("https://pencilshadings.deviantart.com" ("https://pencilshadings.deviantart.com"
@@ -876,8 +876,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
}), }),
# GIF (#242) # GIF (#242)
(("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), { (("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), {
"pattern": (r"https://images-wixmp-\w+\.wixmp\.com" "pattern": r"https://wixmp-\w+\.wixmp\.com/f/03fd2413-efe9-4e5c-"
r"/f/[^/]+/[^.]+\.gif\?token="), r"8734-2b72605b3fbb/dcxbsnb-1bbf0b38-42af-4070-8878-"
r"f30961955bec\.gif\?token=ey...",
}), }),
# Flash animation with GIF preview (#1731) # Flash animation with GIF preview (#1731)
("https://www.deviantart.com/yuumei/art/Flash-Comic-214724929", { ("https://www.deviantart.com/yuumei/art/Flash-Comic-214724929", {
@@ -1127,11 +1128,18 @@ class DeviantartOAuthAPI():
self._folders((deviation,)) self._folders((deviation,))
return deviation return deviation
def deviation_content(self, deviation_id, public=False): def deviation_content(self, deviation_id, public=True):
"""Get extended content of a single Deviation""" """Get extended content of a single Deviation"""
endpoint = "/deviation/content" endpoint = "/deviation/content"
params = {"deviationid": deviation_id} params = {"deviationid": deviation_id}
return self._call(endpoint, params=params, public=public) content = self._call(endpoint, params=params, public=public)
if public and content["html"].startswith(
' <span class=\"username-with-symbol'):
if self.refresh_token_key:
content = self._call(endpoint, params=params, public=False)
else:
self.log.warning("Private Journal")
return content
def deviation_download(self, deviation_id, public=True): def deviation_download(self, deviation_id, public=True):
"""Get the original file download (if allowed)""" """Get the original file download (if allowed)"""

View File

@@ -122,7 +122,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"date": "dt:2018-03-18 20:15:00", "date": "dt:2018-03-18 20:15:00",
"eh_category": "Non-H", "eh_category": "Non-H",
"expunged": False, "expunged": False,
"favorites": "21", "favorites": r"re:^[12]\d$",
"filecount": "4", "filecount": "4",
"filesize": 1488978, "filesize": 1488978,
"gid": 1200119, "gid": 1200119,

View File

@@ -6,6 +6,7 @@
"""Extractors for https://www.fanbox.cc/""" """Extractors for https://www.fanbox.cc/"""
import re
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
@@ -78,6 +79,7 @@ class FanboxExtractor(Extractor):
num = 0 num = 0
cover_image = post.get("coverImageUrl") cover_image = post.get("coverImageUrl")
if cover_image: if cover_image:
cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image)
final_post = post.copy() final_post = post.copy()
final_post["isCoverImage"] = True final_post["isCoverImage"] = True
final_post["fileUrl"] = cover_image final_post["fileUrl"] = cover_image

View File

@@ -100,10 +100,6 @@ BASE_PATTERN = FoolfuukaExtractor.update({
"root": "https://thebarchive.com", "root": "https://thebarchive.com",
"pattern": r"thebarchive\.com", "pattern": r"thebarchive\.com",
}, },
"wakarimasen": {
"root": "https://archive.wakarimasen.moe",
"pattern": r"archive\.wakarimasen\.moe",
},
}) })
@@ -149,9 +145,6 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
("https://thebarchive.com/b/thread/739772332/", { ("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c", "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
}), }),
("https://archive.wakarimasen.moe/a/thread/223157648/", {
"url": "fef0758d2eb81b1ba783051fd5ec491d70107a78",
}),
) )
def __init__(self, match): def __init__(self, match):
@@ -190,7 +183,6 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
("https://archive.nyafuu.org/c/"), ("https://archive.nyafuu.org/c/"),
("https://rbt.asia/g/"), ("https://rbt.asia/g/"),
("https://thebarchive.com/b/"), ("https://thebarchive.com/b/"),
("https://archive.wakarimasen.moe/a/"),
) )
def __init__(self, match): def __init__(self, match):
@@ -234,7 +226,6 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
("https://archive.nyafuu.org/_/search/text/test/"), ("https://archive.nyafuu.org/_/search/text/test/"),
("https://rbt.asia/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"),
("https://thebarchive.com/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"),
("https://archive.wakarimasen.moe/a/search/text/test/"),
) )
def __init__(self, match): def __init__(self, match):
@@ -300,7 +291,6 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor):
("https://archive.nyafuu.org/c/gallery/7"), ("https://archive.nyafuu.org/c/gallery/7"),
("https://rbt.asia/g/gallery/8"), ("https://rbt.asia/g/gallery/8"),
("https://thebarchive.com/b/gallery/9"), ("https://thebarchive.com/b/gallery/9"),
("https://archive.wakarimasen.moe/a/gallery/10"),
) )
def __init__(self, match): def __init__(self, match):

View File

@@ -21,10 +21,15 @@ class GelbooruBase():
root = "https://gelbooru.com" root = "https://gelbooru.com"
def _api_request(self, params): def _api_request(self, params):
params["api_key"] = self.api_key
params["user_id"] = self.user_id
url = self.root + "/index.php?page=dapi&s=post&q=index&json=1" url = self.root + "/index.php?page=dapi&s=post&q=index&json=1"
data = self.request(url, params=params).json() data = self.request(url, params=params).json()
if "post" not in data: if "post" not in data:
return () return ()
posts = data["post"] posts = data["post"]
if not isinstance(posts, list): if not isinstance(posts, list):
return (posts,) return (posts,)

View File

@@ -21,6 +21,9 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def __init__(self, match): def __init__(self, match):
booru.BooruExtractor.__init__(self, match) booru.BooruExtractor.__init__(self, match)
self.api_key = self.config("api-key")
self.user_id = self.config("user-id")
try: try:
self.api_root = INSTANCES[self.category]["api_root"] self.api_root = INSTANCES[self.category]["api_root"]
except KeyError: except KeyError:
@@ -149,7 +152,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
test = ( test = (
("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "622e80be3f496672c44aab5c47fbc6941c61bc79", "content": "5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c",
"pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 2, "count": 2,
}), }),

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2020 Mike Fährmann # Copyright 2015-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -84,12 +84,12 @@ class HentaifoundryExtractor(Extractor):
.replace("\r\n", "\n"), "", "")), .replace("\r\n", "\n"), "", "")),
"ratings" : [text.unescape(r) for r in text.extract_iter(extr( "ratings" : [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")], "class='ratings_box'", "</div>"), "title='", "'")],
"media" : text.unescape(extr("Media</b></td>\t\t<td>", "<")),
"date" : text.parse_datetime(extr("datetime='", "'")), "date" : text.parse_datetime(extr("datetime='", "'")),
"views" : text.parse_int(extr("Views</b></td>\t\t<td>", "<")), "views" : text.parse_int(extr(">Views</span>", "<")),
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
"media" : text.unescape(extr(">Media</span>", "<").strip()),
"tags" : text.split_html(extr( "tags" : text.split_html(extr(
"<td><b>Keywords</b></td>", "</tr>"))[::2], ">Tags </span>", "</div>")),
"score" : text.parse_int(extr('Score</b></td>\t\t<td>', '<')),
} }
return text.nameext_from_url(data["src"], data) return text.nameext_from_url(data["src"], data)
@@ -292,7 +292,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
"media" : "Other digital art", "media" : "Other digital art",
"ratings": ["Sexual content", "Contains female nudity"], "ratings": ["Sexual content", "Contains female nudity"],
"score" : int, "score" : int,
"tags" : ["kancolle", "kantai", "collection", "shimakaze"], "tags" : ["collection", "kancolle", "kantai", "shimakaze"],
"title" : "shimakaze", "title" : "shimakaze",
"user" : "Tenpura", "user" : "Tenpura",
"views" : int, "views" : int,

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2020-2021 Mike Fährmann # Copyright 2020-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -22,12 +22,12 @@ class HentaihandGalleryExtractor(GalleryExtractor):
(("https://hentaihand.com/en/comic/c75-takumi-na-muchi-choudenji-hou-" (("https://hentaihand.com/en/comic/c75-takumi-na-muchi-choudenji-hou-"
"no-aishi-kata-how-to-love-a-super-electromagnetic-gun-toaru-kagaku-" "no-aishi-kata-how-to-love-a-super-electromagnetic-gun-toaru-kagaku-"
"no-railgun-english"), { "no-railgun-english"), {
"pattern": r"https://cdn.hentaihand.com/.*/images/360468/\d+.jpg$", "pattern": r"https://cdn.hentaihand.com/.*/images/37387/\d+.jpg$",
"count": 50, "count": 50,
"keyword": { "keyword": {
"artists" : ["Takumi Na Muchi"], "artists" : ["Takumi Na Muchi"],
"date" : "dt:2014-06-28 00:00:00", "date" : "dt:2014-06-28 00:00:00",
"gallery_id": 360468, "gallery_id": 37387,
"lang" : "en", "lang" : "en",
"language" : "English", "language" : "English",
"parodies" : ["Toaru Kagaku No Railgun"], "parodies" : ["Toaru Kagaku No Railgun"],

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2021 Mike Fährmann # Copyright 2016-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -36,13 +36,13 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
test = ( test = (
("https://www.imagefap.com/pictures/7102714", { ("https://www.imagefap.com/pictures/7102714", {
"pattern": r"https://cdnh\.imagefap\.com" "pattern": r"https://cdnh?\.imagefap\.com"
r"/images/full/\d+/\d+/\d+\.jpg", r"/images/full/\d+/\d+/\d+\.jpg",
"keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3", "keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3",
"content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab", "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab",
}), }),
("https://www.imagefap.com/gallery/5486966", { ("https://www.imagefap.com/gallery/5486966", {
"pattern": r"https://cdnh\.imagefap\.com" "pattern": r"https://cdnh?\.imagefap\.com"
r"/images/full/\d+/\d+/\d+\.jpg", r"/images/full/\d+/\d+/\d+\.jpg",
"keyword": "3e24eace5b09639b881ebd393165862feb46adde", "keyword": "3e24eace5b09639b881ebd393165862feb46adde",
}), }),
@@ -107,7 +107,7 @@ class ImagefapImageExtractor(ImagefapExtractor):
pattern = BASE_PATTERN + r"/photo/(\d+)" pattern = BASE_PATTERN + r"/photo/(\d+)"
test = ( test = (
("https://www.imagefap.com/photo/1369341772/", { ("https://www.imagefap.com/photo/1369341772/", {
"pattern": r"https://cdnh\.imagefap\.com" "pattern": r"https://cdnh?\.imagefap\.com"
r"/images/full/\d+/\d+/\d+\.jpg", r"/images/full/\d+/\d+/\d+\.jpg",
"keyword": "8894e45f7262020d8d66ce59917315def1fc475b", "keyword": "8894e45f7262020d8d66ce59917315def1fc475b",
}), }),
@@ -156,10 +156,10 @@ class ImagefapUserExtractor(ImagefapExtractor):
r"|usergallery\.php\?userid=(\d+))") r"|usergallery\.php\?userid=(\d+))")
test = ( test = (
("https://www.imagefap.com/profile/LucyRae/galleries", { ("https://www.imagefap.com/profile/LucyRae/galleries", {
"url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", "url": "822cb6cbb6f474ca2d0f58d1d6d253bc2338937a",
}), }),
("https://www.imagefap.com/usergallery.php?userid=1862791", { ("https://www.imagefap.com/usergallery.php?userid=1862791", {
"url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", "url": "822cb6cbb6f474ca2d0f58d1d6d253bc2338937a",
}), }),
("https://www.imagefap.com/profile.php?user=LucyRae"), ("https://www.imagefap.com/profile.php?user=LucyRae"),
("https://beta.imagefap.com/profile.php?user=LucyRae"), ("https://beta.imagefap.com/profile.php?user=LucyRae"),

View File

@@ -186,11 +186,17 @@ class InstagramExtractor(Extractor):
media = next(self._media_by_id(post["id"])) media = next(self._media_by_id(post["id"]))
return self._parse_post_api(media) return self._parse_post_api(media)
pinned = post.get("pinned_for_users", ())
if pinned:
for index, user in enumerate(pinned):
pinned[index] = int(user["id"])
owner = post["owner"] owner = post["owner"]
data = { data = {
"typename" : typename, "typename" : typename,
"date" : text.parse_timestamp(post["taken_at_timestamp"]), "date" : text.parse_timestamp(post["taken_at_timestamp"]),
"likes" : post["edge_media_preview_like"]["count"], "likes" : post["edge_media_preview_like"]["count"],
"pinned" : pinned,
"owner_id" : owner["id"], "owner_id" : owner["id"],
"username" : owner.get("username"), "username" : owner.get("username"),
"fullname" : owner.get("full_name"), "fullname" : owner.get("full_name"),
@@ -263,6 +269,7 @@ class InstagramExtractor(Extractor):
"post_id" : post["pk"], "post_id" : post["pk"],
"post_shortcode": post["code"], "post_shortcode": post["code"],
"likes": post["like_count"], "likes": post["like_count"],
"pinned": post.get("timeline_pinned_user_ids", ()),
} }
caption = post["caption"] caption = post["caption"]
@@ -392,6 +399,8 @@ class InstagramExtractor(Extractor):
self.log.debug("Cursor: %s", self._cursor) self.log.debug("Cursor: %s", self._cursor)
def _pagination_api(self, endpoint, params=None): def _pagination_api(self, endpoint, params=None):
if params is None:
params = {}
while True: while True:
data = self._request_api(endpoint, params=params) data = self._request_api(endpoint, params=params)
yield from data["items"] yield from data["items"]
@@ -502,7 +511,7 @@ class InstagramChannelExtractor(InstagramExtractor):
class InstagramSavedExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor):
"""Extractor for ProfilePage saved media""" """Extractor for ProfilePage saved media"""
subcategory = "saved" subcategory = "saved"
pattern = USER_PATTERN + r"/saved" pattern = USER_PATTERN + r"/saved/?$"
test = ("https://www.instagram.com/instagram/saved/",) test = ("https://www.instagram.com/instagram/saved/",)
def posts(self): def posts(self):
@@ -511,6 +520,30 @@ class InstagramSavedExtractor(InstagramExtractor):
return self._pagination_graphql(query_hash, variables) return self._pagination_graphql(query_hash, variables)
class InstagramCollectionExtractor(InstagramExtractor):
"""Extractor for ProfilePage saved collection media"""
subcategory = "collection"
pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)"
test = (
"https://www.instagram.com/instagram/saved/collection_name/123456789/",
)
def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user, self.collection_name, self.collection_id = match.groups()
def metadata(self):
return {
"collection_id" : self.collection_id,
"collection_name": text.unescape(self.collection_name),
}
def posts(self):
endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id)
for item in self._pagination_api(endpoint):
yield item["media"]
class InstagramTagExtractor(InstagramExtractor): class InstagramTagExtractor(InstagramExtractor):
"""Extractor for TagPage""" """Extractor for TagPage"""
subcategory = "tag" subcategory = "tag"

View File

@@ -34,11 +34,19 @@ class ItakuExtractor(Extractor):
for post in self.posts(): for post in self.posts():
post["date"] = text.parse_datetime( post["date"] = text.parse_datetime(
post["date_added"], "%Y-%m-%dT%H:%M:%S.%f") post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
for category, tags in post.pop("categorized_tags").items(): for category, tags in post.pop("categorized_tags").items():
post["tags_" + category.lower()] = [t["name"] for t in tags] post["tags_" + category.lower()] = [t["name"] for t in tags]
post["tags"] = [t["name"] for t in post["tags"]] post["tags"] = [t["name"] for t in post["tags"]]
post["sections"] = [s["title"] for s in post["sections"]]
sections = []
for s in post["sections"]:
group = s["group"]
if group:
sections.append(group["title"] + "/" + s["title"])
else:
sections.append(s["title"])
post["sections"] = sections
if post["video"] and self.videos: if post["video"] and self.videos:
url = post["video"]["video"] url = post["video"]["video"]
@@ -79,12 +87,13 @@ class ItakuImageExtractor(ItakuExtractor):
"is_blacklisted": False "is_blacklisted": False
}, },
"can_reshare": True, "can_reshare": True,
"date": "dt:2022-05-05 19:21:17",
"date_added": "2022-05-05T19:21:17.674148Z", "date_added": "2022-05-05T19:21:17.674148Z",
"date_edited": "2022-05-25T14:37:46.220612Z", "date_edited": "2022-05-25T14:37:46.220612Z",
"description": "sketch from drawpile", "description": "sketch from drawpile",
"extension": "png", "extension": "png",
"filename": "220504_oUNIAFT", "filename": "220504_oUNIAFT",
"hotness_score": 11507.4691939, "hotness_score": float,
"id": 100471, "id": 100471,
"image": "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs" "image": "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs"
"/220504_oUNIAFT.png", "/220504_oUNIAFT.png",
@@ -102,7 +111,7 @@ class ItakuImageExtractor(ItakuExtractor):
"owner_displayname": "Piku", "owner_displayname": "Piku",
"owner_username": "piku", "owner_username": "piku",
"reshared_by_you": False, "reshared_by_you": False,
"sections": ["Miku"], "sections": ["Fanart/Miku"],
"tags": list, "tags": list,
"tags_character": ["hatsune_miku"], "tags_character": ["hatsune_miku"],
"tags_copyright": ["vocaloid"], "tags_copyright": ["vocaloid"],
@@ -152,10 +161,10 @@ class ItakuAPI():
return self._pagination(endpoint, params, self.image) return self._pagination(endpoint, params, self.image)
def image(self, image_id): def image(self, image_id):
endpoint = "/galleries/images/" + str(image_id) endpoint = "/galleries/images/{}/".format(image_id)
return self._call(endpoint) return self._call(endpoint)
@memcache() @memcache(keyarg=1)
def user(self, username): def user(self, username):
return self._call("/user_profiles/{}/".format(username)) return self._call("/user_profiles/{}/".format(username))

View File

@@ -96,12 +96,14 @@ class KemonopartyExtractor(Extractor):
post["num"] += 1 post["num"] += 1
post["_http_headers"] = headers post["_http_headers"] = headers
text.nameext_from_url(file.get("name", url), post)
if not post["extension"]:
post["extension"] = text.ext_from_url(url)
if url[0] == "/": if url[0] == "/":
url = self.root + "/data" + url url = self.root + "/data" + url
elif url.startswith(self.root): elif url.startswith(self.root):
url = self.root + "/data" + url[20:] url = self.root + "/data" + url[20:]
text.nameext_from_url(file.get("name", url), post)
yield Message.Url, url, post yield Message.Url, url, post
def login(self): def login(self):
@@ -377,12 +379,15 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
for post["num"], file in enumerate(files, 1): for post["num"], file in enumerate(files, 1):
post["type"] = file["type"] post["type"] = file["type"]
url = file["path"] url = file["path"]
text.nameext_from_url(file.get("name", url), post)
if not post["extension"]:
post["extension"] = text.ext_from_url(url)
if url[0] == "/": if url[0] == "/":
url = self.root + "/data" + url url = self.root + "/data" + url
elif url.startswith(self.root): elif url.startswith(self.root):
url = self.root + "/data" + url[20:] url = self.root + "/data" + url[20:]
text.nameext_from_url(file["name"], post)
yield Message.Url, url, post yield Message.Url, url, post
def posts(self): def posts(self):

View File

@@ -30,7 +30,8 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
"album": { "album": {
"count": 1, "count": 1,
"date": "Sep 18th, 2016", "date": "Sep 18th, 2016",
"name": "Horizon Riders (Wii)", "name": "Horizon Riders",
"platform": "Wii",
"size": 26214400, "size": 26214400,
"type": "Gamerip", "type": "Gamerip",
}, },
@@ -60,6 +61,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
extr = text.extract_from(page) extr = text.extract_from(page)
return {"album": { return {"album": {
"name" : text.unescape(extr("<h2>", "<")), "name" : text.unescape(extr("<h2>", "<")),
"platform": extr("Platforms: <a", "<").rpartition(">")[2],
"count": text.parse_int(extr("Number of Files: <b>", "<")), "count": text.parse_int(extr("Number of Files: <b>", "<")),
"size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
"date" : extr("Date Added: <b>", "<"), "date" : extr("Date Added: <b>", "<"),

View File

@@ -1,22 +1,24 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2021 Mike Fährmann # Copyright 2018-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract manga-chapters and entire manga from https://komikcast.com/""" """Extractors for https://komikcast.me/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text
import re import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:me|com)"
class KomikcastBase(): class KomikcastBase():
"""Base class for komikcast extractors""" """Base class for komikcast extractors"""
category = "komikcast" category = "komikcast"
root = "https://komikcast.com" root = "https://komikcast.me"
@staticmethod @staticmethod
def parse_chapter_string(chapter_string, data=None): def parse_chapter_string(chapter_string, data=None):
@@ -44,18 +46,18 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for manga-chapters from komikcast.com""" """Extractor for manga-chapters from komikcast.me"""
pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?#]+/)" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
test = ( test = (
(("https://komikcast.com/chapter/" (("https://komikcast.me/chapter"
"apotheosis-chapter-02-2-bahasa-indonesia/"), { "/apotheosis-chapter-02-2-bahasa-indonesia/"), {
"url": "f6b43fbc027697749b3ea1c14931c83f878d7936", "url": "74eca5c9b27b896816497f9b2d847f2a1fcfc209",
"keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
}), }),
(("https://komikcast.com/chapter/" (("https://komikcast.me/chapter"
"solo-spell-caster-chapter-37-bahasa-indonesia/"), { "/soul-land-ii-chapter-300-1-bahasa-indonesia/"), {
"url": "c3d30de6c796ff6ff36eb86e2e6fa2f8add8e829", "url": "243a5250e210b40d17217e83b7547cefea5638bd",
"keyword": "ed8a0ff73098776988bf66fb700381a2c748f910", "keyword": "cb646cfed3d45105bd645ab38b2e9f7d8c436436",
}), }),
) )
@@ -74,16 +76,15 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for manga from komikcast.com""" """Extractor for manga from komikcast.me"""
chapterclass = KomikcastChapterExtractor chapterclass = KomikcastChapterExtractor
pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com" pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
r"(/(?:komik/)?[^/?#]+)/?$")
test = ( test = (
("https://komikcast.com/komik/090-eko-to-issho/", { ("https://komikcast.me/komik/090-eko-to-issho/", {
"url": "dc798d107697d1f2309b14ca24ca9dba30c6600f", "url": "08204f0a703ec5272121abcf0632ecacba1e588f",
"keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1", "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1",
}), }),
("https://komikcast.com/tonari-no-kashiwagi-san/"), ("https://komikcast.me/tonari-no-kashiwagi-san/"),
) )
def chapters(self, page): def chapters(self, page):

View File

@@ -20,10 +20,6 @@ class LolisafeExtractor(BaseExtractor):
BASE_PATTERN = LolisafeExtractor.update({ BASE_PATTERN = LolisafeExtractor.update({
"bunkr": {
"root": "https://app.bunkr.is",
"pattern": r"(?:app\.)?bunkr\.(?:is|to)",
},
"zzzz" : { "zzzz" : {
"root": "https://zz.ht", "root": "https://zz.ht",
"pattern": r"zz\.(?:ht|fo)", "pattern": r"zz\.(?:ht|fo)",
@@ -35,25 +31,6 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
subcategory = "album" subcategory = "album"
pattern = BASE_PATTERN + "/a/([^/?#]+)" pattern = BASE_PATTERN + "/a/([^/?#]+)"
test = ( test = (
("https://app.bunkr.is/a/Lktg9Keq", {
"pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
"keyword": {
"album_id": "Lktg9Keq",
"album_name": 'test テスト "&>',
"count": 1,
"filename": 'test-テスト-"&>-QjgneIQv',
"id": "QjgneIQv",
"name": 'test-テスト-"&>',
"num": int,
},
}),
# mp4 (#2239)
("https://bunkr.is/a/ptRHaCn2", {
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
("https://bunkr.to/a/Lktg9Keq"),
("https://zz.ht/a/lop7W6EZ", { ("https://zz.ht/a/lop7W6EZ", {
"pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png", "pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png",
"count": 2, "count": 2,
@@ -71,11 +48,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
domain = self.config("domain") domain = self.config("domain")
if domain is None or domain == "auto": if domain is None or domain == "auto":
if self.category == "bunkr": self.root = text.root_from_url(match.group(0))
self.root = "https://app.bunkr.is"
else:
self.root = text.root_from_url(match.group(0))
else: else:
self.root = text.ensure_http_scheme(domain) self.root = text.ensure_http_scheme(domain)
@@ -89,10 +62,6 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
data["_fallback"] = file["_fallback"] data["_fallback"] = file["_fallback"]
text.nameext_from_url(url, data) text.nameext_from_url(url, data)
data["name"], sep, data["id"] = data["filename"].rpartition("-") data["name"], sep, data["id"] = data["filename"].rpartition("-")
if data["extension"] == "mp4":
url = url.replace(
"//cdn.bunkr.is/", "//media-files.bunkr.is/", 1)
yield Message.Url, url, data yield Message.Url, url, data
def fetch_album(self, album_id): def fetch_album(self, album_id):

View File

@@ -95,9 +95,9 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
r"(/manga/[^/?#]+/?)(?:#.*)?$") r"(/manga/[^/?#]+/?)(?:#.*)?$")
test = ( test = (
("https://www.mangahere.cc/manga/aria/", { ("https://www.mangahere.cc/manga/aria/", {
"url": "dc7f8954efbe87d9fd670c54e5edb5230c01f767", "url": "9c2e54ec42e9a87ad53096c328b33c90750af3e4",
"keyword": "864524eed2dc6a73e366f6ba400b80d894f99b5a", "keyword": "71503c682c5d0c277a50409a8c5fd78e871e3d69",
"count": 69, "count": 71,
}), }),
("https://www.mangahere.cc/manga/hiyokoi/#50", { ("https://www.mangahere.cc/manga/hiyokoi/#50", {
"url": "654850570aa03825cd57e2ae2904af489602c523", "url": "654850570aa03825cd57e2ae2904af489602c523",

View File

@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2020 Jake Mannens # Copyright 2020 Jake Mannens
# Copyright 2021 Mike Fährmann # Copyright 2021-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -13,30 +13,27 @@ from .common import ChapterExtractor, MangaExtractor
from .. import text from .. import text
import re import re
BASE_PATTERN = r"(?:https?://)?(?:ww[\dw]?\.)?mangakakalot\.tv"
class MangakakalotBase(): class MangakakalotBase():
"""Base class for mangakakalot extractors""" """Base class for mangakakalot extractors"""
category = "mangakakalot" category = "mangakakalot"
root = "https://ww.mangakakalot.tv" root = "https://ww3.mangakakalot.tv"
class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor):
"""Extractor for manga chapters from mangakakalot.tv""" """Extractor for manga chapters from mangakakalot.tv"""
pattern = (r"(?:https?://)?(?:www?\.)?mangakakalot\.tv" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/chapter[_-][^/?#]+)"
r"(/chapter/[^/?#]+/chapter[_-][^/?#]+)")
test = ( test = (
("https://ww.mangakakalot.tv/chapter/manga-hl984546/chapter-6", { ("https://ww3.mangakakalot.tv/chapter/manga-jk986845/chapter-34.2", {
"pattern": r"https://cm\.blazefast\.co" "pattern": r"https://cm\.blazefast\.co"
r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg", r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg",
"keyword": "e9646a76a210f1eb4a71b4134664814c99d65d48", "keyword": "0f1586ff52f0f9cbbb25306ae64ab718f8a6a633",
"count": 14, "count": 9,
}),
(("https://mangakakalot.tv/chapter"
"/hatarakanai_futari_the_jobless_siblings/chapter_20.1"), {
"keyword": "14c430737ff600b26a3811815905f34dd6a6c8c6",
"content": "b3eb1f139caef98d9dcd8ba6a5ee146a13deebc4",
"count": 2,
}), }),
("https://mangakakalot.tv/chapter"
"/hatarakanai_futari_the_jobless_siblings/chapter_20.1"),
) )
def __init__(self, match): def __init__(self, match):
@@ -78,16 +75,13 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor):
class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor): class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor):
"""Extractor for manga from mangakakalot.tv""" """Extractor for manga from mangakakalot.tv"""
chapterclass = MangakakalotChapterExtractor chapterclass = MangakakalotChapterExtractor
pattern = (r"(?:https?://)?(?:www?\.)?mangakakalot\.tv" pattern = BASE_PATTERN + r"(/manga/[^/?#]+)"
r"(/manga/[^/?#]+)")
test = ( test = (
("https://ww.mangakakalot.tv/manga/lk921810", { ("https://ww3.mangakakalot.tv/manga/manga-jk986845", {
"url": "654d040c17728c9c8756fce7092b084e8dcf67d2",
}),
("https://mangakakalot.tv/manga/manga-jk986845", {
"pattern": MangakakalotChapterExtractor.pattern, "pattern": MangakakalotChapterExtractor.pattern,
"count": ">= 30", "count": ">= 30",
}), }),
("https://mangakakalot.tv/manga/lk921810"),
) )
def chapters(self, page): def chapters(self, page):

View File

@@ -21,16 +21,12 @@ class ManganeloChapterExtractor(ChapterExtractor):
pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)"
test = ( test = (
("https://readmanganato.com/manga-gn983696/chapter-23", { ("https://readmanganato.com/manga-gn983696/chapter-23", {
"pattern": r"https://s\d+\.\w+\.com/mangakakalot/g\d+/gq921227/" "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23"
r"vol3_chapter_23_24_yen/\d+\.jpg", r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg",
"keyword": "3748087cf41abc97f991530e6fd53b291490d6d0", "keyword": "2c5cd59342f149375df9bcb50aa416b4d04a43cf",
"count": 25, "count": 25,
}), }),
("https://manganelo.com/chapter/gamers/chapter_15", { ("https://manganelo.com/chapter/gamers/chapter_15"),
"keyword": "8f59f88d516247011fe122e05746c27e203c8191",
"content": "fbec629c71f66b246bfa0604204407c0d1c8ae38",
"count": 39,
}),
("https://manganelo.com/chapter/gq921227/chapter_23"), ("https://manganelo.com/chapter/gq921227/chapter_23"),
) )
@@ -81,14 +77,11 @@ class ManganeloMangaExtractor(MangaExtractor):
chapterclass = ManganeloChapterExtractor chapterclass = ManganeloChapterExtractor
pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$" pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$"
test = ( test = (
("https://manganato.com/manga-gu983703", { ("https://readmanganato.com/manga-gn983696", {
"pattern": ManganeloChapterExtractor.pattern, "pattern": ManganeloChapterExtractor.pattern,
"count": ">= 70", "count": ">= 25",
}),
("https://manganelo.com/manga/read_otome_no_teikoku", {
"pattern": ManganeloChapterExtractor.pattern,
"count": ">= 40",
}), }),
("https://manganelo.com/manga/read_otome_no_teikoku"),
("https://manganelo.com/manga/ol921234/"), ("https://manganelo.com/manga/ol921234/"),
) )

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2021 Mike Fährmann # Copyright 2015-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -121,8 +121,8 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
r"(/manga/[^/?#]+)/?$") r"(/manga/[^/?#]+)/?$")
test = ( test = (
("https://mangapark.net/manga/aria", { ("https://mangapark.net/manga/aria", {
"url": "b8f7db2f581404753c4af37af66c049a41273b94", "url": "51c6d82aed5c3c78e0d3f980b09a998e6a2a83ee",
"keyword": "2c0d28efaf84fcfe62932b6931ef3c3987cd48c0", "keyword": "cabc60cf2efa82749d27ac92c495945961e4b73c",
}), }),
("https://mangapark.me/manga/aria"), ("https://mangapark.me/manga/aria"),
("https://mangapark.com/manga/aria"), ("https://mangapark.com/manga/aria"),

View File

@@ -179,12 +179,11 @@ class MastodonAPI():
try: try:
access_token = INSTANCES[extractor.category]["access-token"] access_token = INSTANCES[extractor.category]["access-token"]
except (KeyError, TypeError): except (KeyError, TypeError):
raise exception.StopExtraction( pass
"Missing access token.\n" if access_token:
"Run 'gallery-dl oauth:mastodon:%s' to obtain one.", self.headers = {"Authorization": "Bearer " + access_token}
extractor.instance) else:
self.headers = None
self.headers = {"Authorization": "Bearer " + access_token}
def account_id_by_username(self, username): def account_id_by_username(self, username):
if username.startswith("id:"): if username.startswith("id:"):
@@ -232,6 +231,11 @@ class MastodonAPI():
if code < 400: if code < 400:
return response return response
if code == 401:
raise exception.StopExtraction(
"Invalid or missing access token.\n"
"Run 'gallery-dl oauth:mastodon:%s' to obtain one.",
self.extractor.instance)
if code == 404: if code == 404:
raise exception.NotFoundError() raise exception.NotFoundError()
if code == 429: if code == 429:

View File

@@ -4,7 +4,7 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract hentai-gallery from https://myhentaigallery.com/""" """Extractors for https://myhentaigallery.com/"""
from .common import GalleryExtractor from .common import GalleryExtractor
from .. import text, exception from .. import text, exception
@@ -18,8 +18,8 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
r"/gallery/(?:thumbnails|show)/(\d+)") r"/gallery/(?:thumbnails|show)/(\d+)")
test = ( test = (
("https://myhentaigallery.com/gallery/thumbnails/16247", { ("https://myhentaigallery.com/gallery/thumbnails/16247", {
"pattern": r"https://images.myhentaigrid.com/imagesgallery/images" "pattern": r"https://images.myhentaicomics\.com/imagesgallery"
r"/[^/]+/original/\d+\.jpg", r"/images/[^/]+/original/\d+\.jpg",
"keyword": { "keyword": {
"artist" : list, "artist" : list,
"count" : 11, "count" : 11,

View File

@@ -72,6 +72,8 @@ class NewgroundsExtractor(Extractor):
"""Return general metadata""" """Return general metadata"""
def login(self): def login(self):
if self._check_cookies(self.cookienames):
return
username, password = self._get_auth_info() username, password = self._get_auth_info()
if username: if username:
self._update_cookies(self._login_impl(username, password)) self._update_cookies(self._login_impl(username, password))
@@ -81,10 +83,13 @@ class NewgroundsExtractor(Extractor):
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = self.root + "/passport/" url = self.root + "/passport/"
page = self.request(url).text response = self.request(url)
headers = {"Origin": self.root, "Referer": url} if response.history and response.url.endswith("/social"):
return self.session.cookies
url = text.urljoin(self.root, text.extract(page, 'action="', '"')[0]) headers = {"Origin": self.root, "Referer": url}
url = text.urljoin(self.root, text.extract(
response.text, 'action="', '"')[0])
data = { data = {
"username": username, "username": username,
"password": password, "password": password,
@@ -260,7 +265,11 @@ class NewgroundsExtractor(Extractor):
msg = ", ".join(text.unescape(e) for e in data["errors"]) msg = ", ".join(text.unescape(e) for e in data["errors"])
raise exception.StopExtraction(msg) raise exception.StopExtraction(msg)
for year, items in data["items"].items(): items = data.get("items")
if not items:
return
for year, items in items.items():
for item in items: for item in items:
page_url = text.extract(item, 'href="', '"')[0] page_url = text.extract(item, 'href="', '"')[0]
if page_url[0] == "/": if page_url[0] == "/":

View File

@@ -193,25 +193,28 @@ class NozomiSearchExtractor(NozomiExtractor):
return {"search_tags": self.tags} return {"search_tags": self.tags}
def posts(self): def posts(self):
index = None result = None
result = set() positive = []
negative = []
def nozomi(path): def nozomi(path):
url = "https://j.nozomi.la/" + path + ".nozomi" url = "https://j.nozomi.la/" + path + ".nozomi"
return decode_nozomi(self.request(url).content) return decode_nozomi(self.request(url).content)
for tag in self.tags: for tag in self.tags:
tag = tag.replace("/", "") (negative if tag[0] == "-" else positive).append(
if tag[0] == "-": tag.replace("/", ""))
if not index:
index = set(nozomi("index"))
items = index.difference(nozomi("nozomi/" + tag[1:]))
else:
items = nozomi("nozomi/" + tag)
if result: for tag in positive:
result.intersection_update(items) ids = nozomi("nozomi/" + tag)
if result is None:
result = set(ids)
else: else:
result.update(items) result.intersection_update(ids)
return sorted(result, reverse=True) if result is None:
result = set(nozomi("index"))
for tag in negative:
result.difference_update(nozomi("nozomi/" + tag[1:]))
return sorted(result, reverse=True) if result else ()

View File

@@ -118,11 +118,11 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
"source_url": "https://www.deviantart.com/speccysy/art" "source_url": "https://www.deviantart.com/speccysy/art"
"/Afternoon-Flight-215193985", "/Afternoon-Flight-215193985",
"spoilered": False, "spoilered": False,
"tag_count": 42, "tag_count": int,
"tag_ids": list, "tag_ids": list,
"tags": list, "tags": list,
"thumbnails_generated": True, "thumbnails_generated": True,
"updated_at": "2021-09-30T20:04:01Z", "updated_at": "2022-04-25T09:30:57Z",
"uploader": "Clover the Clever", "uploader": "Clover the Clever",
"uploader_id": 211188, "uploader_id": 211188,
"upvotes": int, "upvotes": int,

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2021 Mike Fährmann # Copyright 2018-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -108,7 +108,7 @@ class PiczelImageExtractor(PiczelExtractor):
pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)" pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
test = ("https://piczel.tv/gallery/image/7807", { test = ("https://piczel.tv/gallery/image/7807", {
"pattern": r"https://(\w+\.)?piczel\.tv/static/uploads/gallery_image" "pattern": r"https://(\w+\.)?piczel\.tv/static/uploads/gallery_image"
r"/32920/image/7807/25737334-Lulena\.png", r"/32920/image/7807/1532236438-Lulena\.png",
"content": "df9a053a24234474a19bce2b7e27e0dec23bff87", "content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
"keyword": { "keyword": {
"created_at": "2018-07-22T05:13:58.000Z", "created_at": "2018-07-22T05:13:58.000Z",

View File

@@ -35,6 +35,12 @@ class PinterestExtractor(Extractor):
yield Message.Directory, data yield Message.Directory, data
for pin in self.pins(): for pin in self.pins():
if isinstance(pin, tuple):
url, data = pin
yield Message.Queue, url, data
continue
pin.update(data) pin.update(data)
carousel_data = pin.get("carousel_data") carousel_data = pin.get("carousel_data")
@@ -170,14 +176,17 @@ class PinterestBoardExtractor(PinterestExtractor):
def pins(self): def pins(self):
board = self.board board = self.board
pins = self.api.board_pins(board["id"])
if board["section_count"] and self.config("sections", True): if board["section_count"] and self.config("sections", True):
pins = [self.api.board_pins(board["id"])] base = "{}/{}/{}/id:".format(
for section in self.api.board_sections(board["id"]): self.root, board["owner"]["username"], board["name"])
pins.append(self.api.board_section_pins(section["id"])) data = {"_extractor": PinterestSectionExtractor}
return itertools.chain.from_iterable(pins) sections = [(base + section["id"], data)
else: for section in self.api.board_sections(board["id"])]
return self.api.board_pins(board["id"]) pins = itertools.chain(pins, sections)
return pins
class PinterestUserExtractor(PinterestExtractor): class PinterestUserExtractor(PinterestExtractor):
@@ -245,8 +254,12 @@ class PinterestSectionExtractor(PinterestExtractor):
self.section = None self.section = None
def metadata(self): def metadata(self):
section = self.section = self.api.board_section( if self.section_slug.startswith("id:"):
self.user, self.board_slug, self.section_slug) section = self.section = self.api.board_section(
self.section_slug[3:])
else:
section = self.section = self.api.board_section_by_name(
self.user, self.board_slug, self.section_slug)
section.pop("preview_pins", None) section.pop("preview_pins", None)
return {"board": section.pop("board"), "section": section} return {"board": section.pop("board"), "section": section}
@@ -398,8 +411,13 @@ class PinterestAPI():
options = {"board_id": board_id} options = {"board_id": board_id}
return self._pagination("BoardFeed", options) return self._pagination("BoardFeed", options)
def board_section(self, user, board_slug, section_slug): def board_section(self, section_id):
"""Yield a specific board section""" """Yield a specific board section"""
options = {"section_id": section_id}
return self._call("BoardSection", options)["resource_response"]["data"]
def board_section_by_name(self, user, board_slug, section_slug):
"""Yield a board section by name"""
options = {"board_slug": board_slug, "section_slug": section_slug, options = {"board_slug": board_slug, "section_slug": section_slug,
"username": user} "username": user}
return self._call("BoardSection", options)["resource_response"]["data"] return self._call("BoardSection", options)["resource_response"]["data"]

View File

@@ -238,7 +238,7 @@ class PixivAvatarExtractor(PixivExtractor):
class PixivBackgroundExtractor(PixivExtractor): class PixivBackgroundExtractor(PixivExtractor):
"""Extractor for pixiv background banners""" """Extractor for pixiv background banners"""
subcategory = "background" subcategory = "background"
filename_fmt = "background{date?_//:%Y-%m-%d}.{extension}" filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}"
archive_fmt = "background_{user[id]}_{date}" archive_fmt = "background_{user[id]}_{date}"
pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net"
r"/(?:en/)?users/(\d+)/background") r"/(?:en/)?users/(\d+)/background")

View File

@@ -50,8 +50,9 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
subcategory = "issue" subcategory = "issue"
pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)" pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)"
test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", { test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", {
"url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6", "pattern": r"https://2\.bp\.blogspot\.com/[\w-]+=s0\?.+",
"keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5", "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5",
"count": 36,
}) })
def __init__(self, match): def __init__(self, match):

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2021 Mike Fährmann # Copyright 2014-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -49,6 +49,8 @@ class SankakuExtractor(BooruExtractor):
self.log.warning( self.log.warning(
"Login required to download 'contentious_content' posts") "Login required to download 'contentious_content' posts")
SankakuExtractor._warning = False SankakuExtractor._warning = False
elif url[8] == "v":
url = "https://s.sankakucomplex.com" + url[url.index("/", 8):]
return url return url
@staticmethod @staticmethod
@@ -79,8 +81,8 @@ class SankakuTagExtractor(SankakuExtractor):
test = ( test = (
("https://sankaku.app/?tags=bonocho", { ("https://sankaku.app/?tags=bonocho", {
"count": 5, "count": 5,
"pattern": r"https://v\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" "pattern": r"https://s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
r"/[^/]{32}\.\w+\?e=\d+&expires=\d+&m=[^&#]+", r"/[^/]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+",
}), }),
("https://beta.sankakucomplex.com/?tags=bonocho"), ("https://beta.sankakucomplex.com/?tags=bonocho"),
("https://chan.sankakucomplex.com/?tags=bonocho"), ("https://chan.sankakucomplex.com/?tags=bonocho"),
@@ -160,7 +162,7 @@ class SankakuPostExtractor(SankakuExtractor):
}), }),
# 'contentious_content' # 'contentious_content'
("https://sankaku.app/post/show/21418978", { ("https://sankaku.app/post/show/21418978", {
"pattern": r"https://v\.sankakucomplex\.com" "pattern": r"https://s\.sankakucomplex\.com"
r"/data/13/3c/133cda3bfde249c504284493903fb985\.jpg", r"/data/13/3c/133cda3bfde249c504284493903fb985\.jpg",
}), }),
# empty tags (#1617) # empty tags (#1617)

View File

@@ -23,6 +23,7 @@ class SkebExtractor(Extractor):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.user_name = match.group(1) self.user_name = match.group(1)
self.thumbnails = self.config("thumbnails", False) self.thumbnails = self.config("thumbnails", False)
self.article = self.config("article", False)
def items(self): def items(self):
for user_name, post_num in self.posts(): for user_name, post_num in self.posts():
@@ -105,6 +106,12 @@ class SkebExtractor(Extractor):
post["file_url"] = resp["og_image_url"] post["file_url"] = resp["og_image_url"]
yield post yield post
if self.article and "article_image_url" in resp:
post["content_category"] = "article"
post["file_id"] = "article"
post["file_url"] = resp["article_image_url"]
yield post
for preview in resp["previews"]: for preview in resp["previews"]:
post["content_category"] = "preview" post["content_category"] = "preview"
post["file_id"] = preview["id"] post["file_id"] = preview["id"]
@@ -134,11 +141,12 @@ class SkebPostExtractor(SkebExtractor):
"anonymous": False, "anonymous": False,
"body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ", "body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ",
"client": { "client": {
"avatar_url": "https://pbs.twimg.com/profile_images" "avatar_url": r"re:https://pbs.twimg.com/profile_images"
"/1471184042791895042/f0DcWFGl.jpg", r"/\d+/\w+\.jpg",
"header_url": None, "header_url": r"re:https://pbs.twimg.com/profile_banners"
r"/1375007870291300358/\d+/1500x500",
"id": 1196514, "id": 1196514,
"name": "湊ラギ", "name": str,
"screen_name": "minato_ragi", "screen_name": "minato_ragi",
}, },
"completed_at": "2022-02-27T14:03:45.442Z", "completed_at": "2022-02-27T14:03:45.442Z",
@@ -208,3 +216,30 @@ class SkebUserExtractor(SkebExtractor):
posts = itertools.chain(posts, self._pagination(url, params)) posts = itertools.chain(posts, self._pagination(url, params))
return posts return posts
class SkebFollowingExtractor(SkebExtractor):
"""Extractor for all creators followed by a skeb user"""
subcategory = "following"
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators"
test = ("https://skeb.jp/@user/following_creators",)
def items(self):
for user in self.users():
url = "{}/@{}".format(self.root, user["screen_name"])
user["_extractor"] = SkebUserExtractor
yield Message.Queue, url, user
def users(self):
url = "{}/api/users/{}/following_creators".format(
self.root, self.user_name)
headers = {"Referer": self.root, "Authorization": "Bearer null"}
params = {"sort": "date", "offset": 0, "limit": 90}
while True:
data = self.request(url, params=params, headers=headers).json()
yield from data
if len(data) < params["limit"]:
return
params["offset"] += params["limit"]

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2019-2020 Mike Fährmann # Copyright 2019-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -36,12 +36,15 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
pattern = BASE_PATTERN + r"/albums/([^/?#]+)" pattern = BASE_PATTERN + r"/albums/([^/?#]+)"
test = ( test = (
("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
"url": "58bd94ebc80fd906e9879826970b408d54c6da07", "pattern": r"https://stored-cf\.slickpic\.com/NDk5MjNmYTc1MzU0MQ,,"
"keyword": "54a9d6f9e42ae43c644aa9316186fb9d9955fe53", r"/20160807/\w+/p/o/JSBFSS-\d+\.jpg",
"keyword": "c37c4ce9c54c09abc6abdf295855d46f11529cbf",
"count": 102,
}), }),
("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
"range": "34", "range": "34",
"content": ("cec6630e659dc72db1ee1a9a6f3b525189261988", "content": ("52b5a310587de1048030ab13a912f6a3a9cc7dab",
"cec6630e659dc72db1ee1a9a6f3b525189261988",
"6f81e1e74c6cd6db36844e7211eef8e7cd30055d", "6f81e1e74c6cd6db36844e7211eef8e7cd30055d",
"22e83645fc242bc3584eca7ec982c8a53a4d8a44"), "22e83645fc242bc3584eca7ec982c8a53a4d8a44"),
}), }),

View File

@@ -79,10 +79,9 @@ class SlidesharePresentationExtractor(GalleryExtractor):
views = extr('content="UserPageVisits:', '"') views = extr('content="UserPageVisits:', '"')
if descr.endswith(""): if descr.endswith(""):
alt_descr = extr( alt_descr = extr('id="slideshow-description-text"', '</p>')
'id="slideshow-description-text" class="notranslate">', '</p>')
if alt_descr: if alt_descr:
descr = text.remove_html(alt_descr).strip() descr = text.remove_html(alt_descr.partition(">")[2]).strip()
return { return {
"user": self.user, "user": self.user,

View File

@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2021 Mike Fährmann # Copyright 2018-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract images from https://www.smugmug.com/""" """Extractors for https://www.smugmug.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, oauth, exception from .. import text, oauth, exception
@@ -111,13 +111,13 @@ class SmugmugImageExtractor(SmugmugExtractor):
test = ( test = (
("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", {
"url": "e6408fd2c64e721fd146130dceb56a971ceb4259", "url": "e6408fd2c64e721fd146130dceb56a971ceb4259",
"keyword": "b15af021186b7234cebcac758d2a4fd8462f9912", "keyword": "460a773f5addadd3e216bda346fc524fe4eedc52",
"content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0",
}), }),
# video # video
("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", {
"url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee",
"keyword": "f6967cc5a46c3e130a4f8de7c5c971f72e07fe61", "keyword": "eb74e5cf6780d5152ab8f11b431ec1b17fa8f69b",
}), }),
) )

View File

@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract images from https://www.tumblr.com/""" """Extractors for https://www.tumblr.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, oauth, exception from .. import text, oauth, exception
@@ -35,7 +35,10 @@ POST_TYPES = frozenset((
BASE_PATTERN = ( BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:tumblr:(?:https?://)?([^/]+)|"
r"(?:https?://)?([\w-]+\.tumblr\.com))") r"(?:https?://)?"
r"(?:www\.tumblr\.com/blog/(?:view/)?([\w-]+)|"
r"([\w-]+\.tumblr\.com)))"
)
class TumblrExtractor(Extractor): class TumblrExtractor(Extractor):
@@ -48,9 +51,14 @@ class TumblrExtractor(Extractor):
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.blog = match.group(1) or match.group(2)
self.api = TumblrAPI(self)
name = match.group(2)
if name:
self.blog = name + ".tumblr.com"
else:
self.blog = match.group(1) or match.group(3)
self.api = TumblrAPI(self)
self.types = self._setup_posttypes() self.types = self._setup_posttypes()
self.avatar = self.config("avatar", False) self.avatar = self.config("avatar", False)
self.inline = self.config("inline", True) self.inline = self.config("inline", True)
@@ -102,7 +110,12 @@ class TumblrExtractor(Extractor):
for photo in photos: for photo in photos:
post["photo"] = photo post["photo"] = photo
photo.update(photo["original_size"]) best_photo = photo["original_size"]
for alt_photo in photo["alt_sizes"]:
if (alt_photo["height"] > best_photo["height"] or
alt_photo["width"] > best_photo["width"]):
best_photo = alt_photo
photo.update(best_photo)
del photo["original_size"] del photo["original_size"]
del photo["alt_sizes"] del photo["alt_sizes"]
yield self._prepare_image(photo["url"], post) yield self._prepare_image(photo["url"], post)
@@ -232,6 +245,8 @@ class TumblrUserExtractor(TumblrExtractor):
("https://demo.tumblr.com/archive"), ("https://demo.tumblr.com/archive"),
("tumblr:http://www.b-authentique.com/"), ("tumblr:http://www.b-authentique.com/"),
("tumblr:www.b-authentique.com"), ("tumblr:www.b-authentique.com"),
("https://www.tumblr.com/blog/view/smarties-art"),
("https://www.tumblr.com/blog/smarties-art"),
) )
def posts(self): def posts(self):
@@ -241,7 +256,7 @@ class TumblrUserExtractor(TumblrExtractor):
class TumblrPostExtractor(TumblrExtractor): class TumblrPostExtractor(TumblrExtractor):
"""Extractor for images from a single post on tumblr""" """Extractor for images from a single post on tumblr"""
subcategory = "post" subcategory = "post"
pattern = BASE_PATTERN + r"/(?:post|image)/(\d+)" pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)"
test = ( test = (
("http://demo.tumblr.com/post/459265350", { ("http://demo.tumblr.com/post/459265350", {
"pattern": (r"https://\d+\.media\.tumblr\.com" "pattern": (r"https://\d+\.media\.tumblr\.com"
@@ -273,11 +288,12 @@ class TumblrPostExtractor(TumblrExtractor):
"exception": exception.NotFoundError, # HTML response (#297) "exception": exception.NotFoundError, # HTML response (#297)
}), }),
("http://demo.tumblr.com/image/459265350"), ("http://demo.tumblr.com/image/459265350"),
("https://www.tumblr.com/blog/view/smarties-art/686047436641353728"),
) )
def __init__(self, match): def __init__(self, match):
TumblrExtractor.__init__(self, match) TumblrExtractor.__init__(self, match)
self.post_id = match.group(3) self.post_id = match.group(4)
self.reblogs = True self.reblogs = True
self.date_min = 0 self.date_min = 0
@@ -293,14 +309,18 @@ class TumblrTagExtractor(TumblrExtractor):
"""Extractor for images from a tumblr-user by tag""" """Extractor for images from a tumblr-user by tag"""
subcategory = "tag" subcategory = "tag"
pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" pattern = BASE_PATTERN + r"/tagged/([^/?#]+)"
test = ("http://demo.tumblr.com/tagged/Times%20Square", { test = (
"pattern": (r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg"), ("http://demo.tumblr.com/tagged/Times%20Square", {
"count": 1, "pattern": r"https://\d+\.media\.tumblr\.com"
}) r"/tumblr_[^/_]+_1280.jpg",
"count": 1,
}),
("https://www.tumblr.com/blog/view/smarties-art/tagged/undertale"),
)
def __init__(self, match): def __init__(self, match):
TumblrExtractor.__init__(self, match) TumblrExtractor.__init__(self, match)
self.tag = text.unquote(match.group(3).replace("-", " ")) self.tag = text.unquote(match.group(4).replace("-", " "))
def posts(self): def posts(self):
return self.api.posts(self.blog, {"tag": self.tag}) return self.api.posts(self.blog, {"tag": self.tag})
@@ -312,9 +332,12 @@ class TumblrLikesExtractor(TumblrExtractor):
directory_fmt = ("{category}", "{blog_name}", "likes") directory_fmt = ("{category}", "{blog_name}", "likes")
archive_fmt = "f_{blog[name]}_{id}_{num}" archive_fmt = "f_{blog[name]}_{id}_{num}"
pattern = BASE_PATTERN + r"/likes" pattern = BASE_PATTERN + r"/likes"
test = ("http://mikf123.tumblr.com/likes", { test = (
"count": 1, ("http://mikf123.tumblr.com/likes", {
}) "count": 1,
}),
("https://www.tumblr.com/blog/view/mikf123/likes"),
)
def posts(self): def posts(self):
return self.api.likes(self.blog) return self.api.likes(self.blog)

View File

@@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
"tag_ids": list, "tag_ids": list,
"tags": list, "tags": list,
"thumbnails_generated": True, "thumbnails_generated": True,
"updated_at": "2022-02-03T15:49:07.110Z", "updated_at": "2022-05-13T00:43:19.791Z",
"upvotes": int, "upvotes": int,
"view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png",
"width": 576, "width": 576,

View File

@@ -11,6 +11,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import itertools
import json import json
BASE_PATTERN = ( BASE_PATTERN = (
@@ -40,7 +41,7 @@ class TwitterExtractor(Extractor):
self.quoted = self.config("quoted", False) self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True) self.videos = self.config("videos", True)
self.cards = self.config("cards", False) self.cards = self.config("cards", False)
self._user_id = None self._user = self._user_obj = None
self._user_cache = {} self._user_cache = {}
self._init_sizes() self._init_sizes()
@@ -64,6 +65,11 @@ class TwitterExtractor(Extractor):
tweets = self._expand_tweets(self.tweets()) tweets = self._expand_tweets(self.tweets())
self.tweets = lambda : tweets self.tweets = lambda : tweets
if self.config("unique", True):
seen_tweets = set()
else:
seen_tweets = None
for tweet in self.tweets(): for tweet in self.tweets():
if "legacy" in tweet: if "legacy" in tweet:
@@ -71,6 +77,11 @@ class TwitterExtractor(Extractor):
else: else:
data = tweet data = tweet
if seen_tweets is not None:
if data["id_str"] in seen_tweets:
continue
seen_tweets.add(data["id_str"])
if not self.retweets and "retweeted_status_id_str" in data: if not self.retweets and "retweeted_status_id_str" in data:
self.log.debug("Skipping %s (retweet)", data["id_str"]) self.log.debug("Skipping %s (retweet)", data["id_str"])
continue continue
@@ -80,8 +91,9 @@ class TwitterExtractor(Extractor):
if "in_reply_to_user_id_str" in data and ( if "in_reply_to_user_id_str" in data and (
not self.replies or ( not self.replies or (
self.replies == "self" and self.replies == "self" and
(self._user_id or data["in_reply_to_user_id_str"]) != data["user_id_str"] !=
data["user_id_str"] (self._user_obj["rest_id"] if self._user else
data["in_reply_to_user_id_str"])
) )
): ):
self.log.debug("Skipping %s (reply)", data["id_str"]) self.log.debug("Skipping %s (reply)", data["id_str"])
@@ -100,6 +112,7 @@ class TwitterExtractor(Extractor):
tdata = self._transform_tweet(tweet) tdata = self._transform_tweet(tweet)
tdata.update(metadata) tdata.update(metadata)
tdata["count"] = len(files)
yield Message.Directory, tdata yield Message.Directory, tdata
for tdata["num"], file in enumerate(files, 1): for tdata["num"], file in enumerate(files, 1):
file.update(tdata) file.update(tdata)
@@ -218,11 +231,13 @@ class TwitterExtractor(Extractor):
files.append({"url": url}) files.append({"url": url})
def _transform_tweet(self, tweet): def _transform_tweet(self, tweet):
if "core" in tweet: if "author" in tweet:
user = self._transform_user( author = tweet["author"]
tweet["core"]["user_results"]["result"]) elif "core" in tweet:
author = tweet["core"]["user_results"]["result"]
else: else:
user = self._transform_user(tweet["user"]) author = tweet["user"]
author = self._transform_user(author)
if "legacy" in tweet: if "legacy" in tweet:
tweet = tweet["legacy"] tweet = tweet["legacy"]
@@ -234,12 +249,13 @@ class TwitterExtractor(Extractor):
"retweet_id" : text.parse_int( "retweet_id" : text.parse_int(
tget("retweeted_status_id_str")), tget("retweeted_status_id_str")),
"quote_id" : text.parse_int( "quote_id" : text.parse_int(
tget("quoted_status_id_str")), tget("quoted_by_id_str")),
"reply_id" : text.parse_int( "reply_id" : text.parse_int(
tget("in_reply_to_status_id_str")), tget("in_reply_to_status_id_str")),
"date" : text.parse_datetime( "date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"user" : user, "user" : self._user or author,
"author" : author,
"lang" : tweet["lang"], "lang" : tweet["lang"],
"favorite_count": tget("favorite_count"), "favorite_count": tget("favorite_count"),
"quote_count" : tget("quote_count"), "quote_count" : tget("quote_count"),
@@ -259,7 +275,7 @@ class TwitterExtractor(Extractor):
"nick": u["name"], "nick": u["name"],
} for u in mentions] } for u in mentions]
content = tget("full_text") or tget("text") or "" content = text.unescape(tget("full_text") or tget("text") or "")
urls = entities.get("urls") urls = entities.get("urls")
if urls: if urls:
for url in urls: for url in urls:
@@ -269,13 +285,8 @@ class TwitterExtractor(Extractor):
if "in_reply_to_screen_name" in tweet: if "in_reply_to_screen_name" in tweet:
tdata["reply_to"] = tweet["in_reply_to_screen_name"] tdata["reply_to"] = tweet["in_reply_to_screen_name"]
if "quoted_by_id_str" in tweet: if "quoted_by" in tweet:
tdata["quote_by"] = text.parse_int(tweet["quoted_by_id_str"]) tdata["quote_by"] = tweet["quoted_by"]
if "author" in tweet:
tdata["author"] = self._transform_user(tweet["author"])
else:
tdata["author"] = tdata["user"]
return tdata return tdata
@@ -325,6 +336,10 @@ class TwitterExtractor(Extractor):
return udata return udata
def _assign_user(self, user):
self._user_obj = user
self._user = self._transform_user(user)
def _users_result(self, users): def _users_result(self, users):
userfmt = self.config("users") userfmt = self.config("users")
if not userfmt or userfmt == "timeline": if not userfmt or userfmt == "timeline":
@@ -440,45 +455,43 @@ class TwitterTimelineExtractor(TwitterExtractor):
self.user = "id:" + user_id self.user = "id:" + user_id
def tweets(self): def tweets(self):
if self.retweets or self.textonly:
tweets = (self.api.user_tweets_and_replies if self.replies else
self.api.user_tweets)
else:
tweets = self.api.user_media
# yield initial batch of (media) tweets # yield initial batch of (media) tweets
tweet = None tweet = None
for tweet in tweets(self.user): for tweet in self._select_tweet_source()(self.user):
yield tweet yield tweet
if tweet is None: if tweet is None:
return return
# get username
if not self.user.startswith("id:"):
username = self.user
elif "core" in tweet:
username = (tweet["core"]["user_results"]["result"]
["legacy"]["screen_name"])
else:
username = tweet["user"]["screen_name"]
# get tweet data
if "legacy" in tweet:
tweet = tweet["legacy"]
# build search query # build search query
query = "from:{} max_id:{}".format(username, tweet["id_str"]) query = "from:{} max_id:{}".format(
self._user["name"], tweet["rest_id"])
if self.retweets: if self.retweets:
query += " include:retweets include:nativeretweets" query += " include:retweets include:nativeretweets"
if not self.textonly:
query += (" (filter:images OR"
" filter:native_video OR"
" card_name:animated_gif)")
# yield search results starting from last tweet id if not self.textonly:
# try to search for media-only tweets
tweet = None
for tweet in self.api.search_adaptive(query + " filter:links"):
yield tweet
if tweet is not None:
return
# yield unfiltered search results
yield from self.api.search_adaptive(query) yield from self.api.search_adaptive(query)
def _select_tweet_source(self):
strategy = self.config("strategy")
if strategy is None or strategy == "auto":
if self.retweets or self.textonly:
return self.api.user_tweets
else:
return self.api.user_media
if strategy == "tweets":
return self.api.user_tweets
if strategy == "with_replies":
return self.api.user_tweets_and_replies
return self.api.user_media
class TwitterTweetsExtractor(TwitterExtractor): class TwitterTweetsExtractor(TwitterExtractor):
"""Extractor for Tweets from a user's Tweets timeline""" """Extractor for Tweets from a user's Tweets timeline"""
@@ -607,7 +620,25 @@ class TwitterSearchExtractor(TwitterExtractor):
return {"search": text.unquote(self.user)} return {"search": text.unquote(self.user)}
def tweets(self): def tweets(self):
return self.api.search_adaptive(text.unquote(self.user)) query = text.unquote(self.user)
user = None
for item in query.split():
item = item.strip("()")
if item.startswith("from:"):
if user:
user = None
break
else:
user = item[5:]
if user is not None:
try:
self._assign_user(self.api.user_by_screen_name(user))
except KeyError:
raise exception.NotFoundError("user")
return self.api.search_adaptive(query)
class TwitterEventExtractor(TwitterExtractor): class TwitterEventExtractor(TwitterExtractor):
@@ -675,7 +706,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}), }),
("https://twitter.com/i/web/status/1424898916156284928", { ("https://twitter.com/i/web/status/1424898916156284928", {
"options": (("replies", "self"),), "options": (("replies", "self"),),
"count": 0, "count": 1,
}), }),
# "quoted" option (#854) # "quoted" option (#854)
("https://twitter.com/StobiesGalaxy/status/1270755918330896395", { ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
@@ -759,20 +790,38 @@ class TwitterTweetExtractor(TwitterExtractor):
def tweets(self): def tweets(self):
if self.config("conversations", False): if self.config("conversations", False):
return self.api.tweet_detail(self.tweet_id) return self._tweets_conversation(self.tweet_id)
else:
return self._tweets_single(self.tweet_id)
def _tweets_single(self, tweet_id):
tweets = [] tweets = []
tweet_id = self.tweet_id
for tweet in self.api.tweet_detail(tweet_id): for tweet in self.api.tweet_detail(tweet_id):
if tweet["rest_id"] == tweet_id or \ if tweet["rest_id"] == tweet_id or \
tweet.get("_retweet_id_str") == tweet_id: tweet.get("_retweet_id_str") == tweet_id:
self._assign_user(tweet["core"]["user_results"]["result"])
tweets.append(tweet) tweets.append(tweet)
tweet_id = tweet["legacy"].get("quoted_status_id_str") tweet_id = tweet["legacy"].get("quoted_status_id_str")
if not tweet_id: if not tweet_id:
break break
return tweets return tweets
def _tweets_conversation(self, tweet_id):
tweets = self.api.tweet_detail(tweet_id)
buffer = []
for tweet in tweets:
buffer.append(tweet)
if tweet["rest_id"] == tweet_id or \
tweet.get("_retweet_id_str") == tweet_id:
self._assign_user(tweet["core"]["user_results"]["result"])
break
return itertools.chain(buffer, tweets)
class TwitterImageExtractor(Extractor): class TwitterImageExtractor(Extractor):
category = "twitter" category = "twitter"
@@ -870,7 +919,6 @@ class TwitterAPI():
self._nsfw_warning = True self._nsfw_warning = True
self._syndication = extractor.config("syndication") self._syndication = extractor.config("syndication")
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
self._user = None
cookies = extractor.session.cookies cookies = extractor.session.cookies
cookiedomain = extractor.cookiedomain cookiedomain = extractor.cookiedomain
@@ -1032,13 +1080,13 @@ class TwitterAPI():
def _user_id_by_screen_name(self, screen_name): def _user_id_by_screen_name(self, screen_name):
if screen_name.startswith("id:"): if screen_name.startswith("id:"):
self._user = util.SENTINEL
user_id = screen_name[3:] user_id = screen_name[3:]
user = self.user_by_rest_id(user_id)
else: else:
user = () user = ()
try: try:
user = self._user = self.user_by_screen_name(screen_name) user = self.user_by_screen_name(screen_name)
user_id = user["rest_id"] user_id = user["rest_id"]
except KeyError: except KeyError:
if "unavailable_message" in user: if "unavailable_message" in user:
@@ -1048,7 +1096,7 @@ class TwitterAPI():
else: else:
raise exception.NotFoundError("user") raise exception.NotFoundError("user")
self.extractor._user_id = user_id self.extractor._assign_user(user)
return user_id return user_id
@cache(maxage=3600) @cache(maxage=3600)
@@ -1165,7 +1213,7 @@ class TwitterAPI():
if quoted: if quoted:
quoted = quoted.copy() quoted = quoted.copy()
quoted["author"] = users[quoted["user_id_str"]] quoted["author"] = users[quoted["user_id_str"]]
quoted["user"] = tweet["user"] quoted["quoted_by"] = tweet["user"]["screen_name"]
quoted["quoted_by_id_str"] = tweet["id_str"] quoted["quoted_by_id_str"] = tweet["id_str"]
yield quoted yield quoted
@@ -1208,17 +1256,10 @@ class TwitterAPI():
except LookupError: except LookupError:
extr.log.debug(data) extr.log.debug(data)
if self._user: user = extr._user_obj
user = self._user if user:
if user is util.SENTINEL: user = user["legacy"]
try: if user.get("blocked_by"):
user = self.user_by_rest_id(variables["userId"])
except KeyError:
raise exception.NotFoundError("user")
user = user.get("legacy")
if not user:
pass
elif user.get("blocked_by"):
if self.headers["x-twitter-auth-type"] and \ if self.headers["x-twitter-auth-type"] and \
extr.config("logout"): extr.config("logout"):
guest_token = self._guest_token() guest_token = self._guest_token()
@@ -1304,7 +1345,7 @@ class TwitterAPI():
try: try:
legacy["retweeted_status_id_str"] = \ legacy["retweeted_status_id_str"] = \
retweet["rest_id"] retweet["rest_id"]
legacy["author"] = \ tweet["author"] = \
retweet["core"]["user_results"]["result"] retweet["core"]["user_results"]["result"]
if "extended_entities" in retweet["legacy"] and \ if "extended_entities" in retweet["legacy"] and \
"extended_entities" not in legacy: "extended_entities" not in legacy:
@@ -1318,9 +1359,9 @@ class TwitterAPI():
if "quoted_status_result" in tweet: if "quoted_status_result" in tweet:
try: try:
quoted = tweet["quoted_status_result"]["result"] quoted = tweet["quoted_status_result"]["result"]
quoted["legacy"]["author"] = \ quoted["legacy"]["quoted_by"] = (
quoted["core"]["user_results"]["result"] tweet["core"]["user_results"]["result"]
quoted["core"] = tweet["core"] ["legacy"]["screen_name"])
quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
yield quoted yield quoted
except KeyError: except KeyError:
@@ -1421,6 +1462,6 @@ class TwitterAPI():
return { return {
"rest_id": tweet["id_str"], "rest_id": tweet["id_str"],
"legacy" : tweet, "legacy" : tweet,
"user" : tweet["user"], "core" : {"user_results": {"result": tweet["user"]}},
"_retweet_id_str": retweet_id, "_retweet_id_str": retweet_id,
} }

View File

@@ -40,10 +40,15 @@ class VkExtractor(Extractor):
continue continue
try: try:
photo["url"], photo["width"], photo["height"] = photo[size] photo["url"] = photo[size + "src"]
except KeyError:
self.log.warning("no photo URL found (%s)", photo.get("id"))
continue
try:
_, photo["width"], photo["height"] = photo[size]
except ValueError: except ValueError:
# photo without width/height entries (#2535) # photo without width/height entries (#2535)
photo["url"] = photo[size + "src"]
photo["width"] = photo["height"] = 0 photo["width"] = photo["height"] = 0
photo["id"] = photo["id"].rpartition("_")[2] photo["id"] = photo["id"].rpartition("_")[2]

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2019-2020 Mike Fährmann # Copyright 2019-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -119,7 +119,8 @@ class VscoUserExtractor(VscoExtractor):
pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
test = ( test = (
("https://vsco.co/missuri/gallery", { ("https://vsco.co/missuri/gallery", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+", "pattern": r"https://image(-aws.+)?\.vsco\.co"
r"/[0-9a-f/]+/[\w-]+\.\w+",
"range": "1-80", "range": "1-80",
"count": 80, "count": 80,
}), }),
@@ -150,7 +151,7 @@ class VscoCollectionExtractor(VscoExtractor):
archive_fmt = "c_{user}_{id}" archive_fmt = "c_{user}_{id}"
pattern = BASE_PATTERN + r"/collection/" pattern = BASE_PATTERN + r"/collection/"
test = ("https://vsco.co/vsco/collection/1", { test = ("https://vsco.co/vsco/collection/1", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+\.\w+", "pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/[\w-]+\.\w+",
"range": "1-80", "range": "1-80",
"count": 80, "count": 80,
}) })

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2021 Mike Fährmann # Copyright 2018-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -58,7 +58,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
(("https://wallhaven.cc/search?q=id%3A87" (("https://wallhaven.cc/search?q=id%3A87"
"&categories=111&purity=100&sorting=date_added&order=asc&page=3"), { "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), {
"pattern": r"https://w.wallhaven.cc/full/\w\w/wallhaven-\w+\.\w+", "pattern": r"https://w.wallhaven.cc/full/\w\w/wallhaven-\w+\.\w+",
"count": "<= 20", "count": "<= 30",
}), }),
) )

View File

@@ -99,13 +99,14 @@ class WeiboExtractor(Extractor):
else: else:
yield pic["largest"].copy() yield pic["largest"].copy()
if "page_info" in status: if "page_info" in status and self.videos:
page_info = status["page_info"] try:
if "media_info" not in page_info or not self.videos: media = max(status["page_info"]["media_info"]["playback_list"],
return key=lambda m: m["meta"]["quality_index"])
media = max(page_info["media_info"]["playback_list"], except KeyError:
key=lambda m: m["meta"]["quality_index"]) pass
yield media["play_info"].copy() else:
yield media["play_info"].copy()
def _status_by_id(self, status_id): def _status_by_id(self, status_id):
url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) url = "{}/ajax/statuses/show?id={}".format(self.root, status_id)
@@ -310,7 +311,8 @@ class WeiboAlbumExtractor(WeiboExtractor):
subcategory = "album" subcategory = "album"
pattern = USER_PATTERN + r"\?tabtype=album" pattern = USER_PATTERN + r"\?tabtype=album"
test = ("https://weibo.com/1758989602?tabtype=album", { test = ("https://weibo.com/1758989602?tabtype=album", {
"pattern": r"https://wx\d+\.sinaimg\.cn/large/\w{32}\.(jpg|png|gif)", "pattern": r"https://(wx\d+\.sinaimg\.cn/large/\w{32}\.(jpg|png|gif)"
r"|g\.us\.sinaimg\.cn/../\w+\.mp4)",
"range": "1-3", "range": "1-3",
"count": 3, "count": 3,
}) })
@@ -324,7 +326,11 @@ class WeiboAlbumExtractor(WeiboExtractor):
mid = image["mid"] mid = image["mid"]
if mid not in seen: if mid not in seen:
seen.add(mid) seen.add(mid)
yield self._status_by_id(mid) status = self._status_by_id(mid)
if status.get("ok") != 1:
self.log.debug("Skipping status %s (%s)", mid, status)
else:
yield status
class WeiboStatusExtractor(WeiboExtractor): class WeiboStatusExtractor(WeiboExtractor):
@@ -341,7 +347,7 @@ class WeiboStatusExtractor(WeiboExtractor):
}), }),
# unavailable video (#427) # unavailable video (#427)
("https://m.weibo.cn/status/4268682979207023", { ("https://m.weibo.cn/status/4268682979207023", {
"exception": exception.HttpError, "exception": exception.NotFoundError,
}), }),
# non-numeric status ID (#664) # non-numeric status ID (#664)
("https://weibo.com/3314883543/Iy7fj4qVg"), ("https://weibo.com/3314883543/Iy7fj4qVg"),
@@ -361,12 +367,20 @@ class WeiboStatusExtractor(WeiboExtractor):
"pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104" "pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104"
r"120005tc0E010\.mp4\?label=gif_mp4", r"120005tc0E010\.mp4\?label=gif_mp4",
}), }),
# missing 'playback_list' (#2792)
("https://weibo.com/2909128931/4409545658754086", {
"count": 9,
}),
("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/status/4339748116375525"),
("https://m.weibo.cn/5746766133/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"),
) )
def statuses(self): def statuses(self):
return (self._status_by_id(self.user),) status = self._status_by_id(self.user)
if status.get("ok") != 1:
self.log.debug(status)
raise exception.NotFoundError("status")
return (status,)
@cache(maxage=356*86400) @cache(maxage=356*86400)

View File

@@ -0,0 +1,153 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.zerochan.net/"""
from .booru import BooruExtractor
from ..cache import cache
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
class ZerochanExtractor(BooruExtractor):
"""Base class for zerochan extractors"""
category = "zerochan"
root = "https://www.zerochan.net"
filename_fmt = "{id}.{extension}"
archive_fmt = "{id}"
cookiedomain = ".zerochan.net"
cookienames = ("z_id", "z_hash")
def login(self):
if not self._check_cookies(self.cookienames):
username, password = self._get_auth_info()
if username:
self._update_cookies(self._login_impl(username, password))
# force legacy layout
self.session.cookies.set("v3", "0", domain=self.cookiedomain)
@cache(maxage=90*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = self.root + "/login"
headers = {
"Origin" : self.root,
"Referer" : url,
}
data = {
"ref" : "/",
"name" : username,
"password": password,
"login" : "Login",
}
response = self.request(url, method="POST", headers=headers, data=data)
if not response.history:
raise exception.AuthenticationError()
return response.cookies
def _parse_entry_page(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text)
return {
"id" : entry_id,
"author": extr('"author": "', '"'),
"file_url": extr('"contentUrl": "', '"'),
"date" : text.parse_datetime(extr(
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
"width" : extr('"width": "', ' '),
"height": extr('"height": "', ' '),
"size" : extr('"contentSize": "', 'B'),
}
class ZerochanTagExtractor(ZerochanExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", {
"pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
"count": "> 24",
"keywords": {
"extension": r"re:jpg|png",
"file_url": "",
"filename": r"re:Perth.\(Kantai.Collection\).full.\d+",
"height": r"re:^\d+$",
"id": r"re:^\d+$",
"name": "Perth (Kantai Collection)",
"search_tags": "Perth (Kantai Collection)",
"size": r"re:^\d+k$",
"width": r"re:^\d+$",
},
})
def __init__(self, match):
ZerochanExtractor.__init__(self, match)
self.search_tag, self.query = match.groups()
def metadata(self):
return {"search_tags": text.unquote(
self.search_tag.replace("+", " "))}
def posts(self):
url = self.root + "/" + self.search_tag
params = text.parse_query(self.query)
params["p"] = text.parse_int(params.get("p"), 1)
while True:
page = self.request(url, params=params).text
thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
extr = text.extract_from(thumbs)
while True:
post = extr('<li class="', '>')
if not post:
break
yield {
"id" : extr('href="/', '"'),
"name" : extr('alt="', '"'),
"width" : extr('title="', 'x'),
"height": extr('', ' '),
"size" : extr('', 'B'),
"file_url": "https://static." + extr(
'<a href="https://static.', '"'),
}
if 'rel="next"' not in page:
break
params["p"] += 1
class ZerochanImageExtractor(ZerochanExtractor):
subcategory = "image"
pattern = BASE_PATTERN + r"/(\d+)"
test = ("https://www.zerochan.net/2920445", {
"pattern": r"https://static\.zerochan\.net/"
r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",
"keyword": {
"author": "YukinoTokisaki",
"date": "dt:2020-04-24 21:33:44",
"file_url": str,
"filename": "Perth.(Kantai.Collection).full.2920445",
"height": "1366",
"id": "2920445",
"size": "1929k",
"width": "1920",
},
})
def __init__(self, match):
ZerochanExtractor.__init__(self, match)
self.image_id = match.group(1)
def posts(self):
return (self._parse_entry_page(self.image_id),)

View File

@@ -10,6 +10,7 @@
import os import os
import json import json
import time
import string import string
import _string import _string
import datetime import datetime
@@ -17,17 +18,9 @@ import operator
import functools import functools
from . import text, util from . import text, util
_CACHE = {}
_CONVERSIONS = None
_GLOBALS = {
"_env": lambda: os.environ,
"_lit": lambda: _literal,
"_now": datetime.datetime.now,
}
def parse(format_string, default=None, fmt=format):
def parse(format_string, default=None): key = format_string, default, fmt
key = format_string, default
try: try:
return _CACHE[key] return _CACHE[key]
@@ -48,7 +41,7 @@ def parse(format_string, default=None):
elif kind == "F": elif kind == "F":
cls = FStringFormatter cls = FStringFormatter
formatter = _CACHE[key] = cls(format_string, default) formatter = _CACHE[key] = cls(format_string, default, fmt)
return formatter return formatter
@@ -95,8 +88,9 @@ class StringFormatter():
Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r") Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r")
""" """
def __init__(self, format_string, default=None): def __init__(self, format_string, default=None, fmt=format):
self.default = default self.default = default
self.format = fmt
self.result = [] self.result = []
self.fields = [] self.fields = []
@@ -126,7 +120,7 @@ class StringFormatter():
return "".join(result) return "".join(result)
def _field_access(self, field_name, format_spec, conversion): def _field_access(self, field_name, format_spec, conversion):
fmt = parse_format_spec(format_spec, conversion) fmt = self._parse_format_spec(format_spec, conversion)
if "|" in field_name: if "|" in field_name:
return self._apply_list([ return self._apply_list([
@@ -184,27 +178,38 @@ class StringFormatter():
return fmt(obj) return fmt(obj)
return wrap return wrap
def _parse_format_spec(self, format_spec, conversion):
fmt = _build_format_func(format_spec, self.format)
if not conversion:
return fmt
conversion = _CONVERSIONS[conversion]
if fmt is self.format:
return conversion
else:
return lambda obj: fmt(conversion(obj))
class TemplateFormatter(StringFormatter): class TemplateFormatter(StringFormatter):
"""Read format_string from file""" """Read format_string from file"""
def __init__(self, path, default=None): def __init__(self, path, default=None, fmt=format):
with open(util.expand_path(path)) as fp: with open(util.expand_path(path)) as fp:
format_string = fp.read() format_string = fp.read()
StringFormatter.__init__(self, format_string, default) StringFormatter.__init__(self, format_string, default, fmt)
class ExpressionFormatter(): class ExpressionFormatter():
"""Generate text by evaluating a Python expression""" """Generate text by evaluating a Python expression"""
def __init__(self, expression, default=None): def __init__(self, expression, default=None, fmt=None):
self.format_map = util.compile_expression(expression) self.format_map = util.compile_expression(expression)
class ModuleFormatter(): class ModuleFormatter():
"""Generate text by calling an external function""" """Generate text by calling an external function"""
def __init__(self, function_spec, default=None): def __init__(self, function_spec, default=None, fmt=None):
module_name, _, function_name = function_spec.partition(":") module_name, _, function_name = function_spec.partition(":")
module = __import__(module_name) module = __import__(module_name)
self.format_map = getattr(module, function_name) self.format_map = getattr(module, function_name)
@@ -213,7 +218,7 @@ class ModuleFormatter():
class FStringFormatter(): class FStringFormatter():
"""Generate text by evaluaring an f-string literal""" """Generate text by evaluaring an f-string literal"""
def __init__(self, fstring, default=None): def __init__(self, fstring, default=None, fmt=None):
self.format_map = util.compile_expression("f'''" + fstring + "'''") self.format_map = util.compile_expression("f'''" + fstring + "'''")
@@ -251,81 +256,37 @@ def _slice(indices):
) )
def parse_format_spec(format_spec, conversion): def _build_format_func(format_spec, default):
fmt = build_format_func(format_spec)
if not conversion:
return fmt
global _CONVERSIONS
if _CONVERSIONS is None:
_CONVERSIONS = {
"l": str.lower,
"u": str.upper,
"c": str.capitalize,
"C": string.capwords,
"j": functools.partial(json.dumps, default=str),
"t": str.strip,
"T": util.datetime_to_timestamp_string,
"d": text.parse_timestamp,
"U": text.unescape,
"S": util.to_string,
"s": str,
"r": repr,
"a": ascii,
}
conversion = _CONVERSIONS[conversion]
if fmt is format:
return conversion
else:
def chain(obj):
return fmt(conversion(obj))
return chain
def build_format_func(format_spec):
if format_spec: if format_spec:
fmt = format_spec[0] return _FORMAT_SPECIFIERS.get(
if fmt == "?": format_spec[0], _default_format)(format_spec, default)
return _parse_optional(format_spec) return default
if fmt == "[":
return _parse_slice(format_spec)
if fmt == "L":
return _parse_maxlen(format_spec)
if fmt == "J":
return _parse_join(format_spec)
if fmt == "R":
return _parse_replace(format_spec)
if fmt == "D":
return _parse_datetime(format_spec)
return _default_format(format_spec)
return format
def _parse_optional(format_spec): def _parse_optional(format_spec, default):
before, after, format_spec = format_spec.split("/", 2) before, after, format_spec = format_spec.split(_SEPARATOR, 2)
before = before[1:] before = before[1:]
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def optional(obj): def optional(obj):
return before + fmt(obj) + after if obj else "" return before + fmt(obj) + after if obj else ""
return optional return optional
def _parse_slice(format_spec): def _parse_slice(format_spec, default):
indices, _, format_spec = format_spec.partition("]") indices, _, format_spec = format_spec.partition("]")
slice = _slice(indices[1:]) slice = _slice(indices[1:])
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def apply_slice(obj): def apply_slice(obj):
return fmt(obj[slice]) return fmt(obj[slice])
return apply_slice return apply_slice
def _parse_maxlen(format_spec): def _parse_maxlen(format_spec, default):
maxlen, replacement, format_spec = format_spec.split("/", 2) maxlen, replacement, format_spec = format_spec.split(_SEPARATOR, 2)
maxlen = text.parse_int(maxlen[1:]) maxlen = text.parse_int(maxlen[1:])
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def mlen(obj): def mlen(obj):
obj = fmt(obj) obj = fmt(obj)
@@ -333,37 +294,58 @@ def _parse_maxlen(format_spec):
return mlen return mlen
def _parse_join(format_spec): def _parse_join(format_spec, default):
separator, _, format_spec = format_spec.partition("/") separator, _, format_spec = format_spec.partition(_SEPARATOR)
separator = separator[1:] separator = separator[1:]
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def join(obj): def join(obj):
return fmt(separator.join(obj)) return fmt(separator.join(obj))
return join return join
def _parse_replace(format_spec): def _parse_replace(format_spec, default):
old, new, format_spec = format_spec.split("/", 2) old, new, format_spec = format_spec.split(_SEPARATOR, 2)
old = old[1:] old = old[1:]
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def replace(obj): def replace(obj):
return fmt(obj.replace(old, new)) return fmt(obj.replace(old, new))
return replace return replace
def _parse_datetime(format_spec): def _parse_datetime(format_spec, default):
dt_format, _, format_spec = format_spec.partition("/") dt_format, _, format_spec = format_spec.partition(_SEPARATOR)
dt_format = dt_format[1:] dt_format = dt_format[1:]
fmt = build_format_func(format_spec) fmt = _build_format_func(format_spec, default)
def dt(obj): def dt(obj):
return fmt(text.parse_datetime(obj, dt_format)) return fmt(text.parse_datetime(obj, dt_format))
return dt return dt
def _default_format(format_spec): def _parse_offset(format_spec, default):
offset, _, format_spec = format_spec.partition(_SEPARATOR)
offset = offset[1:]
fmt = _build_format_func(format_spec, default)
if not offset or offset == "local":
is_dst = time.daylight and time.localtime().tm_isdst > 0
offset = -(time.altzone if is_dst else time.timezone)
else:
hours, _, minutes = offset.partition(":")
offset = 3600 * int(hours)
if minutes:
offset += 60 * (int(minutes) if offset > 0 else -int(minutes))
offset = datetime.timedelta(seconds=offset)
def off(obj):
return fmt(obj + offset)
return off
def _default_format(format_spec, default):
def wrap(obj): def wrap(obj):
return format(obj, format_spec) return format(obj, format_spec)
return wrap return wrap
@@ -379,3 +361,35 @@ class Literal():
_literal = Literal() _literal = Literal()
_CACHE = {}
_SEPARATOR = "/"
_GLOBALS = {
"_env": lambda: os.environ,
"_lit": lambda: _literal,
"_now": datetime.datetime.now,
}
_CONVERSIONS = {
"l": str.lower,
"u": str.upper,
"c": str.capitalize,
"C": string.capwords,
"j": functools.partial(json.dumps, default=str),
"t": str.strip,
"T": util.datetime_to_timestamp_string,
"d": text.parse_timestamp,
"U": text.unescape,
"S": util.to_string,
"s": str,
"r": repr,
"a": ascii,
}
_FORMAT_SPECIFIERS = {
"?": _parse_optional,
"[": _parse_slice,
"D": _parse_datetime,
"L": _parse_maxlen,
"J": _parse_join,
"O": _parse_offset,
"R": _parse_replace,
}

View File

@@ -35,10 +35,13 @@ class Job():
self.status = 0 self.status = 0
self.url_key = extr.config("url-metadata") self.url_key = extr.config("url-metadata")
path_key = extr.config("path-metadata")
path_proxy = output.PathfmtProxy(self)
self._logger_extra = { self._logger_extra = {
"job" : self, "job" : self,
"extractor": extr, "extractor": extr,
"path" : output.PathfmtProxy(self), "path" : path_proxy,
"keywords" : output.KwdictProxy(self), "keywords" : output.KwdictProxy(self),
} }
extr.log = self._wrap_logger(extr.log) extr.log = self._wrap_logger(extr.log)
@@ -58,6 +61,8 @@ class Job():
kwdict = extr.config("keywords") kwdict = extr.config("keywords")
if kwdict: if kwdict:
self.kwdict.update(kwdict) self.kwdict.update(kwdict)
if path_key:
self.kwdict[path_key] = path_proxy
# predicates # predicates
self.pred_url = self._prepare_predicates("image", True) self.pred_url = self._prepare_predicates("image", True)
@@ -439,6 +444,9 @@ class DownloadJob(Job):
if self.archive: if self.archive:
self.archive.check = pathfmt.exists self.archive.check = pathfmt.exists
if not cfg("postprocess", True):
return
postprocessors = extr.config_accumulate("postprocessors") postprocessors = extr.config_accumulate("postprocessors")
if postprocessors: if postprocessors:
self.hooks = collections.defaultdict(list) self.hooks = collections.defaultdict(list)

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017-2021 Mike Fährmann # Copyright 2017-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -283,6 +283,11 @@ def build_parser():
dest="download", nargs=0, action=ConfigConstAction, const=False, dest="download", nargs=0, action=ConfigConstAction, const=False,
help=("Do not download any files") help=("Do not download any files")
) )
downloader.add_argument(
"--no-postprocessors",
dest="postprocess", nargs=0, action=ConfigConstAction, const=False,
help=("Do not run any post processors")
)
downloader.add_argument( downloader.add_argument(
"--no-check-certificate", "--no-check-certificate",
dest="verify", nargs=0, action=ConfigConstAction, const=False, dest="verify", nargs=0, action=ConfigConstAction, const=False,

View File

@@ -73,6 +73,12 @@ class PathfmtProxy():
pathfmt = object.__getattribute__(self, "job").pathfmt pathfmt = object.__getattribute__(self, "job").pathfmt
return pathfmt.__dict__.get(name) if pathfmt else None return pathfmt.__dict__.get(name) if pathfmt else None
def __str__(self):
pathfmt = object.__getattribute__(self, "job").pathfmt
if pathfmt:
return pathfmt.path or pathfmt.directory
return ""
class KwdictProxy(): class KwdictProxy():
__slots__ = ("job",) __slots__ = ("job",)

View File

@@ -30,6 +30,17 @@ class MetadataPP(PostProcessor):
elif mode == "tags": elif mode == "tags":
self.write = self._write_tags self.write = self._write_tags
ext = "txt" ext = "txt"
elif mode == "modify":
self.run = self._run_modify
self.fields = {
name: formatter.parse(value, None, util.identity).format_map
for name, value in options.get("fields").items()
}
ext = None
elif mode == "delete":
self.run = self._run_delete
self.fields = options.get("fields")
ext = None
else: else:
self.write = self._write_json self.write = self._write_json
self.indent = options.get("indent", 4) self.indent = options.get("indent", 4)
@@ -114,6 +125,32 @@ class MetadataPP(PostProcessor):
def _run_stdout(self, pathfmt): def _run_stdout(self, pathfmt):
self.write(sys.stdout, pathfmt.kwdict) self.write(sys.stdout, pathfmt.kwdict)
def _run_modify(self, pathfmt):
kwdict = pathfmt.kwdict
for key, func in self.fields.items():
obj = kwdict
try:
while "[" in key:
name, _, key = key.partition("[")
obj = obj[name]
key = key.rstrip("]")
obj[key] = func(kwdict)
except Exception:
pass
def _run_delete(self, pathfmt):
kwdict = pathfmt.kwdict
for key in self.fields:
obj = kwdict
try:
while "[" in key:
name, _, key = key.partition("[")
obj = obj[name]
key = key.rstrip("]")
del obj[key]
except Exception:
pass
def _directory(self, pathfmt): def _directory(self, pathfmt):
return pathfmt.realdirectory return pathfmt.realdirectory

View File

@@ -9,7 +9,7 @@
"""Use metadata as file modification time""" """Use metadata as file modification time"""
from .common import PostProcessor from .common import PostProcessor
from .. import text, util from .. import text, util, formatter
from datetime import datetime from datetime import datetime
@@ -17,7 +17,12 @@ class MtimePP(PostProcessor):
def __init__(self, job, options): def __init__(self, job, options):
PostProcessor.__init__(self, job) PostProcessor.__init__(self, job)
self.key = options.get("key", "date") value = options.get("value")
if value:
self._get = formatter.parse(value, None, util.identity).format_map
else:
key = options.get("key", "date")
self._get = lambda kwdict: kwdict.get(key)
events = options.get("event") events = options.get("event")
if events is None: if events is None:
@@ -27,7 +32,7 @@ class MtimePP(PostProcessor):
job.register_hooks({event: self.run for event in events}, options) job.register_hooks({event: self.run for event in events}, options)
def run(self, pathfmt): def run(self, pathfmt):
mtime = pathfmt.kwdict.get(self.key) mtime = self._get(pathfmt.kwdict)
pathfmt.kwdict["_mtime"] = ( pathfmt.kwdict["_mtime"] = (
util.datetime_to_timestamp(mtime) util.datetime_to_timestamp(mtime)
if isinstance(mtime, datetime) else if isinstance(mtime, datetime) else

View File

@@ -35,7 +35,7 @@ class UgoiraPP(PostProcessor):
self.output = options.get("ffmpeg-output", True) self.output = options.get("ffmpeg-output", True)
self.delete = not options.get("keep-files", False) self.delete = not options.get("keep-files", False)
self.repeat = options.get("repeat-last-frame", True) self.repeat = options.get("repeat-last-frame", True)
self.mtime = options.get("mtime") self.mtime = options.get("mtime", True)
ffmpeg = options.get("ffmpeg-location") ffmpeg = options.get("ffmpeg-location")
self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg" self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg"

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2021 Mike Fährmann # Copyright 2018-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,7 @@
from .common import PostProcessor from .common import PostProcessor
from .. import util from .. import util
import zipfile import zipfile
import os
class ZipPP(PostProcessor): class ZipPP(PostProcessor):
@@ -34,30 +35,38 @@ class ZipPP(PostProcessor):
algorithm = "store" algorithm = "store"
self.zfile = None self.zfile = None
self.path = job.pathfmt.realdirectory self.path = job.pathfmt.realdirectory[:-1]
self.args = (self.path[:-1] + ext, "a", self.args = (self.path + ext, "a",
self.COMPRESSION_ALGORITHMS[algorithm], True) self.COMPRESSION_ALGORITHMS[algorithm], True)
job.register_hooks({ job.register_hooks({
"file": "file": (self.write_safe if options.get("mode") == "safe" else
self.write_safe if options.get("mode") == "safe" else self.write, self.write_fast),
}, options) }, options)
job.hooks["finalize"].append(self.finalize) job.hooks["finalize"].append(self.finalize)
def write(self, pathfmt, zfile=None): def open(self):
try:
return zipfile.ZipFile(*self.args)
except FileNotFoundError:
os.makedirs(os.path.dirname(self.path))
return zipfile.ZipFile(*self.args)
def write(self, pathfmt, zfile):
# 'NameToInfo' is not officially documented, but it's available # 'NameToInfo' is not officially documented, but it's available
# for all supported Python versions and using it directly is a lot # for all supported Python versions and using it directly is a lot
# faster than calling getinfo() # faster than calling getinfo()
if zfile is None:
if self.zfile is None:
self.zfile = zipfile.ZipFile(*self.args)
zfile = self.zfile
if pathfmt.filename not in zfile.NameToInfo: if pathfmt.filename not in zfile.NameToInfo:
zfile.write(pathfmt.temppath, pathfmt.filename) zfile.write(pathfmt.temppath, pathfmt.filename)
pathfmt.delete = self.delete pathfmt.delete = self.delete
def write_fast(self, pathfmt):
if self.zfile is None:
self.zfile = self.open()
self.write(pathfmt, self.zfile)
def write_safe(self, pathfmt): def write_safe(self, pathfmt):
with zipfile.ZipFile(*self.args) as zfile: with self.open() as zfile:
self.write(pathfmt, zfile) self.write(pathfmt, zfile)
def finalize(self, pathfmt, status): def finalize(self, pathfmt, status):

View File

@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.22.2-dev" __version__ = "1.23.0-dev"

View File

@@ -9,12 +9,13 @@
import os import os
import sys import sys
import time
import unittest import unittest
import datetime import datetime
import tempfile import tempfile
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import formatter # noqa E402 from gallery_dl import formatter, text, util # noqa E402
class TestFormatter(unittest.TestCase): class TestFormatter(unittest.TestCase):
@@ -98,6 +99,14 @@ class TestFormatter(unittest.TestCase):
self._run_test("{missing[key]}", replacement, default) self._run_test("{missing[key]}", replacement, default)
self._run_test("{missing:?a//}", "a" + default, default) self._run_test("{missing:?a//}", "a" + default, default)
def test_fmt_func(self):
self._run_test("{t}" , self.kwdict["t"] , None, int)
self._run_test("{t}" , self.kwdict["t"] , None, util.identity)
self._run_test("{dt}", self.kwdict["dt"], None, util.identity)
self._run_test("{ds}", self.kwdict["dt"], None, text.parse_datetime)
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z}", self.kwdict["dt"],
None, util.identity)
def test_alternative(self): def test_alternative(self):
self._run_test("{a|z}" , "hElLo wOrLd") self._run_test("{a|z}" , "hElLo wOrLd")
self._run_test("{z|a}" , "hElLo wOrLd") self._run_test("{z|a}" , "hElLo wOrLd")
@@ -184,6 +193,31 @@ class TestFormatter(unittest.TestCase):
self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+0100") self._run_test("{ds:D%Y}", "2010-01-01T01:00:00+0100")
self._run_test("{l:D%Y}", "None") self._run_test("{l:D%Y}", "None")
def test_offset(self):
self._run_test("{dt:O 01:00}", "2010-01-01 01:00:00")
self._run_test("{dt:O+02:00}", "2010-01-01 02:00:00")
self._run_test("{dt:O-03:45}", "2009-12-31 20:15:00")
self._run_test("{dt:O12}", "2010-01-01 12:00:00")
self._run_test("{dt:O-24}", "2009-12-31 00:00:00")
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/O1}", "2010-01-01 01:00:00")
self._run_test("{t!d:O2}", "2010-01-01 02:00:00")
orig_daylight = time.daylight
orig_timezone = time.timezone
orig_altzone = time.altzone
try:
time.daylight = False
time.timezone = -3600
self._run_test("{dt:O}", "2010-01-01 01:00:00")
time.timezone = 7200
self._run_test("{dt:Olocal}", "2009-12-31 22:00:00")
finally:
time.daylight = orig_daylight
time.timezone = orig_timezone
time.altzone = orig_altzone
def test_chain_special(self): def test_chain_special(self):
# multiple replacements # multiple replacements
self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld") self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld")
@@ -202,6 +236,26 @@ class TestFormatter(unittest.TestCase):
# parse and format datetime # parse and format datetime
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/%Y%m%d}", "20100101") self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z/%Y%m%d}", "20100101")
def test_separator(self):
orig_separator = formatter._SEPARATOR
try:
formatter._SEPARATOR = "|"
self._run_test("{a:Rh|C|RE|e|RL|l|}", "Cello wOrld")
self._run_test("{d[b]!s:R1|Q|R2|A|R0|Y|}", "Y")
formatter._SEPARATOR = "##"
self._run_test("{l:J-##Rb##E##}", "a-E-c")
self._run_test("{l:J-##[1:-1]}", "-b-")
formatter._SEPARATOR = "\0"
self._run_test("{d[a]:?<\0>\0L1\0too long\0}", "<too long>")
self._run_test("{d[c]:?<\0>\0L5\0too long\0}", "")
formatter._SEPARATOR = "?"
self._run_test("{ds:D%Y-%m-%dT%H:%M:%S%z?%Y%m%d}", "20100101")
finally:
formatter._SEPARATOR = orig_separator
def test_globals_env(self): def test_globals_env(self):
os.environ["FORMATTER_TEST"] = value = self.kwdict["a"] os.environ["FORMATTER_TEST"] = value = self.kwdict["a"]
@@ -316,8 +370,8 @@ def noarg():
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
self.assertEqual(fmt3.format_map(self.kwdict), "") self.assertEqual(fmt3.format_map(self.kwdict), "")
def _run_test(self, format_string, result, default=None): def _run_test(self, format_string, result, default=None, fmt=format):
fmt = formatter.parse(format_string, default) fmt = formatter.parse(format_string, default, fmt)
output = fmt.format_map(self.kwdict) output = fmt.format_map(self.kwdict)
self.assertEqual(output, result, format_string) self.assertEqual(output, result, format_string)

View File

@@ -339,6 +339,52 @@ class MetadataTest(BasePostprocessorTest):
{"category": "test", "extension": "ext", "filename": "file"} {"category": "test", "extension": "ext", "filename": "file"}
""") """)
def test_metadata_modify(self):
kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}}
self._create({
"mode": "modify",
"fields": {
"foo" : "{filename}-{foo!s}",
"foo2" : "\fE bar['bax'] + 122",
"bar[baz]": "{_now}",
"bar[ba2]": "test",
},
}, kwdict)
pdict = self.pathfmt.kwdict
self.assertIsNot(kwdict, pdict)
self.assertEqual(pdict["foo"], kwdict["foo"])
self.assertEqual(pdict["bar"], kwdict["bar"])
self._trigger()
self.assertEqual(pdict["foo"] , "file-0")
self.assertEqual(pdict["foo2"] , 123)
self.assertEqual(pdict["bar"]["ba2"], "test")
self.assertIsInstance(pdict["bar"]["baz"], datetime)
def test_metadata_delete(self):
kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}}
self._create({"mode": "delete", "fields": ["foo", "bar[baz]"]}, kwdict)
pdict = self.pathfmt.kwdict
self.assertIsNot(kwdict, pdict)
self.assertEqual(pdict["foo"], kwdict["foo"])
self.assertEqual(pdict["bar"], kwdict["bar"])
del kwdict["foo"]
del kwdict["bar"]["baz"]
self._trigger()
self.assertNotIn("foo", pdict)
self.assertNotIn("baz", pdict["bar"])
self.assertEqual(kwdict["bar"], pdict["bar"])
self._trigger()
self.assertNotIn("foo", pdict)
self.assertNotIn("baz", pdict["bar"])
self.assertEqual(kwdict["bar"], pdict["bar"])
@staticmethod @staticmethod
def _output(mock): def _output(mock):
return "".join( return "".join(
@@ -350,10 +396,6 @@ class MetadataTest(BasePostprocessorTest):
class MtimeTest(BasePostprocessorTest): class MtimeTest(BasePostprocessorTest):
def test_mtime_default(self):
pp = self._create()
self.assertEqual(pp.key, "date")
def test_mtime_datetime(self): def test_mtime_datetime(self):
self._create(None, {"date": datetime(1980, 1, 1)}) self._create(None, {"date": datetime(1980, 1, 1)})
self._trigger() self._trigger()
@@ -364,31 +406,36 @@ class MtimeTest(BasePostprocessorTest):
self._trigger() self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
def test_mtime_custom(self): def test_mtime_key(self):
self._create({"key": "foo"}, {"foo": 315532800}) self._create({"key": "foo"}, {"foo": 315532800})
self._trigger() self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
def test_mtime_value(self):
self._create({"value": "{foo}"}, {"foo": 315532800})
self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
class ZipTest(BasePostprocessorTest): class ZipTest(BasePostprocessorTest):
def test_zip_default(self): def test_zip_default(self):
pp = self._create() pp = self._create()
self.assertEqual(self.job.hooks["file"][0], pp.write) self.assertEqual(self.job.hooks["file"][0], pp.write_fast)
self.assertEqual(pp.path, self.pathfmt.realdirectory) self.assertEqual(pp.path, self.pathfmt.realdirectory[:-1])
self.assertEqual(pp.delete, True) self.assertEqual(pp.delete, True)
self.assertEqual(pp.args, ( self.assertEqual(pp.args, (
pp.path[:-1] + ".zip", "a", zipfile.ZIP_STORED, True, pp.path + ".zip", "a", zipfile.ZIP_STORED, True,
)) ))
self.assertTrue(pp.args[0].endswith("/test.zip")) self.assertTrue(pp.args[0].endswith("/test.zip"))
def test_zip_safe(self): def test_zip_safe(self):
pp = self._create({"mode": "safe"}) pp = self._create({"mode": "safe"})
self.assertEqual(self.job.hooks["file"][0], pp.write_safe) self.assertEqual(self.job.hooks["file"][0], pp.write_safe)
self.assertEqual(pp.path, self.pathfmt.realdirectory) self.assertEqual(pp.path, self.pathfmt.realdirectory[:-1])
self.assertEqual(pp.delete, True) self.assertEqual(pp.delete, True)
self.assertEqual(pp.args, ( self.assertEqual(pp.args, (
pp.path[:-1] + ".zip", "a", zipfile.ZIP_STORED, True, pp.path + ".zip", "a", zipfile.ZIP_STORED, True,
)) ))
self.assertTrue(pp.args[0].endswith("/test.zip")) self.assertTrue(pp.args[0].endswith("/test.zip"))
@@ -400,7 +447,7 @@ class ZipTest(BasePostprocessorTest):
}) })
self.assertEqual(pp.delete, False) self.assertEqual(pp.delete, False)
self.assertEqual(pp.args, ( self.assertEqual(pp.args, (
pp.path[:-1] + ".cbz", "a", zipfile.ZIP_DEFLATED, True, pp.path + ".cbz", "a", zipfile.ZIP_DEFLATED, True,
)) ))
self.assertTrue(pp.args[0].endswith("/test.cbz")) self.assertTrue(pp.args[0].endswith("/test.cbz"))
@@ -439,9 +486,9 @@ class ZipTest(BasePostprocessorTest):
with zipfile.ZipFile(pp.zfile.filename) as file: with zipfile.ZipFile(pp.zfile.filename) as file:
nti = file.NameToInfo nti = file.NameToInfo
self.assertEqual(len(pp.zfile.NameToInfo), 3) self.assertEqual(len(pp.zfile.NameToInfo), 3)
self.assertIn("file0.ext", pp.zfile.NameToInfo) self.assertIn("file0.ext", nti)
self.assertIn("file1.ext", pp.zfile.NameToInfo) self.assertIn("file1.ext", nti)
self.assertIn("file2.ext", pp.zfile.NameToInfo) self.assertIn("file2.ext", nti)
os.unlink(pp.zfile.filename) os.unlink(pp.zfile.filename)