Merge branch 'archive'
This commit is contained in:
@@ -353,6 +353,20 @@ Description Additional key-value pairs to be added to each metadata dictionary.
|
||||
=========== =====
|
||||
|
||||
|
||||
extractor.*.archive
|
||||
-------------------
|
||||
=========== =====
|
||||
Type ``string``
|
||||
Default ``null``
|
||||
Description File to store IDs of downloaded files in. Downloads of files
|
||||
already recorded in this archive file will be skipped_.
|
||||
|
||||
The resulting archive file is not a plain text file but an SQLite3
|
||||
database, as lookup operations are significantly faster when the
|
||||
amount of stored IDs gets reasonably large.
|
||||
=========== =====
|
||||
|
||||
|
||||
Extractor-specific Options
|
||||
==========================
|
||||
|
||||
@@ -788,6 +802,7 @@ How To - login and visit Tumblr's Applications_ section
|
||||
.. |datetime.max| replace:: ``datetime.max``
|
||||
.. |strptime| replace:: strftime() and strptime() Behavior
|
||||
|
||||
.. _skipped: `extractor.*.skip`_
|
||||
.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_
|
||||
.. _date-format: extractor.reddit.date-format_
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
},
|
||||
"extractor":
|
||||
{
|
||||
"archive": null,
|
||||
"skip": true,
|
||||
"sleep": 0,
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -17,8 +17,10 @@ class FutabaThreadExtractor(Extractor):
|
||||
category = "2chan"
|
||||
subcategory = "thread"
|
||||
directory_fmt = ["{category}", "{board_name}", "{thread}"]
|
||||
pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"]
|
||||
filename_fmt = "{tim}.{extension}"
|
||||
archive_fmt = "{board}_{thread}_{tim}"
|
||||
urlfmt = "https://{server}.2chan.net/{board}/src/{filename}"
|
||||
pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"]
|
||||
test = [("http://dec.2chan.net/70/res/947.htm", {
|
||||
"url": "c5c12b80b290e224b6758507b3bb952044f4595b",
|
||||
"keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
|
||||
|
||||
@@ -20,6 +20,7 @@ class BooruExtractor(SharedConfigExtractor):
|
||||
"""Base class for all booru extractors"""
|
||||
basecategory = "booru"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
api_url = ""
|
||||
per_page = 50
|
||||
page_start = 1
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -19,7 +19,8 @@ class ChanThreadExtractor(Extractor):
|
||||
category = "chan"
|
||||
subcategory = "thread"
|
||||
directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
|
||||
filename_fmt = "{tim}-{filename}{ext}"
|
||||
filename_fmt = "{tim}-{filename}.{extension}"
|
||||
archive_fmt = "{board}_{thread}_{tim}"
|
||||
api_url = ""
|
||||
file_url = ""
|
||||
|
||||
@@ -69,6 +70,7 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
|
||||
directory_fmt = ["{category}", "{board[shortname]}",
|
||||
"{thread_num}{title:? - //}"]
|
||||
filename_fmt = "{media[media]}"
|
||||
archive_fmt = "{{board[shortname]}}_{num}_{timestamp}"
|
||||
root = ""
|
||||
referer = True
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ class Extractor():
|
||||
categorytransfer = False
|
||||
directory_fmt = ["{category}"]
|
||||
filename_fmt = "{name}.{extension}"
|
||||
archive_fmt = ""
|
||||
cookiedomain = ""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -20,8 +20,9 @@ import re
|
||||
class DeviantartExtractor(Extractor):
|
||||
"""Base class for deviantart extractors"""
|
||||
category = "deviantart"
|
||||
filename_fmt = "{category}_{index}_{title}.{extension}"
|
||||
directory_fmt = ["{category}", "{author[username]!l}"]
|
||||
filename_fmt = "{category}_{index}_{title}.{extension}"
|
||||
archive_fmt = "{index}.{extension}"
|
||||
|
||||
def __init__(self, match=None):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -22,6 +22,7 @@ class ExhentaiGalleryExtractor(Extractor):
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{gallery_id}"]
|
||||
filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{num}"
|
||||
pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
|
||||
test = [
|
||||
("https://exhentai.org/g/960460/4f0e369d82/", {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -16,6 +16,7 @@ class FlickrExtractor(Extractor):
|
||||
"""Base class for flickr extractors"""
|
||||
category = "flickr"
|
||||
filename_fmt = "{category}_{id}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -62,6 +62,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
|
||||
directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
archive_fmt = "{id}"
|
||||
method = "default"
|
||||
|
||||
def __init__(self, match, url=None):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class GelbooruExtractor(SharedConfigExtractor):
|
||||
basecategory = "booru"
|
||||
category = "gelbooru"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -15,6 +15,7 @@ from .. import exception
|
||||
class GfycatExtractor(Extractor):
|
||||
"""Base class for gfycat extractors"""
|
||||
category = "gfycat"
|
||||
archive_fmt = "{gfyName}"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -69,6 +69,7 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
|
||||
directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
|
||||
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
|
||||
"{page:>03}.{extension}")
|
||||
archive_fmt = "{manga_id}_{chapter}_{page}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
|
||||
test = [("http://www.hbrowse.com/10363/c00000", {
|
||||
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class HentaifoundryUserExtractor(Extractor):
|
||||
subcategory = "user"
|
||||
directory_fmt = ["{category}", "{artist}"]
|
||||
filename_fmt = "{category}_{index}_{title}.{extension}"
|
||||
archive_fmt = "{index}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/"
|
||||
r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"]
|
||||
test = [
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -19,6 +19,7 @@ class HitomiGalleryExtractor(Extractor):
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{num}"
|
||||
pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"]
|
||||
test = [("https://hitomi.la/galleries/867789.html", {
|
||||
"url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -17,7 +17,8 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
|
||||
category = "imagebam"
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{title} - {gallery_key}"]
|
||||
filename_fmt = "{num:>03}-{filename}"
|
||||
filename_fmt = "{num:>03}-{name}.{extension}"
|
||||
archive_fmt = "{image_id}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"]
|
||||
test = [(("http://www.imagebam.com/"
|
||||
"gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), {
|
||||
@@ -76,6 +77,7 @@ class ImagebamImageExtractor(Extractor):
|
||||
"""Extractor for single images from imagebam.com"""
|
||||
category = "imagebam"
|
||||
subcategory = "image"
|
||||
archive_fmt = "{token}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"]
|
||||
test = [("http://www.imagebam.com/image/94d56c502511890", {
|
||||
"url": "b384893c35a01a09c58018db71ddc4cf2480be95",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -13,12 +13,17 @@ from .. import text, util
|
||||
import json
|
||||
|
||||
|
||||
class ImagefapGalleryExtractor(Extractor):
|
||||
"""Extractor for image galleries from imagefap.com"""
|
||||
class ImagefapExtractor(Extractor):
|
||||
"""Base class for imagefap extractors"""
|
||||
category = "imagefap"
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{image_id}"
|
||||
|
||||
|
||||
class ImagefapGalleryExtractor(ImagefapExtractor):
|
||||
"""Extractor for image galleries from imagefap.com"""
|
||||
subcategory = "gallery"
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
|
||||
r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")]
|
||||
test = [
|
||||
@@ -35,7 +40,7 @@ class ImagefapGalleryExtractor(Extractor):
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
ImagefapExtractor.__init__(self)
|
||||
self.gid = match.group(1)
|
||||
self.image_id = ""
|
||||
|
||||
@@ -80,12 +85,9 @@ class ImagefapGalleryExtractor(Extractor):
|
||||
params["idx"] += 24
|
||||
|
||||
|
||||
class ImagefapImageExtractor(Extractor):
|
||||
class ImagefapImageExtractor(ImagefapExtractor):
|
||||
"""Extractor for single images from imagefap.com"""
|
||||
category = "imagefap"
|
||||
subcategory = "image"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"]
|
||||
test = [("http://www.imagefap.com/photo/1369341772/", {
|
||||
"url": "24cc4312e4a5084f39f1e35af5ba92e5f7c1ad3c",
|
||||
@@ -93,7 +95,7 @@ class ImagefapImageExtractor(Extractor):
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
ImagefapExtractor.__init__(self)
|
||||
self.image_id = match.group(1)
|
||||
|
||||
def items(self):
|
||||
@@ -132,9 +134,8 @@ class ImagefapImageExtractor(Extractor):
|
||||
return json_dict
|
||||
|
||||
|
||||
class ImagefapUserExtractor(Extractor):
|
||||
class ImagefapUserExtractor(ImagefapExtractor):
|
||||
"""Extractor for all galleries from a user at imagefap.com"""
|
||||
category = "imagefap"
|
||||
subcategory = "user"
|
||||
categorytransfer = True
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
|
||||
@@ -146,7 +147,7 @@ class ImagefapUserExtractor(Extractor):
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
ImagefapExtractor.__init__(self)
|
||||
try:
|
||||
self.user_id = int(match.group(1))
|
||||
self.user = None
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ from urllib.parse import urljoin
|
||||
class ImagehostImageExtractor(Extractor):
|
||||
"""Base class for single-image extractors for various imagehosts"""
|
||||
subcategory = "image"
|
||||
archive_fmt = "{token}"
|
||||
https = False
|
||||
method = "post"
|
||||
params = "simple"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -16,6 +16,7 @@ import re
|
||||
class ImgboxExtractor(Extractor):
|
||||
"""Base class for imgbox extractors"""
|
||||
category = "imgbox"
|
||||
archive_fmt = "{image_key}"
|
||||
root = "https://imgbox.com"
|
||||
|
||||
def items(self):
|
||||
@@ -62,7 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
|
||||
"""Extractor for image galleries from imgbox.com"""
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{title} - {gallery_key}"]
|
||||
filename_fmt = "{num:>03}-{filename}"
|
||||
filename_fmt = "{num:>03}-{name}.{extension}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
|
||||
test = [
|
||||
("https://imgbox.com/g/JaX5V5HX7g", {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -15,6 +15,7 @@ from .. import text
|
||||
class ImgchiliExtractor(Extractor):
|
||||
"""Base class for imgchili extractors"""
|
||||
category = "imgchili"
|
||||
archive_fmt = "{image_id}"
|
||||
root = "https://imgchili.net"
|
||||
|
||||
def __init__(self, match):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class ImgthGalleryExtractor(Extractor):
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{num}"
|
||||
pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"]
|
||||
test = [("http://imgth.com/gallery/37/wallpaper-anime", {
|
||||
"url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
|
||||
|
||||
@@ -16,6 +16,7 @@ import json
|
||||
class ImgurExtractor(Extractor):
|
||||
"""Base class for imgur extractors"""
|
||||
category = "imgur"
|
||||
archive_fmt = "{hash}"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
|
||||
category = "khinsider"
|
||||
subcategory = "soundtrack"
|
||||
directory_fmt = ["{category}", "{album}"]
|
||||
archive_fmt = "{album}_{name}"
|
||||
pattern = [r"(?:https?://)?downloads\.khinsider\.com/"
|
||||
r"game-soundtracks/album/([^/?&#]+)"]
|
||||
test = [(("https://downloads.khinsider.com/game-soundtracks/"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class LusciousAlbumExtractor(AsynchronousExtractor):
|
||||
subcategory = "album"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{image_id}"
|
||||
pattern = [(r"(?:https?://)?(?:www\.|members\.)?luscious\.net/"
|
||||
r"(?:c/[^/?&#]+/)?(?:pictures/album|albums)/([^/?&#]+_(\d+))")]
|
||||
test = [
|
||||
|
||||
@@ -59,6 +59,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
|
||||
|
||||
class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangareader.net"""
|
||||
archive_fmt = "{manga}_{chapter}_{page}"
|
||||
pattern = [
|
||||
(r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
|
||||
(r"(?:https?://)?(?:www\.)?mangareader\.net"
|
||||
|
||||
@@ -16,6 +16,7 @@ from urllib.parse import urljoin
|
||||
class MangastreamChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangastream.com"""
|
||||
category = "mangastream"
|
||||
archive_fmt = "{chapter_id}_{page}"
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
|
||||
r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
|
||||
test = [("https://readms.net/r/onepunch_man/087/4874/1", None)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -19,6 +19,7 @@ class NhentaiGalleryExtractor(Extractor):
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{gallery_id} {title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{num}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"]
|
||||
test = [("http://nhentai.net/g/147850/", {
|
||||
"url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",
|
||||
|
||||
@@ -18,6 +18,7 @@ class NijieExtractor(AsynchronousExtractor):
|
||||
category = "nijie"
|
||||
directory_fmt = ["{category}", "{artist_id}"]
|
||||
filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
|
||||
archive_fmt = "{image_id}_{index}"
|
||||
cookiedomain = "nijie.info"
|
||||
popup_url = "https://nijie.info/view_popup.php?id="
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ class PahealExtractor(SharedConfigExtractor):
|
||||
basecategory = "booru"
|
||||
category = "paheal"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
root = "http://rule34.paheal.net"
|
||||
|
||||
def items(self):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -17,6 +17,7 @@ class PawooExtractor(Extractor):
|
||||
category = "pawoo"
|
||||
directory_fmt = ["{category}", "{account[username]}"]
|
||||
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
|
||||
archive_fmt = "{media[id]}"
|
||||
|
||||
def __init__(self):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -16,6 +16,7 @@ class PinterestExtractor(Extractor):
|
||||
"""Base class for pinterest extractors"""
|
||||
category = "pinterest"
|
||||
filename_fmt = "{category}_{pin_id}.{extension}"
|
||||
archive_fmt = "{pin_id}"
|
||||
|
||||
def __init__(self):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -19,6 +19,7 @@ class PixivExtractor(Extractor):
|
||||
category = "pixiv"
|
||||
directory_fmt = ["{category}", "{user[id]} {user[account]}"]
|
||||
filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
|
||||
archive_fmt = "{id}{num}"
|
||||
illust_url = "https://www.pixiv.net/member_illust.php?mode=medium"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -18,6 +18,7 @@ class ReadcomiconlineBase():
|
||||
category = "readcomiconline"
|
||||
directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
|
||||
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
|
||||
archive_fmt = "{comic}_{issue}_{page}"
|
||||
root = "http://readcomiconline.to"
|
||||
useragent = "Wget/1.19.2 (linux-gnu)"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -118,6 +118,7 @@ class RedditImageExtractor(Extractor):
|
||||
"""Extractor for reddit-hosted images"""
|
||||
category = "reddit"
|
||||
subcategory = "image"
|
||||
archive_fmt = "{name}"
|
||||
pattern = [r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)"
|
||||
r"/[^/?&#]+(?:\?[^#]*)?"]
|
||||
test = [
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -20,6 +20,7 @@ class SankakuExtractor(SharedConfigExtractor):
|
||||
basecategory = "booru"
|
||||
category = "sankaku"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
archive_fmt = "{id}"
|
||||
cookienames = ("login", "pass_hash")
|
||||
cookiedomain = "chan.sankakucomplex.com"
|
||||
subdomain = "chan"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -16,6 +16,7 @@ from ..cache import cache
|
||||
class SeigaExtractor(Extractor):
|
||||
"""Base class for seiga extractors"""
|
||||
category = "seiga"
|
||||
archive_fmt = "{image_id}"
|
||||
cookiedomain = ".nicovideo.jp"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class SenmangaChapterExtractor(Extractor):
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
|
||||
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
|
||||
archive_fmt = "{manga}_{chapter_string}_{page}"
|
||||
pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]
|
||||
test = [
|
||||
("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
|
||||
|
||||
@@ -18,6 +18,7 @@ class SlideshareExtractor(Extractor):
|
||||
subcategory = "presentation"
|
||||
directory_fmt = ["{category}", "{user}"]
|
||||
filename_fmt = "{presentation}-{num:>02}.{extension}"
|
||||
archive_fmt = "{presentation}_{num}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net"
|
||||
r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)"]
|
||||
test = [
|
||||
|
||||
@@ -38,7 +38,7 @@ class SpectrumnexusChapterExtractor(ChapterExtractor):
|
||||
category = "spectrumnexus"
|
||||
directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
|
||||
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
|
||||
|
||||
archive_fmt = "{manga}_{chapter_string}_{page}"
|
||||
pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
|
||||
r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
|
||||
test = [(("http://view.thespectrum.net/series/"
|
||||
|
||||
@@ -17,7 +17,7 @@ import re
|
||||
def _original_image(url):
|
||||
match = re.match(
|
||||
r"https?://\d+\.media\.tumblr\.com"
|
||||
r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)",
|
||||
r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+_)\d+(\.[0-9a-z]+)",
|
||||
url)
|
||||
|
||||
if not match:
|
||||
@@ -26,8 +26,8 @@ def _original_image(url):
|
||||
path, key, ext = match.groups()
|
||||
|
||||
return (
|
||||
"".join((root, path, "_raw." if key else "_1280.", ext)),
|
||||
"".join((root, path, "_500.", ext)),
|
||||
"".join((root, path, "raw" if key else "1280", ext)),
|
||||
"".join((root, path, "500", ext)),
|
||||
url,
|
||||
)
|
||||
|
||||
@@ -53,6 +53,7 @@ class TumblrExtractor(Extractor):
|
||||
category = "tumblr"
|
||||
directory_fmt = ["{category}", "{name}"]
|
||||
filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}"
|
||||
archive_fmt = "{id}_{offset}"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class TwitterTweetExtractor(Extractor):
|
||||
subcategory = "tweet"
|
||||
directory_fmt = ["{category}", "{user}"]
|
||||
filename_fmt = "{tweet_id}_{num}.{extension}"
|
||||
archive_fmt = "{tweet_id}_{num}"
|
||||
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/"
|
||||
r"(([^/]+)/status/(\d+))"]
|
||||
test = [
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,6 +18,7 @@ class WarosuThreadExtractor(Extractor):
|
||||
subcategory = "thread"
|
||||
directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
|
||||
filename_fmt = "{tim}-{filename}{ext}"
|
||||
archive_fmt = "{board}_{thread}_{tim}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"]
|
||||
test = [
|
||||
("https://warosu.org/jp/thread/16656025", {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -29,6 +29,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
|
||||
subcategory = "gallery"
|
||||
directory_fmt = ["{category}", "{user[name]}", "{title}"]
|
||||
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
|
||||
archive_fmt = "{gallery_id}_{num}"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?xvideos\.com"
|
||||
r"/profiles/([^/?&#]+)/photos/(\d+)"]
|
||||
test = [
|
||||
|
||||
@@ -10,6 +10,7 @@ import sys
|
||||
import time
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
from . import extractor, downloader, config, util, output, exception
|
||||
from .extractor.message import Message
|
||||
|
||||
@@ -152,34 +153,57 @@ class DownloadJob(Job):
|
||||
|
||||
def __init__(self, url, parent=None):
|
||||
Job.__init__(self, url, parent)
|
||||
self.log = logging.getLogger("download")
|
||||
self.pathfmt = None
|
||||
self.archive = None
|
||||
self.sleep = None
|
||||
self.downloaders = {}
|
||||
self.out = output.select()
|
||||
|
||||
def handle_url(self, url, keywords):
|
||||
def handle_url(self, url, keywords, fallback=None):
|
||||
"""Download the resource specified in 'url'"""
|
||||
if self._prepare_download(keywords):
|
||||
dlobj = self.get_downloader(url)
|
||||
if not dlobj.download(url, self.pathfmt):
|
||||
self._report_failure(dlobj)
|
||||
# prepare download
|
||||
self.pathfmt.set_keywords(keywords)
|
||||
|
||||
if self.pathfmt.exists(self.archive):
|
||||
self.out.skip(self.pathfmt.path)
|
||||
return
|
||||
|
||||
if self.sleep:
|
||||
time.sleep(self.sleep)
|
||||
|
||||
# download from URL
|
||||
if not self.get_downloader(url).download(url, self.pathfmt):
|
||||
|
||||
# use fallback URLs if available
|
||||
for num, url in enumerate(fallback or (), 1):
|
||||
self.log.info("Trying fallback URL #%d", num)
|
||||
if self.get_downloader(url).download(url, self.pathfmt):
|
||||
break
|
||||
else:
|
||||
# download failed
|
||||
self.log.error(
|
||||
"Failed to download %s", self.pathfmt.filename)
|
||||
return
|
||||
|
||||
# download succeeded
|
||||
if self.archive:
|
||||
self.archive.add()
|
||||
|
||||
def handle_urllist(self, urls, keywords):
|
||||
"""Download the resource specified in 'url'"""
|
||||
if self._prepare_download(keywords):
|
||||
for num, url in enumerate(urls):
|
||||
dlobj = self.get_downloader(url)
|
||||
if num:
|
||||
dlobj.log.info("Trying fallback URL #%d", num)
|
||||
if dlobj.download(url, self.pathfmt):
|
||||
return
|
||||
self._report_failure(dlobj)
|
||||
fallback = iter(urls)
|
||||
url = next(fallback)
|
||||
self.handle_url(url, keywords, fallback)
|
||||
|
||||
def handle_directory(self, keywords):
|
||||
"""Set and create the target directory for downloads"""
|
||||
if not self.pathfmt:
|
||||
self.pathfmt = util.PathFormat(self.extractor)
|
||||
self.sleep = self.extractor.config("sleep")
|
||||
archive = self.extractor.config("archive")
|
||||
if archive:
|
||||
self.archive = util.DownloadArchive(self.extractor, archive)
|
||||
self.pathfmt.set_directory(keywords)
|
||||
|
||||
def handle_queue(self, url, keywords):
|
||||
@@ -201,18 +225,6 @@ class DownloadJob(Job):
|
||||
self.downloaders[scheme] = instance
|
||||
return instance
|
||||
|
||||
def _prepare_download(self, keywords):
|
||||
self.pathfmt.set_keywords(keywords)
|
||||
if self.pathfmt.exists():
|
||||
self.out.skip(self.pathfmt.path)
|
||||
return False
|
||||
if self.sleep:
|
||||
time.sleep(self.sleep)
|
||||
return True
|
||||
|
||||
def _report_failure(self, dlobj):
|
||||
dlobj.log.error("Failed to download %s", self.pathfmt.filename)
|
||||
|
||||
|
||||
class KeywordJob(Job):
|
||||
"""Print available keywords"""
|
||||
|
||||
@@ -206,6 +206,12 @@ def build_parser():
|
||||
)
|
||||
|
||||
selection = parser.add_argument_group("Selection Options")
|
||||
selection.add_argument(
|
||||
"--download-archive",
|
||||
metavar="FILE", dest="archive", action=ConfigAction,
|
||||
help=("Record all downloaded files in the archive file and "
|
||||
"skip downloading any file already in it.")
|
||||
)
|
||||
selection.add_argument(
|
||||
"--range",
|
||||
metavar="RANGE", dest="image_range",
|
||||
|
||||
@@ -19,6 +19,7 @@ import shutil
|
||||
import string
|
||||
import _string
|
||||
import hashlib
|
||||
import sqlite3
|
||||
import datetime
|
||||
import itertools
|
||||
import urllib.parse
|
||||
@@ -373,22 +374,31 @@ class PathFormat():
|
||||
if os.altsep:
|
||||
self.basedirectory = self.basedirectory.replace(os.altsep, os.sep)
|
||||
|
||||
skipmode = extractor.config("skip", True)
|
||||
if skipmode == "abort":
|
||||
self.exists = self._exists_abort
|
||||
elif skipmode == "exit":
|
||||
self.exists = self._exists_exit
|
||||
elif not skipmode:
|
||||
self.exists = lambda: False
|
||||
skip = extractor.config("skip", True)
|
||||
if skip:
|
||||
if skip == "abort":
|
||||
self._skipexc = exception.StopExtraction
|
||||
elif skip == "exit":
|
||||
self._skipexc = exit
|
||||
else:
|
||||
self._skipexc = None
|
||||
else:
|
||||
self.exists = lambda x=None: False
|
||||
|
||||
def open(self, mode="wb"):
|
||||
"""Open file and return a corresponding file object"""
|
||||
return open(self.partpath or self.realpath, mode)
|
||||
|
||||
def exists(self):
|
||||
"""Return True if 'path' is complete and refers to an existing path"""
|
||||
if self.has_extension:
|
||||
return os.path.exists(self.realpath)
|
||||
def exists(self, archive=None):
|
||||
if (self.has_extension and os.path.exists(self.realpath) or
|
||||
archive and archive.check(self.keywords)):
|
||||
if self._skipexc:
|
||||
raise self._skipexc()
|
||||
if not self.has_extension:
|
||||
self.set_extension("")
|
||||
if self.path[-1] == ".":
|
||||
self.path = self.path[:-1]
|
||||
return True
|
||||
return False
|
||||
|
||||
def set_directory(self, keywords):
|
||||
@@ -473,16 +483,6 @@ class PathFormat():
|
||||
shutil.copyfile(self.partpath, self.realpath)
|
||||
os.unlink(self.partpath)
|
||||
|
||||
def _exists_abort(self):
|
||||
if self.has_extension and os.path.exists(self.realpath):
|
||||
raise exception.StopExtraction()
|
||||
return False
|
||||
|
||||
def _exists_exit(self):
|
||||
if self.has_extension and os.path.exists(self.realpath):
|
||||
exit()
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def adjust_path(path):
|
||||
"""Enable longer-than-260-character paths on windows"""
|
||||
@@ -535,3 +535,30 @@ class OAuthSession():
|
||||
OAuthSession.quote(str(key)) + "=" + OAuthSession.quote(str(value))
|
||||
for key, value in sorted(params.items()) if value
|
||||
)
|
||||
|
||||
|
||||
class DownloadArchive():
|
||||
|
||||
def __init__(self, extractor, path):
|
||||
con = sqlite3.connect(path)
|
||||
con.isolation_level = None
|
||||
self.cursor = con.cursor()
|
||||
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
|
||||
"(entry PRIMARY KEY) WITHOUT ROWID")
|
||||
self.keygen = (
|
||||
extractor.category +
|
||||
(extractor.archive_fmt or extractor.filename_fmt)
|
||||
).format_map
|
||||
self._key = None
|
||||
|
||||
def check(self, kwdict):
|
||||
"""Return True if item described by 'kwdict' exists in archive"""
|
||||
self._key = self.keygen(kwdict)
|
||||
self.cursor.execute(
|
||||
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (self._key,))
|
||||
return self.cursor.fetchone()
|
||||
|
||||
def add(self):
|
||||
"""Add last item used in 'check()' to archive"""
|
||||
self.cursor.execute(
|
||||
"INSERT OR IGNORE INTO archive VALUES (?)", (self._key,))
|
||||
|
||||
Reference in New Issue
Block a user