Merge branch 'archive'

2018-02-12 18:07:58 +01:00
parent 20af86b2ea 4d2fadfb6f
commit 3cec533c28
46 changed files with 199 additions and 95 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -353,6 +353,20 @@ Description Additional key-value pairs to be added to each metadata dictionary.
 =========== =====


+extractor.*.archive
+-------------------
+=========== =====
+Type        ``string``
+Default     ``null``
+Description File to store IDs of downloaded files in. Downloads of files
+            already recorded in this archive file will be skipped_.
+
+            The resulting archive file is not a plain text file but an SQLite3
+            database, as lookup operations are significantly faster when the
+            amount of stored IDs gets reasonably large.
+=========== =====
+
+
 Extractor-specific Options
 ==========================

@@ -788,6 +802,7 @@ How To      - login and visit Tumblr's Applications_ section
 .. |datetime.max| replace:: ``datetime.max``
 .. |strptime| replace:: strftime() and strptime() Behavior

+.. _skipped: `extractor.*.skip`_
 .. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_
 .. _date-format: extractor.reddit.date-format_

--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -16,6 +16,7 @@
    },
    "extractor":
    {
+        "archive": null,
        "skip": true,
        "sleep": 0,

--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -17,8 +17,10 @@ class FutabaThreadExtractor(Extractor):
    category = "2chan"
    subcategory = "thread"
    directory_fmt = ["{category}", "{board_name}", "{thread}"]
-    pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"]
+    filename_fmt = "{tim}.{extension}"
+    archive_fmt = "{board}_{thread}_{tim}"
    urlfmt = "https://{server}.2chan.net/{board}/src/{filename}"
+    pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"]
    test = [("http://dec.2chan.net/70/res/947.htm", {
        "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
        "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -20,6 +20,7 @@ class BooruExtractor(SharedConfigExtractor):
    """Base class for all booru extractors"""
    basecategory = "booru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    archive_fmt = "{id}"
    api_url = ""
    per_page = 50
    page_start = 1
--- a/gallery_dl/extractor/chan.py
+++ b/gallery_dl/extractor/chan.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,8 @@ class ChanThreadExtractor(Extractor):
    category = "chan"
    subcategory = "thread"
    directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
-    filename_fmt = "{tim}-{filename}{ext}"
+    filename_fmt = "{tim}-{filename}.{extension}"
+    archive_fmt = "{board}_{thread}_{tim}"
    api_url = ""
    file_url = ""

@@ -69,6 +70,7 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
    directory_fmt = ["{category}", "{board[shortname]}",
                     "{thread_num}{title:? - //}"]
    filename_fmt = "{media[media]}"
+    archive_fmt = "{{board[shortname]}}_{num}_{timestamp}"
    root = ""
    referer = True

--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -28,6 +28,7 @@ class Extractor():
    categorytransfer = False
    directory_fmt = ["{category}"]
    filename_fmt = "{name}.{extension}"
+    archive_fmt = ""
    cookiedomain = ""

    def __init__(self):
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,9 @@ import re
 class DeviantartExtractor(Extractor):
    """Base class for deviantart extractors"""
    category = "deviantart"
-    filename_fmt = "{category}_{index}_{title}.{extension}"
    directory_fmt = ["{category}", "{author[username]!l}"]
+    filename_fmt = "{category}_{index}_{title}.{extension}"
+    archive_fmt = "{index}.{extension}"

    def __init__(self, match=None):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -22,6 +22,7 @@ class ExhentaiGalleryExtractor(Extractor):
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery_id}"]
    filename_fmt = "{gallery_id}_{num:>04}_{image_token}_{name}.{extension}"
+    archive_fmt = "{gallery_id}_{num}"
    pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
    test = [
        ("https://exhentai.org/g/960460/4f0e369d82/", {
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -16,6 +16,7 @@ class FlickrExtractor(Extractor):
    """Base class for flickr extractors"""
    category = "flickr"
    filename_fmt = "{category}_{id}.{extension}"
+    archive_fmt = "{id}"

    def __init__(self, match):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -62,6 +62,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
    directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
    filename_fmt = (
        "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
+    archive_fmt = "{id}"
    method = "default"

    def __init__(self, match, url=None):
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class GelbooruExtractor(SharedConfigExtractor):
    basecategory = "booru"
    category = "gelbooru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    archive_fmt = "{id}"
    api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"

    def __init__(self):
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -15,6 +15,7 @@ from .. import exception
 class GfycatExtractor(Extractor):
    """Base class for gfycat extractors"""
    category = "gfycat"
+    archive_fmt = "{gfyName}"

    def __init__(self, match):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/hbrowse.py
+++ b/gallery_dl/extractor/hbrowse.py
@@ -69,6 +69,7 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
    directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
    filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
                    "{page:>03}.{extension}")
+    archive_fmt = "{manga_id}_{chapter}_{page}"
    pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
    test = [("http://www.hbrowse.com/10363/c00000", {
        "url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class HentaifoundryUserExtractor(Extractor):
    subcategory = "user"
    directory_fmt = ["{category}", "{artist}"]
    filename_fmt = "{category}_{index}_{title}.{extension}"
+    archive_fmt = "{index}"
    pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/"
               r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"]
    test = [
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -19,6 +19,7 @@ class HitomiGalleryExtractor(Extractor):
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}"
+    archive_fmt = "{gallery_id}_{num}"
    pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"]
    test = [("https://hitomi.la/galleries/867789.html", {
        "url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130",
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -17,7 +17,8 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
    category = "imagebam"
    subcategory = "gallery"
    directory_fmt = ["{category}", "{title} - {gallery_key}"]
-    filename_fmt = "{num:>03}-{filename}"
+    filename_fmt = "{num:>03}-{name}.{extension}"
+    archive_fmt = "{image_id}"
    pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"]
    test = [(("http://www.imagebam.com/"
              "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), {
@@ -76,6 +77,7 @@ class ImagebamImageExtractor(Extractor):
    """Extractor for single images from imagebam.com"""
    category = "imagebam"
    subcategory = "image"
+    archive_fmt = "{token}"
    pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"]
    test = [("http://www.imagebam.com/image/94d56c502511890", {
        "url": "b384893c35a01a09c58018db71ddc4cf2480be95",
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -13,12 +13,17 @@ from .. import text, util
 import json


-class ImagefapGalleryExtractor(Extractor):
-    """Extractor for image galleries from imagefap.com"""
+class ImagefapExtractor(Extractor):
+    """Base class for imagefap extractors"""
    category = "imagefap"
-    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
+    archive_fmt = "{gallery_id}_{image_id}"
+
+
+class ImagefapGalleryExtractor(ImagefapExtractor):
+    """Extractor for image galleries from imagefap.com"""
+    subcategory = "gallery"
    pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
                r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")]
    test = [
@@ -35,7 +40,7 @@ class ImagefapGalleryExtractor(Extractor):
    ]

    def __init__(self, match):
-        Extractor.__init__(self)
+        ImagefapExtractor.__init__(self)
        self.gid = match.group(1)
        self.image_id = ""

@@ -80,12 +85,9 @@ class ImagefapGalleryExtractor(Extractor):
            params["idx"] += 24


-class ImagefapImageExtractor(Extractor):
+class ImagefapImageExtractor(ImagefapExtractor):
    """Extractor for single images from imagefap.com"""
-    category = "imagefap"
    subcategory = "image"
-    directory_fmt = ["{category}", "{gallery_id} {title}"]
-    filename_fmt = "{category}_{gallery_id}_{name}.{extension}"
    pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"]
    test = [("http://www.imagefap.com/photo/1369341772/", {
        "url": "24cc4312e4a5084f39f1e35af5ba92e5f7c1ad3c",
@@ -93,7 +95,7 @@ class ImagefapImageExtractor(Extractor):
    })]

    def __init__(self, match):
-        Extractor.__init__(self)
+        ImagefapExtractor.__init__(self)
        self.image_id = match.group(1)

    def items(self):
@@ -132,9 +134,8 @@ class ImagefapImageExtractor(Extractor):
        return json_dict


-class ImagefapUserExtractor(Extractor):
+class ImagefapUserExtractor(ImagefapExtractor):
    """Extractor for all galleries from a user at imagefap.com"""
-    category = "imagefap"
    subcategory = "user"
    categorytransfer = True
    pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com/"
@@ -146,7 +147,7 @@ class ImagefapUserExtractor(Extractor):
    })]

    def __init__(self, match):
-        Extractor.__init__(self)
+        ImagefapExtractor.__init__(self)
        try:
            self.user_id = int(match.group(1))
            self.user = None
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ from urllib.parse import urljoin
 class ImagehostImageExtractor(Extractor):
    """Base class for single-image extractors for various imagehosts"""
    subcategory = "image"
+    archive_fmt = "{token}"
    https = False
    method = "post"
    params = "simple"
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -16,6 +16,7 @@ import re
 class ImgboxExtractor(Extractor):
    """Base class for imgbox extractors"""
    category = "imgbox"
+    archive_fmt = "{image_key}"
    root = "https://imgbox.com"

    def items(self):
@@ -62,7 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
    """Extractor for image galleries from imgbox.com"""
    subcategory = "gallery"
    directory_fmt = ["{category}", "{title} - {gallery_key}"]
-    filename_fmt = "{num:>03}-{filename}"
+    filename_fmt = "{num:>03}-{name}.{extension}"
    pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
    test = [
        ("https://imgbox.com/g/JaX5V5HX7g", {
--- a/gallery_dl/extractor/imgchili.py
+++ b/gallery_dl/extractor/imgchili.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -15,6 +15,7 @@ from .. import text
 class ImgchiliExtractor(Extractor):
    """Base class for imgchili extractors"""
    category = "imgchili"
+    archive_fmt = "{image_id}"
    root = "https://imgchili.net"

    def __init__(self, match):
--- a/gallery_dl/extractor/imgth.py
+++ b/gallery_dl/extractor/imgth.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class ImgthGalleryExtractor(Extractor):
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+    archive_fmt = "{gallery_id}_{num}"
    pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"]
    test = [("http://imgth.com/gallery/37/wallpaper-anime", {
        "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -16,6 +16,7 @@ import json
 class ImgurExtractor(Extractor):
    """Base class for imgur extractors"""
    category = "imgur"
+    archive_fmt = "{hash}"

    def __init__(self, match):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
    category = "khinsider"
    subcategory = "soundtrack"
    directory_fmt = ["{category}", "{album}"]
+    archive_fmt = "{album}_{name}"
    pattern = [r"(?:https?://)?downloads\.khinsider\.com/"
               r"game-soundtracks/album/([^/?&#]+)"]
    test = [(("https://downloads.khinsider.com/game-soundtracks/"
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class LusciousAlbumExtractor(AsynchronousExtractor):
    subcategory = "album"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+    archive_fmt = "{gallery_id}_{image_id}"
    pattern = [(r"(?:https?://)?(?:www\.|members\.)?luscious\.net/"
                r"(?:c/[^/?&#]+/)?(?:pictures/album|albums)/([^/?&#]+_(\d+))")]
    test = [
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@@ -59,6 +59,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):

 class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
    """Extractor for manga-chapters from mangareader.net"""
+    archive_fmt = "{manga}_{chapter}_{page}"
    pattern = [
        (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
        (r"(?:https?://)?(?:www\.)?mangareader\.net"
--- a/gallery_dl/extractor/mangastream.py
+++ b/gallery_dl/extractor/mangastream.py
@@ -16,6 +16,7 @@ from urllib.parse import urljoin
 class MangastreamChapterExtractor(ChapterExtractor):
    """Extractor for manga-chapters from mangastream.com"""
    category = "mangastream"
+    archive_fmt = "{chapter_id}_{page}"
    pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
                r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
    test = [("https://readms.net/r/onepunch_man/087/4874/1", None)]
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -19,6 +19,7 @@ class NhentaiGalleryExtractor(Extractor):
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+    archive_fmt = "{gallery_id}_{num}"
    pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"]
    test = [("http://nhentai.net/g/147850/", {
        "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -18,6 +18,7 @@ class NijieExtractor(AsynchronousExtractor):
    category = "nijie"
    directory_fmt = ["{category}", "{artist_id}"]
    filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
+    archive_fmt = "{image_id}_{index}"
    cookiedomain = "nijie.info"
    popup_url = "https://nijie.info/view_popup.php?id="

--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -17,6 +17,7 @@ class PahealExtractor(SharedConfigExtractor):
    basecategory = "booru"
    category = "paheal"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    archive_fmt = "{id}"
    root = "http://rule34.paheal.net"

    def items(self):
--- a/gallery_dl/extractor/pawoo.py
+++ b/gallery_dl/extractor/pawoo.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -17,6 +17,7 @@ class PawooExtractor(Extractor):
    category = "pawoo"
    directory_fmt = ["{category}", "{account[username]}"]
    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+    archive_fmt = "{media[id]}"

    def __init__(self):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -16,6 +16,7 @@ class PinterestExtractor(Extractor):
    """Base class for pinterest extractors"""
    category = "pinterest"
    filename_fmt = "{category}_{pin_id}.{extension}"
+    archive_fmt = "{pin_id}"

    def __init__(self):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -19,6 +19,7 @@ class PixivExtractor(Extractor):
    category = "pixiv"
    directory_fmt = ["{category}", "{user[id]} {user[account]}"]
    filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
+    archive_fmt = "{id}{num}"
    illust_url = "https://www.pixiv.net/member_illust.php?mode=medium"

    def __init__(self):
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -18,6 +18,7 @@ class ReadcomiconlineBase():
    category = "readcomiconline"
    directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
    filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
+    archive_fmt = "{comic}_{issue}_{page}"
    root = "http://readcomiconline.to"
    useragent = "Wget/1.19.2 (linux-gnu)"

--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -118,6 +118,7 @@ class RedditImageExtractor(Extractor):
    """Extractor for reddit-hosted images"""
    category = "reddit"
    subcategory = "image"
+    archive_fmt = "{name}"
    pattern = [r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)"
               r"/[^/?&#]+(?:\?[^#]*)?"]
    test = [
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2017 Mike Fährmann
+# Copyright 2014-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -20,6 +20,7 @@ class SankakuExtractor(SharedConfigExtractor):
    basecategory = "booru"
    category = "sankaku"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    archive_fmt = "{id}"
    cookienames = ("login", "pass_hash")
    cookiedomain = "chan.sankakucomplex.com"
    subdomain = "chan"
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -16,6 +16,7 @@ from ..cache import cache
 class SeigaExtractor(Extractor):
    """Base class for seiga extractors"""
    category = "seiga"
+    archive_fmt = "{image_id}"
    cookiedomain = ".nicovideo.jp"

    def __init__(self):
--- a/gallery_dl/extractor/senmanga.py
+++ b/gallery_dl/extractor/senmanga.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class SenmangaChapterExtractor(Extractor):
    subcategory = "chapter"
    directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
    filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
+    archive_fmt = "{manga}_{chapter_string}_{page}"
    pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]
    test = [
        ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -18,6 +18,7 @@ class SlideshareExtractor(Extractor):
    subcategory = "presentation"
    directory_fmt = ["{category}", "{user}"]
    filename_fmt = "{presentation}-{num:>02}.{extension}"
+    archive_fmt = "{presentation}_{num}"
    pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net"
               r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)"]
    test = [
--- a/gallery_dl/extractor/spectrumnexus.py
+++ b/gallery_dl/extractor/spectrumnexus.py
@@ -38,7 +38,7 @@ class SpectrumnexusChapterExtractor(ChapterExtractor):
    category = "spectrumnexus"
    directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
    filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
-
+    archive_fmt = "{manga}_{chapter_string}_{page}"
    pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
               r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
    test = [(("http://view.thespectrum.net/series/"
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -17,7 +17,7 @@ import re
 def _original_image(url):
    match = re.match(
        r"https?://\d+\.media\.tumblr\.com"
-        r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+)_\d+\.([0-9a-z]+)",
+        r"((/[0-9a-f]+)?/tumblr_[^/?&#.]+_)\d+(\.[0-9a-z]+)",
        url)

    if not match:
@@ -26,8 +26,8 @@ def _original_image(url):
    path, key, ext = match.groups()

    return (
-        "".join((root, path, "_raw." if key else "_1280.", ext)),
-        "".join((root, path, "_500.", ext)),
+        "".join((root, path, "raw" if key else "1280", ext)),
+        "".join((root, path, "500", ext)),
        url,
    )

@@ -53,6 +53,7 @@ class TumblrExtractor(Extractor):
    category = "tumblr"
    directory_fmt = ["{category}", "{name}"]
    filename_fmt = "{category}_{blog_name}_{id}o{offset}.{extension}"
+    archive_fmt = "{id}_{offset}"

    def __init__(self, match):
        Extractor.__init__(self)
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2017 Mike Fährmann
+# Copyright 2016-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class TwitterTweetExtractor(Extractor):
    subcategory = "tweet"
    directory_fmt = ["{category}", "{user}"]
    filename_fmt = "{tweet_id}_{num}.{extension}"
+    archive_fmt = "{tweet_id}_{num}"
    pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/"
               r"(([^/]+)/status/(\d+))"]
    test = [
--- a/gallery_dl/extractor/warosu.py
+++ b/gallery_dl/extractor/warosu.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@ class WarosuThreadExtractor(Extractor):
    subcategory = "thread"
    directory_fmt = ["{category}", "{board}", "{thread} - {title}"]
    filename_fmt = "{tim}-{filename}{ext}"
+    archive_fmt = "{board}_{thread}_{tim}"
    pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"]
    test = [
        ("https://warosu.org/jp/thread/16656025", {
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017 Mike Fährmann
+# Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -29,6 +29,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
    subcategory = "gallery"
    directory_fmt = ["{category}", "{user[name]}", "{title}"]
    filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+    archive_fmt = "{gallery_id}_{num}"
    pattern = [r"(?:https?://)?(?:www\.)?xvideos\.com"
               r"/profiles/([^/?&#]+)/photos/(\d+)"]
    test = [
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -10,6 +10,7 @@ import sys
 import time
 import json
 import hashlib
+import logging
 from . import extractor, downloader, config, util, output, exception
 from .extractor.message import Message

@@ -152,34 +153,57 @@ class DownloadJob(Job):

    def __init__(self, url, parent=None):
        Job.__init__(self, url, parent)
+        self.log = logging.getLogger("download")
        self.pathfmt = None
+        self.archive = None
        self.sleep = None
        self.downloaders = {}
        self.out = output.select()

-    def handle_url(self, url, keywords):
+    def handle_url(self, url, keywords, fallback=None):
        """Download the resource specified in 'url'"""
-        if self._prepare_download(keywords):
-            dlobj = self.get_downloader(url)
-            if not dlobj.download(url, self.pathfmt):
-                self._report_failure(dlobj)
+        # prepare download
+        self.pathfmt.set_keywords(keywords)
+
+        if self.pathfmt.exists(self.archive):
+            self.out.skip(self.pathfmt.path)
+            return
+
+        if self.sleep:
+            time.sleep(self.sleep)
+
+        # download from URL
+        if not self.get_downloader(url).download(url, self.pathfmt):
+
+            # use fallback URLs if available
+            for num, url in enumerate(fallback or (), 1):
+                self.log.info("Trying fallback URL #%d", num)
+                if self.get_downloader(url).download(url, self.pathfmt):
+                    break
+            else:
+                # download failed
+                self.log.error(
+                    "Failed to download %s", self.pathfmt.filename)
+                return
+
+        # download succeeded
+        if self.archive:
+            self.archive.add()

    def handle_urllist(self, urls, keywords):
        """Download the resource specified in 'url'"""
-        if self._prepare_download(keywords):
-            for num, url in enumerate(urls):
-                dlobj = self.get_downloader(url)
-                if num:
-                    dlobj.log.info("Trying fallback URL #%d", num)
-                if dlobj.download(url, self.pathfmt):
-                    return
-            self._report_failure(dlobj)
+        fallback = iter(urls)
+        url = next(fallback)
+        self.handle_url(url, keywords, fallback)

    def handle_directory(self, keywords):
        """Set and create the target directory for downloads"""
        if not self.pathfmt:
            self.pathfmt = util.PathFormat(self.extractor)
            self.sleep = self.extractor.config("sleep")
+            archive = self.extractor.config("archive")
+            if archive:
+                self.archive = util.DownloadArchive(self.extractor, archive)
        self.pathfmt.set_directory(keywords)

    def handle_queue(self, url, keywords):
@@ -201,18 +225,6 @@ class DownloadJob(Job):
            self.downloaders[scheme] = instance
        return instance

-    def _prepare_download(self, keywords):
-        self.pathfmt.set_keywords(keywords)
-        if self.pathfmt.exists():
-            self.out.skip(self.pathfmt.path)
-            return False
-        if self.sleep:
-            time.sleep(self.sleep)
-        return True
-
-    def _report_failure(self, dlobj):
-        dlobj.log.error("Failed to download %s", self.pathfmt.filename)
-

 class KeywordJob(Job):
    """Print available keywords"""
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -206,6 +206,12 @@ def build_parser():
    )

    selection = parser.add_argument_group("Selection Options")
+    selection.add_argument(
+        "--download-archive",
+        metavar="FILE", dest="archive", action=ConfigAction,
+        help=("Record all downloaded files in the archive file and "
+              "skip downloading any file already in it.")
+    )
    selection.add_argument(
        "--range",
        metavar="RANGE", dest="image_range",
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -19,6 +19,7 @@ import shutil
 import string
 import _string
 import hashlib
+import sqlite3
 import datetime
 import itertools
 import urllib.parse
@@ -373,22 +374,31 @@ class PathFormat():
        if os.altsep:
            self.basedirectory = self.basedirectory.replace(os.altsep, os.sep)

-        skipmode = extractor.config("skip", True)
-        if skipmode == "abort":
-            self.exists = self._exists_abort
-        elif skipmode == "exit":
-            self.exists = self._exists_exit
-        elif not skipmode:
-            self.exists = lambda: False
+        skip = extractor.config("skip", True)
+        if skip:
+            if skip == "abort":
+                self._skipexc = exception.StopExtraction
+            elif skip == "exit":
+                self._skipexc = exit
+            else:
+                self._skipexc = None
+        else:
+            self.exists = lambda x=None: False

    def open(self, mode="wb"):
        """Open file and return a corresponding file object"""
        return open(self.partpath or self.realpath, mode)

-    def exists(self):
-        """Return True if 'path' is complete and refers to an existing path"""
-        if self.has_extension:
-            return os.path.exists(self.realpath)
+    def exists(self, archive=None):
+        if (self.has_extension and os.path.exists(self.realpath) or
+                archive and archive.check(self.keywords)):
+            if self._skipexc:
+                raise self._skipexc()
+            if not self.has_extension:
+                self.set_extension("")
+                if self.path[-1] == ".":
+                    self.path = self.path[:-1]
+            return True
        return False

    def set_directory(self, keywords):
@@ -473,16 +483,6 @@ class PathFormat():
        shutil.copyfile(self.partpath, self.realpath)
        os.unlink(self.partpath)

-    def _exists_abort(self):
-        if self.has_extension and os.path.exists(self.realpath):
-            raise exception.StopExtraction()
-        return False
-
-    def _exists_exit(self):
-        if self.has_extension and os.path.exists(self.realpath):
-            exit()
-        return False
-
    @staticmethod
    def adjust_path(path):
        """Enable longer-than-260-character paths on windows"""
@@ -535,3 +535,30 @@ class OAuthSession():
            OAuthSession.quote(str(key)) + "=" + OAuthSession.quote(str(value))
            for key, value in sorted(params.items()) if value
        )
+
+
+class DownloadArchive():
+
+    def __init__(self, extractor, path):
+        con = sqlite3.connect(path)
+        con.isolation_level = None
+        self.cursor = con.cursor()
+        self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
+                            "(entry PRIMARY KEY) WITHOUT ROWID")
+        self.keygen = (
+            extractor.category +
+            (extractor.archive_fmt or extractor.filename_fmt)
+        ).format_map
+        self._key = None
+
+    def check(self, kwdict):
+        """Return True if item described by 'kwdict' exists in archive"""
+        self._key = self.keygen(kwdict)
+        self.cursor.execute(
+            "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (self._key,))
+        return self.cursor.fetchone()
+
+    def add(self):
+        """Add last item used in 'check()' to archive"""
+        self.cursor.execute(
+            "INSERT OR IGNORE INTO archive VALUES (?)", (self._key,))