Merge branch 'category'

This commit is contained in:
Mike Fährmann
2016-09-25 17:42:47 +02:00
48 changed files with 169 additions and 220 deletions

View File

@@ -17,7 +17,7 @@ class FourchanThreadExtractor(ChanExtractor):
pattern = [r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+)"]
test = [("https://boards.4chan.org/tg/thread/15396072/", {
"url": "39082ad166161966d7ba8e37f2173a824eb540f0",
"keyword": "9b610fd3674653728516c34ec65925a024cc0074",
"keyword": "38679a7c8054f535cba67cae13eef1ea7dbc8085",
"content": "3081ed85a5afaeb3f430f42540e7bb5eec1908cc",
})]
api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"

View File

@@ -17,7 +17,7 @@ class InfinitychanThreadExtractor(ChanExtractor):
pattern = [r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)"]
test = [("https://8ch.net/tg/res/175887.html", {
"url": "cb03fdc650ad8e796fdab553fbd5489f468d3f45",
"keyword": "d9388d231db6a0ea3e710a6cf46dc53dbdbb2115",
"keyword": "c2a7f57422558dddaf3467b9a30018e847eb4fad",
"content": "9f51cdfee6942a18011996ca049baeb0a22f931b",
})]
api_url = "https://8ch.net/{board}/res/{thread}.json"

View File

@@ -22,7 +22,7 @@ class BatotoChapterExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?(?:www\.)?bato\.to/reader#([0-9a-f]+)"]
test = [("http://bato.to/reader#459878c8fda07502", {
"url": "432d7958506ad913b0a9e42664a89e46a63e9296",
"keyword": "7a3e03c40c8b3c7137c4ebe723b1b9c95a303d81",
"keyword": "75a3a86d32aecfc21c44865b4043490757f73d77",
})]
url = "https://bato.to/"
reader_url = "https://bato.to/areader"
@@ -78,7 +78,6 @@ class BatotoChapterExtractor(AsynchronousExtractor):
manga, pos = extr(page, "document.title = '", " - ", pos)
match = re.match(r"(Vol.(\d+) )?Ch\.([^:]+)(: (.+))?", cinfo)
return {
"category": self.category,
"token": self.token,
"manga": text.unescape(manga),
"volume": match.group(2) or "",

View File

@@ -55,13 +55,10 @@ class BooruExtractor(Extractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
# Override this method in derived classes
return {
"category": self.category,
}
return {}
def get_file_metadata(self, data):
"""Collect metadata for a downloadable file"""
data["category"] = self.category
return text.nameext_from_url(self.get_file_url(data), data)
def get_file_url(self, data):
@@ -114,10 +111,7 @@ class BooruTagExtractor(BooruExtractor):
self.params["tags"] = self.tags
def get_job_metadata(self):
return {
"category": self.category,
"tags": self.tags,
}
return {"tags": self.tags}
class BooruPoolExtractor(BooruExtractor):
@@ -131,10 +125,7 @@ class BooruPoolExtractor(BooruExtractor):
self.params["tags"] = "pool:" + self.pool
def get_job_metadata(self):
return {
"category": self.category,
"pool": self.pool,
}
return {"pool": self.pool}
class BooruPostExtractor(BooruExtractor):

View File

@@ -21,7 +21,6 @@ class ChanExtractor(Extractor):
def __init__(self, board, thread):
Extractor.__init__(self)
self.metadata = {
"category": self.category,
"board": board,
"thread": thread,
}

View File

@@ -21,7 +21,7 @@ class ChronosImageExtractor(Extractor):
url_base = "http://chronos.to/"
test = [("http://chronos.to/bdrmq7rw7v4y", {
"url": "7fcb3fe315c94283644d25ef47a644c2dc8da944",
"keyword": "9c364ef9bd962fe70eca49ef74c1c424486514f9",
"keyword": "04dbc71a1154728d01c931308184050d61c5da55",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -30,10 +30,6 @@ class ChronosImageExtractor(Extractor):
self.token = match.group(1)
def items(self):
data = {
"category": self.category,
"token": self.token,
}
params = {
"op": "view",
"id": self.token,
@@ -44,7 +40,7 @@ class ChronosImageExtractor(Extractor):
data=params).text
url , pos = text.extract(page, '<br><img src="', '"')
filename, pos = text.extract(page, ' alt="', '"', pos)
text.nameext_from_url(filename, data)
data = text.nameext_from_url(filename, {"token": self.token})
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, url, data

View File

@@ -17,6 +17,6 @@ class CoreimgImageExtractor(chronos.ChronosImageExtractor):
url_base = "https://coreimg.net/"
test = [("http://coreimg.net/ykcl5al8uzvg", {
"url": "2b32596a2ea66b7cc784e20f3749f75f20998d78",
"keyword": "c81daac4ecc3e44796117cdea5eb6f3b852c2027",
"keyword": "8d71e5b820bc7177baee33ca529c91ae4521299f",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]

View File

@@ -21,7 +21,7 @@ class DeviantartUserExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com(?:/gallery)?/?$"]
test = [("http://shimoda7.deviantart.com/gallery/", {
"url": "63bfa8efba199e27181943c9060f6770f91a8441",
"keyword": "c0343b41c28c87254a3f0b3241222e94f780701e",
"keyword": "741bbea4891a23335bb5d119c4a42aeb54702c50",
})]
def __init__(self, match):
@@ -57,10 +57,7 @@ class DeviantartUserExtractor(AsynchronousExtractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
return {
"category": self.category,
"artist": self.artist,
}
return {"artist": self.artist}
def get_image_metadata(self, image):
"""Collect metadata for an image"""
@@ -108,7 +105,7 @@ class DeviantartImageExtractor(Extractor):
pattern = [r"(?:https?://)?[^\.]+\.deviantart\.com/art/.+-(\d+)"]
test = [("http://shimoda7.deviantart.com/art/For-the-sake-of-a-memory-10073852", {
"url": "71345ce3bef5b19bd2a56d7b96e6b5ddba747c2e",
"keyword": "f2dfde276a39990097935ace092811c56bc0bfec",
"keyword": "ccac27b8f740fc943afca9460608e02c6cbcdf96",
})]
def __init__(self, match):
@@ -127,7 +124,7 @@ class DeviantartImageExtractor(Extractor):
('description', '"og:description" content="', '"'),
(None , '<span class="tt-w">', ''),
('date' , 'title="', '"'),
), values={'category': self.category, "index": self.index})[0]
), values={"index": self.index})[0]
data["description"] = text.unescape(text.unescape(data["description"]))
data["artist"] = text.extract(data["url"], "//", ".")[0]
data["date"] = text.extract(data["date"], ", ", " in ", len(data["title"]))[0]

View File

@@ -22,7 +22,7 @@ class DoujinmodeChapterExtractor(Extractor):
r"(?:hentai/|yaoi/|western/)?mangas/([0-9a-f]{36})")]
test = [("http://doujinmode.net/mangas/967836c988a716e9efca06998b7838d09eb5", {
"url": "be5d48a9fd48f09cfcc5d4e51f24bf1100e75502",
"keyword": "710cc9599faf563b0cad836bbc7d85b288fcda3a",
"keyword": "fbccd0416f19080dc2e041917aeff721399adf13",
"content": "a041114e2a8af54d42a4a46a69cae4ebf2641cb1",
})]
url_base = "http://doujinmode.net/mangas/"
@@ -45,7 +45,6 @@ class DoujinmodeChapterExtractor(Extractor):
count, pos = text.extract(page, ' class="manga-count">', '</span>')
title, pos = text.extract(page, '<h2>', ' Images List</h2>', pos)
return {
"category": self.category,
"gallery-id": self.gid,
"title": text.unescape(title),
"count": count,

View File

@@ -23,11 +23,11 @@ class DynastyscansChapterExtractor(Extractor):
test = [
("http://dynasty-scans.com/chapters/hitoribocchi_no_oo_seikatsu_ch33", {
"url": "63950fa1dfdef58ab842c1b9b854c5c1d650cfa0",
"keyword": "7a950a94e76cceb63559de0826cb2d5a1dcaa48a",
"keyword": "81bfda5b98b34ac2a7324bd9e2abad3df9cc7673",
}),
("http://dynasty-scans.com/chapters/new_game_the_spinoff_special_13", {
"url": "6b28c733481ac498da341e85a9eb155864491731",
"keyword": "56ed59442b69d45ee4042d6586b30a72f55c3e12",
"keyword": "93b75d0c0aaeb849c99f2225a4b97f466bc3ace9",
}),
]
url_base = "http://dynasty-scans.com/"
@@ -61,7 +61,6 @@ class DynastyscansChapterExtractor(Extractor):
info
)
return {
"category": self.category,
"manga": text.unescape(match.group(1)),
"chapter": match.group(2) or "",
"title": text.unescape(match.group(3) or ""),

View File

@@ -22,7 +22,7 @@ class ExhentaiGalleryExtractor(Extractor):
filename_fmt = "{gallery-id}_{num:>04}_{image-token}_{name}.{extension}"
pattern = [r"(?:https?://)?(?:g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
test = [("https://exhentai.org/g/960460/4f0e369d82/", {
"keyword": "c1282ffbe5d452c62dec9dbde4ecb7037525cd64",
"keyword": "623f8c86c9fe38e964682dd4309b96922655b900",
"content": "493d759de534355c9f55f8e365565b62411de146",
})]
api_url = "https://exhentai.org/api.php"
@@ -71,7 +71,6 @@ class ExhentaiGalleryExtractor(Extractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
data = {
"category" : self.category,
"gallery-id" : self.gid,
"gallery-token": self.token,
}

View File

@@ -46,7 +46,7 @@ class HbrowseChapterExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/(c\d+)"]
test = [("http://www.hbrowse.com/10363/c00000", {
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
"keyword": "e6263b71f791000ad4bca58bc4d90f79e42e6be6",
"keyword": "c7dc22a10699dee5cf466406fecee6ffa2e6277e",
"content": "44578ebbe176c2c27434966aef22945787e2781e",
})]
url_base = "http://www.hbrowse.com"
@@ -68,7 +68,6 @@ class HbrowseChapterExtractor(Extractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
data = {
"category": self.category,
'gallery-id': self.gid,
"chapter": int(self.chapter[1:]),
}

View File

@@ -54,7 +54,7 @@ class Hentai2readChapterExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"]
test = [("http://hentai2read.com/amazon_elixir/1/", {
"url": "fb5fc4d7cc194116960eaa648c7e045a6e6f0c11",
"keyword": "03435037539d57ca084c457b5ac4d48928487521",
"keyword": "c05d0d0bbe188926b15a43df1f8f65b8ac11c3fd",
})]
def __init__(self, match):
@@ -78,7 +78,6 @@ class Hentai2readChapterExtractor(Extractor):
title = text.extract(page, "<title>", "</title>")[0]
match = re.match(r"Reading (?:(.+) dj - )?(.+) Hentai - \d+: ", title)
return {
"category": self.category,
"gallery-id": images[0].split("/")[-3],
"chapter": self.chapter,
"count": len(images),

View File

@@ -20,7 +20,7 @@ class HentaiboxChapterExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?hentaibox\.net/[^/]+/(\d+)_\d+_([^/&]+)"]
test = [("http://www.hentaibox.net/hentai-manga/16_18_Original_Amazon-No-Hiyaku-Amazon-Elixir-Decensored", {
"url": "d1a50a9b289d284f178971e01cf312791888e057",
"keyword": "294eda384689d4f1178ec952560d0dedd3e38647",
"keyword": "b4b100f800b716e573e072f01b5d604d9b436b70",
})]
def __init__(self, match):
@@ -44,7 +44,7 @@ class HentaiboxChapterExtractor(Extractor):
("title" , 'content="Read or Download ', ' hentai manga from'),
("series" , ' the series ', ' with ' + self.count),
("language", ' translated pages to ', '.'),
), values={"category": self.category, "count": self.count})[0]
), values={"count": self.count})[0]
data["lang"] = iso639_1.language_to_code(data["language"])
return data

View File

@@ -23,7 +23,7 @@ class HentaifoundryUserExtractor(Extractor):
]
test = [("http://www.hentai-foundry.com/pictures/user/Orzy", {
"url": "236ac02c8f081fee44ad2c2571bf74615633b91e",
"keyword": "f5f1aa78ecbe390fb117a0b599f771cd47df86c6",
"keyword": "9f334f635b71c915b026cf20a65eee065237d452",
})]
url_base = "http://www.hentai-foundry.com/pictures/user/"
@@ -60,7 +60,6 @@ class HentaifoundryUserExtractor(Extractor):
token, pos = text.extract(page, 'hidden" value="', '"')
count, pos = text.extract(page, 'class="active" >Pictures (', ')', pos)
return {
"category": self.category,
"artist": self.artist,
"count": count,
}, token
@@ -115,7 +114,7 @@ class HentaifoundryImageExtractor(Extractor):
r"([^/]+)/(\d+)/[^/]+")]
test = [("http://www.hentai-foundry.com/pictures/user/Orzy/76940/Youmu-Konpaku", {
"url": "50c267b2b2983b98b18fd0d2acbec8ce5ba64c77",
"keyword": "8c9b7054b78fb4f52982c3f21f3ba2a9fcdd5428",
"keyword": "6cee38ac0817783feb6db9944da997bec13d0e19",
})]
def __init__(self, match):
@@ -136,7 +135,6 @@ class HentaifoundryImageExtractor(Extractor):
title, pos = text.extract(page, 'Pictures</a> &raquo; <span>', '<')
url , pos = text.extract(page, '//pictures.hentai-foundry.com', '"', pos)
data = {
"category": self.category,
"artist": self.artist,
"index": self.index,
"title": text.unescape(title),

View File

@@ -21,7 +21,7 @@ class HitomiGalleryExtractor(Extractor):
pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"]
test = [("https://hitomi.la/galleries/867789.html", {
"url": "23fd59894c3db65aec826aa5efb85f96d2384883",
"keyword": "80395a06b6ba24842c15121d142830bb467ae68b",
"keyword": "03a64d67584afd7b8ad96ecb47acae08ea14d90f",
})]
def __init__(self, match):
@@ -61,7 +61,6 @@ class HitomiGalleryExtractor(Extractor):
series, pos = text.extract(page, '.html">', '</a>', pos)
lang = lang.capitalize()
return {
"category": self.category,
"gallery-id": self.gid,
"title": " ".join(title.split()),
"artist": string.capwords(artist),

View File

@@ -20,7 +20,7 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+).*"]
test = [("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", {
"url": "d7a4483b6d5ebba81950a349aad58ae034c60eda",
"keyword": "9f54ab808d77f2517444411dfbf8686189c20b43",
"keyword": "e4a9395dbd06d4af3172a6a61c90601bc47ee18c",
"content": "596e6bfa157f2c7169805d50075c2986549973a8",
})]
url_base = "http://www.imagebam.com"
@@ -44,17 +44,12 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
"""Collect metadata for extractor-job"""
url = self.url_base + "/gallery/" + self.gkey
page = self.request(url, encoding="utf-8").text
data = {
"category": self.category,
"gallery-key": self.gkey,
}
data, _ = text.extract_all(page, (
return text.extract_all(page, (
(None , "<img src='/img/icons/photos.png'", ""),
("title" , "'> ", " <"),
("count" , "'>", " images"),
("first-url", "<a href='http://www.imagebam.com", "'"),
), values=data)
return data
), values={"gallery-key": self.gkey})[0]
def get_images(self, url):
"""Yield all image-urls and -ids for a gallery"""
@@ -71,7 +66,6 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
yield image_url, image_id
class ImagebamImageExtractor(Extractor):
"""Extractor for single images from imagebam.com"""
category = "imagebam"
@@ -81,7 +75,7 @@ class ImagebamImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/image/([0-9a-f]{15})"]
test = [("http://www.imagebam.com/image/94d56c502511890", {
"url": "94add9417c685d113a91bcdda4916e9538b5f8a9",
"keyword": "046f049533126bb0ee7f81419f59371c6903df9e",
"keyword": "fd99b2f45b761d0b639af46740aacd976f5dfcc7",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -90,10 +84,9 @@ class ImagebamImageExtractor(Extractor):
self.token = match.group(1)
def items(self):
data = {"category": self.category, "token": self.token}
page = self.request("http://www.imagebam.com/image/" + self.token).text
url = text.extract(page, 'property="og:image" content="', '"')[0]
text.nameext_from_url(url, data)
url = text.extract(page, 'property="og:image" content="', '"')[0]
data = text.nameext_from_url(url, {"token": self.token})
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, url, data

View File

@@ -22,7 +22,7 @@ class ImagefapGalleryExtractor(Extractor):
r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")]
test = [("http://www.imagefap.com/gallery/6318447", {
"url": "f63e6876df83a40e1a98dad70e46952dd9edb7a7",
"keyword": "eb26d0e62defc1a547b6b854fe0de693055d9f20",
"keyword": "715f99ad154c4cf608afc7cd77dd1e896030646a",
"content": "38e50699db9518ae68648c45ecdd6be614efc324",
})]
@@ -48,7 +48,7 @@ class ImagefapGalleryExtractor(Extractor):
("title" , '<title>Porn pics of ', ' (Page 1)</title>'),
("uploader", '>Uploaded by ', '</font>'),
("count" , ' 1 of ', ' pics"'),
), values={"category": self.category, "gallery-id": self.gid})
), values={"gallery-id": self.gid})
self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0]
data["title"] = text.unescape(data["title"])
return data
@@ -82,7 +82,7 @@ class ImagefapImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"]
test = [("http://www.imagefap.com/photo/1616331218/", {
"url": "8a05c0ccdcf84e63c962803bc41d247628c549ea",
"keyword": "401ded07ae0b3a8f718e553e506898b34cd92020",
"keyword": "c9880c6731b3fdc6d98d25dbff56f4342c11683e",
"content": "964b8c62c9d5c2a039a2fccf1b1e10aaf7a18a96",
})]
@@ -101,7 +101,6 @@ class ImagefapImageExtractor(Extractor):
"""Collect metadata for extractor-job"""
parts = info["contentUrl"].rsplit("/", 3)
return text.nameext_from_url(parts[3], {
"category": self.category,
"title": text.unescape(info["name"]),
"section": info["section"],
"uploader": info["author"],

View File

@@ -20,7 +20,7 @@ class ImagetwistImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imagetwist\.com/([a-z0-9]{12})"]
test = [("http://imagetwist.com/4e46hv31tu0q/test.jpg", {
"url": "6b3fc0bd1105b698d2d5844658ca674d66b1e2e7",
"keyword": "d599a540ed233bb7b66e4abec30affbad2e44af1",
"keyword": "825d9d1901829da054b6ef9c034229af85e495e2",
"content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",
})]
@@ -34,7 +34,6 @@ class ImagetwistImageExtractor(Extractor):
filename, pos = text.extract(page, ' alt="', '"', pos)
userid , pos = text.extract(url , '/', '/', 29)
data = {
"category": self.category,
"token": self.token,
"user": userid,
}

View File

@@ -21,7 +21,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
test = [("http://imgbox.com/g/JaX5V5HX7g", {
"url": "c7c3466dde31d4308833816961104c7d1100368d",
"keyword": "23deb783d3afee090f61472b495e797c8f262b93",
"keyword": "cebd7f6868cf84ff492341c936cb6dbe5cde4682",
"content": "d20307dc8511ac24d688859c55abf2e2cc2dd3cc",
})]
url_base = "http://imgbox.com"
@@ -47,7 +47,6 @@ class ImgboxGalleryExtractor(AsynchronousExtractor):
title = text.extract(page, "<h1>", "</h1>")[0]
parts = title.rsplit(" - ", maxsplit=1)
return {
"category": self.category,
"gallery-key": self.key,
"title": text.unescape(parts[0]),
"count": parts[1][:-7],
@@ -79,7 +78,7 @@ class ImgboxImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"]
test = [("http://imgbox.com/qHhw7lpG", {
"url": "d96990ea12223895287d139695077b70dfa0abe4",
"keyword": "c5e87be93fec3122151edf85b6424d1871279590",
"keyword": "ff0524dba869a4b3292d7d4f72f5da4024b4f002",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -91,8 +90,7 @@ class ImgboxImageExtractor(Extractor):
page = self.request("http://imgbox.com/" + self.key).text
url , pos = text.extract(page, 'src="http://i.', '"')
filename, pos = text.extract(page, ' title="', '"', pos)
data = {"category": self.category, "image-key": self.key}
text.nameext_from_url(filename, data)
data = text.nameext_from_url(filename, {"image-key": self.key})
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, "http://i." + url, data

View File

@@ -21,7 +21,7 @@ class ImgcandyImageExtractor(Extractor):
r"(?:_(.+))?\.html")]
test = [("http://imgcandy.net/img-57d02527efee8_test-テスト.png.html", {
"url": "bc3c9207b10dbfe8e65ccef5b9e3194a7427b4fa",
"keyword": "381e036374742a091cac7dd7a3eca90ee725afa3",
"keyword": "1ed1587ef38a6b26ce28b35857a78417239d197a",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -30,12 +30,11 @@ class ImgcandyImageExtractor(Extractor):
self.token, self.filename = match.groups()
def items(self):
data = {"category": self.category, "token": self.token}
params = {"imgContinue": "Continue+to+image+...+"}
page = self.request("http://imgcandy.net/img-" + self.token + ".html",
method="post", data=params).text
url = text.extract(page, "<img class='centred' src='", "'")[0]
text.nameext_from_url(self.filename or url, data)
data = text.nameext_from_url(self.filename or url, {"token": self.token})
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, url, data

View File

@@ -61,8 +61,6 @@ class ImgchiliImageExtractor(ImgchiliExtractor):
parts = name2.split("in the gallery ")
name = parts[0] if not parts[0].endswith("...") else name1
return text.nameext_from_url(name, {
"category": self.category,
"subcategory": self.subcategory,
"image-id": self.match.group(1),
"title": text.unescape(parts[-1]) if len(parts) > 1 else ""
})
@@ -86,8 +84,6 @@ class ImgchiliAlbumExtractor(ImgchiliExtractor):
def get_job_metadata(self, page):
title = text.extract(page, "<h1>", "</h1>")[0]
return {
"category": self.category,
"subcategory": self.subcategory,
"title": text.unescape(title),
"key": self.match.group(1),
}

View File

@@ -20,7 +20,7 @@ class ImgthGalleryExtractor(Extractor):
pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"]
test = [("http://imgth.com/gallery/37/wallpaper-anime", {
"url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
"keyword": "1b15726d53bc2c08d845fa60ce538396380688df",
"keyword": "3f268fcc18d49ac3799a8f25cc08053e90891955",
})]
def __init__(self, match):
@@ -61,4 +61,4 @@ class ImgthGalleryExtractor(Extractor):
("date" , 'created on ', ' by <'),
(None , 'href="/users/', ''),
("user" , '>', '<'),
), values={"category": self.category, "gallery-id": self.gid})[0]
), values={"gallery-id": self.gid})[0]

View File

@@ -20,7 +20,7 @@ class ImgtrexImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imgtrex\.com/([^/]+)"]
test = [("http://imgtrex.com/im0ypxq0rke4/test-テスト-&<a>.png", {
"url": "c000618bddda42bd599a590b7972c7396d19d8fe",
"keyword": "4d766eae04aa5457bca4992290aa28b76239d287",
"keyword": "58905795a9cd3f17d5ff024fc4d63645795ba23c",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -29,11 +29,10 @@ class ImgtrexImageExtractor(Extractor):
self.token = match.group(1)
def items(self):
data = {"category": self.category, "token": self.token}
page = self.request("http://imgtrex.com/" + self.token).text
filename, pos = text.extract(page, '<title>ImgTrex: ', '</title>')
url , pos = text.extract(page, '<br>\n<img src="', '"', pos)
text.nameext_from_url(filename, data)
data = text.nameext_from_url(filename, {"token": self.token})
yield Message.Version, 1
yield Message.Directory, data
yield Message.Url, url, data

View File

@@ -22,7 +22,7 @@ class ImgurAlbumExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?imgur\.com/(?:a|gallery)/([^/?&#]+)"]
test = [("https://imgur.com/a/TcBmP", {
"url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
"keyword": "5c96eee4df5938ed37f1f95f5c4ef64444bddeb4",
"keyword": "c76bbf86f8f114cdaadab396c0ea4acf47aa44eb",
})]
def __init__(self, match):
@@ -43,16 +43,12 @@ class ImgurAlbumExtractor(Extractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
page = self.request("https://imgur.com/a/" + self.album).text
data = {
"category": self.category,
"album-key": self.album,
}
text.extract_all(page, (
data = text.extract_all(page, (
('title', '<meta property="og:title" content="', '"'),
('count', '"num_images":"', '"'),
('date' , '"datetime":"', ' '),
('time' , '', '"'),
), values=data)
), values={"album-key": self.album})[0]
data["title"] = text.unescape(data["title"])
return data

View File

@@ -21,7 +21,7 @@ class ImgytImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?img\.yt/img-([a-z0-9]+)\.html"]
test = [("http://img.yt/img-57a2050547b97.html", {
"url": "6801fac1ff8335bd27a1665ad27ad64cace2cd84",
"keyword": "a20aa2215a4a6d5f4605d6370a8d605b525fc4bc",
"keyword": "7548cc9915f90f5d7ffbafa079085457ae34562c",
"content": "54592f2635674c25677c6872db3709d343cdf92f",
})]
@@ -30,12 +30,12 @@ class ImgytImageExtractor(Extractor):
self.token = match.group(1)
def items(self):
data = {"category": self.category, "token": self.token}
params = {"imgContinue": "Continue+to+image+...+"}
page = self.request("https://img.yt/img-" + self.token + ".html",
method="post", data=params).text
url , pos = text.extract(page, "<img class='centred' src='", "'")
filename, pos = text.extract(page, " alt='", "'", pos)
data = {"token": self.token}
text.nameext_from_url(filename + splitext(url)[1], data)
if url.startswith("http:"):
url = "https:" + url[5:]

View File

@@ -20,7 +20,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?downloads\.khinsider\.com/game-soundtracks/album/(.+)"]
test = [("http://downloads.khinsider.com/game-soundtracks/album/horizon-riders-wii-", {
"url": "35ff4c8310884664408dc5560fda3b06157f7606",
"keyword": "dde50e1f5dbed5ee3f13df4e1bffc58bb9563f22",
"keyword": "d91cf3edee6713b536eaf3995743f0be7dc72f68",
})]
def __init__(self, match):
@@ -45,7 +45,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
("size" , "Total Filesize: <b>", "</b>"),
("date" , "Date added: <b>", "</b>"),
("type" , "Album type: <b>", "</b>"),
), values={"category": self.category})[0]
))[0]
def get_album_tracks(self, page):
"""Collect url and metadata for all tracks of a soundtrack"""

View File

@@ -53,11 +53,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
test = [
("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
"url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0",
"keyword": "892c3e4df03a575a282a5695add986a49623d746",
"keyword": "ab332093a4f2e473a468235bfd624cbe3b19fd7f",
}),
("http://kissmanga.com/Manga/Urban-Tales/a?id=256717", {
"url": "de074848f6c1245204bb9214c12bcc3ecfd65019",
"keyword": "0a98952984941cc2a11892b1cd7b237ffb20adaa",
"keyword": "013aad80e578c6ccd2e1fe47cdc27c12a64f6db2",
})
]
@@ -81,7 +81,6 @@ class KissmangaChapterExtractor(KissmangaExtractor):
r"(?:Vol.0*(\d+) )?(?:Ch.)?0*(\d+)(?:\.0*(\d+))?(?:: (.+))?", cinfo)
chminor = match.group(3)
return {
"category": self.category,
"manga": manga,
"volume": match.group(1) or "",
"chapter": match.group(2),

View File

@@ -21,8 +21,8 @@ class LusciousAlbumExtractor(Extractor):
pattern = [(r"(?:https?://)?(?:www\.)?luscious\.net/c/([^/]+)/"
r"(?:pictures/album|albums)/([^/\d]+(\d+))")]
test = [("https://luscious.net/c/incest_manga/albums/amazon-no-hiyaku-amazon-elixir-english-decensored_261127/view/", {
"url": "319a70261de12620d123add9b519d15b8515b503",
"keyword": "60cc15db2619b8aee47c1527b6326be5a54f5c2f",
"url": "12e1fde5ef3c0d41973a85fb27a602eb922c60aa",
"keyword": "e10c7c070ad730e305024fb37cc70af6b05378dd",
})]
def __init__(self, match):
@@ -50,7 +50,7 @@ class LusciousAlbumExtractor(Extractor):
(None , '<p>Language:', ''),
("language", '\n ', ' '),
("artist" , 'rtist: ', '\n'),
), values={"category": self.category, "gallery-id": self.gid})[0]
), values={"gallery-id": self.gid})[0]
data["lang"] = iso639_1.language_to_code(data["language"])
return data

View File

@@ -49,7 +49,7 @@ class MangahereChapterExtractor(AsynchronousExtractor):
r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")]
test = [("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/", {
"url": "68efaeed3bc6abb0a0b6f75a5c649c17979e31f1",
"keyword": "f342e3df9fa39eb10cf7ba5ef3300df6ad77f332",
"keyword": "d3fe470e934a9f02ed00d4391b1743970eae82fa",
})]
url_fmt = "http://www.mangahere.co/manga/{}/{}.html"
@@ -76,7 +76,6 @@ class MangahereChapterExtractor(AsynchronousExtractor):
count, pos = text.extract(page, '>', '<', pos-30)
manga = re.match(r"(.+) \d+(\.\d+)? - Read .+ Chapter \d+(\.\d+)? Online", manga).group(1)
return {
"category": self.category,
"manga": text.unescape(manga),
# "title": TODO,
"volume": self.volume or "",

View File

@@ -55,7 +55,7 @@ class MangamintChapterExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?mangamint\.com/([^\?]+-(\d+))"]
test = [("http://www.mangamint.com/mushishi-1", {
"url": "337f46c4dab50f544e9196ced723ac8f70400dd0",
"keyword": "ca4ba6fa84367fd7c345879a17ebaad39b589da5",
"keyword": "de9ea839d231cb9f1590a2a93ca9ab2f8743b39d",
})]
def __init__(self, match):
@@ -80,7 +80,6 @@ class MangamintChapterExtractor(Extractor):
chid , pos = text.extract(page, r'"identifier":"node\/', '"', pos)
match = re.match(r"(.+) (\d+)(\.\d+)?$", manga)
return {
"category": self.category,
"manga": match.group(1),
"chapter": match.group(2),
"chapter-minor": match.group(3) or "",

View File

@@ -33,5 +33,5 @@ class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
]
test = [("http://www.mangapanda.com/red-storm/2", {
"url": "4bf4ddf6c50105ec8a37675495ab80c46608275d",
"keyword": "dcb8d655e3f461738c821819bbb8d017bd916713",
"keyword": "89c712f7ed255ec9c1d8e84dcb5a160b6cb4498c",
})]

View File

@@ -51,11 +51,11 @@ class MangaparkChapterExtractor(Extractor):
test = [
("http://mangapark.me/manga/ad-astra-per-aspera-hata-kenjirou/s1/c1.2/1", {
"url": "25d998a70df1fa559afc189ebd17df300b54dc28",
"keyword": "40d60961d7aaf24454d2ab23fbc83f4c55cd4174",
"keyword": "aa0dfbd21a5174b1497bce98182324e5120dd4ff",
}),
("http://mangapark.me/manga/gekkan-shoujo-nozaki-kun/s2/c70/e2/1", {
"url": "8534c8286a18c4db47606f84a4df9f1a42bab291",
"keyword": "f96962442cdd5bc957603831c695159d974b7b93",
"keyword": "df83f2ccde8dd58d6b906a65ae1ecf3bec801567",
})
]
@@ -80,7 +80,6 @@ class MangaparkChapterExtractor(Extractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
data = {
"category": self.category,
"version": self.version,
"volume": self.volume or "",
"chapter": self.chapter,

View File

@@ -50,7 +50,7 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
]
test = [("http://www.mangareader.net/karate-shoukoushi-kohinata-minoru/11", {
"url": "84ffaab4c027ef9022695c53163c3aeabd07ca58",
"keyword": "0df7db81a44ef642922aab798c303d60e2b6802d",
"keyword": "09b4ad57a082eb371dec027ccfc8ed1157c6eac6",
})]
def __init__(self, match):
@@ -74,7 +74,6 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
"""Collect metadata for extractor-job"""
page = self.request(self.url_base + self.url_title).text
data = {
"category": self.category,
"chapter": self.chapter,
"lang": "en",
"language": "English",

View File

@@ -46,7 +46,7 @@ class MangashareChapterExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?read\.mangashare\.com/([^/]+/chapter-\d+)"]
test = [("http://read.mangashare.com/Gantz/chapter-331/page001.html", {
"url": "2980fb9548e809dea63d104bc514dcc33bdd9ef7",
"keyword": "4872a5645ab79cb9ecf363a5bf4cb9062fd61eef",
"keyword": "8afc1c2a3e64efa3d2b9ed2359885343f89bdfa9",
})]
url_fmt = "http://read.mangashare.com/{}/page{:>03}.html"
@@ -67,7 +67,6 @@ class MangashareChapterExtractor(AsynchronousExtractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
data = {
"category": self.category,
"lang": "en",
"language": "English",
}

View File

@@ -46,7 +46,6 @@ class MangastreamChapterExtractor(AsynchronousExtractor):
title, pos = text.extract(page, ' - ', '<', pos)
count, pos = text.extract(page, 'Last Page (', ')', pos)
data = {
"category": self.category,
"manga": manga,
"chapter": text.unquote(self.chapter),
"chapter-id": self.ch_id,

View File

@@ -21,7 +21,7 @@ class NhentaiGalleryExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"]
test = [("http://nhentai.net/g/147850/", {
"url": "199ddd07dded0f69282e09a372710698ea21ab8e",
"keyword": "e00678567c8335289ffcbb2e6980b28d332ee6a7",
"keyword": "c7e37dfe80ca5eee69210c690a1340ea78a932a4",
})]
def __init__(self, match):
@@ -57,7 +57,6 @@ class NhentaiGalleryExtractor(Extractor):
title_en = ginfo["title"].get("english", "")
title_ja = ginfo["title"].get("japanese", "")
return {
"category": self.category,
"gallery-id": self.gid,
"upload-date": ginfo["upload_date"],
"media-id": ginfo["media_id"],

View File

@@ -40,10 +40,7 @@ class NijieExtractor(AsynchronousExtractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
return {
"category": self.category,
"artist-id": self.artist_id,
}
return {"artist-id": self.artist_id}
def get_image_ids(self):
"""Collect all image-ids for a specific artist"""
@@ -83,7 +80,7 @@ class NijieUserExtractor(NijieExtractor):
r"members(?:_illust)?\.php\?id=(\d+)")]
test = [("https://nijie.info/members_illust.php?id=44", {
"url": "585d821df4716b1098660a0be426d01db4b65f2a",
"keyword": "30c981b9d7351ec275b9840d8bc2b4ef3da8c4b4",
"keyword": "7a2dbf8fc0dfdb2af208ecdb8ec7f3186bdc31ab",
})]
def __init__(self, match):
@@ -105,7 +102,7 @@ class NijieImageExtractor(NijieExtractor):
pattern = [r"(?:https?://)?(?:www\.)?nijie\.info/view\.php\?id=(\d+)"]
test = [("https://nijie.info/view.php?id=70720", {
"url": "a10d4995645b5f260821e32c60a35f73546c2699",
"keyword": "1c0b1a2e447d8e1cd4f93c21f71d7fe7de0eeed3",
"keyword": "e454c2bad9b636b90d569881bf4fe8438506e0d2",
"content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6",
})]

View File

@@ -26,8 +26,6 @@ class PinterestExtractor(Extractor):
img = pin["image"]["original"]
url = img["url"]
data = {
"category": self.category,
"subcategory": self.subcategory,
"pin-id": pin["id"],
"note": pin["note"],
"width": img["width"],
@@ -90,8 +88,6 @@ class PinterestBoardExtractor(PinterestExtractor):
def data_from_board(self, board):
"""Get metadata from a board-object"""
data = {
"category": self.category,
"subcategory": self.subcategory,
"user": self.user,
"board-id": board["id"],
"board": board["name"],

View File

@@ -92,7 +92,6 @@ class PixivUserExtractor(Extractor):
"""Prepare a work-dictionary with additional keywords"""
user = work["user"]
url = work["image_urls"]["large"]
work["category"] = self.category
work["artist-id"] = user["id"]
work["artist-name"] = user["name"]
work["artist-nick"] = user["account"]
@@ -130,7 +129,6 @@ class PixivUserExtractor(Extractor):
if not user:
user = self.api.user(self.artist_id)["response"][0]
return {
"category": self.category,
"artist-id": user["id"],
"artist-name": user["name"],
"artist-nick": user["account"],

View File

@@ -27,7 +27,7 @@ class PowermangaChapterExtractor(Extractor):
]
test = [("https://read.powermanga.org/read/one_piece/en/0/803/page/1", {
"url": "e6179c1565068f99180620281f86bdd25be166b4",
"keyword": "1c8593087f4a2e3343966a2900fc67be8e6401f1",
"keyword": "ab66c38e31f1b716ed360ee8c78fd973d7d8693a",
})]
def __init__(self, match):
@@ -63,7 +63,6 @@ class PowermangaChapterExtractor(Extractor):
json_data, pos = text.extract(page, 'var pages = ', ';', pos)
match = re.match(r"(\w+ (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
return {
"category": self.category,
"manga": text.unescape(manga),
"chapter": match.group(2) or match.group(1),
"chapter-minor": match.group(3) or "",

View File

@@ -20,7 +20,7 @@ class SankakuTagExtractor(AsynchronousExtractor):
pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)"]
test = [("https://chan.sankakucomplex.com/?tags=bonocho", {
"url": "2561ca0d8482ead48f22a7abcd23919cd78344a1",
"keyword": "6282e9a2d5223d635d9be7515f59d87d4b9be732",
"keyword": "5e3a39fdc6698e63ed0054478ebd4ca632ce643e",
})]
url = "https://chan.sankakucomplex.com/"
@@ -42,10 +42,7 @@ class SankakuTagExtractor(AsynchronousExtractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
return {
"category": self.category,
"tags": self.tags,
}
return {"tags": self.tags}
def get_images(self):
params = {

View File

@@ -23,7 +23,7 @@ class SeigaImageExtractor(Extractor):
(r"(?:https?://)?lohas\.nicoseiga\.jp/"
r"(?:priv|o)/[^/]+/\d+/(\d+)")]
test = [("http://seiga.nicovideo.jp/seiga/im5977527", {
"keyword": "e2ea59186c47beb71484ba35d550cf6511ac185a",
"keyword": "fd2628b573d15d1bbdefb219a99b993365b214ed",
"content": "d9202292012178374d57fb0126f6124387265297",
})]
@@ -44,10 +44,7 @@ class SeigaImageExtractor(Extractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
return {
"category": self.category,
"image-id": self.image_id,
}
return {"image-id": self.image_id}
def get_image_url(self, image_id):
"""Get url for an image with id 'image_id'"""

View File

@@ -20,7 +20,7 @@ class SenmangaChapterExtractor(Extractor):
pattern = [r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"]
test = [("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
"url": "32d88382fcad66859d089cd9a61249f375492ec5",
"keyword": "9554ccc7bc32c358b2491c255e614ae908d7d593",
"keyword": "465905e0b69998656f9d59462a9560319941c58d",
"content": "a791dda85ac0d37e3b36d754560cbb65b8dab5b9",
})]
url_base = "http://raw.senmanga.com"
@@ -52,7 +52,6 @@ class SenmangaChapterExtractor(Extractor):
manga, pos = text.extract(title, '| Raw | ', ' | Chapter ')
chapter, pos = text.extract(title, '', ' | Page ', pos)
return {
"category": self.category,
"manga": text.unescape(manga.replace("-", " ")),
"chapter": chapter,
"count": count,

View File

@@ -50,7 +50,7 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor):
]
test = [("http://view.thespectrum.net/series/toriko.html?ch=Chapter+343&page=1", {
"url": "c0fc7dc594841217cc622a67edd79f06e9900333",
"keyword": "bde9c95a2d0feca0574c7248ed06f1684f86b2ac",
"keyword": "8499166b62db0c87e7109cc5f9aa837b4815dd9c",
})]
def __init__(self, match):
@@ -82,7 +82,6 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
data = {
"category": self.category,
"chapter": self.chapter or "",
"volume": self.volume or "",
"identifier": self.identifier.replace("+", " "),

View File

@@ -21,7 +21,7 @@ class TumblrUserExtractor(Extractor):
pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]
test = [("http://demo.tumblr.com/", {
"url": "d3d2bb185230e537314a0036814050634c730f74",
"keyword": "2ab87097ecafce595dd53d8469b2337ec541bcde",
"keyword": "8704a9bbb65b6e52dc1ccdf2c2449bd4abe3d389",
"content": "31495fdb9f84edbb7f67972746a1521456f649e2",
})]
@@ -47,7 +47,6 @@ class TumblrUserExtractor(Extractor):
def get_job_metadata(self, image_data):
"""Collect metadata for extractor-job"""
data = next(image_data)
data["category"] = self.category
data["user"] = self.user
del data["cname"]
del data["description"]
@@ -97,7 +96,7 @@ class TumblrPostExtractor(TumblrUserExtractor):
pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/post/(\d+)"]
test = [("http://demo.tumblr.com/post/459265350", {
"url": "d3d2bb185230e537314a0036814050634c730f74",
"keyword": "a6a0d99eddfba835e710a584d59b19df1ea5c1ab",
"keyword": "821236db342fb0d1bf8a177ca3108349168e6cd0",
})]
def __init__(self, match):
@@ -111,7 +110,7 @@ class TumblrTagExtractor(TumblrUserExtractor):
pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/tagged/(.+)"]
test = [("http://demo.tumblr.com/tagged/Times Square", {
"url": "d3d2bb185230e537314a0036814050634c730f74",
"keyword": "2ab87097ecafce595dd53d8469b2337ec541bcde",
"keyword": "e182759d3a26c9f72ccc8ddc22a382aad598d6dc",
})]
def __init__(self, match):

View File

@@ -20,7 +20,7 @@ class TurboimagehostImageExtractor(Extractor):
pattern = [r"(?:https?://)?(?:www\.)?turboimagehost\.com/p/((\d+)/[^/]+\.html)"]
test = [("http://www.turboimagehost.com/p/29690902/test--.png.html", {
"url": "c624dc7784de515342117a2678fee6ecf1032d79",
"keyword": "32b27364c3137786ffec8e90b8de453e489abf93",
"keyword": "8f8d105bae58fa33f1b06ca04949d38a1515641f",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})]
@@ -30,15 +30,11 @@ class TurboimagehostImageExtractor(Extractor):
def items(self):
page = self.request("http://www.turboimagehost.com/p/" + self.part).text
data = {
"category": self.category,
"token": self.token,
}
text.extract_all(page, (
data = text.extract_all(page, (
('width' , 'var imWidth = ', ';'),
('height', 'var imHeight = ', ';'),
('url' , '<a href="http://www.turboimagehost.com"><img src="', '"'),
), values=data)
), values={"token": self.token})[0]
text.nameext_from_url(data["url"], data)
yield Message.Version, 1
yield Message.Directory, data

View File

@@ -23,7 +23,50 @@ class Job():
def run(self):
"""Execute or run the job"""
pass
for msg in self.extractor:
if msg[0] == Message.Url:
self.update_kwdict(msg[2])
self.handle_url(msg[1], msg[2])
elif msg[0] == Message.Directory:
self.update_kwdict(msg[1])
self.handle_directory(msg[1])
elif msg[0] == Message.Queue:
self.handle_queue(msg[1])
elif msg[0] == Message.Headers:
self.handle_headers(msg[1])
elif msg[0] == Message.Cookies:
self.handle_cookies(msg[1])
elif msg[0] == Message.Version:
if msg[1] != 1:
raise "unsupported message-version ({}, {})".format(
self.extractor.category, msg[1]
)
# TODO: support for multiple message versions
def handle_url(self, url, kexwords):
"""Handle Message.Url"""
def handle_directory(self, keywords):
"""Handle Message.Directory"""
def handle_queue(self, url):
"""Handle Message.Queue"""
def handle_headers(self, headers):
"""Handle Message.Headers"""
def handle_cookies(self, cookies):
"""Handle Message.Cookies"""
def update_kwdict(self, kwdict):
"""Add 'category' and 'subcategory' keywords"""
kwdict["category"] = self.extractor.category
kwdict["subcategory"] = self.extractor.subcategory
class DownloadJob(Job):
"""Download images into appropriate directory/filename locations"""
@@ -45,44 +88,17 @@ class DownloadJob(Job):
)
def run(self):
for msg in self.extractor:
if msg[0] == Message.Url:
self.download(msg)
Job.run(self)
if self.queue:
for url in self.queue:
try:
DownloadJob(url).run()
except exception.NoExtractorError:
pass
elif msg[0] == Message.Headers:
self.get_downloader("http:").set_headers(msg[1])
elif msg[0] == Message.Cookies:
self.get_downloader("http:").set_cookies(msg[1])
elif msg[0] == Message.Directory:
self.set_directory(msg)
elif msg[0] == Message.Queue:
self.enqueue(msg[1])
elif msg[0] == Message.Version:
if msg[1] != 1:
raise "unsupported message-version ({}, {})".format(
self.extractor.category, msg[1]
)
# TODO: support for multiple message versions
self.run_queue()
def run_queue(self):
"""Run all jobs stored in queue"""
if not self.queue:
return
for url in self.queue:
try:
DownloadJob(url).run()
except exception.NoExtractorError:
pass
def download(self, msg):
"""Download the resource specified in 'msg'"""
_, url, metadata = msg
filename = text.clean_path(self.filename_fmt.format(**metadata))
def handle_url(self, url, keywords):
"""Download the resource specified in 'url'"""
filename = text.clean_path(self.filename_fmt.format(**keywords))
path = os.path.join(self.directory, filename)
realpath = self.adjust_path(path)
if os.path.exists(realpath):
@@ -94,10 +110,10 @@ class DownloadJob(Job):
tries = dlinstance.download(url, file)
self.printer.success(path, tries)
def set_directory(self, msg):
def handle_directory(self, keywords):
"""Set and create the target directory for downloads"""
segments = [
text.clean_path(segment.format(**msg[1]).strip())
text.clean_path(segment.format(**keywords).strip())
for segment in self.directory_fmt
]
self.directory = os.path.join(
@@ -106,6 +122,19 @@ class DownloadJob(Job):
)
os.makedirs(self.adjust_path(self.directory), exist_ok=True)
def handle_queue(self, url):
"""Add url to work-queue"""
try:
self.queue.append(url)
except AttributeError:
self.queue = [url]
def handle_headers(self, headers):
self.get_downloader("http:").set_headers(headers)
def handle_cookies(self, cookies):
self.get_downloader("http:").set_cookies(cookies)
def get_downloader(self, url):
"""Return, and possibly construct, a downloader suitable for 'url'"""
pos = url.find(":")
@@ -119,13 +148,6 @@ class DownloadJob(Job):
self.downloaders[scheme] = instance
return instance
def enqueue(self, url):
"""Add url to work-queue"""
try:
self.queue.append(url)
except AttributeError:
self.queue = [url]
@staticmethod
def get_base_directory():
"""Return the base-destination-directory for downloads"""
@@ -147,10 +169,12 @@ class KeywordJob(Job):
for msg in self.extractor:
if msg[0] == Message.Url:
print("Keywords for filenames:")
self.update_kwdict(msg[2])
self.print_keywords(msg[2])
return
elif msg[0] == Message.Directory:
print("Keywords for directory names:")
self.update_kwdict(msg[1])
self.print_keywords(msg[1])
@staticmethod
@@ -165,27 +189,27 @@ class KeywordJob(Job):
class UrlJob(Job):
"""Print download urls"""
def run(self):
for msg in self.extractor:
if msg[0] == Message.Url:
print(msg[1])
elif msg[0] == Message.Queue:
try:
UrlJob(msg[1]).run()
except exception.NoExtractorError:
pass
def handle_url(self, url, _):
print(url)
def handle_queue(self, url):
try:
UrlJob(url).run()
except exception.NoExtractorError:
pass
class HashJob(DownloadJob):
"""Generate SHA1 hashes for extractor results"""
class HashIO():
"""Minimal file-like interface"""
def __init__(self, hashobj):
self.hashobj = hashobj
def write(self, content):
"""Update SHA1 hash"""
self.hashobj.update(content)
def __init__(self, url, content=False):
@@ -197,25 +221,28 @@ class HashJob(DownloadJob):
if content:
self.fileobj = self.HashIO(self.hash_content)
def download(self, msg):
self.update_url(msg[1])
self.update_keyword(msg[2])
self.update_content(msg[1])
def handle_url(self, url, keywords):
self.update_url(url)
self.update_keyword(keywords)
self.update_content(url)
def set_directory(self, msg):
self.update_keyword(msg[1])
def handle_directory(self, keywords):
self.update_keyword(keywords)
def enqueue(self, url):
def handle_queue(self, url):
self.update_url(url)
def update_url(self, url):
"""Update the URL hash"""
self.hash_url.update(url.encode())
def update_keyword(self, kwdict):
"""Update the keyword hash"""
self.hash_keyword.update(
json.dumps(kwdict, sort_keys=True).encode()
)
def update_content(self, url):
"""Update the content hash"""
if self.content:
self.get_downloader(url).download(url, self.fileobj)