[plurk] add a 'comments' options (#212)

This commit is contained in:
Mike Fährmann
2019-04-14 21:52:23 +02:00
parent 0b2ff406f6
commit 70be494161
2 changed files with 52 additions and 16 deletions

View File

@@ -627,6 +627,15 @@ Description Download Pixiv's Ugoira animations or ignore them.
=========== ===== =========== =====
extractor.plurk.comments
------------------------
=========== =====
Type ``bool``
Default ``false``
Description Also search Plurk comments for URLs.
=========== =====
extractor.reactor.wait-min & .wait-max extractor.reactor.wait-min & .wait-max
-------------------------------------- --------------------------------------
=========== ===== =========== =====

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.plurk.com/""" """Extractors for https://www.plurk.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, extractor, exception
import datetime import datetime
import json import json
import re import re
@@ -21,14 +21,40 @@ class PlurkExtractor(Extractor):
root = "https://www.plurk.com" root = "https://www.plurk.com"
def items(self): def items(self):
urls = self._urls_ex if self.config("comments", False) else self._urls
yield Message.Version, 1 yield Message.Version, 1
for plurk in self.plurks(): with extractor.blacklist(("plurk",)):
for url in text.extract_iter(plurk["content"], ' href="', '"'): for plurk in self.plurks():
yield Message.Queue, url, plurk for url in urls(plurk):
yield Message.Queue, url, plurk
def plurks(self): def plurks(self):
"""Return an iterable with all relevant 'plurk' objects""" """Return an iterable with all relevant 'plurk' objects"""
@staticmethod
def _urls(obj):
"""Extract URLs from a 'plurk' object"""
return text.extract_iter(obj["content"], ' href="', '"')
def _urls_ex(self, plurk):
"""Extract URLs from a 'plurk' and its comments"""
yield from self._urls(plurk)
for comment in self._comments(plurk):
yield from self._urls(comment)
def _comments(self, plurk):
"""Return an iterable with a 'plurk's comments"""
url = "https://www.plurk.com/Responses/get"
data = {"plurk_id": plurk["id"], "count": "200"}
while True:
info = self.request(url, "POST", data=data).json()
yield from info["responses"]
if not info["has_newer"]:
return
data["from_response_id"] = info["responses"][-1]["id"]
@staticmethod @staticmethod
def _load(data): def _load(data):
if not data: if not data:
@@ -56,13 +82,8 @@ class PlurkTimelineExtractor(PlurkExtractor):
plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
url = "https://www.plurk.com/TimeLine/getPlurks" url = "https://www.plurk.com/TimeLine/getPlurks"
headers = { data = {"user_id": user_id.strip()}
"Referer": self.root + "/", headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"}
"X-Requested-With": "XMLHttpRequest",
}
data = {
"user_id": user_id.strip(),
}
while plurks: while plurks:
yield from plurks yield from plurks
@@ -70,8 +91,7 @@ class PlurkTimelineExtractor(PlurkExtractor):
offset = datetime.datetime.strptime( offset = datetime.datetime.strptime(
plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z") data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
response = self.request( response = self.request(url, "POST", headers=headers, data=data)
url, method="POST", headers=headers, data=data)
plurks = response.json()["plurks"] plurks = response.json()["plurks"]
@@ -79,9 +99,16 @@ class PlurkPostExtractor(PlurkExtractor):
"""Extractor for URLs from a Plurk post""" """Extractor for URLs from a Plurk post"""
subcategory = "post" subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)" pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)"
test = ("https://www.plurk.com/p/i701j1", { test = (
"url": "2115f208564591b8748525c2807a84596aaaaa5f", ("https://www.plurk.com/p/i701j1", {
}) "url": "2115f208564591b8748525c2807a84596aaaaa5f",
"count": 3,
}),
("https://www.plurk.com/p/i701j1", {
"options": (("comments", True),),
"count": ">= 210",
}),
)
def __init__(self, match): def __init__(self, match):
PlurkExtractor.__init__(self, match) PlurkExtractor.__init__(self, match)