[postmill] implement suggestions

This commit is contained in:
blankie
2023-12-23 13:28:36 +11:00
parent fbe14a2745
commit 8a42ea736a
2 changed files with 21 additions and 22 deletions

View File

@@ -2735,7 +2735,7 @@ Description
extractor.[postmill].save-link-post-body extractor.[postmill].save-link-post-body
------------------------ ----------------------------------------
Type Type
``bool`` ``bool``
Default Default

View File

@@ -7,7 +7,6 @@
"""Extractors for Postmill instances""" """Extractors for Postmill instances"""
import re import re
import urllib.parse
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, exception from .. import text, exception
@@ -28,8 +27,8 @@ class PostmillExtractor(BaseExtractor):
def items(self): def items(self):
for post_url in self.post_urls(): for post_url in self.post_urls():
response = self.request(post_url) page = self.request(post_url).text
extr = text.extract_from(response.text) extr = text.extract_from(page)
title = text.unescape(extr( title = text.unescape(extr(
'<meta property="og:title" content="', '">')) '<meta property="og:title" content="', '">'))
@@ -52,7 +51,7 @@ class PostmillExtractor(BaseExtractor):
id = int(match.group(2)) id = int(match.group(2))
is_text_post = url.startswith("/") is_text_post = url.startswith("/")
is_image_post = self._search_image_tag(response.text) is not None is_image_post = self._search_image_tag(page) is not None
data = { data = {
"title": title, "title": title,
"date": date, "date": date,
@@ -60,7 +59,7 @@ class PostmillExtractor(BaseExtractor):
"forum": forum, "forum": forum,
"id": id, "id": id,
"flair": [text.unescape(i) for i in text.extract_iter( "flair": [text.unescape(i) for i in text.extract_iter(
response.text, '<span class="flair__label">', '</span>')], page, '<span class="flair__label">', '</span>')],
"instance": self.instance, "instance": self.instance,
} }
@@ -90,32 +89,32 @@ class PostmillSubmissionsExtractor(PostmillExtractor):
def __init__(self, match): def __init__(self, match):
PostmillExtractor.__init__(self, match) PostmillExtractor.__init__(self, match)
self.base = match.group(3) groups = match.groups()
self.sorting_path = match.group(4) or "" self.base = groups[-3]
self.sorting_path = groups[-2] or ""
self.query = {key: value for key, value in text.parse_query( self.query = {key: value for key, value in text.parse_query(
match.group(5) or "").items() if self.acceptable_query(key)} groups[-1]).items() if self.acceptable_query(key)}
def items(self): def items(self):
url = self.root + self.base + self.sorting_path url = self.root + self.base + self.sorting_path
if self.query:
url += "?" + urllib.parse.urlencode(self.query)
while url: while url:
response = self.request(url) response = self.request(url, params=self.query)
if response.history: if response.history:
redirect_url = response.url redirect_url = response.url
if redirect_url == self.root + "/login": if redirect_url == self.root + "/login":
raise exception.StopExtraction( raise exception.StopExtraction(
"HTTP redirect to login page (%s)", redirect_url) "HTTP redirect to login page (%s)", redirect_url)
page = response.text
for nav in text.extract_iter(response.text, for nav in text.extract_iter(page,
'<nav class="submission__nav">', '<nav class="submission__nav">',
'</nav>'): '</nav>'):
post_url = text.unescape(text.extr(nav, '<a href="', '"')) post_url = text.unescape(text.extr(nav, '<a href="', '"'))
yield Message.Queue, text.urljoin(url, post_url), \ yield Message.Queue, text.urljoin(url, post_url), \
{"_extractor": PostmillPostExtractor} {"_extractor": PostmillPostExtractor}
url = text.unescape(text.extr(response.text, url = text.unescape(text.extr(page,
'<link rel="next" href="', '">')) '<link rel="next" href="', '">'))
def acceptable_query(self, key): def acceptable_query(self, key):
@@ -131,14 +130,15 @@ BASE_PATTERN = PostmillExtractor.update({
r"\.onion)"), r"\.onion)"),
} }
}) })
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" QUERY_RE = r"(?:\?([^#]+))?$"
QUERY_RE = r"(?:\?([^#]+))?" SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
QUERY_RE
class PostmillPostExtractor(PostmillExtractor): class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL""" """Extractor for a single submission URL"""
subcategory = "post" subcategory = "post"
pattern = BASE_PATTERN + r"/f/([\w\d_]+)/(\d+)(?:/.+)?$" pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
example = "https://raddle.me/f/FORUM/123/TITLE" example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match): def __init__(self, match):
@@ -170,29 +170,28 @@ class PostmillShortURLExtractor(PostmillExtractor):
class PostmillHomeExtractor(PostmillSubmissionsExtractor): class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page""" """Extractor for the home page"""
subcategory = "home" subcategory = "home"
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE \ pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
+ QUERY_RE + "$"
example = "https://raddle.me/" example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor): class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum""" """Extractor for submissions on a forum"""
subcategory = "forum" subcategory = "forum"
pattern = BASE_PATTERN + r"(/f/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
example = "https://raddle.me/f/FORUM" example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user""" """Extractor for submissions made by a user"""
subcategory = "usersubmissions" subcategory = "usersubmissions"
pattern = BASE_PATTERN + r"(/user/[\w\d_]+/submissions)()" + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
example = "https://raddle.me/user/USER/submissions" example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor): class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag""" """Extractor for submissions on a forum with a specific tag"""
subcategory = "tag" subcategory = "tag"
pattern = BASE_PATTERN + r"(/tag/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
example = "https://raddle.me/tag/TAG" example = "https://raddle.me/tag/TAG"