[instagram] add 'include' option (closes #1180)

Split the functionality of the old 'user' extractor into separate
'posts' and 'highlights' extractors, which respond to virtual URLs
('/<user>/posts' and '/<user>/highlights')
This commit is contained in:
Mike Fährmann
2020-12-20 23:20:32 +01:00
parent 78061658ea
commit bf629a2818
4 changed files with 95 additions and 83 deletions

View File

@@ -1045,15 +1045,22 @@ Description
for details) for details)
extractor.instagram.highlights extractor.instagram.include
------------------------------ ---------------------------
Type Type
``bool`` ``string`` or ``list`` of ``strings``
Default Default
``false`` ``"posts"``
Example
``"stories,highlights,posts"`` or ``["stories", "highlights", "posts"]``
Description Description
Include *Story Highlights* when downloading a user profile. A (comma-separated) list of subcategories to include
(requires authentication) when processing a user profile.
Possible values are
``"posts"``, ``"stories"``, ``"highlights"``, ``"channel"``.
You can use ``"all"`` instead of listing all values separately.
extractor.instagram.videos extractor.instagram.videos

View File

@@ -157,7 +157,7 @@ Turboimagehost https://www.turboimagehost.com/ individual Images
.. |furaffinity-C| replace:: Favorites, Galleries, Posts, Scraps, Search Results, User Profiles .. |furaffinity-C| replace:: Favorites, Galleries, Posts, Scraps, Search Results, User Profiles
.. |hentaifoundry-C| replace:: Favorites, individual Images, Pictures, Popular Images, Recent Images, Scraps, Stories, User Profiles .. |hentaifoundry-C| replace:: Favorites, individual Images, Pictures, Popular Images, Recent Images, Scraps, Stories, User Profiles
.. |imgur-C| replace:: Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles .. |imgur-C| replace:: Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles
.. |instagram-C| replace:: Channels, Posts, Saved Posts, Stories, Tag Searches, User Profiles .. |instagram-C| replace:: Channels, Highlights, Posts, Saved Posts, Stories, Tag Searches, User Profiles
.. |newgrounds-C| replace:: Art, Audio, Favorites, individual Images, Media Files, Movies, User Profiles .. |newgrounds-C| replace:: Art, Audio, Favorites, individual Images, Media Files, Movies, User Profiles
.. |nijie-C| replace:: Doujin, Favorites, Illustrations, individual Images, User Profiles .. |nijie-C| replace:: Doujin, Favorites, Illustrations, individual Images, User Profiles
.. |pixiv-C| replace:: Favorites, Follows, pixiv.me Links, Rankings, Search Results, User Profiles, individual Images .. |pixiv-C| replace:: Favorites, Follows, pixiv.me Links, Rankings, Search Results, User Profiles, individual Images

View File

@@ -12,11 +12,13 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import itertools
import json import json
import time import time
import re import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
class InstagramExtractor(Extractor): class InstagramExtractor(Extractor):
"""Base class for instagram extractors""" """Base class for instagram extractors"""
@@ -31,6 +33,7 @@ class InstagramExtractor(Extractor):
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.item = match.group(1)
self.www_claim = "0" self.www_claim = "0"
self.csrf_token = util.generate_csrf_token() self.csrf_token = util.generate_csrf_token()
self._find_tags = re.compile(r"#\w+").findall self._find_tags = re.compile(r"#\w+").findall
@@ -68,15 +71,18 @@ class InstagramExtractor(Extractor):
def request(self, url, **kwargs): def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs) response = Extractor.request(self, url, **kwargs)
if response.history and "/accounts/login/" in response.request.url: if response.history and "/accounts/login/" in response.request.url:
if self._cursor: if self._cursor:
self.log.info("Use '-o cursor=%s' to continue downloading " self.log.info("Use '-o cursor=%s' to continue downloading "
"from the current position", self._cursor) "from the current position", self._cursor)
raise exception.StopExtraction( raise exception.StopExtraction(
"Redirected to login page (%s)", response.request.url) "HTTP redirect to login page (%s)", response.request.url)
www_claim = response.headers.get("x-ig-set-www-claim") www_claim = response.headers.get("x-ig-set-www-claim")
if www_claim is not None: if www_claim is not None:
self.www_claim = www_claim self.www_claim = www_claim
return response return response
def _api_request(self, endpoint, params): def _api_request(self, endpoint, params):
@@ -340,9 +346,9 @@ class InstagramExtractor(Extractor):
if not info["has_next_page"]: if not info["has_next_page"]:
return return
elif not data["edges"] and "_virtual" not in info: elif not data["edges"] and "_virtual" not in info:
s = "" if self.user.endswith("s") else "s" s = "" if self.item.endswith("s") else "s"
raise exception.StopExtraction( raise exception.StopExtraction(
"%s'%s posts are private", self.user, s) "%s'%s posts are private", self.item, s)
variables["after"] = self._cursor = info["end_cursor"] variables["after"] = self._cursor = info["end_cursor"]
self.log.debug("Cursor: %s", self._cursor) self.log.debug("Cursor: %s", self._cursor)
@@ -351,80 +357,62 @@ class InstagramExtractor(Extractor):
class InstagramUserExtractor(InstagramExtractor): class InstagramUserExtractor(InstagramExtractor):
"""Extractor for ProfilePage""" """Extractor for an Instagram user profile"""
subcategory = "user" subcategory = "user"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" pattern = USER_PATTERN + r"/?(?:$|[?#])"
r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
r"([^/?#]+)/?(?:$|[?#])")
test = ( test = (
("https://www.instagram.com/instagram/", { ("https://www.instagram.com/instagram/"),
"range": "1-16",
"count": ">= 16",
}),
# ("https://www.instagram.com/instagram/", {
# "options": (("highlights", True),),
# "pattern": InstagramStoriesExtractor.pattern,
# "range": "1-2",
# "count": 2,
# }),
("https://www.instagram.com/instagram/?hl=en"), ("https://www.instagram.com/instagram/?hl=en"),
) )
def __init__(self, match): def items(self):
InstagramExtractor.__init__(self, match) if self.config("highlights"):
self.user = match.group(1) self.log.warning("'highlights' is deprecated, "
"use '\"include\": \"…,highlights\"' instead")
default = ("highlights", "posts")
else:
default = ("posts",)
base = "{}/{}/".format(self.root, self.item)
stories = "{}/stories/{}/".format(self.root, self.item)
return self._dispatch_extractors((
(InstagramStoriesExtractor , stories),
(InstagramHighlightsExtractor, base + "highlights/"),
(InstagramPostsExtractor , base + "posts/"),
(InstagramChannelExtractor , base + "channel/"),
), default)
class InstagramPostsExtractor(InstagramExtractor):
"""Extractor for ProfilePage posts"""
subcategory = "posts"
pattern = USER_PATTERN + r"/posts"
test = ("https://www.instagram.com/instagram/posts/", {
"range": "1-16",
"count": ">= 16",
})
def posts(self): def posts(self):
url = "{}/{}/".format(self.root, self.user) url = "{}/{}/".format(self.root, self.item)
user = self._extract_profile_page(url) user = self._extract_profile_page(url)
if user.get("highlight_reel_count") and self.config("highlights"):
query_hash = "d4d88dc1500312af6f937f7b804c68c3"
variables = {
"user_id": user["id"],
"include_chaining": False,
"include_reel": True,
"include_suggested_users": False,
"include_logged_out_extras": False,
"include_highlight_reels": True,
"include_live_status": True,
}
data = self._graphql_request(query_hash, variables)
highlights = [
{
"__typename": "GraphReel",
"id" : "highlight:" + edge["node"]["id"],
}
for edge in data["user"]["edge_highlight_reels"]["edges"]
]
else:
highlights = None
query_hash = "003056d32c2554def87228bc3fd9668a" query_hash = "003056d32c2554def87228bc3fd9668a"
variables = {"id": user["id"], "first": 50} variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_owner_to_timeline_media") edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
posts = self._pagination(query_hash, variables, edge) return self._pagination(query_hash, variables, edge)
return itertools.chain(highlights, posts) if highlights else posts
class InstagramChannelExtractor(InstagramExtractor): class InstagramChannelExtractor(InstagramExtractor):
"""Extractor for ProfilePage channel""" """Extractor for ProfilePage channel"""
subcategory = "channel" subcategory = "channel"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" pattern = USER_PATTERN + r"/channel"
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
r"([^/?#]+)/channel")
test = ("https://www.instagram.com/instagram/channel/", { test = ("https://www.instagram.com/instagram/channel/", {
"range": "1-16", "range": "1-16",
"count": ">= 16", "count": ">= 16",
}) })
def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user = match.group(1)
def posts(self): def posts(self):
url = "{}/{}/channel/".format(self.root, self.user) url = "{}/{}/channel/".format(self.root, self.item)
user = self._extract_profile_page(url) user = self._extract_profile_page(url)
query_hash = "bc78b344a68ed16dd5d7f264681c4c76" query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
@@ -436,17 +424,11 @@ class InstagramChannelExtractor(InstagramExtractor):
class InstagramSavedExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor):
"""Extractor for ProfilePage saved media""" """Extractor for ProfilePage saved media"""
subcategory = "saved" subcategory = "saved"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" pattern = USER_PATTERN + r"([^/?#]+)/saved"
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
r"([^/?#]+)/saved")
test = ("https://www.instagram.com/instagram/saved/",) test = ("https://www.instagram.com/instagram/saved/",)
def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user = match.group(1)
def posts(self): def posts(self):
url = "{}/{}/saved/".format(self.root, self.user) url = "{}/{}/saved/".format(self.root, self.item)
user = self._extract_profile_page(url) user = self._extract_profile_page(url)
query_hash = "2ce1d673055b99250e93b6f88f878fde" query_hash = "2ce1d673055b99250e93b6f88f878fde"
@@ -459,22 +441,17 @@ class InstagramTagExtractor(InstagramExtractor):
"""Extractor for TagPage""" """Extractor for TagPage"""
subcategory = "tag" subcategory = "tag"
directory_fmt = ("{category}", "{subcategory}", "{tag}") directory_fmt = ("{category}", "{subcategory}", "{tag}")
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)"
r"/explore/tags/([^/?#]+)")
test = ("https://www.instagram.com/explore/tags/instagram/", { test = ("https://www.instagram.com/explore/tags/instagram/", {
"range": "1-16", "range": "1-16",
"count": ">= 16", "count": ">= 16",
}) })
def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.tag = match.group(1)
def metadata(self): def metadata(self):
return {"tag": self.tag} return {"tag": self.item}
def posts(self): def posts(self):
url = "{}/explore/tags/{}/".format(self.root, self.tag) url = "{}/explore/tags/{}/".format(self.root, self.item)
data = self._extract_shared_data(url) data = self._extract_shared_data(url)
hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"] hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
@@ -604,14 +581,10 @@ class InstagramPostExtractor(InstagramExtractor):
("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ("https://www.instagram.com/reel/CDg_6Y1pxWu/"),
) )
def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.shortcode = match.group(1)
def posts(self): def posts(self):
query_hash = "a9441f24ac73000fa17fe6e6da11d59d" query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
variables = { variables = {
"shortcode" : self.shortcode, "shortcode" : self.item,
"child_comment_count" : 3, "child_comment_count" : 3,
"fetch_comment_count" : 40, "fetch_comment_count" : 40,
"parent_comment_count" : 24, "parent_comment_count" : 24,
@@ -652,3 +625,34 @@ class InstagramStoriesExtractor(InstagramExtractor):
reel_id = user["id"] reel_id = user["id"]
return ({"__typename": "GraphReel", "id": reel_id},) return ({"__typename": "GraphReel", "id": reel_id},)
class InstagramHighlightsExtractor(InstagramExtractor):
"""Extractor for all Instagram story highlights of a user"""
subcategory = "highlights"
pattern = USER_PATTERN + r"/highlights"
test = ("https://www.instagram.com/instagram/highlights",)
def posts(self):
url = "{}/{}/".format(self.root, self.item)
user = self._extract_profile_page(url)
query_hash = "d4d88dc1500312af6f937f7b804c68c3"
variables = {
"user_id": user["id"],
"include_chaining": False,
"include_reel": True,
"include_suggested_users": False,
"include_logged_out_extras": False,
"include_highlight_reels": True,
"include_live_status": True,
}
data = self._graphql_request(query_hash, variables)
return [
{
"__typename": "GraphReel",
"id" : "highlight:" + edge["node"]["id"],
}
for edge in data["user"]["edge_highlight_reels"]["edges"]
]

View File

@@ -123,6 +123,7 @@ SUBCATEGORY_MAP = {
"story": "", "story": "",
}, },
"instagram": { "instagram": {
"posts": "",
"saved": "Saved Posts", "saved": "Saved Posts",
}, },
"newgrounds": { "newgrounds": {