[instagram] rewrite
(#1113, #1122, #1128, #1130, #1149) Rely on the results of GraphQL queries instead of requesting data for each post separately via '/p/<shortcode>/?__a=1'. This might result in some missing metadata, and there might be some issues for '/channel/' and '/saved/' URLs, but at least downloading from the regular post listings should work without issues and without getting users blocked/banned. TODO: reimplement support for stories
This commit is contained in:
@@ -10,9 +10,8 @@
|
||||
"""Extractors for https://www.instagram.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, exception
|
||||
from .. import text, util, exception
|
||||
from ..cache import cache
|
||||
import itertools
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
@@ -27,43 +26,75 @@ class InstagramExtractor(Extractor):
|
||||
root = "https://www.instagram.com"
|
||||
cookiedomain = ".instagram.com"
|
||||
cookienames = ("sessionid",)
|
||||
_request_interval = 5
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self.csrf_token = util.generate_csrf_token()
|
||||
self._find_tags = re.compile(r'#\w+').findall
|
||||
|
||||
def get_metadata(self):
|
||||
return {}
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
yield Message.Version, 1
|
||||
|
||||
data = self.metadata()
|
||||
videos = self.config("videos", True)
|
||||
metadata = self.get_metadata()
|
||||
for data in self.instagrams():
|
||||
data.update(metadata)
|
||||
yield Message.Directory, data
|
||||
|
||||
if data['typename'] == 'GraphHighlightReel':
|
||||
url = '{}/stories/highlights/{}/'.format(self.root, data['id'])
|
||||
data['_extractor'] = InstagramStoriesExtractor
|
||||
yield Message.Queue, url, data
|
||||
else:
|
||||
url = data.get('video_url')
|
||||
for post in self.posts():
|
||||
post = self._parse_post(post)
|
||||
post.update(data)
|
||||
files = post.pop("_files")
|
||||
|
||||
yield Message.Directory, post
|
||||
for file in files:
|
||||
url = file.get('video_url')
|
||||
if not url:
|
||||
url = data['display_url']
|
||||
url = file['display_url']
|
||||
elif not videos:
|
||||
continue
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
file.update(post)
|
||||
yield Message.Url, url, text.nameext_from_url(url, file)
|
||||
|
||||
def metadata(self):
|
||||
return ()
|
||||
|
||||
def posts(self):
|
||||
return ()
|
||||
|
||||
def request(self, url, **kwargs):
|
||||
response = Extractor.request(self, url, **kwargs)
|
||||
if response.history and "/accounts/login/" in response.request.url:
|
||||
raise exception.StopExtraction(
|
||||
"Redirected to login page (%s)", response.request.url)
|
||||
return response
|
||||
|
||||
def _graphql_request(self, query_hash, variables):
|
||||
url = self.root + "/graphql/query/"
|
||||
params = {
|
||||
"query_hash": query_hash,
|
||||
"variables" : json.dumps(variables),
|
||||
}
|
||||
headers = {
|
||||
"X-CSRFToken" : self.csrf_token,
|
||||
"X-IG-App-ID" : "936619743392459",
|
||||
"X-IG-WWW-Claim" : "0",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
}
|
||||
cookies = {
|
||||
"csrftoken": self.csrf_token,
|
||||
}
|
||||
return self.request(
|
||||
url, params=params, headers=headers, cookies=cookies,
|
||||
).json()["data"]
|
||||
|
||||
def login(self):
|
||||
if self._check_cookies(self.cookienames):
|
||||
return
|
||||
username, password = self._get_auth_info()
|
||||
if username:
|
||||
self.session.cookies.set("ig_cb", "1", domain="www.instagram.com")
|
||||
self._update_cookies(self._login_impl(username, password))
|
||||
if not self._check_cookies(self.cookienames):
|
||||
username, password = self._get_auth_info()
|
||||
if username:
|
||||
self.session.cookies.set(
|
||||
'ig_cb', '2', domain='www.instagram.com')
|
||||
self._update_cookies(self._login_impl(username, password))
|
||||
|
||||
self.session.cookies.set(
|
||||
"csrftoken", self.csrf_token, domain=self.cookiedomain)
|
||||
|
||||
@cache(maxage=360*24*3600, keyarg=1)
|
||||
def _login_impl(self, username, password):
|
||||
@@ -98,16 +129,95 @@ class InstagramExtractor(Extractor):
|
||||
for key in ("sessionid", "mid", "csrftoken")
|
||||
}
|
||||
|
||||
def _request_graphql(self, variables, query_hash, csrf=None):
|
||||
headers = {
|
||||
'X-CSRFToken': csrf,
|
||||
'X-IG-App-ID': '936619743392459',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
def _parse_post(self, post):
|
||||
if post.get("is_video") and "video_url" not in post:
|
||||
url = "{}/tv/{}/".format(self.root, post["shortcode"])
|
||||
post = self._extract_post_page(url)
|
||||
|
||||
owner = post["owner"]
|
||||
data = {
|
||||
'typename' : post['__typename'],
|
||||
"date" : text.parse_timestamp(post["taken_at_timestamp"]),
|
||||
"likes" : post["edge_media_preview_like"]["count"],
|
||||
"owner_id" : owner["id"],
|
||||
"username" : owner.get("username"),
|
||||
"fullname" : owner.get("full_name"),
|
||||
"post_id" : post["id"],
|
||||
"post_shortcode": post["shortcode"],
|
||||
"post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
|
||||
"description": text.parse_unicode_escapes("\n".join(
|
||||
edge["node"]["text"]
|
||||
for edge in post["edge_media_to_caption"]["edges"]
|
||||
)),
|
||||
}
|
||||
url = '{}/graphql/query/?query_hash={}&variables={}'.format(
|
||||
self.root, query_hash, variables,
|
||||
)
|
||||
return self.request(url, headers=headers).json()
|
||||
|
||||
tags = self._find_tags(data["description"])
|
||||
if tags:
|
||||
data["tags"] = sorted(set(tags))
|
||||
|
||||
location = post.get("location")
|
||||
if location:
|
||||
data["location_id"] = location["id"]
|
||||
data["location_slug"] = location["slug"]
|
||||
data["location_url"] = "{}/explore/locations/{}/{}/".format(
|
||||
self.root, location["id"], location["slug"])
|
||||
|
||||
data["_files"] = files = []
|
||||
if "edge_sidecar_to_children" in post:
|
||||
for num, edge in enumerate(
|
||||
post['edge_sidecar_to_children']['edges'], 1):
|
||||
node = edge["node"]
|
||||
dimensions = node["dimensions"]
|
||||
media = {
|
||||
'num': num,
|
||||
'media_id' : node['id'],
|
||||
'shortcode' : (node.get('shortcode') or
|
||||
self._shortcode_from_id(node["id"])),
|
||||
'display_url': node['display_url'],
|
||||
'video_url' : node.get('video_url'),
|
||||
'width' : dimensions['width'],
|
||||
'height' : dimensions['height'],
|
||||
'sidecar_media_id' : post['id'],
|
||||
'sidecar_shortcode': post['shortcode'],
|
||||
}
|
||||
self._extract_tagged_users(node, media)
|
||||
files.append(media)
|
||||
else:
|
||||
dimensions = post["dimensions"]
|
||||
media = {
|
||||
'media_id' : post['id'],
|
||||
'shortcode' : post['shortcode'],
|
||||
'display_url': post['display_url'],
|
||||
'video_url' : post.get('video_url'),
|
||||
'width' : dimensions['width'],
|
||||
'height' : dimensions['height'],
|
||||
}
|
||||
self._extract_tagged_users(post, media)
|
||||
files.append(media)
|
||||
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def _shortcode_from_id(post_id):
|
||||
return util.bencode(
|
||||
int(post_id),
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789-_")
|
||||
|
||||
def _extract_tagged_users(self, src, dest):
|
||||
if "edge_media_to_tagged_user" not in src:
|
||||
return
|
||||
edges = src['edge_media_to_tagged_user']['edges']
|
||||
if edges:
|
||||
dest['tagged_users'] = tagged_users = []
|
||||
for edge in edges:
|
||||
user = edge['node']['user']
|
||||
tagged_users.append({
|
||||
'id' : user['id'],
|
||||
'username' : user['username'],
|
||||
'full_name': user['full_name'],
|
||||
})
|
||||
|
||||
def _extract_shared_data(self, url):
|
||||
page = self.request(url).text
|
||||
@@ -122,226 +232,156 @@ class InstagramExtractor(Extractor):
|
||||
json.loads(additional_data.partition(',')[2])
|
||||
return data
|
||||
|
||||
def _extract_postpage(self, url):
|
||||
try:
|
||||
with self.request(url + '?__a=1', fatal=False) as response:
|
||||
media = response.json()['graphql']['shortcode_media']
|
||||
except (KeyError, ValueError) as exc:
|
||||
self.log.warning("Unable to fetch data from '%s': %s: %s",
|
||||
url, exc.__class__.__name__, exc)
|
||||
self.log.debug("Server response: %s", response.text)
|
||||
return ()
|
||||
def _extract_profile_page(self, url):
|
||||
data = self._extract_shared_data(url)["entry_data"]
|
||||
if "HttpErrorPage" in data:
|
||||
raise exception.NotFoundError("user")
|
||||
return data["ProfilePage"][0]["graphql"]["user"]
|
||||
|
||||
common = {
|
||||
'date': text.parse_timestamp(media['taken_at_timestamp']),
|
||||
'likes': text.parse_int(media['edge_media_preview_like']['count']),
|
||||
'owner_id': media['owner']['id'],
|
||||
'username': media['owner']['username'],
|
||||
'fullname': media['owner']['full_name'],
|
||||
'post_id': media['id'],
|
||||
'post_shortcode': media['shortcode'],
|
||||
'post_url': url,
|
||||
'description': text.parse_unicode_escapes('\n'.join(
|
||||
edge['node']['text']
|
||||
for edge in media['edge_media_to_caption']['edges']
|
||||
)),
|
||||
}
|
||||
|
||||
tags = self._find_tags(common['description'])
|
||||
if tags:
|
||||
common['tags'] = sorted(set(tags))
|
||||
|
||||
location = media['location']
|
||||
if location:
|
||||
common['location_id'] = location['id']
|
||||
common['location_slug'] = location['slug']
|
||||
common['location_url'] = "{}/explore/locations/{}/{}/".format(
|
||||
self.root, location['id'], location['slug'])
|
||||
|
||||
medias = []
|
||||
if media['__typename'] == 'GraphSidecar':
|
||||
for num, edge in enumerate(
|
||||
media['edge_sidecar_to_children']['edges'], 1):
|
||||
children = edge['node']
|
||||
media_data = {
|
||||
'num': num,
|
||||
'media_id': children['id'],
|
||||
'shortcode': children['shortcode'],
|
||||
'typename': children['__typename'],
|
||||
'display_url': children['display_url'],
|
||||
'video_url': children.get('video_url'),
|
||||
'height': text.parse_int(children['dimensions']['height']),
|
||||
'width': text.parse_int(children['dimensions']['width']),
|
||||
'sidecar_media_id': media['id'],
|
||||
'sidecar_shortcode': media['shortcode'],
|
||||
}
|
||||
self._extract_tagged_users(children, media_data)
|
||||
media_data.update(common)
|
||||
medias.append(media_data)
|
||||
|
||||
else:
|
||||
media_data = {
|
||||
'media_id': media['id'],
|
||||
'shortcode': media['shortcode'],
|
||||
'typename': media['__typename'],
|
||||
'display_url': media['display_url'],
|
||||
'video_url': media.get('video_url'),
|
||||
'height': text.parse_int(media['dimensions']['height']),
|
||||
'width': text.parse_int(media['dimensions']['width']),
|
||||
}
|
||||
self._extract_tagged_users(media, media_data)
|
||||
media_data.update(common)
|
||||
medias.append(media_data)
|
||||
|
||||
return medias
|
||||
|
||||
def _extract_stories(self, url):
|
||||
if self.highlight_id:
|
||||
user_id = ''
|
||||
highlight_id = '"{}"'.format(self.highlight_id)
|
||||
query_hash = '30a89afdd826d78a5376008a7b81c205'
|
||||
else:
|
||||
shared_data = self._extract_shared_data(url)
|
||||
|
||||
# If no stories are present the URL redirects to `ProfilePage'
|
||||
if 'StoriesPage' not in shared_data['entry_data']:
|
||||
return []
|
||||
|
||||
user_id = '"{}"'.format(
|
||||
shared_data['entry_data']['StoriesPage'][0]['user']['id'])
|
||||
highlight_id = ''
|
||||
query_hash = '0a85e6ea60a4c99edc58ab2f3d17cfdf'
|
||||
|
||||
variables = (
|
||||
'{{'
|
||||
'"reel_ids":[{}],"tag_names":[],"location_ids":[],'
|
||||
'"highlight_reel_ids":[{}],"precomposed_overlay":false,'
|
||||
'"show_story_viewer_list":true,'
|
||||
'"story_viewer_fetch_count":50,"story_viewer_cursor":"",'
|
||||
'"stories_video_dash_manifest":false'
|
||||
'}}'
|
||||
).format(user_id, highlight_id)
|
||||
shared_data = self._request_graphql(variables, query_hash)
|
||||
|
||||
# If there are stories present but the user is not authenticated or
|
||||
# does not have permissions no stories are returned.
|
||||
if not shared_data['data']['reels_media']:
|
||||
return [] # no stories present
|
||||
|
||||
medias = []
|
||||
for media in shared_data['data']['reels_media'][0]['items']:
|
||||
media_data = {
|
||||
'owner_id': media['owner']['id'],
|
||||
'username': media['owner']['username'],
|
||||
'date' : text.parse_timestamp(
|
||||
media['taken_at_timestamp']),
|
||||
'expires' : text.parse_timestamp(
|
||||
media['expiring_at_timestamp']),
|
||||
'media_id': media['id'],
|
||||
'typename': media['__typename'],
|
||||
'display_url': media['display_url'],
|
||||
}
|
||||
if media['__typename'] == 'GraphStoryImage':
|
||||
media_data.update({
|
||||
'height': text.parse_int(media['dimensions']['height']),
|
||||
'width': text.parse_int(media['dimensions']['width']),
|
||||
})
|
||||
elif media['__typename'] == 'GraphStoryVideo':
|
||||
vr = media['video_resources'][0]
|
||||
media_data.update({
|
||||
'duration': text.parse_float(media['video_duration']),
|
||||
'video_url': vr['src'],
|
||||
'height': text.parse_int(vr['config_height']),
|
||||
'width': text.parse_int(vr['config_width']),
|
||||
})
|
||||
medias.append(media_data)
|
||||
|
||||
return medias
|
||||
|
||||
def _extract_story_highlights(self, shared_data):
|
||||
graphql = shared_data['entry_data']['ProfilePage'][0]['graphql']
|
||||
variables = (
|
||||
'{{'
|
||||
'"user_id":"{}","include_chaining":true,'
|
||||
'"include_reel":true,"include_suggested_users":false,'
|
||||
'"include_logged_out_extras":false,'
|
||||
'"include_highlight_reels":true'
|
||||
'}}'
|
||||
).format(graphql['user']['id'])
|
||||
|
||||
data = self._request_graphql(
|
||||
variables,
|
||||
'ad99dd9d3646cc3c0dda65debcd266a7',
|
||||
shared_data['config']['csrf_token'],
|
||||
)
|
||||
|
||||
highlights = []
|
||||
for edge in data['data']['user']['edge_highlight_reels']['edges']:
|
||||
story = edge['node']
|
||||
highlights.append({
|
||||
'id' : story['id'],
|
||||
'title' : story['title'],
|
||||
'owner_id': story['owner']['id'],
|
||||
'username': story['owner']['username'],
|
||||
'typename': story['__typename'],
|
||||
})
|
||||
|
||||
return highlights
|
||||
|
||||
def _extract_page(self, shared_data, psdf):
|
||||
csrf = shared_data['config']['csrf_token']
|
||||
def _extract_post_page(self, url):
|
||||
data = self._extract_shared_data(url)["entry_data"]
|
||||
if "HttpErrorPage" in data:
|
||||
raise exception.NotFoundError("post")
|
||||
return data["PostPage"][0]["graphql"]["shortcode_media"]
|
||||
|
||||
def _pagination(self, query_hash, variables, data):
|
||||
while True:
|
||||
# Deal with different structure of pages: the first page
|
||||
# has interesting data in `entry_data', next pages in `data'.
|
||||
if 'entry_data' in shared_data:
|
||||
entry_data = shared_data['entry_data']
|
||||
if 'HttpErrorPage' in entry_data:
|
||||
return
|
||||
base_shared_data = entry_data[psdf['page']][0]['graphql']
|
||||
for edge in data["edges"]:
|
||||
yield edge["node"]
|
||||
|
||||
# variables_id is available only in the first page
|
||||
variables_id = base_shared_data[psdf['node']][psdf['node_id']]
|
||||
else:
|
||||
base_shared_data = shared_data['data']
|
||||
|
||||
medias = base_shared_data[psdf['node']][psdf['edge_to_medias']]
|
||||
has_next_page = medias['page_info']['has_next_page']
|
||||
shortcodes = [n['node']['shortcode'] for n in medias['edges']]
|
||||
|
||||
for s in shortcodes:
|
||||
url = '{}/p/{}/'.format(self.root, s)
|
||||
yield from self._extract_postpage(url)
|
||||
|
||||
if not has_next_page:
|
||||
break
|
||||
time.sleep(3)
|
||||
end_cursor = medias['page_info']['end_cursor']
|
||||
variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format(
|
||||
psdf['variables_id'],
|
||||
variables_id,
|
||||
end_cursor,
|
||||
)
|
||||
shared_data = self._request_graphql(
|
||||
variables, psdf['query_hash'], csrf,
|
||||
)
|
||||
|
||||
def _extract_tagged_users(self, src_media, dest_dict):
|
||||
edges = src_media['edge_media_to_tagged_user']['edges']
|
||||
if edges:
|
||||
dest_dict['tagged_users'] = tagged_users = []
|
||||
for edge in edges:
|
||||
user = edge['node']['user']
|
||||
tagged_users.append({
|
||||
'id' : user['id'],
|
||||
'username' : user['username'],
|
||||
'full_name': user['full_name'],
|
||||
})
|
||||
info = data["page_info"]
|
||||
if not info["has_next_page"]:
|
||||
return
|
||||
variables["after"] = info["end_cursor"]
|
||||
data = next(iter(self._graphql_request(
|
||||
query_hash, variables)["user"].values()))
|
||||
|
||||
|
||||
class InstagramImageExtractor(InstagramExtractor):
|
||||
"""Extractor for PostPage"""
|
||||
subcategory = "image"
|
||||
class InstagramUserExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage"""
|
||||
subcategory = "user"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
|
||||
r"([^/?#]+)/?(?:$|[?#])")
|
||||
test = (
|
||||
("https://www.instagram.com/instagram/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
}),
|
||||
# ("https://www.instagram.com/instagram/", {
|
||||
# "options": (("highlights", True),),
|
||||
# "pattern": InstagramStoriesExtractor.pattern,
|
||||
# "range": "1-2",
|
||||
# "count": 2,
|
||||
# }),
|
||||
("https://www.instagram.com/instagram/?hl=en"),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.user = match.group(1)
|
||||
|
||||
def posts(self):
|
||||
url = '{}/{}/'.format(self.root, self.user)
|
||||
user = self._extract_profile_page(url)
|
||||
edge = user["edge_owner_to_timeline_media"]
|
||||
|
||||
query_hash = "003056d32c2554def87228bc3fd9668a"
|
||||
variables = {"id": user["id"], "first": 12}
|
||||
return self._pagination(query_hash, variables, edge)
|
||||
|
||||
|
||||
class InstagramChannelExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage channel"""
|
||||
subcategory = "channel"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
|
||||
r"([^/?#]+)/channel")
|
||||
test = ("https://www.instagram.com/instagram/channel/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
})
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.user = match.group(1)
|
||||
|
||||
def posts(self):
|
||||
url = '{}/{}/channel/'.format(self.root, self.user)
|
||||
user = self._extract_profile_page(url)
|
||||
edge = user["edge_felix_video_timeline"]
|
||||
|
||||
query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
|
||||
variables = {"id": user["id"], "first": 12}
|
||||
return self._pagination(query_hash, variables, edge)
|
||||
|
||||
|
||||
class InstagramSavedExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage saved media"""
|
||||
subcategory = "saved"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
|
||||
r"([^/?#]+)/saved")
|
||||
test = ("https://www.instagram.com/instagram/saved/",)
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.user = match.group(1)
|
||||
|
||||
def posts(self):
|
||||
url = '{}/{}/saved/'.format(self.root, self.user)
|
||||
user = self._extract_profile_page(url)
|
||||
edge = user["edge_saved_media"]
|
||||
|
||||
query_hash = "2ce1d673055b99250e93b6f88f878fde"
|
||||
variables = {"id": user["id"], "first": 12}
|
||||
return self._pagination(query_hash, variables, edge)
|
||||
|
||||
|
||||
class InstagramTagExtractor(InstagramExtractor):
|
||||
"""Extractor for TagPage"""
|
||||
subcategory = "tag"
|
||||
directory_fmt = ("{category}", "{subcategory}", "{tag}")
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/explore/tags/([^/?#]+)")
|
||||
test = ("https://www.instagram.com/explore/tags/instagram/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
})
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.tag = match.group(1)
|
||||
|
||||
def metadata(self):
|
||||
return {"tag": self.tag}
|
||||
|
||||
def posts(self):
|
||||
url = '{}/explore/tags/{}/'.format(self.root, self.tag)
|
||||
data = self._extract_shared_data(url)
|
||||
hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
|
||||
edge = hashtag["edge_hashtag_to_media"]
|
||||
|
||||
query_hash = "9b498c08113f1e09617a1703c22b2f32"
|
||||
variables = {"tag_name": hashtag["name"], "first": 12}
|
||||
return self._pagination(query_hash, variables, edge)
|
||||
|
||||
def _pagination(self, query_hash, variables, data):
|
||||
while True:
|
||||
for edge in data["edges"]:
|
||||
yield edge["node"]
|
||||
|
||||
info = data["page_info"]
|
||||
if not info["has_next_page"]:
|
||||
return
|
||||
variables["after"] = info["end_cursor"]
|
||||
data = self._graphql_request(
|
||||
query_hash, variables)["hashtag"]["edge_hashtag_to_media"]
|
||||
|
||||
|
||||
class InstagramPostExtractor(InstagramExtractor):
|
||||
"""Extractor for an Instagram post"""
|
||||
subcategory = "post"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?:p|tv|reel)/([^/?#]+)")
|
||||
test = (
|
||||
@@ -435,8 +475,8 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
("https://www.instagram.com/p/B_2lf3qAd3y/", {
|
||||
"keyword": {
|
||||
"tagged_users": [{
|
||||
"id": "1246468638",
|
||||
"username": "kaaymbl",
|
||||
"id" : "1246468638",
|
||||
"username" : "kaaymbl",
|
||||
"full_name": "Call Me Kay",
|
||||
}]
|
||||
}
|
||||
@@ -449,158 +489,14 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.shortcode = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/p/{}/'.format(self.root, self.shortcode)
|
||||
return self._extract_postpage(url)
|
||||
|
||||
|
||||
class InstagramStoriesExtractor(InstagramExtractor):
|
||||
"""Extractor for StoriesPage"""
|
||||
subcategory = "stories"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/stories/([^/?#]+)(?:/(\d+))?")
|
||||
test = (
|
||||
("https://www.instagram.com/stories/instagram/"),
|
||||
("https://www.instagram.com/stories/highlights/18042509488170095/"),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.username, self.highlight_id = match.groups()
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/stories/{}/'.format(self.root, self.username)
|
||||
return self._extract_stories(url)
|
||||
|
||||
|
||||
class InstagramSavedExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage saved media"""
|
||||
subcategory = "saved"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
|
||||
r"([^/?#]+)/saved")
|
||||
test = ("https://www.instagram.com/instagram/saved/",)
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.username = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/{}/saved/'.format(self.root, self.username)
|
||||
shared_data = self._extract_shared_data(url)
|
||||
|
||||
return self._extract_page(shared_data, {
|
||||
'page': 'ProfilePage',
|
||||
'node': 'user',
|
||||
'node_id': 'id',
|
||||
'variables_id': 'id',
|
||||
'edge_to_medias': 'edge_saved_media',
|
||||
'query_hash': '8c86fed24fa03a8a2eea2a70a80c7b6b',
|
||||
})
|
||||
|
||||
|
||||
class InstagramUserExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage"""
|
||||
subcategory = "user"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
|
||||
r"([^/?#]+)/?(?:$|[?#])")
|
||||
test = (
|
||||
("https://www.instagram.com/instagram/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
}),
|
||||
("https://www.instagram.com/instagram/", {
|
||||
"options": (("highlights", True),),
|
||||
"pattern": InstagramStoriesExtractor.pattern,
|
||||
"range": "1-2",
|
||||
"count": 2,
|
||||
}),
|
||||
("https://www.instagram.com/instagram/?hl=en"),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.username = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/{}/'.format(self.root, self.username)
|
||||
shared_data = self._extract_shared_data(url)
|
||||
|
||||
instagrams = self._extract_page(shared_data, {
|
||||
'page': 'ProfilePage',
|
||||
'node': 'user',
|
||||
'node_id': 'id',
|
||||
'variables_id': 'id',
|
||||
'edge_to_medias': 'edge_owner_to_timeline_media',
|
||||
'query_hash': '15bf78a4ad24e33cbd838fdb31353ac1',
|
||||
})
|
||||
|
||||
if self.config('highlights'):
|
||||
instagrams = itertools.chain(
|
||||
self._extract_story_highlights(shared_data),
|
||||
instagrams,
|
||||
)
|
||||
|
||||
return instagrams
|
||||
|
||||
|
||||
class InstagramChannelExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage channel"""
|
||||
subcategory = "channel"
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
|
||||
r"([^/?#]+)/channel")
|
||||
test = ("https://www.instagram.com/instagram/channel/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
})
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.username = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/{}/channel/'.format(self.root, self.username)
|
||||
shared_data = self._extract_shared_data(url)
|
||||
|
||||
return self._extract_page(shared_data, {
|
||||
'page': 'ProfilePage',
|
||||
'node': 'user',
|
||||
'node_id': 'id',
|
||||
'variables_id': 'id',
|
||||
'edge_to_medias': 'edge_felix_video_timeline',
|
||||
'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76',
|
||||
})
|
||||
|
||||
|
||||
class InstagramTagExtractor(InstagramExtractor):
|
||||
"""Extractor for TagPage"""
|
||||
subcategory = "tag"
|
||||
directory_fmt = ("{category}", "{subcategory}", "{tag}")
|
||||
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
|
||||
r"/explore/tags/([^/?#]+)")
|
||||
test = ("https://www.instagram.com/explore/tags/instagram/", {
|
||||
"range": "1-16",
|
||||
"count": ">= 16",
|
||||
})
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self, match)
|
||||
self.tag = match.group(1)
|
||||
|
||||
def get_metadata(self):
|
||||
return {"tag": self.tag}
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/explore/tags/{}/'.format(self.root, self.tag)
|
||||
shared_data = self._extract_shared_data(url)
|
||||
|
||||
return self._extract_page(shared_data, {
|
||||
'page': 'TagPage',
|
||||
'node': 'hashtag',
|
||||
'node_id': 'name',
|
||||
'variables_id': 'tag_name',
|
||||
'edge_to_medias': 'edge_hashtag_to_media',
|
||||
'query_hash': 'c769cb6c71b24c8a86590b22402fda50',
|
||||
})
|
||||
def posts(self):
|
||||
query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
|
||||
variables = {
|
||||
"shortcode" : self.shortcode,
|
||||
"child_comment_count" : 3,
|
||||
"fetch_comment_count" : 40,
|
||||
"parent_comment_count" : 24,
|
||||
"has_threaded_comments": True
|
||||
}
|
||||
data = self._graphql_request(query_hash, variables)
|
||||
return (data["shortcode_media"],)
|
||||
|
||||
Reference in New Issue
Block a user