Add instagram metadata: post_pageurl, post_tags (#743)
* Add instagram metadata: post_pageurl, post_tags Add the following metadata for instagram: - post_pageurl: json string with url of the post page - post_tags: json array with instagram tags extracted from the post description * Oops: rename post_tags to tags for --write-tags This way, --write-tags will pick up the post tags. * Rename to post_url, improve regex * Add post_url and tags to tests * Remove duplicate tags and sort them * Bugfix: don't create empty tag lists * Metadata: add location * Metadata: add tagged_users for each media * Move self._find_tags() to base class * Make flake happy
This commit is contained in:
@@ -14,6 +14,7 @@ from .. import text, exception
|
||||
from ..cache import cache
|
||||
import itertools
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
class InstagramExtractor(Extractor):
|
||||
@@ -26,6 +27,10 @@ class InstagramExtractor(Extractor):
|
||||
cookiedomain = ".instagram.com"
|
||||
cookienames = ("sessionid",)
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self._find_tags = re.compile(r'#\w+').findall
|
||||
|
||||
def get_metadata(self):
|
||||
return {}
|
||||
|
||||
@@ -133,12 +138,28 @@ class InstagramExtractor(Extractor):
|
||||
'fullname': media['owner']['full_name'],
|
||||
'post_id': media['id'],
|
||||
'post_shortcode': media['shortcode'],
|
||||
'post_url': url,
|
||||
'description': text.parse_unicode_escapes('\n'.join(
|
||||
edge['node']['text']
|
||||
for edge in media['edge_media_to_caption']['edges']
|
||||
)),
|
||||
}
|
||||
|
||||
if self._find_tags(common['description']):
|
||||
common['tags'] = sorted(
|
||||
set(self._find_tags(common['description'])))
|
||||
|
||||
if media['location']:
|
||||
common['location_id'] = media['location']['id']
|
||||
common['location_slug'] = media['location']['slug']
|
||||
common['location_url'] = (
|
||||
'https://www.instagram.com/explore/locations/' +
|
||||
media['location']['id'] +
|
||||
'/' +
|
||||
media['location']['slug'] +
|
||||
'/'
|
||||
)
|
||||
|
||||
medias = []
|
||||
if media['__typename'] == 'GraphSidecar':
|
||||
for num, edge in enumerate(
|
||||
@@ -156,6 +177,7 @@ class InstagramExtractor(Extractor):
|
||||
'sidecar_media_id': media['id'],
|
||||
'sidecar_shortcode': media['shortcode'],
|
||||
}
|
||||
self._extract_tagged_users(children, media_data)
|
||||
media_data.update(common)
|
||||
medias.append(media_data)
|
||||
|
||||
@@ -169,6 +191,7 @@ class InstagramExtractor(Extractor):
|
||||
'height': text.parse_int(media['dimensions']['height']),
|
||||
'width': text.parse_int(media['dimensions']['width']),
|
||||
}
|
||||
self._extract_tagged_users(media, media_data)
|
||||
media_data.update(common)
|
||||
medias.append(media_data)
|
||||
|
||||
@@ -305,6 +328,19 @@ class InstagramExtractor(Extractor):
|
||||
variables, psdf['query_hash'], csrf,
|
||||
)
|
||||
|
||||
def _extract_tagged_users(self, src_media, dest_dict):
|
||||
if src_media['edge_media_to_tagged_user']['edges']:
|
||||
tagged_users = []
|
||||
for num, edge in enumerate(
|
||||
src_media['edge_media_to_tagged_user']['edges'], 1):
|
||||
tagged = edge['node']
|
||||
tagged_data = {
|
||||
'username': tagged['user']['username'],
|
||||
'full_name': tagged['user']['full_name'],
|
||||
}
|
||||
tagged_users.append(tagged_data)
|
||||
dest_dict['tagged_users'] = tagged_users
|
||||
|
||||
|
||||
class InstagramImageExtractor(InstagramExtractor):
|
||||
"""Extractor for PostPage"""
|
||||
@@ -321,10 +357,14 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
"description": str,
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"location_id": "214424288",
|
||||
"location_slug": "hong-kong",
|
||||
"media_id": "1922949326347663701",
|
||||
"shortcode": "BqvsDleB3lV",
|
||||
"post_id": "1922949326347663701",
|
||||
"post_shortcode": "BqvsDleB3lV",
|
||||
"post_url": "https://www.instagram.com/p/BqvsDleB3lV/",
|
||||
"tags": ["#WHPsquares"],
|
||||
"typename": "GraphImage",
|
||||
"username": "instagram",
|
||||
"width": int,
|
||||
@@ -339,6 +379,7 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
"sidecar_shortcode": "BoHk1haB5tM",
|
||||
"post_id": "1875629777499953996",
|
||||
"post_shortcode": "BoHk1haB5tM",
|
||||
"post_url": "https://www.instagram.com/p/BoHk1haB5tM/",
|
||||
"num": int,
|
||||
"likes": int,
|
||||
"username": "instagram",
|
||||
@@ -354,7 +395,9 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"media_id": "1923502432034620000",
|
||||
"post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/",
|
||||
"shortcode": "Bqxp0VSBgJg",
|
||||
"tags": ["#ASMR"],
|
||||
"typename": "GraphVideo",
|
||||
"username": "instagram",
|
||||
"width": int,
|
||||
@@ -370,6 +413,7 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"media_id": "1806097553666903266",
|
||||
"post_url": "https://www.instagram.com/p/BkQjCfsBIzi/",
|
||||
"shortcode": "BkQjCfsBIzi",
|
||||
"typename": "GraphVideo",
|
||||
"username": "instagram",
|
||||
@@ -381,11 +425,22 @@ class InstagramImageExtractor(InstagramExtractor):
|
||||
("https://www.instagram.com/p/BtOvDOfhvRr/", {
|
||||
"count": 2,
|
||||
"keyword": {
|
||||
"post_url": "https://www.instagram.com/p/BtOvDOfhvRr/",
|
||||
"sidecar_media_id": "1967717017113261163",
|
||||
"sidecar_shortcode": "BtOvDOfhvRr",
|
||||
"video_url": str,
|
||||
}
|
||||
})
|
||||
}),
|
||||
|
||||
# GraphImage with tagged user
|
||||
("https://www.instagram.com/p/B_2lf3qAd3y/", {
|
||||
"keyword": {
|
||||
"tagged_users": [{
|
||||
"full_name": "Call Me Kay",
|
||||
"username": "kaaymbl"
|
||||
}]
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
|
||||
Reference in New Issue
Block a user