Add instagram metadata: post_pageurl, post_tags (#743)

* Add instagram metadata: post_pageurl, post_tags Add the following metadata for instagram: - post_pageurl: json string with url of the post page - post_tags: json array with instagram tags extracted from the post description * Oops: rename post_tags to tags for --write-tags This way, --write-tags will pick up the post tags. * Rename to post_url, improve regex * Add post_url and tags to tests * Remove duplicate tags and sort them * Bugfix: don't create empty tag lists * Metadata: add location * Metadata: add tagged_users for each media * Move self._find_tags() to base class * Make flake happy
2020-05-28 21:58:24 +02:00
parent da22ea8ced
commit 62b65e59d0
1 changed files with 56 additions and 1 deletions
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -14,6 +14,7 @@ from .. import text, exception
 from ..cache import cache
 import itertools
 import json
+import re


 class InstagramExtractor(Extractor):
@@ -26,6 +27,10 @@ class InstagramExtractor(Extractor):
    cookiedomain = ".instagram.com"
    cookienames = ("sessionid",)

+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self._find_tags = re.compile(r'#\w+').findall
+
    def get_metadata(self):
        return {}

@@ -133,12 +138,28 @@ class InstagramExtractor(Extractor):
            'fullname': media['owner']['full_name'],
            'post_id': media['id'],
            'post_shortcode': media['shortcode'],
+            'post_url': url,
            'description': text.parse_unicode_escapes('\n'.join(
                edge['node']['text']
                for edge in media['edge_media_to_caption']['edges']
            )),
        }

+        if self._find_tags(common['description']):
+            common['tags'] = sorted(
+                set(self._find_tags(common['description'])))
+
+        if media['location']:
+            common['location_id'] = media['location']['id']
+            common['location_slug'] = media['location']['slug']
+            common['location_url'] = (
+                'https://www.instagram.com/explore/locations/' +
+                media['location']['id'] +
+                '/' +
+                media['location']['slug'] +
+                '/'
+            )
+
        medias = []
        if media['__typename'] == 'GraphSidecar':
            for num, edge in enumerate(
@@ -156,6 +177,7 @@ class InstagramExtractor(Extractor):
                    'sidecar_media_id': media['id'],
                    'sidecar_shortcode': media['shortcode'],
                }
+                self._extract_tagged_users(children, media_data)
                media_data.update(common)
                medias.append(media_data)

@@ -169,6 +191,7 @@ class InstagramExtractor(Extractor):
                'height': text.parse_int(media['dimensions']['height']),
                'width': text.parse_int(media['dimensions']['width']),
            }
+            self._extract_tagged_users(media, media_data)
            media_data.update(common)
            medias.append(media_data)

@@ -305,6 +328,19 @@ class InstagramExtractor(Extractor):
                variables, psdf['query_hash'], csrf,
            )

+    def _extract_tagged_users(self, src_media, dest_dict):
+        if src_media['edge_media_to_tagged_user']['edges']:
+            tagged_users = []
+            for num, edge in enumerate(
+                    src_media['edge_media_to_tagged_user']['edges'], 1):
+                tagged = edge['node']
+                tagged_data = {
+                    'username': tagged['user']['username'],
+                    'full_name': tagged['user']['full_name'],
+                }
+            tagged_users.append(tagged_data)
+            dest_dict['tagged_users'] = tagged_users
+

 class InstagramImageExtractor(InstagramExtractor):
    """Extractor for PostPage"""
@@ -321,10 +357,14 @@ class InstagramImageExtractor(InstagramExtractor):
                "description": str,
                "height": int,
                "likes": int,
+                "location_id": "214424288",
+                "location_slug": "hong-kong",
                "media_id": "1922949326347663701",
                "shortcode": "BqvsDleB3lV",
                "post_id": "1922949326347663701",
                "post_shortcode": "BqvsDleB3lV",
+                "post_url": "https://www.instagram.com/p/BqvsDleB3lV/",
+                "tags": ["#WHPsquares"],
                "typename": "GraphImage",
                "username": "instagram",
                "width": int,
@@ -339,6 +379,7 @@ class InstagramImageExtractor(InstagramExtractor):
                "sidecar_shortcode": "BoHk1haB5tM",
                "post_id": "1875629777499953996",
                "post_shortcode": "BoHk1haB5tM",
+                "post_url": "https://www.instagram.com/p/BoHk1haB5tM/",
                "num": int,
                "likes": int,
                "username": "instagram",
@@ -354,7 +395,9 @@ class InstagramImageExtractor(InstagramExtractor):
                "height": int,
                "likes": int,
                "media_id": "1923502432034620000",
+                "post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/",
                "shortcode": "Bqxp0VSBgJg",
+                "tags": ["#ASMR"],
                "typename": "GraphVideo",
                "username": "instagram",
                "width": int,
@@ -370,6 +413,7 @@ class InstagramImageExtractor(InstagramExtractor):
                "height": int,
                "likes": int,
                "media_id": "1806097553666903266",
+                "post_url": "https://www.instagram.com/p/BkQjCfsBIzi/",
                "shortcode": "BkQjCfsBIzi",
                "typename": "GraphVideo",
                "username": "instagram",
@@ -381,11 +425,22 @@ class InstagramImageExtractor(InstagramExtractor):
        ("https://www.instagram.com/p/BtOvDOfhvRr/", {
            "count": 2,
            "keyword": {
+                "post_url": "https://www.instagram.com/p/BtOvDOfhvRr/",
                "sidecar_media_id": "1967717017113261163",
                "sidecar_shortcode": "BtOvDOfhvRr",
                "video_url": str,
            }
-        })
+        }),
+
+        # GraphImage with tagged user
+        ("https://www.instagram.com/p/B_2lf3qAd3y/", {
+            "keyword": {
+                "tagged_users": [{
+                    "full_name": "Call Me Kay",
+                    "username": "kaaymbl"
+                }]
+            }
+        }),
    )

    def __init__(self, match):