From 69865dcc0567807fc0921337a9a0879610e103a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Mon, 22 May 2023 18:30:45 +0200
Subject: [PATCH] [formatter] implement slicing strings as bytes (#4087)

prefixing a slice '[10:30]' with a lowercase b '[b10:30]' encodes
the string to bytes in filesystem encoding before applying the slice
---
 docs/formatting.md      | 23 +++++++++++++++--------
 gallery_dl/formatter.py | 30 ++++++++++++++++++++++++++----
 test/test_formatter.py  | 29 +++++++++++++++++++++++++----
 3 files changed, 66 insertions(+), 16 deletions(-)
diff --git a/docs/formatting.md b/docs/formatting.md
index cc2703d2..7c571fdd 100644
--- a/docs/formatting.md
+++ b/docs/formatting.md
@@ -11,14 +11,15 @@ Field names select the metadata value to use in a replacement field.
 
 While simple names are usually enough, more complex forms like accessing values by attribute, element index, or slicing are also supported.
 
-|                      | Example           | Result                 |
-| -------------------- | ----------------- | ---------------------- |
-| Name                 | `{title}`         | `Hello World`          |
-| Element Index        | `{title[6]}`      | `W`                    |
-| Slicing              | `{title[3:8]}`    | `lo Wo`                |
-| Alternatives         | `{empty\|title}`  | `Hello World`          |
-| Element Access       | `{user[name]}`    | `John Doe`             |
-| Attribute Access     | `{extractor.url}` | `https://example.org/` |
+|                      | Example             | Result                 |
+| -------------------- | ------------------- | ---------------------- |
+| Name                 | `{title}`           | `Hello World`          |
+| Element Index        | `{title[6]}`        | `W`                    |
+| Slicing              | `{title[3:8]}`      | `lo Wo`                |
+| Slicing (Bytes)      | `{title_ja[b3:18]}` | `ロー・ワー`           |
+| Alternatives         | `{empty\|title}`    | `Hello World`          |
+| Element Access       | `{user[name]}`      | `John Doe`             |
+| Attribute Access     | `{extractor.url}`   | `https://example.org/` |
 
 All of these methods can be combined as needed.
 For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`.
@@ -150,6 +151,12 @@ Format specifiers can be used for advanced formatting by using the options provi
     <td><code>{foo:[1:-1]}</code></td>
     <td><code>oo&nbsp;Ba</code></td>
 </tr>
+<tr>
+    <td><code>[b&lt;start&gt;:&lt;stop&gt;]</code></td>
+    <td>Same as above, but applies to the <a href="https://docs.python.org/3/library/stdtypes.html#bytes"><code>bytes()</code></a> representation of a string in <a href="https://docs.python.org/3/library/sys.html#sys.getfilesystemencoding">filesystem encoding</a></td>
+    <td><code>{foo_ja:[b3:-1]}</code></td>
+    <td><code>ー・バ</code></td>
+</tr>
 <tr>
     <td rowspan="2"><code>L&lt;maxlen&gt;/&lt;repl&gt;/</code></td>
     <td rowspan="2">Replaces the entire output with <code>&lt;repl&gt;</code> if its length exceeds <code>&lt;maxlen&gt;</code></td>
diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py
index fc36fa2c..2ff48c32 100644
--- a/gallery_dl/formatter.py
+++ b/gallery_dl/formatter.py
@@ -9,6 +9,7 @@
 """String formatters"""
 
 import os
+import sys
 import time
 import string
 import _string
@@ -255,7 +256,11 @@ def parse_field_name(field_name):
             func = operator.itemgetter
             try:
                 if ":" in key:
-                    key = _slice(key)
+                    if key[0] == "b":
+                        func = _bytesgetter
+                        key = _slice(key[1:])
+                    else:
+                        key = _slice(key)
                 else:
                     key = key.strip("\"'")
             except TypeError:
@@ -276,6 +281,14 @@ def _slice(indices):
     )
 
 
+def _bytesgetter(slice, encoding=sys.getfilesystemencoding()):
+
+    def apply_slice_bytes(obj):
+        return obj.encode(encoding)[slice].decode(encoding, "ignore")
+
+    return apply_slice_bytes
+
+
 def _build_format_func(format_spec, default):
     if format_spec:
         return _FORMAT_SPECIFIERS.get(
@@ -295,11 +308,20 @@ def _parse_optional(format_spec, default):
 
 def _parse_slice(format_spec, default):
     indices, _, format_spec = format_spec.partition("]")
-    slice = _slice(indices[1:])
     fmt = _build_format_func(format_spec, default)
 
-    def apply_slice(obj):
-        return fmt(obj[slice])
+    if indices[1] == "b":
+        slice_bytes = _bytesgetter(_slice(indices[2:]))
+
+        def apply_slice(obj):
+            return fmt(slice_bytes(obj))
+
+    else:
+        slice = _slice(indices[1:])
+
+        def apply_slice(obj):
+            return fmt(obj[slice])
+
     return apply_slice
 
 
diff --git a/test/test_formatter.py b/test/test_formatter.py
index 22589668..1bda9d9c 100644
--- a/test/test_formatter.py
+++ b/test/test_formatter.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -23,6 +23,7 @@ class TestFormatter(unittest.TestCase):
     kwdict = {
         "a": "hElLo wOrLd",
         "b": "äöü",
+        "j": "げんそうきょう",
         "d": {"a": "foo", "b": 0, "c": None},
         "l": ["a", "b", "c"],
         "n": None,
@@ -133,7 +134,7 @@ class TestFormatter(unittest.TestCase):
         self._run_test("{d['a']}", "foo")
         self._run_test('{d["a"]}', "foo")
 
-    def test_slicing(self):
+    def test_slice_str(self):
         v = self.kwdict["a"]
         self._run_test("{a[1:10]}"  , v[1:10])
         self._run_test("{a[-10:-1]}", v[-10:-1])
@@ -165,6 +166,26 @@ class TestFormatter(unittest.TestCase):
         self._run_test("{a:[:50:2]}", v[:50:2])
         self._run_test("{a:[::]}"   , v)
 
+    def test_slice_bytes(self):
+        v = self.kwdict["j"]
+        self._run_test("{j[b1:10]}"  , v[1:3])
+        self._run_test("{j[b-10:-1]}", v[-3:-1])
+        self._run_test("{j[b5:]}"    , v[2:])
+        self._run_test("{j[b50:]}"   , v[50:])
+        self._run_test("{j[b:5]}"    , v[:1])
+        self._run_test("{j[b:50]}"   , v[:50])
+        self._run_test("{j[b:]}"     , v)
+        self._run_test("{j[b::]}"    , v)
+
+        self._run_test("{j:[b1:10]}"  , v[1:3])
+        self._run_test("{j:[b-10:-1]}", v[-3:-1])
+        self._run_test("{j:[b5:]}"    , v[2:])
+        self._run_test("{j:[b50:]}"   , v[50:])
+        self._run_test("{j:[b:5]}"    , v[:1])
+        self._run_test("{j:[b:50]}"   , v[:50])
+        self._run_test("{j:[b:]}"     , v)
+        self._run_test("{j:[b::]}"    , v)
+
     def test_maxlen(self):
         v = self.kwdict["a"]
         self._run_test("{a:L5/foo/}" , "foo")
@@ -413,10 +434,10 @@ def noarg():
             fmt4 = formatter.parse("\fM " + path + ":lengths")
 
         self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name")
-        self.assertEqual(fmt2.format_map(self.kwdict), "89")
+        self.assertEqual(fmt2.format_map(self.kwdict), "96")
 
         self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name")
-        self.assertEqual(fmt4.format_map(self.kwdict), "89")
+        self.assertEqual(fmt4.format_map(self.kwdict), "96")
 
         with self.assertRaises(TypeError):
             self.assertEqual(fmt0.format_map(self.kwdict), "")