From 69865dcc0567807fc0921337a9a0879610e103a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 22 May 2023 18:30:45 +0200 Subject: [PATCH] [formatter] implement slicing strings as bytes (#4087) prefixing a slice '[10:30]' with a lowercase b '[b10:30]' encodes the string to bytes in filesystem encoding before applying the slice --- docs/formatting.md | 23 +++++++++++++++-------- gallery_dl/formatter.py | 30 ++++++++++++++++++++++++++---- test/test_formatter.py | 29 +++++++++++++++++++++++++---- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/docs/formatting.md b/docs/formatting.md index cc2703d2..7c571fdd 100644 --- a/docs/formatting.md +++ b/docs/formatting.md @@ -11,14 +11,15 @@ Field names select the metadata value to use in a replacement field. While simple names are usually enough, more complex forms like accessing values by attribute, element index, or slicing are also supported. -| | Example | Result | -| -------------------- | ----------------- | ---------------------- | -| Name | `{title}` | `Hello World` | -| Element Index | `{title[6]}` | `W` | -| Slicing | `{title[3:8]}` | `lo Wo` | -| Alternatives | `{empty\|title}` | `Hello World` | -| Element Access | `{user[name]}` | `John Doe` | -| Attribute Access | `{extractor.url}` | `https://example.org/` | +| | Example | Result | +| -------------------- | ------------------- | ---------------------- | +| Name | `{title}` | `Hello World` | +| Element Index | `{title[6]}` | `W` | +| Slicing | `{title[3:8]}` | `lo Wo` | +| Slicing (Bytes) | `{title_ja[b3:18]}` | `ロー・ワー` | +| Alternatives | `{empty\|title}` | `Hello World` | +| Element Access | `{user[name]}` | `John Doe` | +| Attribute Access | `{extractor.url}` | `https://example.org/` | All of these methods can be combined as needed. For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`. @@ -150,6 +151,12 @@ Format specifiers can be used for advanced formatting by using the options provi {foo:[1:-1]} oo Ba + + [b<start>:<stop>] + Same as above, but applies to the bytes() representation of a string in filesystem encoding + {foo_ja:[b3:-1]} + ー・バ + L<maxlen>/<repl>/ Replaces the entire output with <repl> if its length exceeds <maxlen> diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index fc36fa2c..2ff48c32 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -9,6 +9,7 @@ """String formatters""" import os +import sys import time import string import _string @@ -255,7 +256,11 @@ def parse_field_name(field_name): func = operator.itemgetter try: if ":" in key: - key = _slice(key) + if key[0] == "b": + func = _bytesgetter + key = _slice(key[1:]) + else: + key = _slice(key) else: key = key.strip("\"'") except TypeError: @@ -276,6 +281,14 @@ def _slice(indices): ) +def _bytesgetter(slice, encoding=sys.getfilesystemencoding()): + + def apply_slice_bytes(obj): + return obj.encode(encoding)[slice].decode(encoding, "ignore") + + return apply_slice_bytes + + def _build_format_func(format_spec, default): if format_spec: return _FORMAT_SPECIFIERS.get( @@ -295,11 +308,20 @@ def _parse_optional(format_spec, default): def _parse_slice(format_spec, default): indices, _, format_spec = format_spec.partition("]") - slice = _slice(indices[1:]) fmt = _build_format_func(format_spec, default) - def apply_slice(obj): - return fmt(obj[slice]) + if indices[1] == "b": + slice_bytes = _bytesgetter(_slice(indices[2:])) + + def apply_slice(obj): + return fmt(slice_bytes(obj)) + + else: + slice = _slice(indices[1:]) + + def apply_slice(obj): + return fmt(obj[slice]) + return apply_slice diff --git a/test/test_formatter.py b/test/test_formatter.py index 22589668..1bda9d9c 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,6 +23,7 @@ class TestFormatter(unittest.TestCase): kwdict = { "a": "hElLo wOrLd", "b": "äöü", + "j": "げんそうきょう", "d": {"a": "foo", "b": 0, "c": None}, "l": ["a", "b", "c"], "n": None, @@ -133,7 +134,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{d['a']}", "foo") self._run_test('{d["a"]}', "foo") - def test_slicing(self): + def test_slice_str(self): v = self.kwdict["a"] self._run_test("{a[1:10]}" , v[1:10]) self._run_test("{a[-10:-1]}", v[-10:-1]) @@ -165,6 +166,26 @@ class TestFormatter(unittest.TestCase): self._run_test("{a:[:50:2]}", v[:50:2]) self._run_test("{a:[::]}" , v) + def test_slice_bytes(self): + v = self.kwdict["j"] + self._run_test("{j[b1:10]}" , v[1:3]) + self._run_test("{j[b-10:-1]}", v[-3:-1]) + self._run_test("{j[b5:]}" , v[2:]) + self._run_test("{j[b50:]}" , v[50:]) + self._run_test("{j[b:5]}" , v[:1]) + self._run_test("{j[b:50]}" , v[:50]) + self._run_test("{j[b:]}" , v) + self._run_test("{j[b::]}" , v) + + self._run_test("{j:[b1:10]}" , v[1:3]) + self._run_test("{j:[b-10:-1]}", v[-3:-1]) + self._run_test("{j:[b5:]}" , v[2:]) + self._run_test("{j:[b50:]}" , v[50:]) + self._run_test("{j:[b:5]}" , v[:1]) + self._run_test("{j:[b:50]}" , v[:50]) + self._run_test("{j:[b:]}" , v) + self._run_test("{j:[b::]}" , v) + def test_maxlen(self): v = self.kwdict["a"] self._run_test("{a:L5/foo/}" , "foo") @@ -413,10 +434,10 @@ def noarg(): fmt4 = formatter.parse("\fM " + path + ":lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "89") + self.assertEqual(fmt2.format_map(self.kwdict), "96") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "89") + self.assertEqual(fmt4.format_map(self.kwdict), "96") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "")