From 722d0305b3411c00f2bafb73e9d580b0da1a978a Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sun, 14 Jul 2024 11:51:13 +0200 Subject: [PATCH] fix(`decode_bytes`): error handling led to data loss in subsequent chunks The contained new test documents how error handling in `decode_bytes()` caused data in a subsequent chunk to be skipped. The reason for that was that the pointer variable was not reset when a chunk (or rather the joined data of any previous chunks) was fully decoded. Decoding in the next chunk would start from the position last recorded during error handling in a previous chunk. Refs: https://github.com/datalad/datalad-next/pull/744 Closes: https://github.com/datalad/datasalad/issues/38 --- datasalad/itertools/decode_bytes.py | 4 ++++ datasalad/itertools/tests/test_decode_bytes.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/datasalad/itertools/decode_bytes.py b/datasalad/itertools/decode_bytes.py index efe77dd..880a4fc 100644 --- a/datasalad/itertools/decode_bytes.py +++ b/datasalad/itertools/decode_bytes.py @@ -125,6 +125,10 @@ def handle_decoding_error( try: yield joined_data[position:].decode(encoding) joined_data = b'' + # must reset the pointer for successful decoded + # parts too, otherwise we start too far into a new chunk's + # content + position = 0 except UnicodeDecodeError as e: # If an encoding error occurs, we first check whether it was # in the middle of `joined_data` or whether it extends until the diff --git a/datasalad/itertools/tests/test_decode_bytes.py b/datasalad/itertools/tests/test_decode_bytes.py index d7a55a1..bc5132d 100644 --- a/datasalad/itertools/tests/test_decode_bytes.py +++ b/datasalad/itertools/tests/test_decode_bytes.py @@ -40,3 +40,10 @@ def test_no_empty_strings(): def test_multiple_errors(): r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3'])) assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3' + + +def test_error_chunks(): + # this verifies that error handling in a previous chunk does not + # cause data loss in a subsequent chunk + r = ''.join(decode_bytes([b'08 War \xaf No', b'1234567890'])) + assert r == '08 War \\xaf No1234567890'