99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
from lorabot.messages import split_to_bytes, trim_to_bytes
|
||
|
||
|
||
def test_short_ascii_passthrough():
|
||
assert trim_to_bytes("hello", 184) == "hello"
|
||
|
||
|
||
def test_exact_fit_passthrough():
|
||
s = "a" * 184
|
||
assert trim_to_bytes(s, 184) == s
|
||
|
||
|
||
def test_long_ascii_clean_cut():
|
||
s = "x" * 200
|
||
out = trim_to_bytes(s, 184)
|
||
assert len(out.encode("utf-8")) == 184
|
||
assert out == "x" * 184
|
||
|
||
|
||
def test_emoji_does_not_split():
|
||
# Each 🎉 is 4 UTF-8 bytes. Limit of 5 must keep just one emoji (4 bytes), not 5.
|
||
out = trim_to_bytes("🎉🎉", 5)
|
||
assert out == "🎉"
|
||
assert len(out.encode("utf-8")) == 4
|
||
|
||
|
||
def test_multibyte_at_boundary():
|
||
# "ä" is 2 bytes in UTF-8. With a 3-byte budget for "aä" (3 bytes total), we keep both.
|
||
assert trim_to_bytes("aä", 3) == "aä"
|
||
# With a 2-byte budget we can only keep the leading "a".
|
||
assert trim_to_bytes("aä", 2) == "a"
|
||
|
||
|
||
def test_zero_or_negative_max_bytes():
|
||
assert trim_to_bytes("anything", 0) == ""
|
||
assert trim_to_bytes("anything", -1) == ""
|
||
|
||
|
||
def test_empty_input():
|
||
assert trim_to_bytes("", 184) == ""
|
||
|
||
|
||
# split_to_bytes
|
||
|
||
|
||
def test_split_short_input_single_chunk():
|
||
assert split_to_bytes("hello", 184) == ["hello"]
|
||
|
||
|
||
def test_split_long_input_two_chunks_drops_rest():
|
||
s = "x" * 500
|
||
chunks = split_to_bytes(s, 180, max_chunks=2)
|
||
assert chunks == ["x" * 180, "x" * 180]
|
||
assert sum(len(c.encode("utf-8")) for c in chunks) == 360
|
||
|
||
|
||
def test_split_exact_two_chunks_no_third():
|
||
s = "x" * 360
|
||
chunks = split_to_bytes(s, 180, max_chunks=2)
|
||
assert chunks == ["x" * 180, "x" * 180]
|
||
|
||
|
||
def test_split_does_not_break_multibyte():
|
||
# 4 emoji × 4 bytes = 16 bytes total. Budget 5 bytes/chunk → 1 emoji per chunk.
|
||
chunks = split_to_bytes("🎉🎉🎉🎉", 5, max_chunks=2)
|
||
assert chunks == ["🎉", "🎉"]
|
||
for c in chunks:
|
||
assert len(c.encode("utf-8")) == 4
|
||
|
||
|
||
def test_split_two_byte_char_at_boundary():
|
||
# "abäcd" → bytes: a b ä(2) c d = 6 bytes. Budget 3/chunk:
|
||
# chunk1 must end at "ab" (3rd byte is start of ä, can't include without continuation).
|
||
# chunk2: "äc" = 3 bytes.
|
||
chunks = split_to_bytes("abäcd", 3, max_chunks=2)
|
||
assert chunks[0] == "ab"
|
||
assert chunks[1] == "äc"
|
||
# "d" is dropped (over the budget).
|
||
|
||
|
||
def test_split_empty_input():
|
||
assert split_to_bytes("", 184) == []
|
||
|
||
|
||
def test_split_zero_max_bytes():
|
||
assert split_to_bytes("hi", 0) == []
|
||
|
||
|
||
def test_split_zero_chunks():
|
||
assert split_to_bytes("hi", 184, max_chunks=0) == []
|
||
|
||
|
||
def test_split_concat_is_prefix_of_input():
|
||
# The delivered text must always be a prefix of the original (no rearrangement).
|
||
src = "Hello world! 🎉 This is a longer message that should be split."
|
||
chunks = split_to_bytes(src, 20, max_chunks=2)
|
||
delivered = "".join(chunks)
|
||
assert src.startswith(delivered)
|