Files
lorabot/tests/test_messages.py
T
2026-05-04 20:52:51 +02:00

99 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from lorabot.messages import split_to_bytes, trim_to_bytes
def test_short_ascii_passthrough():
assert trim_to_bytes("hello", 184) == "hello"
def test_exact_fit_passthrough():
s = "a" * 184
assert trim_to_bytes(s, 184) == s
def test_long_ascii_clean_cut():
s = "x" * 200
out = trim_to_bytes(s, 184)
assert len(out.encode("utf-8")) == 184
assert out == "x" * 184
def test_emoji_does_not_split():
# Each 🎉 is 4 UTF-8 bytes. Limit of 5 must keep just one emoji (4 bytes), not 5.
out = trim_to_bytes("🎉🎉", 5)
assert out == "🎉"
assert len(out.encode("utf-8")) == 4
def test_multibyte_at_boundary():
# "ä" is 2 bytes in UTF-8. With a 3-byte budget for "aä" (3 bytes total), we keep both.
assert trim_to_bytes("", 3) == ""
# With a 2-byte budget we can only keep the leading "a".
assert trim_to_bytes("", 2) == "a"
def test_zero_or_negative_max_bytes():
assert trim_to_bytes("anything", 0) == ""
assert trim_to_bytes("anything", -1) == ""
def test_empty_input():
assert trim_to_bytes("", 184) == ""
# split_to_bytes
def test_split_short_input_single_chunk():
assert split_to_bytes("hello", 184) == ["hello"]
def test_split_long_input_two_chunks_drops_rest():
s = "x" * 500
chunks = split_to_bytes(s, 180, max_chunks=2)
assert chunks == ["x" * 180, "x" * 180]
assert sum(len(c.encode("utf-8")) for c in chunks) == 360
def test_split_exact_two_chunks_no_third():
s = "x" * 360
chunks = split_to_bytes(s, 180, max_chunks=2)
assert chunks == ["x" * 180, "x" * 180]
def test_split_does_not_break_multibyte():
# 4 emoji × 4 bytes = 16 bytes total. Budget 5 bytes/chunk → 1 emoji per chunk.
chunks = split_to_bytes("🎉🎉🎉🎉", 5, max_chunks=2)
assert chunks == ["🎉", "🎉"]
for c in chunks:
assert len(c.encode("utf-8")) == 4
def test_split_two_byte_char_at_boundary():
# "abäcd" → bytes: a b ä(2) c d = 6 bytes. Budget 3/chunk:
# chunk1 must end at "ab" (3rd byte is start of ä, can't include without continuation).
# chunk2: "äc" = 3 bytes.
chunks = split_to_bytes("abäcd", 3, max_chunks=2)
assert chunks[0] == "ab"
assert chunks[1] == "äc"
# "d" is dropped (over the budget).
def test_split_empty_input():
assert split_to_bytes("", 184) == []
def test_split_zero_max_bytes():
assert split_to_bytes("hi", 0) == []
def test_split_zero_chunks():
assert split_to_bytes("hi", 184, max_chunks=0) == []
def test_split_concat_is_prefix_of_input():
# The delivered text must always be a prefix of the original (no rearrangement).
src = "Hello world! 🎉 This is a longer message that should be split."
chunks = split_to_bytes(src, 20, max_chunks=2)
delivered = "".join(chunks)
assert src.startswith(delivered)