lorabot/src/lorabot/messages.py

"""Helpers for shaping outgoing mesh messages."""

from __future__ import annotations


def trim_to_bytes(text: str, max_bytes: int) -> str:
    """Return ``text`` truncated so its UTF-8 encoding is at most ``max_bytes`` bytes.

    Backs off if the cut lands inside a multi-byte UTF-8 sequence so we never emit
    invalid UTF-8 to the radio.
    """
    if max_bytes <= 0:
        return ""
    encoded = text.encode("utf-8")
    if len(encoded) <= max_bytes:
        return text
    cut = encoded[:max_bytes]
    # Continuation bytes start with bits 10xxxxxx; rewind past them.
    while cut and (cut[-1] & 0xC0) == 0x80:
        cut = cut[:-1]
    return cut.decode("utf-8", errors="ignore")


def split_to_bytes(text: str, max_bytes: int, max_chunks: int = 2) -> list[str]:
    """Split ``text`` into up to ``max_chunks`` UTF-8-safe chunks of ``max_bytes`` each.

    Anything past ``max_chunks * max_bytes`` is dropped. Splits never land inside a
    multi-byte UTF-8 sequence on either side of the boundary.
    """
    if max_bytes <= 0 or max_chunks <= 0 or not text:
        return []
    encoded = text.encode("utf-8")
    n = len(encoded)
    chunks: list[str] = []
    pos = 0
    for _ in range(max_chunks):
        if pos >= n:
            break
        end = min(pos + max_bytes, n)
        if end < n:
            # Rewind past trailing UTF-8 continuation bytes (10xxxxxx).
            while end > pos and (encoded[end - 1] & 0xC0) == 0x80:
                end -= 1
            # And past a dangling leader byte (11xxxxxx) whose continuations
            # would fall outside the budget.
            if end > pos and (encoded[end - 1] & 0xC0) == 0xC0:
                end -= 1
        if end == pos:
            break
        chunks.append(encoded[pos:end].decode("utf-8"))
        pos = end
    return chunks