Add MessageEntity.adjust_message_entities_to_utf_16 Utility Function (#4323)

Co-authored-by: Bibo-Joshi <22366557+Bibo-Joshi@users.noreply.github.com> Co-authored-by: Harshil <37377066+harshil21@users.noreply.github.com>
2024-11-21 22:56:38 +01:00 · 2024-07-06 22:08:29 +08:00 · 2024-07-06 22:08:29 +08:00 · 42d7c8c477
commit 42d7c8c477
parent 8018e5ff3f
2 changed files with 100 additions and 1 deletions
--- a/telegram/_messageentity.py
+++ b/telegram/_messageentity.py
@ -18,7 +18,9 @@
 # along with this program.  If not, see [http://www.gnu.org/licenses/].
 """This module contains an object that represents a Telegram MessageEntity."""
-from typing import TYPE_CHECKING, Final, List, Optional
+import copy
 import itertools
 from typing import TYPE_CHECKING, Dict, Final, List, Optional, Sequence
 from telegram import constants
 from telegram._telegramobject import TelegramObject
@ -142,6 +144,81 @@ class MessageEntity(TelegramObject):
        return super().de_json(data=data, bot=bot)
    @staticmethod
    def adjust_message_entities_to_utf_16(
        text: str, entities: Sequence["MessageEntity"]
    ) -> Sequence["MessageEntity"]:
        """Utility functionality for converting the offset and length of entities from
        Unicode (:obj:`str`) to UTF-16 (``utf-16-le`` encoded :obj:`bytes`).
        Tip:
            Only the offsets and lengths calulated in UTF-16 is acceptable by the Telegram Bot API.
            If they are calculated using the Unicode string (:obj:`str` object), errors may occur
            when the text contains characters that are not in the Basic Multilingual Plane (BMP).
            For more information, see `Unicode <https://en.wikipedia.org/wiki/Unicode>`_ and
            `Plane (Unicode) <https://en.wikipedia.org/wiki/Plane_(Unicode)>`_.
        .. versionadded:: NEXT.VERSION
        Examples:
            Below is a snippet of code that demonstrates how to use this function to convert
            entities from Unicode to UTF-16 space. The ``unicode_entities`` are calculated in
            Unicode and the `utf_16_entities` are calculated in UTF-16.
            .. code-block:: python
                text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
                unicode_entities = [
                    MessageEntity(offset=2, length=4, type=MessageEntity.BOLD),
                    MessageEntity(offset=9, length=6, type=MessageEntity.ITALIC),
                    MessageEntity(offset=28, length=3, type=MessageEntity.UNDERLINE),
                ]
                utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(
                    text, unicode_entities
                )
                await bot.send_message(
                    chat_id=123,
                    text=text,
                    entities=utf_16_entities,
                )
                # utf_16_entities[0]: offset=3, length=4
                # utf_16_entities[1]: offset=11, length=6
                # utf_16_entities[2]: offset=30, length=6
        Args:
            text (:obj:`str`): The text that the entities belong to
            entities (Sequence[:class:`telegram.MessageEntity`]): Sequence of entities
                with offset and length calculated in Unicode
        Returns:
            Sequence[:class:`telegram.MessageEntity`]: Sequence of entities
            with offset and length calculated in UTF-16 encoding
        """
        # get sorted positions
        positions = sorted(itertools.chain(*((x.offset, x.offset + x.length) for x in entities)))
        accumulated_length = 0
        # calculate the length of each slice text[:position] in utf-16 accordingly,
        # store the position translations
        position_translation: Dict[int, int] = {}
        for i, position in enumerate(positions):
            last_position = positions[i - 1] if i > 0 else 0
            text_slice = text[last_position:position]
            accumulated_length += len(text_slice.encode("utf-16-le")) // 2
            position_translation[position] = accumulated_length
        # get the final output entites
        out = []
        for entity in entities:
            translated_positions = position_translation[entity.offset]
            translated_length = (
                position_translation[entity.offset + entity.length] - translated_positions
            )
            new_entity = copy.copy(entity)
            with new_entity._unfrozen():
                new_entity.offset = translated_positions
                new_entity.length = translated_length
            out.append(new_entity)
        return out
    ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType)
    """List[:obj:`str`]: A list of all available message entity types."""
    BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE
--- a/tests/test_messageentity.py
+++ b/tests/test_messageentity.py
@ -16,6 +16,9 @@
 #
 # You should have received a copy of the GNU Lesser Public License
 # along with this program.  If not, see [http://www.gnu.org/licenses/].
 import random
 from typing import List, Tuple
 import pytest
 from telegram import MessageEntity, User
@ -81,6 +84,25 @@ class TestMessageEntityWithoutRequest(TestMessageEntityBase):
        entity = MessageEntity(type="url", offset=0, length=1)
        assert entity.type is MessageEntityType.URL
    def test_fix_utf16(self):
        text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
        inputs_outputs: List[Tuple[Tuple[int, int, str], Tuple[int, int]]] = [
            ((2, 4, MessageEntity.BOLD), (3, 4)),
            ((9, 6, MessageEntity.ITALIC), (11, 6)),
            ((28, 3, MessageEntity.UNDERLINE), (30, 6)),
        ]
        random.shuffle(inputs_outputs)
        unicode_entities = [
            MessageEntity(offset=_input[0], length=_input[1], type=_input[2])
            for _input, _ in inputs_outputs
        ]
        utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(text, unicode_entities)
        for out_entity, input_output in zip(utf_16_entities, inputs_outputs):
            _, output = input_output
            offset, length = output
            assert out_entity.offset == offset
            assert out_entity.length == length
    def test_equality(self):
        a = MessageEntity(MessageEntity.BOLD, 2, 3)
        b = MessageEntity(MessageEntity.BOLD, 2, 3)