Add MessageEntity.adjust_message_entities_to_utf_16 Utility Function (#4323)

Co-authored-by: Bibo-Joshi <22366557+Bibo-Joshi@users.noreply.github.com> Co-authored-by: Harshil <37377066+harshil21@users.noreply.github.com>
2024-12-22 06:25:12 +01:00 · 2024-07-06 22:08:29 +08:00 · 2024-07-06 22:08:29 +08:00 · 42d7c8c477
commit 42d7c8c477
parent 8018e5ff3f
2 changed files with 100 additions and 1 deletions
--- a/telegram/_messageentity.py
+++ b/telegram/_messageentity.py
@ -18,7 +18,9 @@
 # along with this program.  If not, see [http://www.gnu.org/licenses/].
 """This module contains an object that represents a Telegram MessageEntity."""

-from typing import TYPE_CHECKING, Final, List, Optional
+import copy
+import itertools
+from typing import TYPE_CHECKING, Dict, Final, List, Optional, Sequence

 from telegram import constants
 from telegram._telegramobject import TelegramObject
@ -142,6 +144,81 @@ class MessageEntity(TelegramObject):

        return super().de_json(data=data, bot=bot)

+    @staticmethod
+    def adjust_message_entities_to_utf_16(
+        text: str, entities: Sequence["MessageEntity"]
+    ) -> Sequence["MessageEntity"]:
+        """Utility functionality for converting the offset and length of entities from
+        Unicode (:obj:`str`) to UTF-16 (``utf-16-le`` encoded :obj:`bytes`).
+
+        Tip:
+            Only the offsets and lengths calulated in UTF-16 is acceptable by the Telegram Bot API.
+            If they are calculated using the Unicode string (:obj:`str` object), errors may occur
+            when the text contains characters that are not in the Basic Multilingual Plane (BMP).
+            For more information, see `Unicode <https://en.wikipedia.org/wiki/Unicode>`_ and
+            `Plane (Unicode) <https://en.wikipedia.org/wiki/Plane_(Unicode)>`_.
+
+        .. versionadded:: NEXT.VERSION
+
+        Examples:
+            Below is a snippet of code that demonstrates how to use this function to convert
+            entities from Unicode to UTF-16 space. The ``unicode_entities`` are calculated in
+            Unicode and the `utf_16_entities` are calculated in UTF-16.
+
+            .. code-block:: python
+
+                text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
+                unicode_entities = [
+                    MessageEntity(offset=2, length=4, type=MessageEntity.BOLD),
+                    MessageEntity(offset=9, length=6, type=MessageEntity.ITALIC),
+                    MessageEntity(offset=28, length=3, type=MessageEntity.UNDERLINE),
+                ]
+                utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(
+                    text, unicode_entities
+                )
+                await bot.send_message(
+                    chat_id=123,
+                    text=text,
+                    entities=utf_16_entities,
+                )
+                # utf_16_entities[0]: offset=3, length=4
+                # utf_16_entities[1]: offset=11, length=6
+                # utf_16_entities[2]: offset=30, length=6
+
+        Args:
+            text (:obj:`str`): The text that the entities belong to
+            entities (Sequence[:class:`telegram.MessageEntity`]): Sequence of entities
+                with offset and length calculated in Unicode
+
+        Returns:
+            Sequence[:class:`telegram.MessageEntity`]: Sequence of entities
+            with offset and length calculated in UTF-16 encoding
+        """
+        # get sorted positions
+        positions = sorted(itertools.chain(*((x.offset, x.offset + x.length) for x in entities)))
+        accumulated_length = 0
+        # calculate the length of each slice text[:position] in utf-16 accordingly,
+        # store the position translations
+        position_translation: Dict[int, int] = {}
+        for i, position in enumerate(positions):
+            last_position = positions[i - 1] if i > 0 else 0
+            text_slice = text[last_position:position]
+            accumulated_length += len(text_slice.encode("utf-16-le")) // 2
+            position_translation[position] = accumulated_length
+        # get the final output entites
+        out = []
+        for entity in entities:
+            translated_positions = position_translation[entity.offset]
+            translated_length = (
+                position_translation[entity.offset + entity.length] - translated_positions
+            )
+            new_entity = copy.copy(entity)
+            with new_entity._unfrozen():
+                new_entity.offset = translated_positions
+                new_entity.length = translated_length
+            out.append(new_entity)
+        return out
+
    ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType)
    """List[:obj:`str`]: A list of all available message entity types."""
    BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE
--- a/tests/test_messageentity.py
+++ b/tests/test_messageentity.py
@ -16,6 +16,9 @@
 #
 # You should have received a copy of the GNU Lesser Public License
 # along with this program.  If not, see [http://www.gnu.org/licenses/].
+import random
+from typing import List, Tuple
+
 import pytest

 from telegram import MessageEntity, User
@ -81,6 +84,25 @@ class TestMessageEntityWithoutRequest(TestMessageEntityBase):
        entity = MessageEntity(type="url", offset=0, length=1)
        assert entity.type is MessageEntityType.URL

+    def test_fix_utf16(self):
+        text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
+        inputs_outputs: List[Tuple[Tuple[int, int, str], Tuple[int, int]]] = [
+            ((2, 4, MessageEntity.BOLD), (3, 4)),
+            ((9, 6, MessageEntity.ITALIC), (11, 6)),
+            ((28, 3, MessageEntity.UNDERLINE), (30, 6)),
+        ]
+        random.shuffle(inputs_outputs)
+        unicode_entities = [
+            MessageEntity(offset=_input[0], length=_input[1], type=_input[2])
+            for _input, _ in inputs_outputs
+        ]
+        utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(text, unicode_entities)
+        for out_entity, input_output in zip(utf_16_entities, inputs_outputs):
+            _, output = input_output
+            offset, length = output
+            assert out_entity.offset == offset
+            assert out_entity.length == length
+
    def test_equality(self):
        a = MessageEntity(MessageEntity.BOLD, 2, 3)
        b = MessageEntity(MessageEntity.BOLD, 2, 3)