From 42d7c8c4770dd36ccb0b5893c410a380e03236ac Mon Sep 17 00:00:00 2001 From: Antares Date: Sat, 6 Jul 2024 22:08:29 +0800 Subject: [PATCH] Add `MessageEntity.adjust_message_entities_to_utf_16` Utility Function (#4323) Co-authored-by: Bibo-Joshi <22366557+Bibo-Joshi@users.noreply.github.com> Co-authored-by: Harshil <37377066+harshil21@users.noreply.github.com> --- telegram/_messageentity.py | 79 ++++++++++++++++++++++++++++++++++++- tests/test_messageentity.py | 22 +++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/telegram/_messageentity.py b/telegram/_messageentity.py index bbea88d10..bc80c2dcb 100644 --- a/telegram/_messageentity.py +++ b/telegram/_messageentity.py @@ -18,7 +18,9 @@ # along with this program. If not, see [http://www.gnu.org/licenses/]. """This module contains an object that represents a Telegram MessageEntity.""" -from typing import TYPE_CHECKING, Final, List, Optional +import copy +import itertools +from typing import TYPE_CHECKING, Dict, Final, List, Optional, Sequence from telegram import constants from telegram._telegramobject import TelegramObject @@ -142,6 +144,81 @@ class MessageEntity(TelegramObject): return super().de_json(data=data, bot=bot) + @staticmethod + def adjust_message_entities_to_utf_16( + text: str, entities: Sequence["MessageEntity"] + ) -> Sequence["MessageEntity"]: + """Utility functionality for converting the offset and length of entities from + Unicode (:obj:`str`) to UTF-16 (``utf-16-le`` encoded :obj:`bytes`). + + Tip: + Only the offsets and lengths calulated in UTF-16 is acceptable by the Telegram Bot API. + If they are calculated using the Unicode string (:obj:`str` object), errors may occur + when the text contains characters that are not in the Basic Multilingual Plane (BMP). + For more information, see `Unicode `_ and + `Plane (Unicode) `_. + + .. versionadded:: NEXT.VERSION + + Examples: + Below is a snippet of code that demonstrates how to use this function to convert + entities from Unicode to UTF-16 space. The ``unicode_entities`` are calculated in + Unicode and the `utf_16_entities` are calculated in UTF-16. + + .. code-block:: python + + text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍" + unicode_entities = [ + MessageEntity(offset=2, length=4, type=MessageEntity.BOLD), + MessageEntity(offset=9, length=6, type=MessageEntity.ITALIC), + MessageEntity(offset=28, length=3, type=MessageEntity.UNDERLINE), + ] + utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16( + text, unicode_entities + ) + await bot.send_message( + chat_id=123, + text=text, + entities=utf_16_entities, + ) + # utf_16_entities[0]: offset=3, length=4 + # utf_16_entities[1]: offset=11, length=6 + # utf_16_entities[2]: offset=30, length=6 + + Args: + text (:obj:`str`): The text that the entities belong to + entities (Sequence[:class:`telegram.MessageEntity`]): Sequence of entities + with offset and length calculated in Unicode + + Returns: + Sequence[:class:`telegram.MessageEntity`]: Sequence of entities + with offset and length calculated in UTF-16 encoding + """ + # get sorted positions + positions = sorted(itertools.chain(*((x.offset, x.offset + x.length) for x in entities))) + accumulated_length = 0 + # calculate the length of each slice text[:position] in utf-16 accordingly, + # store the position translations + position_translation: Dict[int, int] = {} + for i, position in enumerate(positions): + last_position = positions[i - 1] if i > 0 else 0 + text_slice = text[last_position:position] + accumulated_length += len(text_slice.encode("utf-16-le")) // 2 + position_translation[position] = accumulated_length + # get the final output entites + out = [] + for entity in entities: + translated_positions = position_translation[entity.offset] + translated_length = ( + position_translation[entity.offset + entity.length] - translated_positions + ) + new_entity = copy.copy(entity) + with new_entity._unfrozen(): + new_entity.offset = translated_positions + new_entity.length = translated_length + out.append(new_entity) + return out + ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType) """List[:obj:`str`]: A list of all available message entity types.""" BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE diff --git a/tests/test_messageentity.py b/tests/test_messageentity.py index b14ec79c3..8bab9fec7 100644 --- a/tests/test_messageentity.py +++ b/tests/test_messageentity.py @@ -16,6 +16,9 @@ # # You should have received a copy of the GNU Lesser Public License # along with this program. If not, see [http://www.gnu.org/licenses/]. +import random +from typing import List, Tuple + import pytest from telegram import MessageEntity, User @@ -81,6 +84,25 @@ class TestMessageEntityWithoutRequest(TestMessageEntityBase): entity = MessageEntity(type="url", offset=0, length=1) assert entity.type is MessageEntityType.URL + def test_fix_utf16(self): + text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍" + inputs_outputs: List[Tuple[Tuple[int, int, str], Tuple[int, int]]] = [ + ((2, 4, MessageEntity.BOLD), (3, 4)), + ((9, 6, MessageEntity.ITALIC), (11, 6)), + ((28, 3, MessageEntity.UNDERLINE), (30, 6)), + ] + random.shuffle(inputs_outputs) + unicode_entities = [ + MessageEntity(offset=_input[0], length=_input[1], type=_input[2]) + for _input, _ in inputs_outputs + ] + utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(text, unicode_entities) + for out_entity, input_output in zip(utf_16_entities, inputs_outputs): + _, output = input_output + offset, length = output + assert out_entity.offset == offset + assert out_entity.length == length + def test_equality(self): a = MessageEntity(MessageEntity.BOLD, 2, 3) b = MessageEntity(MessageEntity.BOLD, 2, 3)