mirror of
https://github.com/python-telegram-bot/python-telegram-bot.git
synced 2024-11-21 22:56:38 +01:00
Add MessageEntity.adjust_message_entities_to_utf_16
Utility Function (#4323)
Co-authored-by: Bibo-Joshi <22366557+Bibo-Joshi@users.noreply.github.com> Co-authored-by: Harshil <37377066+harshil21@users.noreply.github.com>
This commit is contained in:
parent
8018e5ff3f
commit
42d7c8c477
2 changed files with 100 additions and 1 deletions
|
@ -18,7 +18,9 @@
|
||||||
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
||||||
"""This module contains an object that represents a Telegram MessageEntity."""
|
"""This module contains an object that represents a Telegram MessageEntity."""
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Final, List, Optional
|
import copy
|
||||||
|
import itertools
|
||||||
|
from typing import TYPE_CHECKING, Dict, Final, List, Optional, Sequence
|
||||||
|
|
||||||
from telegram import constants
|
from telegram import constants
|
||||||
from telegram._telegramobject import TelegramObject
|
from telegram._telegramobject import TelegramObject
|
||||||
|
@ -142,6 +144,81 @@ class MessageEntity(TelegramObject):
|
||||||
|
|
||||||
return super().de_json(data=data, bot=bot)
|
return super().de_json(data=data, bot=bot)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def adjust_message_entities_to_utf_16(
|
||||||
|
text: str, entities: Sequence["MessageEntity"]
|
||||||
|
) -> Sequence["MessageEntity"]:
|
||||||
|
"""Utility functionality for converting the offset and length of entities from
|
||||||
|
Unicode (:obj:`str`) to UTF-16 (``utf-16-le`` encoded :obj:`bytes`).
|
||||||
|
|
||||||
|
Tip:
|
||||||
|
Only the offsets and lengths calulated in UTF-16 is acceptable by the Telegram Bot API.
|
||||||
|
If they are calculated using the Unicode string (:obj:`str` object), errors may occur
|
||||||
|
when the text contains characters that are not in the Basic Multilingual Plane (BMP).
|
||||||
|
For more information, see `Unicode <https://en.wikipedia.org/wiki/Unicode>`_ and
|
||||||
|
`Plane (Unicode) <https://en.wikipedia.org/wiki/Plane_(Unicode)>`_.
|
||||||
|
|
||||||
|
.. versionadded:: NEXT.VERSION
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Below is a snippet of code that demonstrates how to use this function to convert
|
||||||
|
entities from Unicode to UTF-16 space. The ``unicode_entities`` are calculated in
|
||||||
|
Unicode and the `utf_16_entities` are calculated in UTF-16.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
|
||||||
|
unicode_entities = [
|
||||||
|
MessageEntity(offset=2, length=4, type=MessageEntity.BOLD),
|
||||||
|
MessageEntity(offset=9, length=6, type=MessageEntity.ITALIC),
|
||||||
|
MessageEntity(offset=28, length=3, type=MessageEntity.UNDERLINE),
|
||||||
|
]
|
||||||
|
utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(
|
||||||
|
text, unicode_entities
|
||||||
|
)
|
||||||
|
await bot.send_message(
|
||||||
|
chat_id=123,
|
||||||
|
text=text,
|
||||||
|
entities=utf_16_entities,
|
||||||
|
)
|
||||||
|
# utf_16_entities[0]: offset=3, length=4
|
||||||
|
# utf_16_entities[1]: offset=11, length=6
|
||||||
|
# utf_16_entities[2]: offset=30, length=6
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (:obj:`str`): The text that the entities belong to
|
||||||
|
entities (Sequence[:class:`telegram.MessageEntity`]): Sequence of entities
|
||||||
|
with offset and length calculated in Unicode
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sequence[:class:`telegram.MessageEntity`]: Sequence of entities
|
||||||
|
with offset and length calculated in UTF-16 encoding
|
||||||
|
"""
|
||||||
|
# get sorted positions
|
||||||
|
positions = sorted(itertools.chain(*((x.offset, x.offset + x.length) for x in entities)))
|
||||||
|
accumulated_length = 0
|
||||||
|
# calculate the length of each slice text[:position] in utf-16 accordingly,
|
||||||
|
# store the position translations
|
||||||
|
position_translation: Dict[int, int] = {}
|
||||||
|
for i, position in enumerate(positions):
|
||||||
|
last_position = positions[i - 1] if i > 0 else 0
|
||||||
|
text_slice = text[last_position:position]
|
||||||
|
accumulated_length += len(text_slice.encode("utf-16-le")) // 2
|
||||||
|
position_translation[position] = accumulated_length
|
||||||
|
# get the final output entites
|
||||||
|
out = []
|
||||||
|
for entity in entities:
|
||||||
|
translated_positions = position_translation[entity.offset]
|
||||||
|
translated_length = (
|
||||||
|
position_translation[entity.offset + entity.length] - translated_positions
|
||||||
|
)
|
||||||
|
new_entity = copy.copy(entity)
|
||||||
|
with new_entity._unfrozen():
|
||||||
|
new_entity.offset = translated_positions
|
||||||
|
new_entity.length = translated_length
|
||||||
|
out.append(new_entity)
|
||||||
|
return out
|
||||||
|
|
||||||
ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType)
|
ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType)
|
||||||
"""List[:obj:`str`]: A list of all available message entity types."""
|
"""List[:obj:`str`]: A list of all available message entity types."""
|
||||||
BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE
|
BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE
|
||||||
|
|
|
@ -16,6 +16,9 @@
|
||||||
#
|
#
|
||||||
# You should have received a copy of the GNU Lesser Public License
|
# You should have received a copy of the GNU Lesser Public License
|
||||||
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
# along with this program. If not, see [http://www.gnu.org/licenses/].
|
||||||
|
import random
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from telegram import MessageEntity, User
|
from telegram import MessageEntity, User
|
||||||
|
@ -81,6 +84,25 @@ class TestMessageEntityWithoutRequest(TestMessageEntityBase):
|
||||||
entity = MessageEntity(type="url", offset=0, length=1)
|
entity = MessageEntity(type="url", offset=0, length=1)
|
||||||
assert entity.type is MessageEntityType.URL
|
assert entity.type is MessageEntityType.URL
|
||||||
|
|
||||||
|
def test_fix_utf16(self):
|
||||||
|
text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
|
||||||
|
inputs_outputs: List[Tuple[Tuple[int, int, str], Tuple[int, int]]] = [
|
||||||
|
((2, 4, MessageEntity.BOLD), (3, 4)),
|
||||||
|
((9, 6, MessageEntity.ITALIC), (11, 6)),
|
||||||
|
((28, 3, MessageEntity.UNDERLINE), (30, 6)),
|
||||||
|
]
|
||||||
|
random.shuffle(inputs_outputs)
|
||||||
|
unicode_entities = [
|
||||||
|
MessageEntity(offset=_input[0], length=_input[1], type=_input[2])
|
||||||
|
for _input, _ in inputs_outputs
|
||||||
|
]
|
||||||
|
utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(text, unicode_entities)
|
||||||
|
for out_entity, input_output in zip(utf_16_entities, inputs_outputs):
|
||||||
|
_, output = input_output
|
||||||
|
offset, length = output
|
||||||
|
assert out_entity.offset == offset
|
||||||
|
assert out_entity.length == length
|
||||||
|
|
||||||
def test_equality(self):
|
def test_equality(self):
|
||||||
a = MessageEntity(MessageEntity.BOLD, 2, 3)
|
a = MessageEntity(MessageEntity.BOLD, 2, 3)
|
||||||
b = MessageEntity(MessageEntity.BOLD, 2, 3)
|
b = MessageEntity(MessageEntity.BOLD, 2, 3)
|
||||||
|
|
Loading…
Reference in a new issue