Add methods to parse entities in Message

Should close #400. * Add parse_entity * Add parse_entities * Add MessageEntity types as constants to MessageEntity. * Add MAX_MESSAGE_ENTITIES to constants.py Note: the value has been found by experimentation as opposed to extracted from the api docs. * Add tests for parse_entity and parse_entities
2025-02-27 14:25:14 +01:00 · 2016-09-07 08:49:09 +02:00 · 2016-09-07 08:49:09 +02:00 · 6647ae3c25
commit 6647ae3c25
parent e4a132c0e4
4 changed files with 135 additions and 0 deletions
--- a/telegram/constants.py
+++ b/telegram/constants.py
@ -32,6 +32,12 @@ Attributes:
        limit, but eventually you'll begin receiving 429 errors.
    MAX_MESSAGES_PER_SECOND (int)
    MAX_MESSAGES_PER_MINUTE_PER_GROUP (int)
+
+The following constant have been found by experimentation:
+
+Attributes:
+    MAX_MESSAGE_ENTITIES (int): Max number of entities that can be in a message.
+        (Beyond this cap telegram will simply ignore further formatting styles)
 """

 MAX_MESSAGE_LENGTH = 4096
@ -45,3 +51,4 @@ MAX_FILESIZE_UPLOAD = int(50E6)  # (50MB)
 MAX_MESSAGES_PER_SECOND_PER_CHAT = 1
 MAX_MESSAGES_PER_SECOND = 30
 MAX_MESSAGES_PER_MINUTE_PER_GROUP = 20
+MAX_MESSAGE_ENTITIES = 100
--- a/telegram/message.py
+++ b/telegram/message.py
@ -19,6 +19,7 @@
 # along with this program.  If not, see [http://www.gnu.org/licenses/].
 """This module contains a object that represents a Telegram Message."""

+import sys
 from datetime import datetime
 from time import mktime

@ -244,3 +245,55 @@ class Message(TelegramObject):
        except AttributeError:
            # Python 3 (< 3.3) and Python 2
            return int(mktime(dt_obj.timetuple()))
+
+    def parse_entity(self, entity):
+        """
+        Returns the text from a given :class:`telegram.MessageEntity`.
+
+        Note:
+            This method is present because Telegram calculates the offset and length in
+            UTF-16 codepoint pairs, which some versions of Python don't handle automatically.
+            (That is, you can't just slice ``Message.text`` with the offset and length.)
+
+        Args:
+            entity (MessageEntity): The entity to extract the text from. It must be an entity that
+                belongs to this message.
+
+        Returns:
+            str: The text of the given entity
+        """
+        # Is it a narrow build, if so we don't need to convert
+        if sys.maxunicode == 0xffff:
+            return self.text[entity.offset:entity.offset + entity.length]
+        else:
+            entity_text = self.text.encode('utf-16-le')
+            entity_text = entity_text[entity.offset * 2:(entity.offset + entity.length) * 2]
+
+        return entity_text.decode('utf-16-le')
+
+    def parse_entities(self, types=None):
+        """
+        Returns a ``dict`` that maps :class:`telegram.MessageEntity` to ``str``.
+        It contains entities from this message filtered by their ``type`` attribute as the key, and
+        the text that each entity belongs to as the value of the ``dict``.
+
+        Note:
+            This method should always be used instead of the ``entities`` attribute, since it
+            calculates the correct substring from the message text based on UTF-16 codepoints.
+            See ``get_entity_text`` for more info.
+
+        Args:
+            types (Optional[list]): List of ``MessageEntity`` types as strings. If the ``type``
+                attribute of an entity is contained in this list, it will be returned.
+                Defaults to a list of all types. All types can be found as constants in
+                :class:`telegram.MessageEntity`.
+
+        Returns:
+            dict[:class:`telegram.MessageEntity`, ``str``]: A dictionary of entities mapped to the
+                text that belongs to them, calculated based on UTF-16 codepoints.
+        """
+        if types is None:
+            types = MessageEntity.ALL_TYPES
+
+        return {entity: self.parse_entity(entity)
+                for entity in self.entities if entity.type in types}
--- a/telegram/messageentity.py
+++ b/telegram/messageentity.py
@ -68,3 +68,17 @@ class MessageEntity(TelegramObject):
            entities.append(MessageEntity.de_json(entity))

        return entities
+
+    MENTION = 'mention'
+    HASHTAG = 'hashtag'
+    BOT_COMMAND = 'bot_command'
+    URL = 'url'
+    EMAIL = 'email'
+    BOLD = 'bold'
+    ITALIC = 'italic'
+    CODE = 'code'
+    PRE = 'pre'
+    TEXT_LINK = 'text_link'
+    TEXT_MENTION = 'text_mention'
+    ALL_TYPES = [MENTION, HASHTAG, BOT_COMMAND, URL, EMAIL, BOLD, ITALIC, CODE, PRE, TEXT_LINK,
+                 TEXT_MENTION]
--- a/tests/test_message.py
+++ b/tests/test_message.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# A library that provides a Python interface to the Telegram Bot API
+# Copyright (C) 2015-2016
+# Leandro Toledo de Souza <devs@python-telegram-bot.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see [http://www.gnu.org/licenses/].
+"""This module contains a object that represents Tests for Telegram Message"""
+
+import sys
+import unittest
+
+sys.path.append('.')
+
+import telegram
+from tests.base import BaseTest
+
+
+class MessageTest(BaseTest, unittest.TestCase):
+    """This object represents Tests for Telegram MessageTest."""
+
+    def test_parse_entity(self):
+        text = (b'\\U0001f469\\u200d\\U0001f469\\u200d\\U0001f467'
+                b'\\u200d\\U0001f467\\U0001f431http://google.com').decode('unicode-escape')
+        entity = telegram.MessageEntity(type=telegram.MessageEntity.URL, offset=13, length=17)
+        message = telegram.Message(
+            message_id=1, from_user=None, date=None, chat=None, text=text, entities=[entity])
+        self.assertEqual(message.parse_entity(entity), 'http://google.com')
+
+    def test_parse_entities(self):
+        text = (b'\\U0001f469\\u200d\\U0001f469\\u200d\\U0001f467'
+                b'\\u200d\\U0001f467\\U0001f431http://google.com').decode('unicode-escape')
+        entity = telegram.MessageEntity(type=telegram.MessageEntity.URL, offset=13, length=17)
+        entity_2 = telegram.MessageEntity(type=telegram.MessageEntity.BOLD, offset=13, length=1)
+        message = telegram.Message(
+            message_id=1,
+            from_user=None,
+            date=None,
+            chat=None,
+            text=text,
+            entities=[entity_2, entity])
+        self.assertDictEqual(
+            message.parse_entities(telegram.MessageEntity.URL), {entity: 'http://google.com'})
+        self.assertDictEqual(message.parse_entities(), {entity: 'http://google.com',
+                                                        entity_2: 'h'})
+
+
+if __name__ == '__main__':
+    unittest.main()