Add Internal Constants for Encodings (#4378)

2024-12-21 22:15:09 +01:00 · 2024-07-21 21:13:30 +02:00 · 2024-07-21 21:13:30 +02:00 · 0913b859d7
commit 0913b859d7
parent c3f17bb18e
15 changed files with 62 additions and 25 deletions
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@ -97,6 +97,7 @@ The following wonderful people contributed directly or indirectly to this projec
 - `Oleg Sushchenko <https://github.com/feuillemorte>`_
 - `Or Bin <https://github.com/OrBin>`_
 - `overquota <https://github.com/overquota>`_
+- `Pablo Martinez <https://github.com/elpekenin>`_
 - `Paradox <https://github.com/paradox70>`_
 - `Patrick Hofmann <https://github.com/PH89>`_
 - `Paul Larsen <https://github.com/PaulSonOfLars>`_
--- a/telegram/_files/inputfile.py
+++ b/telegram/_files/inputfile.py
@ -23,6 +23,7 @@ from typing import IO, Optional, Union
 from uuid import uuid4

 from telegram._utils.files import load_file
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import FieldTuple

 _DEFAULT_MIME_TYPE = "application/octet-stream"
@ -74,7 +75,7 @@ class InputFile:
        if isinstance(obj, bytes):
            self.input_file_content: bytes = obj
        elif isinstance(obj, str):
-            self.input_file_content = obj.encode("utf-8")
+            self.input_file_content = obj.encode(TextEncoding.UTF_8)
        else:
            reported_filename, self.input_file_content = load_file(obj)
            filename = filename or reported_filename
--- a/telegram/_games/game.py
+++ b/telegram/_games/game.py
@ -24,6 +24,7 @@ from telegram._files.photosize import PhotoSize
 from telegram._messageentity import MessageEntity
 from telegram._telegramobject import TelegramObject
 from telegram._utils.argumentparsing import parse_sequence_arg
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import JSONDict

 if TYPE_CHECKING:
@ -157,10 +158,10 @@ class Game(TelegramObject):
        if not self.text:
            raise RuntimeError("This Game has no 'text'.")

-        entity_text = self.text.encode("utf-16-le")
+        entity_text = self.text.encode(TextEncoding.UTF_16_LE)
        entity_text = entity_text[entity.offset * 2 : (entity.offset + entity.length) * 2]

-        return entity_text.decode("utf-16-le")
+        return entity_text.decode(TextEncoding.UTF_16_LE)

    def parse_text_entities(self, types: Optional[List[str]] = None) -> Dict[MessageEntity, str]:
        """
--- a/telegram/_message.py
+++ b/telegram/_message.py
@ -68,6 +68,7 @@ from telegram._utils.argumentparsing import parse_sequence_arg
 from telegram._utils.datetime import extract_tzinfo_from_defaults, from_timestamp
 from telegram._utils.defaultvalue import DEFAULT_NONE, DefaultValue
 from telegram._utils.entities import parse_message_entities, parse_message_entity
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import (
    CorrectOptionID,
    FileInput,
@ -1516,8 +1517,8 @@ class Message(MaybeInaccessibleMessage):
            raise RuntimeError("This message has neither text nor caption.")

        # Telegram wants the position in UTF-16 code units, so we have to calculate in that space
-        utf16_text = text.encode("utf-16-le")
-        utf16_quote = quote.encode("utf-16-le")
+        utf16_text = text.encode(TextEncoding.UTF_16_LE)
+        utf16_quote = quote.encode(TextEncoding.UTF_16_LE)
        effective_index = index or 0

        matches = list(re.finditer(re.escape(utf16_quote), utf16_text))
@ -4479,7 +4480,7 @@ class Message(MaybeInaccessibleMessage):
        if message_text is None:
            return None

-        utf_16_text = message_text.encode("utf-16-le")
+        utf_16_text = message_text.encode(TextEncoding.UTF_16_LE)
        html_text = ""
        last_offset = 0

@ -4543,7 +4544,9 @@ class Message(MaybeInaccessibleMessage):
            # text is part of the parent entity
            html_text += (
                escape(
-                    utf_16_text[last_offset * 2 : (entity.offset - offset) * 2].decode("utf-16-le")
+                    utf_16_text[last_offset * 2 : (entity.offset - offset) * 2].decode(
+                        TextEncoding.UTF_16_LE
+                    )
                )
                + insert
            )
@ -4551,7 +4554,7 @@ class Message(MaybeInaccessibleMessage):
            last_offset = entity.offset - offset + entity.length

        # see comment above
-        html_text += escape(utf_16_text[last_offset * 2 :].decode("utf-16-le"))
+        html_text += escape(utf_16_text[last_offset * 2 :].decode(TextEncoding.UTF_16_LE))

        return html_text

@ -4680,7 +4683,7 @@ class Message(MaybeInaccessibleMessage):
        if message_text is None:
            return None

-        utf_16_text = message_text.encode("utf-16-le")
+        utf_16_text = message_text.encode(TextEncoding.UTF_16_LE)
        markdown_text = ""
        last_offset = 0

@ -4773,7 +4776,7 @@ class Message(MaybeInaccessibleMessage):
            markdown_text += (
                escape_markdown(
                    utf_16_text[last_offset * 2 : (entity.offset - offset) * 2].decode(
-                        "utf-16-le"
+                        TextEncoding.UTF_16_LE
                    ),
                    version=version,
                )
@ -4784,7 +4787,7 @@ class Message(MaybeInaccessibleMessage):

        # see comment above
        markdown_text += escape_markdown(
-            utf_16_text[last_offset * 2 :].decode("utf-16-le"),
+            utf_16_text[last_offset * 2 :].decode(TextEncoding.UTF_16_LE),
            version=version,
        )

--- a/telegram/_messageentity.py
+++ b/telegram/_messageentity.py
@ -26,6 +26,7 @@ from telegram import constants
 from telegram._telegramobject import TelegramObject
 from telegram._user import User
 from telegram._utils import enum
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import JSONDict

 if TYPE_CHECKING:
@ -203,7 +204,7 @@ class MessageEntity(TelegramObject):
        for i, position in enumerate(positions):
            last_position = positions[i - 1] if i > 0 else 0
            text_slice = text[last_position:position]
-            accumulated_length += len(text_slice.encode("utf-16-le")) // 2
+            accumulated_length += len(text_slice.encode(TextEncoding.UTF_16_LE)) // 2
            position_translation[position] = accumulated_length
        # get the final output entites
        out = []
--- a/telegram/_passport/credentials.py
+++ b/telegram/_passport/credentials.py
@ -39,6 +39,7 @@ except ImportError:

 from telegram._telegramobject import TelegramObject
 from telegram._utils.argumentparsing import parse_sequence_arg
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import JSONDict
 from telegram.error import PassportDecryptionError

@ -98,7 +99,7 @@ def decrypt(secret, hash, data):
@no_type_check
 def decrypt_json(secret, hash, data):
    """Decrypts data using secret and hash and then decodes utf-8 string and loads json"""
-    return json.loads(decrypt(secret, hash, data).decode("utf-8"))
+    return json.loads(decrypt(secret, hash, data).decode(TextEncoding.UTF_8))


 class EncryptedCredentials(TelegramObject):
--- a/telegram/_utils/entities.py
+++ b/telegram/_utils/entities.py
@ -26,6 +26,7 @@ Warning:
 from typing import Dict, Optional, Sequence

 from telegram._messageentity import MessageEntity
+from telegram._utils.strings import TextEncoding


 def parse_message_entity(text: str, entity: MessageEntity) -> str:
@ -38,10 +39,10 @@ def parse_message_entity(text: str, entity: MessageEntity) -> str:
    Returns:
        :obj:`str`: The text of the given entity.
    """
-    entity_text = text.encode("utf-16-le")
+    entity_text = text.encode(TextEncoding.UTF_16_LE)
    entity_text = entity_text[entity.offset * 2 : (entity.offset + entity.length) * 2]

-    return entity_text.decode("utf-16-le")
+    return entity_text.decode(TextEncoding.UTF_16_LE)


 def parse_message_entities(
--- a/telegram/_utils/strings.py
+++ b/telegram/_utils/strings.py
@ -24,6 +24,23 @@ Warning:
    the changelog.
 """

+from telegram._utils.enum import StringEnum
+
+# TODO: Remove this when https://github.com/PyCQA/pylint/issues/6887 is resolved.
+# pylint: disable=invalid-enum-extension,invalid-slots
+
+
+class TextEncoding(StringEnum):
+    """This enum contains encoding schemes for text.
+
+    .. versionadded:: NEXT.VERSION
+    """
+
+    __slots__ = ()
+
+    UTF_8 = "utf-8"
+    UTF_16_LE = "utf-16-le"
+

 def to_camel_case(snake_str: str) -> str:
    """Converts a snake_case string to camelCase.
--- a/telegram/request/_baserequest.py
+++ b/telegram/request/_baserequest.py
@ -26,6 +26,7 @@ from typing import AsyncContextManager, Final, List, Optional, Tuple, Type, Type
 from telegram._utils.defaultvalue import DEFAULT_NONE as _DEFAULT_NONE
 from telegram._utils.defaultvalue import DefaultValue
 from telegram._utils.logging import get_logger
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import JSONDict, ODVInput
 from telegram._utils.warnings import warn
 from telegram._version import __version__ as ptb_ver
@ -403,7 +404,7 @@ class BaseRequest(
        Raises:
            TelegramError: If loading the JSON data failed
        """
-        decoded_s = payload.decode("utf-8", "replace")
+        decoded_s = payload.decode(TextEncoding.UTF_8, "replace")
        try:
            return json.loads(decoded_s)
        except ValueError as exc:
--- a/telegram/request/_requestdata.py
+++ b/telegram/request/_requestdata.py
@ -21,6 +21,7 @@ import json
 from typing import Any, Dict, List, Optional, Union, final
 from urllib.parse import urlencode

+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import UploadFileDict
 from telegram.request._requestparameter import RequestParameter

@ -109,7 +110,7 @@ class RequestData:
            To use a custom library for JSON encoding, you can directly encode the keys of
            :attr:`parameters` - note that string valued keys should not be JSON encoded.
        """
-        return json.dumps(self.json_parameters).encode("utf-8")
+        return json.dumps(self.json_parameters).encode(TextEncoding.UTF_8)

    @property
    def multipart_data(self) -> UploadFileDict:
--- a/tests/_files/test_inputfile.py
+++ b/tests/_files/test_inputfile.py
@ -24,6 +24,7 @@ from io import BytesIO
 import pytest

 from telegram import InputFile
+from telegram._utils.strings import TextEncoding
 from tests.auxil.files import data_file
 from tests.auxil.slots import mro_slots

@ -150,17 +151,17 @@ class TestInputFileWithRequest:
        await (await message.document.get_file()).download_to_memory(out=out)
        out.seek(0)

-        assert out.read().decode("utf-8") == "PTB Rocks! ⅞"
+        assert out.read().decode(TextEncoding.UTF_8) == "PTB Rocks! ⅞"

    async def test_send_string(self, bot, chat_id):
        # We test this here and not at the respective test modules because it's not worth
        # duplicating the test for the different methods
        message = await bot.send_document(
-            chat_id, InputFile(data_file("text_file.txt").read_text(encoding="utf-8"))
+            chat_id, InputFile(data_file("text_file.txt").read_text(encoding=TextEncoding.UTF_8))
        )
        out = BytesIO()

        await (await message.document.get_file()).download_to_memory(out=out)
        out.seek(0)

-        assert out.read().decode("utf-8") == "PTB Rocks! ⅞"
+        assert out.read().decode(TextEncoding.UTF_8) == "PTB Rocks! ⅞"
--- a/tests/auxil/ci_bots.py
+++ b/tests/auxil/ci_bots.py
@ -22,6 +22,8 @@ import json
 import os
 import random

+from telegram._utils.strings import TextEncoding
+
 # Provide some public fallbacks so it's easy for contributors to run tests on their local machine
 # These bots are only able to talk in our test chats, so they are quite useless for other
 # purposes than testing.
@ -42,10 +44,12 @@ GITHUB_ACTION = os.getenv("GITHUB_ACTION", None)
 BOTS = os.getenv("BOTS", None)
 JOB_INDEX = os.getenv("JOB_INDEX", None)
 if GITHUB_ACTION is not None and BOTS is not None and JOB_INDEX is not None:
-    BOTS = json.loads(base64.b64decode(BOTS).decode("utf-8"))
+    BOTS = json.loads(base64.b64decode(BOTS).decode(TextEncoding.UTF_8))
    JOB_INDEX = int(JOB_INDEX)

-FALLBACKS = json.loads(base64.b64decode(FALLBACKS).decode("utf-8"))  # type: list[dict[str, str]]
+FALLBACKS = json.loads(
+    base64.b64decode(FALLBACKS).decode(TextEncoding.UTF_8)
+)  # type: list[dict[str, str]]


 class BotInfoProvider:
--- a/tests/auxil/networking.py
+++ b/tests/auxil/networking.py
@ -23,6 +23,7 @@ import pytest
 from httpx import AsyncClient, AsyncHTTPTransport, Response

 from telegram._utils.defaultvalue import DEFAULT_NONE
+from telegram._utils.strings import TextEncoding
 from telegram._utils.types import ODVInput
 from telegram.error import BadRequest, RetryAfter, TimedOut
 from telegram.request import HTTPXRequest, RequestData
@ -103,7 +104,7 @@ async def send_webhook_message(
        content_len = None
        payload = None
    else:
-        payload = bytes(payload_str, encoding="utf-8")
+        payload = bytes(payload_str, encoding=TextEncoding.UTF_8)

    if content_len == -1:
        content_len = len(payload)
--- a/tests/request/test_request.py
+++ b/tests/request/test_request.py
@ -31,6 +31,7 @@ import pytest
 from httpx import AsyncHTTPTransport

 from telegram._utils.defaultvalue import DEFAULT_NONE
+from telegram._utils.strings import TextEncoding
 from telegram.error import (
    BadRequest,
    ChatMigrated,
@ -247,7 +248,7 @@ class TestRequestWithoutRequest:
        else:
            match = "Unknown HTTPError"

-        server_response = json.dumps(response_data).encode("utf-8")
+        server_response = json.dumps(response_data).encode(TextEncoding.UTF_8)

        monkeypatch.setattr(
            httpx_request,
--- a/tests/test_enum_types.py
+++ b/tests/test_enum_types.py
@ -19,6 +19,8 @@
 import re
 from pathlib import Path

+from telegram._utils.strings import TextEncoding
+
 telegram_root = Path(__file__).parent.parent / "telegram"
 telegram_ext_root = telegram_root / "ext"
 exclude_dirs = {
@ -46,7 +48,7 @@ def test_types_are_converted_to_enum():
            # We don't check tg.ext.
            continue

-        text = path.read_text(encoding="utf-8")
+        text = path.read_text(encoding=TextEncoding.UTF_8)
        for match in re.finditer(pattern, text):
            if any(exclude_pattern.match(match.group(0)) for exclude_pattern in exclude_patterns):
                continue