Implement message entity parsing

2025-01-10 20:12:25 +01:00 · 2022-05-24 13:11:15 +04:00 · 2022-05-24 13:11:15 +04:00 · 06c265cd11
commit 06c265cd11
parent 2de428e8eb
2 changed files with 116 additions and 3 deletions
--- a/src/types/message.rs
+++ b/src/types/message.rs
@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize};
 use crate::types::{
    Animation, Audio, BareChatId, Chat, ChatId, Contact, Dice, Document, Game,
    InlineKeyboardMarkup, Invoice, Location, MessageAutoDeleteTimerChanged, MessageEntity,
-    PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker, SuccessfulPayment, True, User,
-    Venue, Video, VideoChatEnded, VideoChatParticipantsInvited, VideoChatScheduled,
-    VideoChatStarted, VideoNote, Voice, WebAppData,
+    MessageEntityRef, PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker,
+    SuccessfulPayment, True, User, Venue, Video, VideoChatEnded, VideoChatParticipantsInvited,
+    VideoChatScheduled, VideoChatStarted, VideoNote, Voice, WebAppData,
 };

 /// This object represents a message.
@ -1095,6 +1095,18 @@ impl Message {
        // /[a-zA-Z0-9_]{5,32}/ and chat/message ids are integers.
        Some(reqwest::Url::parse(&url).unwrap())
    }
+
+    pub fn parse_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
+        self.text()
+            .zip(self.entities())
+            .map(|(t, e)| MessageEntityRef::parse(t, e))
+    }
+
+    pub fn parse_caption_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
+        self.text()
+            .zip(self.entities())
+            .map(|(t, e)| MessageEntityRef::parse(t, e))
+    }
 }

 #[cfg(test)]
--- a/src/types/message_entity.rs
+++ b/src/types/message_entity.rs
@ -1,3 +1,5 @@
+use std::{cmp, ops::Range};
+
 use serde::{Deserialize, Serialize};

 use crate::types::{User, UserId};
@ -19,6 +21,18 @@ pub struct MessageEntity {
    pub length: usize,
 }

+/// A "parsed" [`MessageEntity`].
+///
+/// [`MessageEntity`] has offsets in UTF-**16** code units, but in Rust we
+/// mostly work with UTF-**8**. In order to use an entity we need to convert
+/// UTF-16 offsets to UTF-8 ones. This type represents a message entity with
+/// converted offsets and a reference to the text.
+pub struct MessageEntityRef<'a> {
+    message: &'a str,
+    range: Range<usize>,
+    kind: &'a MessageEntityKind,
+}
+
 impl MessageEntity {
    pub const fn new(kind: MessageEntityKind, offset: usize, length: usize) -> Self {
        Self {
@ -140,6 +154,93 @@ impl MessageEntity {
    }
 }

+impl<'a> MessageEntityRef<'a> {
+    pub fn kind(&self) -> &'a MessageEntityKind {
+        self.kind
+    }
+
+    pub fn text(&self) -> &'a str {
+        &self.message[self.range.clone()]
+    }
+
+    pub fn range(&self) -> Range<usize> {
+        self.range.clone()
+    }
+
+    pub fn start(&self) -> usize {
+        self.range.start
+    }
+
+    pub fn end(&self) -> usize {
+        self.range.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.range.len()
+    }
+
+    pub fn message_text(&self) -> &'a str {
+        self.message
+    }
+
+    pub fn parse(text: &'a str, entities: &'a [MessageEntity]) -> Vec<Self> {
+        // This creates entities with **wrong** offsets (UTF-16) that we later patch.
+        let mut entities: Vec<_> = entities
+            .iter()
+            .map(|e| Self {
+                message: text,
+                range: e.offset..e.offset + e.length,
+                kind: &e.kind,
+            })
+            .collect();
+
+        // Convert offsets
+
+        // References to all offsets that need patching
+        let mut offsets: Vec<&mut usize> = entities
+            .iter_mut()
+            .flat_map(
+                |Self {
+                     range: Range { start, end },
+                     ..
+                 }| [start, end],
+            )
+            .collect();
+
+        // Sort in decreasing order, so the smallest elements are at the end and can be
+        // removed more easily
+        offsets.sort_unstable_by_key(|&&mut offset| cmp::Reverse(offset));
+
+        let _ = text
+            .chars()
+            .chain(['\0']) // this is needed to process offset pointing at the end of the string
+            .try_fold((0, 0), |(len_utf8, len_utf16), c| {
+                // Stop if there are no more offsets to patch
+                if offsets.is_empty() {
+                    return None;
+                }
+
+                // Patch all offsets that can be patched
+                while offsets
+                    .last()
+                    .map(|&&mut offset| offset <= len_utf16)
+                    .unwrap_or(false)
+                {
+                    let offset = offsets.pop().unwrap();
+                    assert_eq!(*offset, len_utf16, "Invalid utf-16 offset");
+
+                    // Patch the offset to be UTF-8
+                    *offset = len_utf8;
+                }
+
+                // Update "running" length
+                Some((len_utf8 + c.len_utf8(), len_utf16 + c.len_utf16()))
+            });
+
+        entities
+    }
+}
+
 #[serde_with_macros::skip_serializing_none]
 #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]