Implement message entity parsing

This commit is contained in:
Maybe Waffle 2022-05-24 13:11:15 +04:00
parent 2de428e8eb
commit 06c265cd11
2 changed files with 116 additions and 3 deletions

View file

@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize};
use crate::types::{
Animation, Audio, BareChatId, Chat, ChatId, Contact, Dice, Document, Game,
InlineKeyboardMarkup, Invoice, Location, MessageAutoDeleteTimerChanged, MessageEntity,
PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker, SuccessfulPayment, True, User,
Venue, Video, VideoChatEnded, VideoChatParticipantsInvited, VideoChatScheduled,
VideoChatStarted, VideoNote, Voice, WebAppData,
MessageEntityRef, PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker,
SuccessfulPayment, True, User, Venue, Video, VideoChatEnded, VideoChatParticipantsInvited,
VideoChatScheduled, VideoChatStarted, VideoNote, Voice, WebAppData,
};
/// This object represents a message.
@ -1095,6 +1095,18 @@ impl Message {
// /[a-zA-Z0-9_]{5,32}/ and chat/message ids are integers.
Some(reqwest::Url::parse(&url).unwrap())
}
pub fn parse_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
self.text()
.zip(self.entities())
.map(|(t, e)| MessageEntityRef::parse(t, e))
}
pub fn parse_caption_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
self.text()
.zip(self.entities())
.map(|(t, e)| MessageEntityRef::parse(t, e))
}
}
#[cfg(test)]

View file

@ -1,3 +1,5 @@
use std::{cmp, ops::Range};
use serde::{Deserialize, Serialize};
use crate::types::{User, UserId};
@ -19,6 +21,18 @@ pub struct MessageEntity {
pub length: usize,
}
/// A "parsed" [`MessageEntity`].
///
/// [`MessageEntity`] has offsets in UTF-**16** code units, but in Rust we
/// mostly work with UTF-**8**. In order to use an entity we need to convert
/// UTF-16 offsets to UTF-8 ones. This type represents a message entity with
/// converted offsets and a reference to the text.
pub struct MessageEntityRef<'a> {
message: &'a str,
range: Range<usize>,
kind: &'a MessageEntityKind,
}
impl MessageEntity {
pub const fn new(kind: MessageEntityKind, offset: usize, length: usize) -> Self {
Self {
@ -140,6 +154,93 @@ impl MessageEntity {
}
}
impl<'a> MessageEntityRef<'a> {
pub fn kind(&self) -> &'a MessageEntityKind {
self.kind
}
pub fn text(&self) -> &'a str {
&self.message[self.range.clone()]
}
pub fn range(&self) -> Range<usize> {
self.range.clone()
}
pub fn start(&self) -> usize {
self.range.start
}
pub fn end(&self) -> usize {
self.range.end
}
pub fn len(&self) -> usize {
self.range.len()
}
pub fn message_text(&self) -> &'a str {
self.message
}
pub fn parse(text: &'a str, entities: &'a [MessageEntity]) -> Vec<Self> {
// This creates entities with **wrong** offsets (UTF-16) that we later patch.
let mut entities: Vec<_> = entities
.iter()
.map(|e| Self {
message: text,
range: e.offset..e.offset + e.length,
kind: &e.kind,
})
.collect();
// Convert offsets
// References to all offsets that need patching
let mut offsets: Vec<&mut usize> = entities
.iter_mut()
.flat_map(
|Self {
range: Range { start, end },
..
}| [start, end],
)
.collect();
// Sort in decreasing order, so the smallest elements are at the end and can be
// removed more easily
offsets.sort_unstable_by_key(|&&mut offset| cmp::Reverse(offset));
let _ = text
.chars()
.chain(['\0']) // this is needed to process offset pointing at the end of the string
.try_fold((0, 0), |(len_utf8, len_utf16), c| {
// Stop if there are no more offsets to patch
if offsets.is_empty() {
return None;
}
// Patch all offsets that can be patched
while offsets
.last()
.map(|&&mut offset| offset <= len_utf16)
.unwrap_or(false)
{
let offset = offsets.pop().unwrap();
assert_eq!(*offset, len_utf16, "Invalid utf-16 offset");
// Patch the offset to be UTF-8
*offset = len_utf8;
}
// Update "running" length
Some((len_utf8 + c.len_utf8(), len_utf16 + c.len_utf16()))
});
entities
}
}
#[serde_with_macros::skip_serializing_none]
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]