mirror of
https://github.com/teloxide/teloxide.git
synced 2025-01-10 20:12:25 +01:00
Implement message entity parsing
This commit is contained in:
parent
2de428e8eb
commit
06c265cd11
2 changed files with 116 additions and 3 deletions
|
@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize};
|
|||
use crate::types::{
|
||||
Animation, Audio, BareChatId, Chat, ChatId, Contact, Dice, Document, Game,
|
||||
InlineKeyboardMarkup, Invoice, Location, MessageAutoDeleteTimerChanged, MessageEntity,
|
||||
PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker, SuccessfulPayment, True, User,
|
||||
Venue, Video, VideoChatEnded, VideoChatParticipantsInvited, VideoChatScheduled,
|
||||
VideoChatStarted, VideoNote, Voice, WebAppData,
|
||||
MessageEntityRef, PassportData, PhotoSize, Poll, ProximityAlertTriggered, Sticker,
|
||||
SuccessfulPayment, True, User, Venue, Video, VideoChatEnded, VideoChatParticipantsInvited,
|
||||
VideoChatScheduled, VideoChatStarted, VideoNote, Voice, WebAppData,
|
||||
};
|
||||
|
||||
/// This object represents a message.
|
||||
|
@ -1095,6 +1095,18 @@ impl Message {
|
|||
// /[a-zA-Z0-9_]{5,32}/ and chat/message ids are integers.
|
||||
Some(reqwest::Url::parse(&url).unwrap())
|
||||
}
|
||||
|
||||
pub fn parse_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
|
||||
self.text()
|
||||
.zip(self.entities())
|
||||
.map(|(t, e)| MessageEntityRef::parse(t, e))
|
||||
}
|
||||
|
||||
pub fn parse_caption_entities(&self) -> Option<Vec<MessageEntityRef<'_>>> {
|
||||
self.text()
|
||||
.zip(self.entities())
|
||||
.map(|(t, e)| MessageEntityRef::parse(t, e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
use std::{cmp, ops::Range};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::types::{User, UserId};
|
||||
|
@ -19,6 +21,18 @@ pub struct MessageEntity {
|
|||
pub length: usize,
|
||||
}
|
||||
|
||||
/// A "parsed" [`MessageEntity`].
|
||||
///
|
||||
/// [`MessageEntity`] has offsets in UTF-**16** code units, but in Rust we
|
||||
/// mostly work with UTF-**8**. In order to use an entity we need to convert
|
||||
/// UTF-16 offsets to UTF-8 ones. This type represents a message entity with
|
||||
/// converted offsets and a reference to the text.
|
||||
pub struct MessageEntityRef<'a> {
|
||||
message: &'a str,
|
||||
range: Range<usize>,
|
||||
kind: &'a MessageEntityKind,
|
||||
}
|
||||
|
||||
impl MessageEntity {
|
||||
pub const fn new(kind: MessageEntityKind, offset: usize, length: usize) -> Self {
|
||||
Self {
|
||||
|
@ -140,6 +154,93 @@ impl MessageEntity {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> MessageEntityRef<'a> {
|
||||
pub fn kind(&self) -> &'a MessageEntityKind {
|
||||
self.kind
|
||||
}
|
||||
|
||||
pub fn text(&self) -> &'a str {
|
||||
&self.message[self.range.clone()]
|
||||
}
|
||||
|
||||
pub fn range(&self) -> Range<usize> {
|
||||
self.range.clone()
|
||||
}
|
||||
|
||||
pub fn start(&self) -> usize {
|
||||
self.range.start
|
||||
}
|
||||
|
||||
pub fn end(&self) -> usize {
|
||||
self.range.end
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.range.len()
|
||||
}
|
||||
|
||||
pub fn message_text(&self) -> &'a str {
|
||||
self.message
|
||||
}
|
||||
|
||||
pub fn parse(text: &'a str, entities: &'a [MessageEntity]) -> Vec<Self> {
|
||||
// This creates entities with **wrong** offsets (UTF-16) that we later patch.
|
||||
let mut entities: Vec<_> = entities
|
||||
.iter()
|
||||
.map(|e| Self {
|
||||
message: text,
|
||||
range: e.offset..e.offset + e.length,
|
||||
kind: &e.kind,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Convert offsets
|
||||
|
||||
// References to all offsets that need patching
|
||||
let mut offsets: Vec<&mut usize> = entities
|
||||
.iter_mut()
|
||||
.flat_map(
|
||||
|Self {
|
||||
range: Range { start, end },
|
||||
..
|
||||
}| [start, end],
|
||||
)
|
||||
.collect();
|
||||
|
||||
// Sort in decreasing order, so the smallest elements are at the end and can be
|
||||
// removed more easily
|
||||
offsets.sort_unstable_by_key(|&&mut offset| cmp::Reverse(offset));
|
||||
|
||||
let _ = text
|
||||
.chars()
|
||||
.chain(['\0']) // this is needed to process offset pointing at the end of the string
|
||||
.try_fold((0, 0), |(len_utf8, len_utf16), c| {
|
||||
// Stop if there are no more offsets to patch
|
||||
if offsets.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Patch all offsets that can be patched
|
||||
while offsets
|
||||
.last()
|
||||
.map(|&&mut offset| offset <= len_utf16)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
let offset = offsets.pop().unwrap();
|
||||
assert_eq!(*offset, len_utf16, "Invalid utf-16 offset");
|
||||
|
||||
// Patch the offset to be UTF-8
|
||||
*offset = len_utf8;
|
||||
}
|
||||
|
||||
// Update "running" length
|
||||
Some((len_utf8 + c.len_utf8(), len_utf16 + c.len_utf16()))
|
||||
});
|
||||
|
||||
entities
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_with_macros::skip_serializing_none]
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
|
|
Loading…
Reference in a new issue