python-telegram-bot/tests/test_official/scraper.py
2025-01-01 14:51:12 +01:00

136 lines
4.7 KiB
Python

#!/usr/bin/env python
#
# A library that provides a Python interface to the Telegram Bot API
# Copyright (C) 2015-2025
# Leandro Toledo de Souza <devs@python-telegram-bot.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser Public License for more details.
#
# You should have received a copy of the GNU Lesser Public License
# along with this program. If not, see [http://www.gnu.org/licenses/].
"""This module contains functions which are used to scrape the official Bot API documentation."""
import asyncio
from dataclasses import dataclass
from typing import Literal, overload
import httpx
from bs4 import BeautifulSoup, Tag
from tests.test_official.exceptions import IGNORED_OBJECTS
from tests.test_official.helpers import (
find_next_sibling_until,
is_parameter_required_by_tg,
is_pascal_case,
)
@dataclass(slots=True, frozen=True)
class TelegramParameter:
"""Represents the scraped Telegram parameter. Contains all relevant attributes needed for
comparison. Relevant for both TelegramMethod and TelegramClass."""
param_name: str
param_type: str
param_required: bool
param_description: str
@dataclass(slots=True, frozen=True)
class TelegramClass:
"""Represents the scraped Telegram class. Contains all relevant attributes needed for
comparison."""
class_name: str
class_parameters: list[TelegramParameter]
# class_description: str
@dataclass(slots=True, frozen=True)
class TelegramMethod:
"""Represents the scraped Telegram method. Contains all relevant attributes needed for
comparison."""
method_name: str
method_parameters: list[TelegramParameter]
# method_description: str
@dataclass(slots=True, frozen=False)
class Scraper:
request: httpx.Response | None = None
soup: BeautifulSoup | None = None
async def make_request(self) -> None:
async with httpx.AsyncClient() as client:
self.request = await client.get("https://core.telegram.org/bots/api", timeout=10)
self.soup = BeautifulSoup(self.request.text, "html.parser")
@overload
def parse_docs(
self, doc_type: Literal["method"]
) -> tuple[list[TelegramMethod], list[str]]: ...
@overload
def parse_docs(self, doc_type: Literal["class"]) -> tuple[list[TelegramClass], list[str]]: ...
def parse_docs(self, doc_type):
argvalues = []
names: list[str] = []
if self.request is None:
asyncio.run(self.make_request())
for unparsed in self.soup.select("h4 > a.anchor"):
if "-" not in unparsed["name"]:
h4: Tag | None = unparsed.parent
name = h4.text
if h4 is None:
raise AssertionError("h4 is None")
if doc_type == "method" and name[0].lower() == name[0]:
params = parse_table_for_params(h4)
obj = TelegramMethod(method_name=name, method_parameters=params)
argvalues.append(obj)
names.append(name)
elif doc_type == "class" and is_pascal_case(name) and name not in IGNORED_OBJECTS:
params = parse_table_for_params(h4)
obj = TelegramClass(class_name=name, class_parameters=params)
argvalues.append(obj)
names.append(name)
return argvalues, names
def collect_methods(self) -> tuple[list[TelegramMethod], list[str]]:
return self.parse_docs("method")
def collect_classes(self) -> tuple[list[TelegramClass], list[str]]:
return self.parse_docs("class")
def parse_table_for_params(h4: Tag) -> list[TelegramParameter]:
"""Parses the Telegram doc table and outputs a list of TelegramParameter objects."""
table = find_next_sibling_until(h4, "table", h4.find_next_sibling("h4"))
if not table:
return []
params = []
for tr in table.find_all("tr")[1:]:
fields = []
for td in tr.find_all("td"):
param = td.text
fields.append(param)
param_name = fields[0]
param_type = fields[1]
param_required = is_parameter_required_by_tg(fields[2])
param_desc = fields[-1] # since length can be 2 or 3, but desc is always the last
params.append(TelegramParameter(param_name, param_type, param_required, param_desc))
return params