mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-22 23:37:43 +01:00
extend info about translations (url, photo, binding, obj structure); add translations hashtag; improve error handling of translations.
This commit is contained in:
parent
81c2684608
commit
ec10d811e6
3 changed files with 31 additions and 7 deletions
|
@ -41,7 +41,7 @@ STATUS_TO_EMOJI = {
|
|||
}
|
||||
|
||||
AVAILABLE_HASHTAGS = {
|
||||
'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android'
|
||||
'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android', 'translations'
|
||||
}
|
||||
HASHTAGS_PATTERNS = {
|
||||
# regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed
|
||||
|
@ -53,6 +53,7 @@ HASHTAGS_PATTERNS = {
|
|||
'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'),
|
||||
'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'),
|
||||
'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'),
|
||||
'translations': os.path.join(ROOT_TREE_DIR, 'web', 'translations.telegram.org'),
|
||||
}
|
||||
# order is important!
|
||||
PATHS_TO_REMOVE_FROM_ALERT = [
|
||||
|
|
|
@ -15,7 +15,6 @@ from typing import List
|
|||
import aiofiles
|
||||
import aiohttp
|
||||
from aiohttp import ClientConnectorError, ServerDisconnectedError
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import ccl_bplist
|
||||
|
||||
|
@ -344,6 +343,11 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
|
|||
|
||||
|
||||
async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
|
||||
import cssutils
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
css_parser = cssutils.CSSParser(loglevel=logging.FATAL, raiseExceptions=False)
|
||||
|
||||
headers = {'X-Requested-With': 'XMLHttpRequest'}
|
||||
content = dict()
|
||||
|
||||
|
@ -356,7 +360,7 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
|||
async with session.post(
|
||||
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
if (499 < response.status < 600) or (response.status != 200):
|
||||
logger.debug(f'Resend cuz {response.status}')
|
||||
new_offset = offset
|
||||
else:
|
||||
|
@ -369,15 +373,33 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
|||
for tr_item in tr_items:
|
||||
tr_key = tr_item.find_next('div', {'class': 'tr-value-key'}).text
|
||||
|
||||
tr_values = tr_item.find_all('span', {'class': 'value'})
|
||||
tr_values_content = [tr_value.decode_contents() for tr_value in tr_values]
|
||||
tr_url = tr_item.find_next('div', {'class': 'tr-key-row'})['data-href']
|
||||
tr_url = f'https://translations.telegram.org{tr_url}'
|
||||
|
||||
content[tr_key] = tr_values_content
|
||||
tr_photo = tr_item.find_next('a', {'class': 'tr-value-photo'})
|
||||
if tr_photo:
|
||||
tr_photo = css_parser.parseStyle(tr_photo['style']).backgroundImage[5:-2]
|
||||
|
||||
tr_has_binding = tr_item.find_next('span', {'class': 'has-1binding binding'})
|
||||
tr_has_binding = tr_has_binding is not None
|
||||
|
||||
tr_values = tr_item.find_all('span', {'class': 'value'})
|
||||
tr_value_singular, *tr_value_plural = [tr_value.decode_contents() for tr_value in tr_values]
|
||||
tr_values = {'singular': tr_value_singular}
|
||||
if tr_value_plural:
|
||||
tr_values['plural'] = tr_value_plural[0]
|
||||
|
||||
content[tr_key] = {
|
||||
'url': tr_url,
|
||||
'photo_url': tr_photo,
|
||||
'has_binding': tr_has_binding is not None,
|
||||
'values': tr_values,
|
||||
}
|
||||
|
||||
new_offset = offset + 200
|
||||
|
||||
new_offset and await _get_page(new_offset)
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
|
||||
await _get_page(offset)
|
||||
|
||||
|
|
|
@ -4,4 +4,5 @@ aiofiles==0.6.0
|
|||
git+https://github.com/pyrogram/pyrogram@7f9e841ccd44246ad855ad4855a6431a5823c554
|
||||
TgCrypto==1.2.3
|
||||
beautifulsoup4==4.11.1
|
||||
cssutils==2.4.2
|
||||
# uvloop==0.16.0
|
Loading…
Reference in a new issue