From 14fb9bc6fc65a498b37a9ccc4dc992760093f59f Mon Sep 17 00:00:00 2001 From: "Il'ya (Marshal)" Date: Mon, 18 Apr 2022 00:11:21 +0200 Subject: [PATCH] a little speedup --- make_files_tree.py | 25 +++++++++++++------------ make_tracked_links_list.py | 9 +++++++-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/make_files_tree.py b/make_files_tree.py index fb91f2a671..84effb9c5d 100644 --- a/make_files_tree.py +++ b/make_files_tree.py @@ -13,7 +13,7 @@ from typing import List import aiofiles import aiohttp -from aiohttp import ClientConnectorError +from aiohttp import ClientConnectorError, ServerDisconnectedError PROTOCOL = 'https://' ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace @@ -229,15 +229,15 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie ) as response: if response.status != 200: logger.debug(f'Resend cuz {response.status}') - return await asyncio.gather(_get_page(offset)) + return await _get_page(offset) json = await response.json(encoding='UTF-8') if 'more_html' in json and json['more_html']: content.append(json['more_html']) - await asyncio.gather(_get_page(offset + 200)) + await _get_page(offset + 200) except (TimeoutError, ClientConnectorError): logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}') - await asyncio.gather(_get_page(offset)) + await _get_page(offset) await _get_page(0) @@ -267,7 +267,7 @@ async def crawl(url: str, session: aiohttp.ClientSession): async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response: if response.status // 100 == 5: logger.warning(f'Error 5XX. Retrying {url}') - return await asyncio.gather(crawl(url, session)) + return await crawl(url, session) if response.status not in {200, 304}: if response.status != 302: @@ -318,18 +318,19 @@ async def crawl(url: str, session: aiohttp.ClientSession): logger.info(f'Write to {filename}') await f.write(content) - except (TimeoutError, ClientConnectorError): + except (ServerDisconnectedError, TimeoutError, ClientConnectorError): logger.warning(f'Client or timeout error. Retrying {url}') - await asyncio.gather(crawl(url, session)) + await crawl(url, session) async def start(url_list: set[str]): async with aiohttp.ClientSession(connector=CONNECTOR) as session: - await asyncio.gather(*[crawl(url, session) for url in url_list]) - - # yeap it will be called each run, and what? ;d - await download_telegram_android_beta_and_extract_resources(session) - await download_telegram_macos_beta_and_extract_resources(session) + await asyncio.gather( + *[crawl(url, session) for url in url_list], + # yeap it will be called each run, and what? ;d + download_telegram_android_beta_and_extract_resources(session), + download_telegram_macos_beta_and_extract_resources(session), + ) if __name__ == '__main__': diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py index d896c3f9dc..d73d091936 100644 --- a/make_tracked_links_list.py +++ b/make_tracked_links_list.py @@ -71,6 +71,11 @@ CRAWL_RULES = { '', # all } }, + 'osx.telegram.org': { + 'deny': { + 'updates/Telegram' + } + }, 'bugs.telegram.org': { # crawl first page of cards sorted by rating 'deny': { # r'/c/[0-9]+/[0-9]+', # disable comments @@ -286,7 +291,7 @@ async def crawl(url: str, session: aiohttp.ClientSession): if response.status // 100 == 5: VISITED_LINKS.remove(url) logger.warning(f'Error 5XX. Retrying {url}') - return await asyncio.gather(crawl(url, session)) + return await crawl(url, session) if response.status not in {200, 304}: if response.status != 302: @@ -329,7 +334,7 @@ async def crawl(url: str, session: aiohttp.ClientSession): logger.warning(f'Client or timeout error. Retrying {url}') VISITED_LINKS.remove(url) # sleep + count of attempts? - await asyncio.gather(crawl(url, session)) + await crawl(url, session) async def start(url_list: set[str]):