From 0b8627332639840eabbead5c397423f1b2f40946 Mon Sep 17 00:00:00 2001 From: "Il'ya (Marshal)" Date: Mon, 21 Jun 2021 16:09:49 +0200 Subject: [PATCH] add logging of skips --- make_files_tree.py | 5 +++++ make_tracked_links_list.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/make_files_tree.py b/make_files_tree.py index fe74888c1a..6e38aabb77 100644 --- a/make_files_tree.py +++ b/make_files_tree.py @@ -38,7 +38,12 @@ async def crawl(url: str, session: aiohttp.ClientSession): try: logger.info(f'Process {url}') async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response: + if response.status == 302: + return + if response.status != 200: + content = await response.text() + logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}') return # bypass external slashes and so on diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py index 6d3e174099..368923df9b 100644 --- a/make_tracked_links_list.py +++ b/make_tracked_links_list.py @@ -8,7 +8,7 @@ from time import time from urllib.parse import unquote import aiohttp -from aiohttp import ClientConnectorError +from aiohttp import ClientConnectorError, ServerDisconnectedError PROTOCOL = 'https://' BASE_URL = 'telegram.org' @@ -196,10 +196,14 @@ async def crawl(url: str, session: aiohttp.ClientSession): try: logger.info(f'[{len(VISITED_LINKS)}] Process {url}') async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response: - status_code = response.status content_type = response.headers.get('content-type') - if status_code != 200: + if response.status == 302: + return + + if response.status != 200: + content = await response.text() + logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}') return if 'text/html' in content_type: @@ -234,7 +238,7 @@ async def crawl(url: str, session: aiohttp.ClientSession): logger.warning('Codec can\'t decode byte. So its was a tgs file') except ClientConnectorError: logger.warning(f'Wrong link: {url}') - except TimeoutError: + except (ServerDisconnectedError, TimeoutError): logger.warning(f'Retrying {url}') VISITED_LINKS.remove(url) await asyncio.gather(crawl(url, session))