add logging of skips

This commit is contained in:
Il'ya (Marshal) 2021-06-21 16:09:49 +02:00
parent cbf993be5e
commit 0b86273326
2 changed files with 13 additions and 4 deletions

View file

@ -38,7 +38,12 @@ async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
if response.status == 302:
return
if response.status != 200:
content = await response.text()
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
return
# bypass external slashes and so on

View file

@ -8,7 +8,7 @@ from time import time
from urllib.parse import unquote
import aiohttp
from aiohttp import ClientConnectorError
from aiohttp import ClientConnectorError, ServerDisconnectedError
PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
@ -196,10 +196,14 @@ async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
status_code = response.status
content_type = response.headers.get('content-type')
if status_code != 200:
if response.status == 302:
return
if response.status != 200:
content = await response.text()
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
return
if 'text/html' in content_type:
@ -234,7 +238,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
logger.warning('Codec can\'t decode byte. So its was a tgs file')
except ClientConnectorError:
logger.warning(f'Wrong link: {url}')
except TimeoutError:
except (ServerDisconnectedError, TimeoutError):
logger.warning(f'Retrying {url}')
VISITED_LINKS.remove(url)
await asyncio.gather(crawl(url, session))