mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-02-01 04:43:52 +01:00
add max count of attempts with log delays
This commit is contained in:
parent
f0ce4fe1dc
commit
c54cccc460
1 changed files with 13 additions and 4 deletions
|
@ -1,6 +1,7 @@
|
|||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
|
@ -493,13 +494,21 @@ class RetryError(Exception):
|
|||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str = OUTPUT_SITES_FOLDER):
|
||||
while True:
|
||||
attempt = 0
|
||||
while attempt < 100: # around 6 minutes
|
||||
attempt += 1
|
||||
try:
|
||||
await _crawl(url, session, output_dir)
|
||||
except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError) as e:
|
||||
logger.warning(f'Client or timeout error ({repr(e)}). Retrying {url}')
|
||||
else:
|
||||
break
|
||||
return
|
||||
|
||||
delay = math.log(attempt)
|
||||
logger.info(f'Sleep for {delay}. Attempt {attempt}. URL: {url}')
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
logger.info(f'Max amount of attempts has been reached ({url})')
|
||||
|
||||
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
|
||||
|
|
Loading…
Add table
Reference in a new issue