mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-15 13:22:43 +01:00
fix resending of requests;
fix timeout settings; fix handling of invalid urls; edit schedule of generation list to test.
This commit is contained in:
parent
eb723304ce
commit
2dffddeaae
2 changed files with 6 additions and 3 deletions
|
@ -2,7 +2,7 @@ name: Generate or update list of tracked links
|
|||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 * * * *'
|
||||
- cron: '* * * * *'
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
|
|
@ -179,7 +179,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
try:
|
||||
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||
status_code = response.status
|
||||
content_type = response.headers.get('content-type')
|
||||
|
||||
|
@ -214,8 +214,11 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
||||
except UnicodeDecodeError:
|
||||
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
except ClientConnectorError:
|
||||
logger.warning(f'Wrong link: {url}')
|
||||
except TimeoutError:
|
||||
logger.warning(f'Retrying {url}')
|
||||
VISITED_LINKS.remove(url)
|
||||
await asyncio.gather(crawl(url, session))
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue