mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-24 00:59:25 +01:00
add logging of skips
This commit is contained in:
parent
cbf993be5e
commit
0b86273326
2 changed files with 13 additions and 4 deletions
|
@ -38,7 +38,12 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
try:
|
try:
|
||||||
logger.info(f'Process {url}')
|
logger.info(f'Process {url}')
|
||||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||||
|
if response.status == 302:
|
||||||
|
return
|
||||||
|
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
|
content = await response.text()
|
||||||
|
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||||
return
|
return
|
||||||
|
|
||||||
# bypass external slashes and so on
|
# bypass external slashes and so on
|
||||||
|
|
|
@ -8,7 +8,7 @@ from time import time
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from aiohttp import ClientConnectorError
|
from aiohttp import ClientConnectorError, ServerDisconnectedError
|
||||||
|
|
||||||
PROTOCOL = 'https://'
|
PROTOCOL = 'https://'
|
||||||
BASE_URL = 'telegram.org'
|
BASE_URL = 'telegram.org'
|
||||||
|
@ -196,10 +196,14 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
try:
|
try:
|
||||||
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
||||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||||
status_code = response.status
|
|
||||||
content_type = response.headers.get('content-type')
|
content_type = response.headers.get('content-type')
|
||||||
|
|
||||||
if status_code != 200:
|
if response.status == 302:
|
||||||
|
return
|
||||||
|
|
||||||
|
if response.status != 200:
|
||||||
|
content = await response.text()
|
||||||
|
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||||
return
|
return
|
||||||
|
|
||||||
if 'text/html' in content_type:
|
if 'text/html' in content_type:
|
||||||
|
@ -234,7 +238,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||||
except ClientConnectorError:
|
except ClientConnectorError:
|
||||||
logger.warning(f'Wrong link: {url}')
|
logger.warning(f'Wrong link: {url}')
|
||||||
except TimeoutError:
|
except (ServerDisconnectedError, TimeoutError):
|
||||||
logger.warning(f'Retrying {url}')
|
logger.warning(f'Retrying {url}')
|
||||||
VISITED_LINKS.remove(url)
|
VISITED_LINKS.remove(url)
|
||||||
await asyncio.gather(crawl(url, session))
|
await asyncio.gather(crawl(url, session))
|
||||||
|
|
Loading…
Add table
Reference in a new issue