From 0b8627332639840eabbead5c397423f1b2f40946 Mon Sep 17 00:00:00 2001
From: "Il'ya (Marshal)" <ilya@marshal.by>
Date: Mon, 21 Jun 2021 16:09:49 +0200
Subject: [PATCH] add logging of skips

---
 make_files_tree.py         |  5 +++++
 make_tracked_links_list.py | 12 ++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/make_files_tree.py b/make_files_tree.py
index fe74888c1a..6e38aabb77 100644
--- a/make_files_tree.py
+++ b/make_files_tree.py
@@ -38,7 +38,12 @@ async def crawl(url: str, session: aiohttp.ClientSession):
     try:
         logger.info(f'Process {url}')
         async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
+            if response.status == 302:
+                return
+
             if response.status != 200:
+                content = await response.text()
+                logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                 return
 
             # bypass external slashes and so on
diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py
index 6d3e174099..368923df9b 100644
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@@ -8,7 +8,7 @@ from time import time
 from urllib.parse import unquote
 
 import aiohttp
-from aiohttp import ClientConnectorError
+from aiohttp import ClientConnectorError, ServerDisconnectedError
 
 PROTOCOL = 'https://'
 BASE_URL = 'telegram.org'
@@ -196,10 +196,14 @@ async def crawl(url: str, session: aiohttp.ClientSession):
     try:
         logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
         async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
-            status_code = response.status
             content_type = response.headers.get('content-type')
 
-            if status_code != 200:
+            if response.status == 302:
+                return
+
+            if response.status != 200:
+                content = await response.text()
+                logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                 return
 
             if 'text/html' in content_type:
@@ -234,7 +238,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
         logger.warning('Codec can\'t decode byte. So its was a tgs file')
     except ClientConnectorError:
         logger.warning(f'Wrong link: {url}')
-    except TimeoutError:
+    except (ServerDisconnectedError, TimeoutError):
         logger.warning(f'Retrying {url}')
         VISITED_LINKS.remove(url)
         await asyncio.gather(crawl(url, session))