From cd5653605f3e1d71f7470876523efa00a594d8d0 Mon Sep 17 00:00:00 2001
From: "Il'ya (Marshal)" <ilya@marshal.by>
Date: Sat, 24 Apr 2021 11:29:19 +0200
Subject: [PATCH] rewrite logic of rules system (not its uses regex); fix
 trailing slash; many attempt to send request;

---
 make_tracked_links_list.py | 127 ++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 59 deletions(-)

diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py
index 65c2b727f8..7acde345b6 100644
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@@ -7,65 +7,59 @@ from time import time
 from urllib.parse import unquote
 
 import aiohttp
+from aiohttp import ClientConnectorError
 
 PROTOCOL = 'https://'
 BASE_URL = 'telegram.org'
 # its necessary to help crawler to find more links
 HIDDEN_URLS = {
-    'corefork.telegram.org',
+    # 'corefork.telegram.org',
 
     'telegram.org/privacy/gmailbot',
     'telegram.org/tos',
     'telegram.org/tour',
 
-    'translations.telegram.org',
-    'translations.telegram.org/en/android',
-    'translations.telegram.org/en/ios',
-    'translations.telegram.org/en/tdesktop',
-    'translations.telegram.org/en/macos',
-    'translations.telegram.org/en/android_x',
+    'desktop.telegram.org/changelog',
 }
 BASE_URL_REGEX = r'telegram.org'
 
-# disable crawling sub links for specific domains and url patches
-EXCLUDE_RULES = {
-    # '*' means exclude all
+# disable crawling sub links for specific domains and url patterns
+CRAWL_RULES = {
+    # every rule is regex
+    # empty string means match any url
+    # allow rules with high priority than deny
     'translations.telegram.org': {
-        # 'max_count_of_slashes': 3,
-        'patches': {
-            '*',
+        'allow': {
+            r'^[^/]*$',  # root
+            r'org/[^/]*/$',  # 1 lvl sub
+            r'/en/[a-z_]+/$'  # 1 lvl after /en/
+        },
+        'deny': {
+            '',  # all
         }
     },
-    'bugs.telegram.org': {
-        'patches': {
-            'c/',
+    'bugs.telegram.org': {  # crawl first page of cards sorted by rating
+        'deny': {
+            r'/c/[0-9]+/[0-9]+',  # disable comments
         },
     },
     'instantview.telegram.org': {
-        'patches': {
+        'allow': {
+            'contest/winners'
+        },
+        'deny': {
             'file/',
 
-            'templates/',
+            r'templates/.+',
             'samples/',
             'contest/',
         },
     },
-    'corefork.telegram.org': {
-        'patches': {
-            'file/',
-
-            'tdlib/docs/',
-
-            'constructor/',
-            'method/',
-            'type/',
-        },
-    },
     'core.telegram.org': {
-        'patches': {
+        'deny': {
             'file/',
 
-            'tdlib/docs/',
+            'tdlib/docs/classtd',
 
             'constructor/',
             'method/',
@@ -73,7 +67,7 @@ EXCLUDE_RULES = {
         },
     },
     'telegram.org': {
-        'patches': {
+        'deny': {
             'file/',
         },
     }
@@ -89,6 +83,7 @@ OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
 
 # unsecure but so simple
 CONNECTOR = aiohttp.TCPConnector(ssl=False)
+TIMEOUT = aiohttp.ClientTimeout(total=30)
 
 logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@@ -97,25 +92,28 @@ VISITED_LINKS = set()
 LINKS_TO_TRACK = set()
 
 
-def should_exclude(url: str, direct_link=None) -> bool:
-    if not direct_link:
-        direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
-    domain_exclude_rules = EXCLUDE_RULES.get(direct_link, dict())
+def should_exclude(url: str) -> bool:
+    direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
+    domain_rules = CRAWL_RULES.get(direct_link)
+    if not domain_rules:
+        return False
 
-    max_count_of_slashes = domain_exclude_rules.get('max_count_of_slashes')
-    exclude_patches = domain_exclude_rules.get('patches', set())
+    allow_rules = domain_rules.get('allow', set())
+    deny_rules = domain_rules.get('deny', set())
 
-    if '*' in exclude_patches:
-        return True
+    exclude = False
 
-    if max_count_of_slashes and max_count_of_slashes < url.count('/'):
-        return True
+    for regex in deny_rules:
+        if re.search(regex, url):
+            exclude = True
+            break
 
-    for path in exclude_patches:
-        if path in url:
-            return True
+    for regex in allow_rules:
+        if re.search(regex, url):
+            exclude = False
+            break
 
-    return False
+    return exclude
 
 
 def find_absolute_links(html: str) -> set[str]:
@@ -136,17 +134,16 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
         links = re.findall(regex, html)
 
         for link in links:
-            if should_exclude(link, direct_cur_link):
-                continue
-
+            # bypass //www.apple and etc shit ;d
             if link.startswith('/'):
-                # bypass //www.apple and etc shit ;d
+                # absolute links starting with double slash
                 if find_absolute_links(link):
-                    # absolute links starting with double slash
-                    if not should_exclude(link):
+                    if not should_exclude(link[1::]):
                         relative_links.add(link[1::])
             else:
-                relative_links.add(f'{direct_cur_link}/{link}')
+                url = f'{direct_cur_link}/{link}'
+                if not should_exclude(url):
+                    relative_links.add(url)
 
     return relative_links
 
@@ -173,11 +170,10 @@ def cleanup_links(links: set[str]) -> set[str]:
 
 
 async def crawl(url: str, session: aiohttp.ClientSession):
-    if url.endswith('/'):
-        url = url[:-1:]
-    if url in VISITED_LINKS or '"' in url:
+    without_trailing_slash = url[:-1:] if url.endswith('/') else url
+    if without_trailing_slash in VISITED_LINKS:
         return
-    VISITED_LINKS.add(url)
+    VISITED_LINKS.add(without_trailing_slash)
 
     try:
         logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
@@ -185,6 +181,17 @@ async def crawl(url: str, session: aiohttp.ClientSession):
             status_code = response.status
             content_type = response.headers.get('content-type')
 
+            # if it was redirect to link with trailing slash - handle this url
+            if 300 < status_code < 400:
+                location = response.headers.get('location', '')
+                # todo rewrite logic
+                if without_trailing_slash in location:
+                    if not should_exclude(location):
+                        # nice shit bro
+                        logger.info(f'Trailing slash. {location}')
+                        cleaned_link = list(cleanup_links({location}))[0]
+                        await asyncio.gather(crawl(cleaned_link, session))
+
             if status_code != 200:
                 return
 
@@ -206,8 +213,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
             else:
                 # TODO track hashes of image/svg/video content types
                 logger.info(f'Unhandled type: {content_type}')
-    except:
-        logger.warning('Mb codec can\'t decode byte. So its was a tgs file')
+    except UnicodeDecodeError:
+        logger.warning('Codec can\'t decode byte. So its was a tgs file')
+    except ClientConnectorError:
+        await asyncio.gather(crawl(url, session))
 
 
 async def start(url_list: set[str]):