From 1f8a2678bafabc404303ed5c3b003d591c581c16 Mon Sep 17 00:00:00 2001
From: "Ilya (Marshal)" <ilya@marshal.dev>
Date: Sun, 2 Jun 2024 16:05:39 +0200
Subject: [PATCH] control concurrent requests; a little optimization; upgrade
 python and deps

---
 .github/workflows/make_tracked_links_list.yml |  6 +-
 make_tracked_links_list.py                    | 67 ++++++++++++-------
 requirements.txt                              |  6 +-
 tracked_links.txt                             |  2 -
 4 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/make_tracked_links_list.yml b/.github/workflows/make_tracked_links_list.yml
index a10ee5d535..c94f3eddbf 100644
--- a/.github/workflows/make_tracked_links_list.yml
+++ b/.github/workflows/make_tracked_links_list.yml
@@ -18,14 +18,14 @@ jobs:
     steps:
 
       - name: Clone.
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           token: ${{ secrets.PAT }}
 
       - name: Setup Python.
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: 3.12
 
       - name: Install dependencies.
         run: |
diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py
index 22c261affe..3dc6c058e2 100644
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@@ -2,7 +2,9 @@ import asyncio
 import logging
 import os
 import re
+from asyncio import Queue
 from asyncio.exceptions import TimeoutError
+from functools import cache
 from html import unescape
 from time import time
 from typing import Set
@@ -11,6 +13,7 @@ from urllib.parse import unquote
 import aiohttp
 from aiohttp import ClientConnectorError, ServerDisconnectedError
 
+
 PROTOCOL = 'https://'
 BASE_URL = 'telegram.org'
 # it's necessary to help crawler to find more links
@@ -21,11 +24,12 @@ HIDDEN_URLS = {
     'corefork.telegram.org/getProxyConfig',
 
     'telegram.org/privacy/gmailbot',
-    'telegram.org/tos',
     'telegram.org/tos/mini-apps',
     'telegram.org/tos/p2pl',
     'telegram.org/tour',
     'telegram.org/evolution',
+    'telegram.org/tos/bots',
+    'telegram.org/tos/business',
 
     'desktop.telegram.org/changelog',
     'td.telegram.org/current',
@@ -133,6 +137,8 @@ CRAWL_RULES = {
             r'apps$',
             r'img/emoji/.+',
             r'img/StickerExample.psd$',
+            r'/privacy$',  # geolocation depended
+            r'/tos$',  # geolocation depended
         },
     },
     'webz.telegram.org': {
@@ -180,7 +186,7 @@ HEADERS = {
     'TE': 'trailers',
 }
 
-logging.basicConfig(format='%(message)s', level=logging.DEBUG)
+logging.basicConfig(format='%(asctime)s  %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 VISITED_LINKS = set()
@@ -188,7 +194,11 @@ LINKS_TO_TRACK = set()
 LINKS_TO_TRANSLATIONS = set()
 LINKS_TO_TRACKABLE_RESOURCES = set()
 
+WORKERS_COUNT = 30
+WORKERS_TASK_QUEUE = Queue()
 
+
+@cache
 def should_exclude(url: str) -> bool:
     direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
     domain_rules = CRAWL_RULES.get(direct_link)
@@ -210,6 +220,9 @@ def should_exclude(url: str) -> bool:
             exclude = False
             break
 
+    if exclude:
+        logger.debug('Exclude %s by rules', url)
+
     return exclude
 
 
@@ -254,7 +267,7 @@ def find_relative_scripts(code: str, cur_link: str) -> Set[str]:
         # dirty magic for specific cases
         if '/' in link:    # path to file from the root
             url = f'{direct_cur_link}/{link}'
-        else:   # its relative link from current folder. Not from the root
+        else:   # it is a relative link from the current folder. not from the root
             current_folder_link, *_ = cur_link.rsplit('/', 1)
             url = f'{current_folder_link}/{link}'
 
@@ -341,17 +354,18 @@ class ServerSideError(Exception):
     pass
 
 
-async def crawl(url: str, session: aiohttp.ClientSession):
-    while True:
+async def crawl_worker(session: aiohttp.ClientSession):
+    while not WORKERS_TASK_QUEUE.empty():
+        url = WORKERS_TASK_QUEUE.get_nowait()
+
         try:
             await _crawl(url, session)
         except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
             logger.warning(f'Client or timeout error. Retrying {url}')
 
+            WORKERS_TASK_QUEUE.put_nowait(url)
             if url in VISITED_LINKS:
                 VISITED_LINKS.remove(url)
-        else:
-            break
 
 
 async def _crawl(url: str, session: aiohttp.ClientSession):
@@ -360,7 +374,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
     VISITED_LINKS.add(url)
 
     try:
-        logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
+        logger.debug('[%s] Process %s', len(VISITED_LINKS), url)
         async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
             content_type = response.headers.get('content-type')
 
@@ -372,20 +386,20 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
             if response.status not in {200, 304}:
                 if response.status != 302:
                     content = await response.text(encoding='UTF-8')
-                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
+                    logger.warning(f'Skip {url} because status code == {response.status}. Content: {content}')
                 return
 
             if is_textable_content_type(content_type):
-                # raw content will be cached by aiohttp. Don't worry about it
+                # aiohttp will cache raw content. we don't worry about it
                 raw_content = await response.read()
                 content = await response.text(encoding='UTF-8')
 
                 if is_translation_url(url):
                     LINKS_TO_TRANSLATIONS.add(url)
-                    logger.info(f'add {url} to LINKS_TO_TRANSLATIONS')
+                    logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url)
                 else:
                     LINKS_TO_TRACK.add(url)
-                    logger.info(f'add {url} to LINKS_TO_TRACK')
+                    logger.debug('Add %s to LINKS_TO_TRACK', url)
 
                 absolute_links = cleanup_links(find_absolute_links(content))
 
@@ -396,33 +410,40 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
                 relative_links = cleanup_links(relative_links_finder(content, url))
 
                 sub_links = absolute_links | relative_links
-                await asyncio.gather(*[crawl(url, session) for url in sub_links])
+                for sub_url in sub_links:
+                    if sub_url not in VISITED_LINKS:
+                        WORKERS_TASK_QUEUE.put_nowait(sub_url)
             elif is_trackable_content_type(content_type):
                 LINKS_TO_TRACKABLE_RESOURCES.add(url)
-                logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES')
+                logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url)
             else:
                 # for example, zip with update of macOS client
-                logger.info(f'Unhandled type: {content_type} from {url}')
+                logger.warning(f'Unhandled type: {content_type} from {url}')
 
-            # telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
-            # so this is a problem when we have random behavior with link will be added
-            # this if resolve this issue. If available both link we prefer without trailing slash
+            # telegram url can work with and without a trailing slash (no redirect).
+            # note: not on every subdomain ;d
+            # so this is a problem when we have random behavior with a link will be added
+            # this if resolve this issue.
+            # if available both links, we prefer without a trailing slash
             for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
                 without_trailing_slash = url[:-1:] if url.endswith('/') else url
                 if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
                     links_set.remove(f'{without_trailing_slash}/')
-                    logger.info(f'remove {without_trailing_slash}/')
+                    logger.debug('Remove %s/', without_trailing_slash)
     except UnicodeDecodeError:
         logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
 
         if raw_content.startswith(b'GIF'):
             LINKS_TO_TRACKABLE_RESOURCES.add(url)
-            logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES (raw content)')
+            logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw content)', url)
 
 
 async def start(url_list: Set[str]):
+    for url in url_list:
+        WORKERS_TASK_QUEUE.put_nowait(url)
+
     async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
-        await asyncio.gather(*[crawl(url, session) for url in url_list])
+        await asyncio.gather(*[crawl_worker(session) for _ in range(WORKERS_COUNT)])
 
 
 if __name__ == '__main__':
@@ -443,8 +464,8 @@ if __name__ == '__main__':
         CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
 
         logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
-        logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
-        logger.info(f'Added: {CURRENT_URL_LIST - OLD_URL_LIST}')
+        logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}')
+        logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}')
     except IOError:
         pass
 
diff --git a/requirements.txt b/requirements.txt
index c693fa3ed4..404703675b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-aiohttp==3.7.4.post0
-aiodns==3.0.0
+aiohttp==3.9.5
+aiodns==3.2.0
 aiofiles==0.6.0
 git+https://github.com/MarshalX/pyrogram
 TgCrypto==1.2.3
 beautifulsoup4==4.11.1
 cssutils==2.4.2
 requests==2.31.0
-# uvloop==0.16.0
+# uvloop==0.19.0
diff --git a/tracked_links.txt b/tracked_links.txt
index 07cfa1df92..a187701030 100644
--- a/tracked_links.txt
+++ b/tracked_links.txt
@@ -7558,11 +7558,9 @@ telegram.org/js/tgsticker-worker.js
 telegram.org/js/tgsticker.js
 telegram.org/js/widget-frame.js
 telegram.org/press
-telegram.org/privacy
 telegram.org/privacy/gmailbot
 telegram.org/support
 telegram.org/t.me/PremiumBot
-telegram.org/tos
 telegram.org/tos/bot-developers
 telegram.org/tos/bots
 telegram.org/tos/business