stel dev layer, headers, loop instead of rec; workflow timeout; aiohttp limit increase.

2025-03-15 13:22:43 +01:00 · 2022-05-11 11:10:08 +02:00 · 2022-05-11 11:10:08 +02:00 · c6e13dc54d
commit c6e13dc54d
parent cdc433274c
4 changed files with 116 additions and 67 deletions
--- a/.github/workflows/make_files_tree.yml
+++ b/.github/workflows/make_files_tree.yml
@ -12,7 +12,7 @@ jobs:
  fetch_new_content:
    name: Make files tree
    runs-on: macos-10.15
-#    timeout-minutes: 10
+    timeout-minutes: 10

    steps:

--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -44,9 +44,27 @@ SPARKLE_SE_REGEX = r';se=(.*?);'
 SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};'
 SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};'

+stel_dev_layer = 132
+
 # unsecure but so simple
-CONNECTOR = aiohttp.TCPConnector(ssl=False)
+CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
 TIMEOUT = aiohttp.ClientTimeout(total=10)
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'DNT': '1',
+    'Connection': 'keep-alive',
+    'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
+    'Upgrade-Insecure-Requests': '1',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'none',
+    'Sec-Fetch-User': '?1',
+    'Cache-Control': 'max-age=0',
+    'TE': 'trailers',
+}

 logging.basicConfig(format='%(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
@ -56,7 +74,7 @@ def get_hash(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()


-async def download_file(url, path, session):
+async def download_file(url: str, path: str, session: aiohttp.ClientSession):
    async with session.get(url) as response:
        if response.status != 200:
            return
@ -437,70 +455,72 @@ class RetryError(Exception):


 async def crawl(url: str, session: aiohttp.ClientSession):
-    try:
-        # f*ck this shit. I believe it's temp solution
-        if 'css/telegram.css' in url:
+    ok = False
+    while not ok:
+        try:
+            await _crawl(url, session)
+            ok = True
+        except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
+            logger.warning(f'Client or timeout error. Retrying {url}')
+
+
+async def _crawl(url: str, session: aiohttp.ClientSession):
+    logger.info(f'Process {url}')
+    async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response:
+        if response.status // 100 == 5:
+            msg = f'Error 5XX. Retrying {url}'
+            logger.warning(msg)
+            raise RetryError(msg)
+
+        if response.status not in {200, 304}:
+            if response.status != 302:
+                content = await response.text()
+                logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
            return

-        logger.info(f'Process {url}')
-        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
-            if response.status // 100 == 5:
-                msg = f'Error 5XX. Retrying {url}'
-                logger.warning(msg)
-                raise RetryError(msg)
+        # bypass external slashes and so on
+        url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]

-            if response.status not in {200, 304}:
-                if response.status != 302:
-                    content = await response.text()
-                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
-                return
+        is_hashable_only = is_hashable_only_content_type(response.content_type)
+        # amazing dirt for media files like
+        # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
+        # with response content type HTML instead of image. Shame on you
+        # sometimes it returns correct type. noice load balancing
+        is_sucking_file = '/file/' in url and 'text' in response.content_type

-            # bypass external slashes and so on
-            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
+        # handle pure domains and html pages without ext in url
+        ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''

-            is_hashable_only = is_hashable_only_content_type(response.content_type)
-            # amazing dirt for media files like
-            # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
-            # with response content type HTML instead of image. Shame on you
-            # sometimes it returns correct type. noice load balancing
-            is_sucking_file = '/file/' in url and 'text' in response.content_type
+        # I don't add ext by content type for images and so on cuz TG servers sucks.
+        # Some servers do not return correct content type. Some servers do...
+        if is_hashable_only or is_sucking_file:
+            ext = ''

-            # handle pure domains and html pages without ext in url
-            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
-
-            # I don't add ext by content type for images and so on cuz TG servers sucks.
-            # Some servers do not return correct content type. Some servers do...
-            if is_hashable_only or is_sucking_file:
-                ext = ''
-
-            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-
-            if is_sucking_file or is_hashable_only:
-                content = await response.read()
-                async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
-                    await f.write(get_hash(content))
-                return
-
-            content = await response.text(encoding='UTF-8')
-            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
-                content = await collect_translations_paginated_content(url, session)
-
-            content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
-            content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
-            content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
-            content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
-            content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
-            content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
-            content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
-            content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
+        filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
+        os.makedirs(os.path.dirname(filename), exist_ok=True)

+        if is_sucking_file or is_hashable_only:
+            content = await response.read()
            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
-                logger.info(f'Write to {filename}')
-                await f.write(content)
-    except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
-        logger.warning(f'Client or timeout error. Retrying {url}')
-        await crawl(url, session)
+                await f.write(get_hash(content))
+            return
+
+        content = await response.text(encoding='UTF-8')
+        if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
+            content = await collect_translations_paginated_content(url, session)
+
+        content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
+        content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
+        content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
+        content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
+        content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
+        content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
+        content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
+        content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
+
+        async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
+            logger.info(f'Write to {filename}')
+            await f.write(content)


 async def start(url_list: set[str], mode: int):
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -137,9 +137,27 @@ DOM_ATTRS = ['href', 'src']
 OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
 COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)

+stel_dev_layer = 132
+
 # unsecure but so simple
-CONNECTOR = aiohttp.TCPConnector(ssl=False)
+CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
 TIMEOUT = aiohttp.ClientTimeout(total=10)
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'DNT': '1',
+    'Connection': 'keep-alive',
+    'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
+    'Upgrade-Insecure-Requests': '1',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'none',
+    'Sec-Fetch-User': '?1',
+    'Cache-Control': 'max-age=0',
+    'TE': 'trailers',
+}

 logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@ -275,7 +293,22 @@ def is_trackable_content_type(content_type) -> bool:
    return False


+class ServerSideError(Exception):
+    pass
+
+
 async def crawl(url: str, session: aiohttp.ClientSession):
+    ok = False
+    while not ok:
+        try:
+            await _crawl(url, session)
+            ok = True
+        except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
+            logger.warning(f'Client or timeout error. Retrying {url}')
+            VISITED_LINKS.remove(url)
+
+
+async def _crawl(url: str, session: aiohttp.ClientSession):
    if url in VISITED_LINKS:
        return
    VISITED_LINKS.add(url)
@ -288,7 +321,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            if response.status // 100 == 5:
                VISITED_LINKS.remove(url)
                logger.warning(f'Error 5XX. Retrying {url}')
-                return await crawl(url, session)
+                raise ServerSideError()

            if response.status not in {200, 304}:
                if response.status != 302:
@ -327,15 +360,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
        logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
    # except ClientConnectorError:
    #     logger.warning(f'Wrong link: {url}')
-    except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
-        logger.warning(f'Client or timeout error. Retrying {url}')
-        VISITED_LINKS.remove(url)
-        # sleep + count of attempts?
-        await crawl(url, session)


 async def start(url_list: set[str]):
-    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
+    async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
        await asyncio.gather(*[crawl(url, session) for url in url_list])


--- a/requirements.txt
+++ b/requirements.txt
@ -2,4 +2,5 @@ aiohttp==3.7.4.post0
 aiodns==3.0.0
 aiofiles==0.6.0
 Pyrogram==2.0.19
+TgCrypto==1.2.3
 # uvloop==0.16.0