mb speedup, mb not

2025-01-05 18:38:19 +01:00 · 2022-04-24 08:45:20 +02:00 · 2022-04-24 08:45:20 +02:00 · d0ab3c176a
commit d0ab3c176a
parent a845c8446c
2 changed files with 78 additions and 50 deletions
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -4,6 +4,7 @@ import os
 import platform
 import re
 import shutil
+import sys
 import zipfile
 import hashlib
 from asyncio.exceptions import TimeoutError
@ -57,8 +58,10 @@ async def download_file(url, path, session):
        if response.status != 200:
            return

-        async with aiofiles.open(path, mode='wb') as f:
-            await f.write(await response.read())
+        content = await response.read()
+
+    async with aiofiles.open(path, mode='wb') as f:
+        await f.write(content)


 async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
@ -88,25 +91,25 @@ async def get_download_link_of_latest_appcenter_release(parameterized_url: str,
 async def track_additional_files(
        files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
 ):
+    kwargs = {'mode': 'r', 'encoding': encoding}
+    if save_hash_only:
+        kwargs['mode'] = 'rb'
+        del kwargs['encoding']
+
    for file in files_to_track:
+        async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
+            content = await r_file.read()
+
+        if save_hash_only:
+            content = get_hash(content)
+        else:
+            content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
+            content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
+
        filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
-            kwargs = {'mode': 'r', 'encoding': encoding}
-            if save_hash_only:
-                kwargs['mode'] = 'rb'
-                del kwargs['encoding']
-
-            async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
-                content = await r_file.read()
-
-                if save_hash_only:
-                    await w_file.write(get_hash(content))
-                    continue
-
-                content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
-                content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
-                await w_file.write(content)
+            await w_file.write(content)


 async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession):
@ -120,7 +123,20 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
    client_folder_name = 'macos'
    client_archive_name = 'macos.zip'

-    await download_file(download_url, client_archive_name, session)
+    assets_output_dir = 'macos_assets'
+    assets_filename = 'Assets.car'
+    assets_extractor = 'acextract'
+    tool_archive_name = f'{assets_extractor}.zip'
+
+    tool_download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
+
+    if 'darwin' not in platform.system().lower():
+        await download_file(download_url, client_archive_name, session)
+    else:
+        await asyncio.gather(
+            download_file(download_url, client_archive_name, session),
+            download_file(tool_download_url, tool_archive_name, session),
+        )

    # synced
    with zipfile.ZipFile(client_archive_name, 'r') as f:
@ -145,14 +161,6 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
        cleanup1()
        return

-    assets_output_dir = 'macos_assets'
-    assets_extractor = 'acextract'
-    assets_filename = 'Assets.car'
-    tool_archive_name = f'{assets_extractor}.zip'
-
-    download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
-    await download_file(download_url, tool_archive_name, session)
-
    # synced
    with zipfile.ZipFile(tool_archive_name, 'r') as f:
        f.extractall(assets_extractor)
@ -191,8 +199,10 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
    if not download_url:
        return

-    await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)
-    await download_file(download_url, 'android.apk', session)
+    await asyncio.gather(
+        download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session),
+        download_file(download_url, 'android.apk', session),
+    )

    def cleanup():
        os.path.isdir('android') and shutil.rmtree('android')
@ -224,17 +234,20 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
        data = {'offset': offset, 'more': 1}

        try:
+            new_offset = None
            async with session.post(
                    f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
            ) as response:
                if response.status != 200:
                    logger.debug(f'Resend cuz {response.status}')
-                    return await _get_page(offset)
+                    new_offset = offset
+                else:
+                    json = await response.json(encoding='UTF-8')
+                    if 'more_html' in json and json['more_html']:
+                        content.append(json['more_html'])
+                        new_offset = offset + 200

-                json = await response.json(encoding='UTF-8')
-                if 'more_html' in json and json['more_html']:
-                    content.append(json['more_html'])
-                    await _get_page(offset + 200)
+            new_offset and await _get_page(new_offset)
        except (TimeoutError, ClientConnectorError):
            logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
            await _get_page(offset)
@ -261,13 +274,18 @@ def is_hashable_only_content_type(content_type) -> bool:
    return False


+class RetryError(Exception):
+    ...
+
+
 async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
            if response.status // 100 == 5:
-                logger.warning(f'Error 5XX. Retrying {url}')
-                return await crawl(url, session)
+                msg = f'Error 5XX. Retrying {url}'
+                logger.warning(msg)
+                raise RetryError(msg)

            if response.status not in {200, 304}:
                if response.status != 302:
@ -306,16 +324,16 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
                content = await collect_translations_paginated_content(url, session)

-            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
-                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
-                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
-                content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
-                content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
-                content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
-                content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
-                content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
-                content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
+            content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
+            content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
+            content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
+            content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
+            content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
+            content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
+            content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
+            content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)

+            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
                logger.info(f'Write to {filename}')
                await f.write(content)
    except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
@ -323,21 +341,25 @@ async def crawl(url: str, session: aiohttp.ClientSession):
        await crawl(url, session)


-async def start(url_list: set[str]):
+async def start(url_list: set[str], mode: int):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
-        await asyncio.gather(
+        mode == 0 and await asyncio.gather(
            *[crawl(url, session) for url in url_list],
-            # yeap it will be called each run, and what? ;d
            download_telegram_android_beta_and_extract_resources(session),
            download_telegram_macos_beta_and_extract_resources(session),
        )
+        mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
+        mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
+        mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)


 if __name__ == '__main__':
+    run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
+
    with open(INPUT_FILENAME, 'r') as f:
        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])

-    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
    start_time = time()
-    asyncio.get_event_loop().run_until_complete(start(tracked_urls))
-    logger.info(f'Stop crawling content. {time() - start_time} sec.')
+    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
+    asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
+    logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
--- a/make_files_tree.sh
+++ b/make_files_tree.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+python make_files_tree.py > /dev/null 1 &
+python make_files_tree.py > /dev/null 2 &
+python make_files_tree.py > /dev/null 3 &
+wait