mb speedup, mb not

2024-10-23 17:47:21 +02:00 · 2022-04-24 08:45:20 +02:00 · 2022-04-24 08:45:20 +02:00 · d0ab3c176a
commit d0ab3c176a
parent a845c8446c
2 changed files with 78 additions and 50 deletions
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -4,6 +4,7 @@ import os
 import platform
 import re
 import shutil
 import sys
 import zipfile
 import hashlib
 from asyncio.exceptions import TimeoutError
@ -57,8 +58,10 @@ async def download_file(url, path, session):
        if response.status != 200:
            return
-        async with aiofiles.open(path, mode='wb') as f:
+        content = await response.read()
-            await f.write(await response.read())
+
    async with aiofiles.open(path, mode='wb') as f:
        await f.write(content)
 async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
@ -88,25 +91,25 @@ async def get_download_link_of_latest_appcenter_release(parameterized_url: str,
 async def track_additional_files(
        files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
 ):
    kwargs = {'mode': 'r', 'encoding': encoding}
    if save_hash_only:
        kwargs['mode'] = 'rb'
        del kwargs['encoding']
    for file in files_to_track:
        async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
            content = await r_file.read()
        if save_hash_only:
            content = get_hash(content)
        else:
            content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
            content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
        filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
-            kwargs = {'mode': 'r', 'encoding': encoding}
+            await w_file.write(content)
            if save_hash_only:
                kwargs['mode'] = 'rb'
                del kwargs['encoding']
            async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
                content = await r_file.read()
                if save_hash_only:
                    await w_file.write(get_hash(content))
                    continue
                content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
                content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
                await w_file.write(content)
 async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession):
@ -120,7 +123,20 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
    client_folder_name = 'macos'
    client_archive_name = 'macos.zip'
-    await download_file(download_url, client_archive_name, session)
+    assets_output_dir = 'macos_assets'
    assets_filename = 'Assets.car'
    assets_extractor = 'acextract'
    tool_archive_name = f'{assets_extractor}.zip'
    tool_download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
    if 'darwin' not in platform.system().lower():
        await download_file(download_url, client_archive_name, session)
    else:
        await asyncio.gather(
            download_file(download_url, client_archive_name, session),
            download_file(tool_download_url, tool_archive_name, session),
        )
    # synced
    with zipfile.ZipFile(client_archive_name, 'r') as f:
@ -145,14 +161,6 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
        cleanup1()
        return
    assets_output_dir = 'macos_assets'
    assets_extractor = 'acextract'
    assets_filename = 'Assets.car'
    tool_archive_name = f'{assets_extractor}.zip'
    download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
    await download_file(download_url, tool_archive_name, session)
    # synced
    with zipfile.ZipFile(tool_archive_name, 'r') as f:
        f.extractall(assets_extractor)
@ -191,8 +199,10 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
    if not download_url:
        return
-    await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)
+    await asyncio.gather(
-    await download_file(download_url, 'android.apk', session)
+        download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session),
        download_file(download_url, 'android.apk', session),
    )
    def cleanup():
        os.path.isdir('android') and shutil.rmtree('android')
@ -224,17 +234,20 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
        data = {'offset': offset, 'more': 1}
        try:
            new_offset = None
            async with session.post(
                    f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
            ) as response:
                if response.status != 200:
                    logger.debug(f'Resend cuz {response.status}')
-                    return await _get_page(offset)
+                    new_offset = offset
                else:
                    json = await response.json(encoding='UTF-8')
                    if 'more_html' in json and json['more_html']:
                        content.append(json['more_html'])
                        new_offset = offset + 200
-                json = await response.json(encoding='UTF-8')
+            new_offset and await _get_page(new_offset)
                if 'more_html' in json and json['more_html']:
                    content.append(json['more_html'])
                    await _get_page(offset + 200)
        except (TimeoutError, ClientConnectorError):
            logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
            await _get_page(offset)
@ -261,13 +274,18 @@ def is_hashable_only_content_type(content_type) -> bool:
    return False
 class RetryError(Exception):
    ...
 async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
            if response.status // 100 == 5:
-                logger.warning(f'Error 5XX. Retrying {url}')
+                msg = f'Error 5XX. Retrying {url}'
-                return await crawl(url, session)
+                logger.warning(msg)
                raise RetryError(msg)
            if response.status not in {200, 304}:
                if response.status != 302:
@ -306,16 +324,16 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
                content = await collect_translations_paginated_content(url, session)
-            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
+            content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
-                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
+            content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
-                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
+            content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
-                content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
+            content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
-                content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
+            content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
-                content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
+            content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
-                content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
+            content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
-                content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
+            content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
                content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
                logger.info(f'Write to {filename}')
                await f.write(content)
    except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
@ -323,21 +341,25 @@ async def crawl(url: str, session: aiohttp.ClientSession):
        await crawl(url, session)
-async def start(url_list: set[str]):
+async def start(url_list: set[str], mode: int):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
-        await asyncio.gather(
+        mode == 0 and await asyncio.gather(
            *[crawl(url, session) for url in url_list],
            # yeap it will be called each run, and what? ;d
            download_telegram_android_beta_and_extract_resources(session),
            download_telegram_macos_beta_and_extract_resources(session),
        )
        mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
        mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
        mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
 if __name__ == '__main__':
    run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
    with open(INPUT_FILENAME, 'r') as f:
        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
    start_time = time()
-    asyncio.get_event_loop().run_until_complete(start(tracked_urls))
+    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
-    logger.info(f'Stop crawling content. {time() - start_time} sec.')
+    asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
    logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
--- a/make_files_tree.sh
+++ b/make_files_tree.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 python make_files_tree.py > /dev/null 1 &
 python make_files_tree.py > /dev/null 2 &
 python make_files_tree.py > /dev/null 3 &
 wait