mb speedup, mb not

This commit is contained in:
Il'ya (Marshal) 2022-04-24 08:45:20 +02:00
parent a845c8446c
commit d0ab3c176a
2 changed files with 78 additions and 50 deletions

View file

@ -4,6 +4,7 @@ import os
import platform import platform
import re import re
import shutil import shutil
import sys
import zipfile import zipfile
import hashlib import hashlib
from asyncio.exceptions import TimeoutError from asyncio.exceptions import TimeoutError
@ -57,8 +58,10 @@ async def download_file(url, path, session):
if response.status != 200: if response.status != 200:
return return
async with aiofiles.open(path, mode='wb') as f: content = await response.read()
await f.write(await response.read())
async with aiofiles.open(path, mode='wb') as f:
await f.write(content)
async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession): async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
@ -88,25 +91,25 @@ async def get_download_link_of_latest_appcenter_release(parameterized_url: str,
async def track_additional_files( async def track_additional_files(
files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
): ):
kwargs = {'mode': 'r', 'encoding': encoding}
if save_hash_only:
kwargs['mode'] = 'rb'
del kwargs['encoding']
for file in files_to_track: for file in files_to_track:
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
content = await r_file.read()
if save_hash_only:
content = get_hash(content)
else:
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file) filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
os.makedirs(os.path.dirname(filename), exist_ok=True) os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file: async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
kwargs = {'mode': 'r', 'encoding': encoding} await w_file.write(content)
if save_hash_only:
kwargs['mode'] = 'rb'
del kwargs['encoding']
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
content = await r_file.read()
if save_hash_only:
await w_file.write(get_hash(content))
continue
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
await w_file.write(content)
async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession): async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession):
@ -120,7 +123,20 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
client_folder_name = 'macos' client_folder_name = 'macos'
client_archive_name = 'macos.zip' client_archive_name = 'macos.zip'
await download_file(download_url, client_archive_name, session) assets_output_dir = 'macos_assets'
assets_filename = 'Assets.car'
assets_extractor = 'acextract'
tool_archive_name = f'{assets_extractor}.zip'
tool_download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
if 'darwin' not in platform.system().lower():
await download_file(download_url, client_archive_name, session)
else:
await asyncio.gather(
download_file(download_url, client_archive_name, session),
download_file(tool_download_url, tool_archive_name, session),
)
# synced # synced
with zipfile.ZipFile(client_archive_name, 'r') as f: with zipfile.ZipFile(client_archive_name, 'r') as f:
@ -145,14 +161,6 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
cleanup1() cleanup1()
return return
assets_output_dir = 'macos_assets'
assets_extractor = 'acextract'
assets_filename = 'Assets.car'
tool_archive_name = f'{assets_extractor}.zip'
download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
await download_file(download_url, tool_archive_name, session)
# synced # synced
with zipfile.ZipFile(tool_archive_name, 'r') as f: with zipfile.ZipFile(tool_archive_name, 'r') as f:
f.extractall(assets_extractor) f.extractall(assets_extractor)
@ -191,8 +199,10 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
if not download_url: if not download_url:
return return
await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session) await asyncio.gather(
await download_file(download_url, 'android.apk', session) download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session),
download_file(download_url, 'android.apk', session),
)
def cleanup(): def cleanup():
os.path.isdir('android') and shutil.rmtree('android') os.path.isdir('android') and shutil.rmtree('android')
@ -224,17 +234,20 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
data = {'offset': offset, 'more': 1} data = {'offset': offset, 'more': 1}
try: try:
new_offset = None
async with session.post( async with session.post(
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
) as response: ) as response:
if response.status != 200: if response.status != 200:
logger.debug(f'Resend cuz {response.status}') logger.debug(f'Resend cuz {response.status}')
return await _get_page(offset) new_offset = offset
else:
json = await response.json(encoding='UTF-8')
if 'more_html' in json and json['more_html']:
content.append(json['more_html'])
new_offset = offset + 200
json = await response.json(encoding='UTF-8') new_offset and await _get_page(new_offset)
if 'more_html' in json and json['more_html']:
content.append(json['more_html'])
await _get_page(offset + 200)
except (TimeoutError, ClientConnectorError): except (TimeoutError, ClientConnectorError):
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}') logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
await _get_page(offset) await _get_page(offset)
@ -261,13 +274,18 @@ def is_hashable_only_content_type(content_type) -> bool:
return False return False
class RetryError(Exception):
...
async def crawl(url: str, session: aiohttp.ClientSession): async def crawl(url: str, session: aiohttp.ClientSession):
try: try:
logger.info(f'Process {url}') logger.info(f'Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response: async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
if response.status // 100 == 5: if response.status // 100 == 5:
logger.warning(f'Error 5XX. Retrying {url}') msg = f'Error 5XX. Retrying {url}'
return await crawl(url, session) logger.warning(msg)
raise RetryError(msg)
if response.status not in {200, 304}: if response.status not in {200, 304}:
if response.status != 302: if response.status != 302:
@ -306,16 +324,16 @@ async def crawl(url: str, session: aiohttp.ClientSession):
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url): if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
content = await collect_translations_paginated_content(url, session) content = await collect_translations_paginated_content(url, session)
async with aiofiles.open(filename, 'w', encoding='utf-8') as f: content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content) content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content) content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content) content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content) content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content) content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content) content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content) content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
logger.info(f'Write to {filename}') logger.info(f'Write to {filename}')
await f.write(content) await f.write(content)
except (ServerDisconnectedError, TimeoutError, ClientConnectorError): except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
@ -323,21 +341,25 @@ async def crawl(url: str, session: aiohttp.ClientSession):
await crawl(url, session) await crawl(url, session)
async def start(url_list: set[str]): async def start(url_list: set[str], mode: int):
async with aiohttp.ClientSession(connector=CONNECTOR) as session: async with aiohttp.ClientSession(connector=CONNECTOR) as session:
await asyncio.gather( mode == 0 and await asyncio.gather(
*[crawl(url, session) for url in url_list], *[crawl(url, session) for url in url_list],
# yeap it will be called each run, and what? ;d
download_telegram_android_beta_and_extract_resources(session), download_telegram_android_beta_and_extract_resources(session),
download_telegram_macos_beta_and_extract_resources(session), download_telegram_macos_beta_and_extract_resources(session),
) )
mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
if __name__ == '__main__': if __name__ == '__main__':
run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
with open(INPUT_FILENAME, 'r') as f: with open(INPUT_FILENAME, 'r') as f:
tracked_urls = set([l.replace('\n', '') for l in f.readlines()]) tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
start_time = time() start_time = time()
asyncio.get_event_loop().run_until_complete(start(tracked_urls)) logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
logger.info(f'Stop crawling content. {time() - start_time} sec.') asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')

6
make_files_tree.sh Normal file
View file

@ -0,0 +1,6 @@
#!/bin/bash
python make_files_tree.py > /dev/null 1 &
python make_files_tree.py > /dev/null 2 &
python make_files_tree.py > /dev/null 3 &
wait