mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-01-05 18:38:19 +01:00
mb speedup, mb not
This commit is contained in:
parent
a845c8446c
commit
d0ab3c176a
2 changed files with 78 additions and 50 deletions
|
@ -4,6 +4,7 @@ import os
|
|||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import zipfile
|
||||
import hashlib
|
||||
from asyncio.exceptions import TimeoutError
|
||||
|
@ -57,8 +58,10 @@ async def download_file(url, path, session):
|
|||
if response.status != 200:
|
||||
return
|
||||
|
||||
async with aiofiles.open(path, mode='wb') as f:
|
||||
await f.write(await response.read())
|
||||
content = await response.read()
|
||||
|
||||
async with aiofiles.open(path, mode='wb') as f:
|
||||
await f.write(content)
|
||||
|
||||
|
||||
async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
|
||||
|
@ -88,25 +91,25 @@ async def get_download_link_of_latest_appcenter_release(parameterized_url: str,
|
|||
async def track_additional_files(
|
||||
files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
|
||||
):
|
||||
kwargs = {'mode': 'r', 'encoding': encoding}
|
||||
if save_hash_only:
|
||||
kwargs['mode'] = 'rb'
|
||||
del kwargs['encoding']
|
||||
|
||||
for file in files_to_track:
|
||||
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
|
||||
content = await r_file.read()
|
||||
|
||||
if save_hash_only:
|
||||
content = get_hash(content)
|
||||
else:
|
||||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
|
||||
|
||||
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
|
||||
kwargs = {'mode': 'r', 'encoding': encoding}
|
||||
if save_hash_only:
|
||||
kwargs['mode'] = 'rb'
|
||||
del kwargs['encoding']
|
||||
|
||||
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
|
||||
content = await r_file.read()
|
||||
|
||||
if save_hash_only:
|
||||
await w_file.write(get_hash(content))
|
||||
continue
|
||||
|
||||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
|
||||
await w_file.write(content)
|
||||
await w_file.write(content)
|
||||
|
||||
|
||||
async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession):
|
||||
|
@ -120,7 +123,20 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
|
|||
client_folder_name = 'macos'
|
||||
client_archive_name = 'macos.zip'
|
||||
|
||||
await download_file(download_url, client_archive_name, session)
|
||||
assets_output_dir = 'macos_assets'
|
||||
assets_filename = 'Assets.car'
|
||||
assets_extractor = 'acextract'
|
||||
tool_archive_name = f'{assets_extractor}.zip'
|
||||
|
||||
tool_download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
|
||||
|
||||
if 'darwin' not in platform.system().lower():
|
||||
await download_file(download_url, client_archive_name, session)
|
||||
else:
|
||||
await asyncio.gather(
|
||||
download_file(download_url, client_archive_name, session),
|
||||
download_file(tool_download_url, tool_archive_name, session),
|
||||
)
|
||||
|
||||
# synced
|
||||
with zipfile.ZipFile(client_archive_name, 'r') as f:
|
||||
|
@ -145,14 +161,6 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
|
|||
cleanup1()
|
||||
return
|
||||
|
||||
assets_output_dir = 'macos_assets'
|
||||
assets_extractor = 'acextract'
|
||||
assets_filename = 'Assets.car'
|
||||
tool_archive_name = f'{assets_extractor}.zip'
|
||||
|
||||
download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
|
||||
await download_file(download_url, tool_archive_name, session)
|
||||
|
||||
# synced
|
||||
with zipfile.ZipFile(tool_archive_name, 'r') as f:
|
||||
f.extractall(assets_extractor)
|
||||
|
@ -191,8 +199,10 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
|
|||
if not download_url:
|
||||
return
|
||||
|
||||
await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)
|
||||
await download_file(download_url, 'android.apk', session)
|
||||
await asyncio.gather(
|
||||
download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session),
|
||||
download_file(download_url, 'android.apk', session),
|
||||
)
|
||||
|
||||
def cleanup():
|
||||
os.path.isdir('android') and shutil.rmtree('android')
|
||||
|
@ -224,17 +234,20 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
|||
data = {'offset': offset, 'more': 1}
|
||||
|
||||
try:
|
||||
new_offset = None
|
||||
async with session.post(
|
||||
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
logger.debug(f'Resend cuz {response.status}')
|
||||
return await _get_page(offset)
|
||||
new_offset = offset
|
||||
else:
|
||||
json = await response.json(encoding='UTF-8')
|
||||
if 'more_html' in json and json['more_html']:
|
||||
content.append(json['more_html'])
|
||||
new_offset = offset + 200
|
||||
|
||||
json = await response.json(encoding='UTF-8')
|
||||
if 'more_html' in json and json['more_html']:
|
||||
content.append(json['more_html'])
|
||||
await _get_page(offset + 200)
|
||||
new_offset and await _get_page(new_offset)
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
|
||||
await _get_page(offset)
|
||||
|
@ -261,13 +274,18 @@ def is_hashable_only_content_type(content_type) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
class RetryError(Exception):
|
||||
...
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
try:
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||
if response.status // 100 == 5:
|
||||
logger.warning(f'Error 5XX. Retrying {url}')
|
||||
return await crawl(url, session)
|
||||
msg = f'Error 5XX. Retrying {url}'
|
||||
logger.warning(msg)
|
||||
raise RetryError(msg)
|
||||
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
|
@ -306,16 +324,16 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||
content = await collect_translations_paginated_content(url, session)
|
||||
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
|
||||
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
|
||||
content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
|
||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
|
||||
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
|
||||
content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
|
||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
await f.write(content)
|
||||
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
|
@ -323,21 +341,25 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
await crawl(url, session)
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
async def start(url_list: set[str], mode: int):
|
||||
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||
await asyncio.gather(
|
||||
mode == 0 and await asyncio.gather(
|
||||
*[crawl(url, session) for url in url_list],
|
||||
# yeap it will be called each run, and what? ;d
|
||||
download_telegram_android_beta_and_extract_resources(session),
|
||||
download_telegram_macos_beta_and_extract_resources(session),
|
||||
)
|
||||
mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||
mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
|
||||
mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
|
||||
|
||||
with open(INPUT_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||
start_time = time()
|
||||
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
|
||||
logger.info(f'Stop crawling content. {time() - start_time} sec.')
|
||||
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||
asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
|
||||
logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
|
||||
|
|
6
make_files_tree.sh
Normal file
6
make_files_tree.sh
Normal file
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
python make_files_tree.py > /dev/null 1 &
|
||||
python make_files_tree.py > /dev/null 2 &
|
||||
python make_files_tree.py > /dev/null 3 &
|
||||
wait
|
Loading…
Reference in a new issue