mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-23 07:49:20 +01:00
mb speedup, mb not
This commit is contained in:
parent
a845c8446c
commit
d0ab3c176a
2 changed files with 78 additions and 50 deletions
|
@ -4,6 +4,7 @@ import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
import hashlib
|
import hashlib
|
||||||
from asyncio.exceptions import TimeoutError
|
from asyncio.exceptions import TimeoutError
|
||||||
|
@ -57,8 +58,10 @@ async def download_file(url, path, session):
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
content = await response.read()
|
||||||
|
|
||||||
async with aiofiles.open(path, mode='wb') as f:
|
async with aiofiles.open(path, mode='wb') as f:
|
||||||
await f.write(await response.read())
|
await f.write(content)
|
||||||
|
|
||||||
|
|
||||||
async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
|
async def get_download_link_of_latest_appcenter_release(parameterized_url: str, session: aiohttp.ClientSession):
|
||||||
|
@ -88,24 +91,24 @@ async def get_download_link_of_latest_appcenter_release(parameterized_url: str,
|
||||||
async def track_additional_files(
|
async def track_additional_files(
|
||||||
files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
|
files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
|
||||||
):
|
):
|
||||||
for file in files_to_track:
|
|
||||||
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
|
|
||||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
||||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
|
|
||||||
kwargs = {'mode': 'r', 'encoding': encoding}
|
kwargs = {'mode': 'r', 'encoding': encoding}
|
||||||
if save_hash_only:
|
if save_hash_only:
|
||||||
kwargs['mode'] = 'rb'
|
kwargs['mode'] = 'rb'
|
||||||
del kwargs['encoding']
|
del kwargs['encoding']
|
||||||
|
|
||||||
|
for file in files_to_track:
|
||||||
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
|
async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
|
||||||
content = await r_file.read()
|
content = await r_file.read()
|
||||||
|
|
||||||
if save_hash_only:
|
if save_hash_only:
|
||||||
await w_file.write(get_hash(content))
|
content = get_hash(content)
|
||||||
continue
|
else:
|
||||||
|
|
||||||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||||
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
|
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
|
||||||
|
|
||||||
|
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
|
||||||
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||||
|
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
|
||||||
await w_file.write(content)
|
await w_file.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
@ -120,7 +123,20 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
|
||||||
client_folder_name = 'macos'
|
client_folder_name = 'macos'
|
||||||
client_archive_name = 'macos.zip'
|
client_archive_name = 'macos.zip'
|
||||||
|
|
||||||
|
assets_output_dir = 'macos_assets'
|
||||||
|
assets_filename = 'Assets.car'
|
||||||
|
assets_extractor = 'acextract'
|
||||||
|
tool_archive_name = f'{assets_extractor}.zip'
|
||||||
|
|
||||||
|
tool_download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
|
||||||
|
|
||||||
|
if 'darwin' not in platform.system().lower():
|
||||||
await download_file(download_url, client_archive_name, session)
|
await download_file(download_url, client_archive_name, session)
|
||||||
|
else:
|
||||||
|
await asyncio.gather(
|
||||||
|
download_file(download_url, client_archive_name, session),
|
||||||
|
download_file(tool_download_url, tool_archive_name, session),
|
||||||
|
)
|
||||||
|
|
||||||
# synced
|
# synced
|
||||||
with zipfile.ZipFile(client_archive_name, 'r') as f:
|
with zipfile.ZipFile(client_archive_name, 'r') as f:
|
||||||
|
@ -145,14 +161,6 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
|
||||||
cleanup1()
|
cleanup1()
|
||||||
return
|
return
|
||||||
|
|
||||||
assets_output_dir = 'macos_assets'
|
|
||||||
assets_extractor = 'acextract'
|
|
||||||
assets_filename = 'Assets.car'
|
|
||||||
tool_archive_name = f'{assets_extractor}.zip'
|
|
||||||
|
|
||||||
download_url = 'https://github.com/bartoszj/acextract/releases/download/2.2/acextract.zip'
|
|
||||||
await download_file(download_url, tool_archive_name, session)
|
|
||||||
|
|
||||||
# synced
|
# synced
|
||||||
with zipfile.ZipFile(tool_archive_name, 'r') as f:
|
with zipfile.ZipFile(tool_archive_name, 'r') as f:
|
||||||
f.extractall(assets_extractor)
|
f.extractall(assets_extractor)
|
||||||
|
@ -191,8 +199,10 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
|
||||||
if not download_url:
|
if not download_url:
|
||||||
return
|
return
|
||||||
|
|
||||||
await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)
|
await asyncio.gather(
|
||||||
await download_file(download_url, 'android.apk', session)
|
download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session),
|
||||||
|
download_file(download_url, 'android.apk', session),
|
||||||
|
)
|
||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
os.path.isdir('android') and shutil.rmtree('android')
|
os.path.isdir('android') and shutil.rmtree('android')
|
||||||
|
@ -224,17 +234,20 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
||||||
data = {'offset': offset, 'more': 1}
|
data = {'offset': offset, 'more': 1}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
new_offset = None
|
||||||
async with session.post(
|
async with session.post(
|
||||||
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
|
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
|
||||||
) as response:
|
) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
logger.debug(f'Resend cuz {response.status}')
|
logger.debug(f'Resend cuz {response.status}')
|
||||||
return await _get_page(offset)
|
new_offset = offset
|
||||||
|
else:
|
||||||
json = await response.json(encoding='UTF-8')
|
json = await response.json(encoding='UTF-8')
|
||||||
if 'more_html' in json and json['more_html']:
|
if 'more_html' in json and json['more_html']:
|
||||||
content.append(json['more_html'])
|
content.append(json['more_html'])
|
||||||
await _get_page(offset + 200)
|
new_offset = offset + 200
|
||||||
|
|
||||||
|
new_offset and await _get_page(new_offset)
|
||||||
except (TimeoutError, ClientConnectorError):
|
except (TimeoutError, ClientConnectorError):
|
||||||
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
|
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
|
||||||
await _get_page(offset)
|
await _get_page(offset)
|
||||||
|
@ -261,13 +274,18 @@ def is_hashable_only_content_type(content_type) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class RetryError(Exception):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
try:
|
try:
|
||||||
logger.info(f'Process {url}')
|
logger.info(f'Process {url}')
|
||||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||||
if response.status // 100 == 5:
|
if response.status // 100 == 5:
|
||||||
logger.warning(f'Error 5XX. Retrying {url}')
|
msg = f'Error 5XX. Retrying {url}'
|
||||||
return await crawl(url, session)
|
logger.warning(msg)
|
||||||
|
raise RetryError(msg)
|
||||||
|
|
||||||
if response.status not in {200, 304}:
|
if response.status not in {200, 304}:
|
||||||
if response.status != 302:
|
if response.status != 302:
|
||||||
|
@ -306,7 +324,6 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||||
content = await collect_translations_paginated_content(url, session)
|
content = await collect_translations_paginated_content(url, session)
|
||||||
|
|
||||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
|
||||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||||
|
@ -316,6 +333,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||||
|
|
||||||
|
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||||
logger.info(f'Write to {filename}')
|
logger.info(f'Write to {filename}')
|
||||||
await f.write(content)
|
await f.write(content)
|
||||||
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||||
|
@ -323,21 +341,25 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
await crawl(url, session)
|
await crawl(url, session)
|
||||||
|
|
||||||
|
|
||||||
async def start(url_list: set[str]):
|
async def start(url_list: set[str], mode: int):
|
||||||
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||||
await asyncio.gather(
|
mode == 0 and await asyncio.gather(
|
||||||
*[crawl(url, session) for url in url_list],
|
*[crawl(url, session) for url in url_list],
|
||||||
# yeap it will be called each run, and what? ;d
|
|
||||||
download_telegram_android_beta_and_extract_resources(session),
|
download_telegram_android_beta_and_extract_resources(session),
|
||||||
download_telegram_macos_beta_and_extract_resources(session),
|
download_telegram_macos_beta_and_extract_resources(session),
|
||||||
)
|
)
|
||||||
|
mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||||
|
mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
|
||||||
|
mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
|
||||||
|
|
||||||
with open(INPUT_FILENAME, 'r') as f:
|
with open(INPUT_FILENAME, 'r') as f:
|
||||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||||
|
|
||||||
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
|
||||||
start_time = time()
|
start_time = time()
|
||||||
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
|
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||||
logger.info(f'Stop crawling content. {time() - start_time} sec.')
|
asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
|
||||||
|
logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
|
||||||
|
|
6
make_files_tree.sh
Normal file
6
make_files_tree.sh
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
python make_files_tree.py > /dev/null 1 &
|
||||||
|
python make_files_tree.py > /dev/null 2 &
|
||||||
|
python make_files_tree.py > /dev/null 3 &
|
||||||
|
wait
|
Loading…
Reference in a new issue